In this script, we perform the topic model and Structure plots on the GTEX V6 data across all tissues and then exclusively on the brain tissues. We fit a topic model with \(K=15\) topics over all tissues in the GTEX study and then plot the Structure plot. We also fit a topic model with \(4\) topics on the brain data exclusively and then plot the corresponding Structure plot as well.
rm(list=ls())
library(data.table)
#install_github('kkdey/maptpx')
library(maptpx)
library(CountClust)
library(data.table)
data <- data.frame(fread('../external_data/GTEX_V6/cis_gene_expression.txt'));
##
Read 0.0% of 16069 rows
Read 62.2% of 16069 rows
Read 16069 rows and 8557 (of 8557) columns from 0.532 GB file in 00:00:09
matdata <- data[,-(1:2)];
samples_id=read.table("../external_data/GTEX_V6/samples_id.txt")[,3];
We fit a topic model for \(K=20\) over all the tissues in the GTEX V6 data.
Topic_Clus=topics(t(matdata),15,kill=0,tol=0.1);
docweights=Topic_Clus$omega;
gom_model_fit <- get(load("../external_data/GTEX_V6/gtexv6fit.k.20.master.rda"))
omega <- gom_model_fit$omega
colnames(omega) <- c(1:NCOL(omega))
# make cell sample labels
# want a version consistent with majority of the literature
sample_labels <- read.table("../external_data/GTEX_V6/samples_id.txt",
header = TRUE, sep = " ",
stringsAsFactors = FALSE)
tissue_labels <- vector("numeric", NROW(sample_labels))
tissue_labels <- sample_labels[ ,3]
# clean labels
tissue_labels[grep("Nucleus", tissue_labels)] <- "Brain -N. accumbens"
tissue_labels[grep("Putamen", tissue_labels)] <- "Brain -Putamen"
tissue_labels[grep("Caudate", tissue_labels)] <- "Brain -Caudate"
tissue_labels[grep("Gastroe", tissue_labels)] <- "Esophagus -Gastroesophageal Jn."
tissue_labels[grep("cingulate", tissue_labels)] <- "Brain - Anterior cortex (BA24)."
tissue_labels[grep("EBV", tissue_labels)] <- "Cells -EBV-lymphocytes"
tissue_labels[grep("Suprapubic", tissue_labels)] <- "Skin - Unexposed (Suprapubic)"
tissue_labels[grep("Lower Leg", tissue_labels)] <- "Skin - Sun Exposed (Lower Leg)"
# find sample orders in hierarchical clustering
docweights_per_tissue_mean <- apply(omega, 2,
function(x) { tapply(x, tissue_labels, mean) })
ordering <- heatmap(docweights_per_tissue_mean)$rowInd
# order tissue by hierarhical clustering results
tissue_levels_reordered <- unique(tissue_labels)[ordering]
annotation <- data.frame(
sample_id = paste0("X", 1:length(tissue_labels)),
tissue_label = factor(tissue_labels,
levels = rev(tissue_levels_reordered ) ) )
cols1 <- c(rev(RColorBrewer::brewer.pal(12, "Paired"))[c(3,4,7,8,11,12,5,6,9,10)],
RColorBrewer::brewer.pal(12, "Set3"))
StructureGGplot(omega = omega,
annotation= annotation,
palette = cols1,
yaxis_label = "",
order_sample = TRUE,
split_line = list(split_lwd = .1,
split_col = "white"),
axis_tick = list(axis_ticks_length = .1,
axis_ticks_lwd_y = .1,
axis_ticks_lwd_x = .1,
axis_label_size = 5,
axis_label_face="bold"))
gom_model_fit <- get(load("../external_data/GTEX_V6/gtexv6fit.k20.thin.0001.rda"))
omega_thin <- gom_model_fit$omega
colnames(omega_thin) <- c(1:NCOL(omega_thin))
sample_labels <- read.table("../external_data/GTEX_V6/samples_id.txt",
header = TRUE, sep = " ",
stringsAsFactors = FALSE)
tissue_labels <- vector("numeric", NROW(sample_labels))
tissue_labels <- sample_labels[ ,3]
annotation_thinned <- data.frame(
sample_id = paste0("X", 1:length(tissue_labels)),
tissue_label = annotation$tissue_label )
# cols_thinned <- cols1
# cols_thinned[15] <- cols1[14];
# cols_thinned[14] <- cols1[13];
# cols_thinned[10] <- cols1[10];
# cols_thinned[3] <- cols1[4];
# cols_thinned[7] <- cols1[7];
# cols_thinned[8] <- cols1[9];
# cols_thinned[11] <- cols1[12];
# cols_thinned[12] <- cols1[11];
# cols_thinned[1] <- cols1[1];
# cols_thinned[2] <- cols1[2];
# cols_thinned[4] <- cols1[3];
# cols_thinned[5] <- cols1[8];
# cols_thinned[6] <- cols1[6];
# cols_thinned[9] <- cols1[5];
# cols_thinned[13] <- cols1[15];
StructureGGplot(omega = omega_thin,
annotation= annotation,
palette = cols1,
yaxis_label = "",
order_sample = TRUE,
split_line = list(split_lwd = .1,
split_col = "white"),
axis_tick = list(axis_ticks_length = .1,
axis_ticks_lwd_y = .1,
axis_ticks_lwd_x = .1,
axis_label_size = 5,
axis_label_face="bold"))
sample_labels <- read.table("../external_data/GTEX_V6/samples_id.txt",
header = TRUE, sep = " ",
stringsAsFactors = FALSE)
brain_indices <- grep("Brain", sample_labels[,3]);
brain_data <- data[,brain_indices];
Topic_clus_brain <- topics(t(brain_data), K=4, tol=100);
write.table(Topic_clus_brain$omega, "../external_data/GTEX_V6/admix_out_GTEX_V6/omega_cis_genes_brain_2.txt")
write.table(Topic_clus_brain$theta, "../external_data/GTEX_V6/admix_out_GTEX_V6/theta_cis_genes_brain_2.txt")
omega_brain <- read.table("../external_data/GTEX_V6/admix_out_GTEX_V6/omega_cis_genes_brain_2.txt",
header = TRUE, sep = " ",
stringsAsFactors = FALSE)
dim(omega_brain)
## [1] 1259 4
colnames(omega_brain) <- c(1:NCOL(omega_brain))
head(omega_brain)
## 1 2 3 4
## V1423 0.8387476 1.880549e-07 1.605299e-01 0.0007222382
## V1424 0.9352019 7.753218e-08 6.455504e-02 0.0002429765
## V1425 0.6590782 8.951338e-02 3.115989e-07 0.2514081048
## V1426 0.6329390 5.261889e-02 2.086520e-02 0.2935769207
## V1427 0.5519008 6.069850e-02 4.256315e-02 0.3448375859
## V1428 0.5104048 3.246305e-01 1.795718e-02 0.1470075448
sample_labels <- read.table("../external_data/GTEX_V6/samples_id.txt",
header = TRUE, sep = " ",
stringsAsFactors = FALSE)
brain_labels <- sample_labels[grep("Brain", sample_labels[,3]), 3]
rownames(omega_brain) <- paste0("X", 1:length(brain_labels))
annotation <- data.frame(
sample_id = paste0("X", 1:length(brain_labels)),
tissue_label = factor(brain_labels,
levels = rev(c("Brain - Cerebellar Hemisphere",
"Brain - Cerebellum",
"Brain - Spinal cord (cervical c-1)",
"Brain - Anterior cingulate cortex (BA24)",
"Brain - Frontal Cortex (BA9)",
"Brain - Cortex",
"Brain - Hippocampus",
"Brain - Substantia nigra",
"Brain - Amygdala",
"Brain - Putamen (basal ganglia)",
"Brain - Caudate (basal ganglia)",
"Brain - Hypothalamus",
"Brain - Nucleus accumbens (basal ganglia)") ) ) )
# define colors of the clusers
cols <- c("blue", "darkgoldenrod1", "cyan", "red")
StructureGGplot(omega = omega_brain,
annotation= annotation,
palette = cols,
yaxis_label = "",
order_sample = TRUE,
split_line = list(split_lwd = .4,
split_col = "white"),
axis_tick = list(axis_ticks_length = .1,
axis_ticks_lwd_y = .1,
axis_ticks_lwd_x = .1,
axis_label_size = 3,
axis_label_face = "bold"))