Objective

In this script, we perform the topic model and Structure plots on the GTEX V6 data across all tissues and then exclusively on the brain tissues. We fit a topic model with \(K=15\) topics over all tissues in the GTEX study and then plot the Structure plot. We also fit a topic model with \(4\) topics on the brain data exclusively and then plot the corresponding Structure plot as well.

rm(list=ls())
library(data.table)
#install_github('kkdey/maptpx') 
library(maptpx)
library(CountClust)
library(data.table)

Data preprocessing

data <- data.frame(fread('../external_data/GTEX_V6/cis_gene_expression.txt'));
## 
Read 0.0% of 16069 rows
Read 62.2% of 16069 rows
Read 16069 rows and 8557 (of 8557) columns from 0.532 GB file in 00:00:09
matdata <- data[,-(1:2)];
samples_id=read.table("../external_data/GTEX_V6/samples_id.txt")[,3];

We fit a topic model for \(K=20\) over all the tissues in the GTEX V6 data.

Topic_Clus=topics(t(matdata),15,kill=0,tol=0.1);
docweights=Topic_Clus$omega;

Structure Plot (All tissues)

gom_model_fit <- get(load("../external_data/GTEX_V6/gtexv6fit.k.20.master.rda"))
omega <- gom_model_fit$omega
colnames(omega) <- c(1:NCOL(omega))

# make cell sample labels
# want a version consistent with majority of the literature
sample_labels <- read.table("../external_data/GTEX_V6/samples_id.txt",
                            header = TRUE, sep = " ",
                            stringsAsFactors = FALSE)
tissue_labels <- vector("numeric", NROW(sample_labels))
tissue_labels <- sample_labels[ ,3]

# clean labels
tissue_labels[grep("Nucleus", tissue_labels)] <- "Brain -N. accumbens"
tissue_labels[grep("Putamen", tissue_labels)] <- "Brain -Putamen"
tissue_labels[grep("Caudate", tissue_labels)] <- "Brain -Caudate"
tissue_labels[grep("Gastroe", tissue_labels)] <- "Esophagus -Gastroesophageal Jn."
tissue_labels[grep("cingulate", tissue_labels)] <- "Brain - Anterior cortex (BA24)."
tissue_labels[grep("EBV", tissue_labels)] <- "Cells -EBV-lymphocytes"
tissue_labels[grep("Suprapubic", tissue_labels)] <- "Skin - Unexposed (Suprapubic)"
tissue_labels[grep("Lower Leg", tissue_labels)] <- "Skin - Sun Exposed (Lower Leg)"

# find sample orders in hierarchical clustering
docweights_per_tissue_mean <- apply(omega, 2,
                                    function(x) { tapply(x, tissue_labels, mean) })
ordering <- heatmap(docweights_per_tissue_mean)$rowInd

# order tissue by hierarhical clustering results
tissue_levels_reordered <- unique(tissue_labels)[ordering]


annotation <- data.frame(
    sample_id = paste0("X", 1:length(tissue_labels)),
    tissue_label = factor(tissue_labels,
                          levels = rev(tissue_levels_reordered ) ) )


cols1 <- c(rev(RColorBrewer::brewer.pal(12, "Paired"))[c(3,4,7,8,11,12,5,6,9,10)],
           RColorBrewer::brewer.pal(12, "Set3"))
StructureGGplot(omega = omega,
                annotation= annotation,
                palette = cols1,
                yaxis_label = "",
                order_sample = TRUE,
                split_line = list(split_lwd = .1,
                                  split_col = "white"),
                axis_tick = list(axis_ticks_length = .1,
                                 axis_ticks_lwd_y = .1,
                                 axis_ticks_lwd_x = .1,
                                 axis_label_size = 5,
                                 axis_label_face="bold"))

Structure Plot (thinned GTEx data)

gom_model_fit <- get(load("../external_data/GTEX_V6/gtexv6fit.k20.thin.0001.rda"))
omega_thin <- gom_model_fit$omega
colnames(omega_thin) <- c(1:NCOL(omega_thin))

sample_labels <- read.table("../external_data/GTEX_V6/samples_id.txt",
                            header = TRUE, sep = " ",
                            stringsAsFactors = FALSE)
tissue_labels <- vector("numeric", NROW(sample_labels))
tissue_labels <- sample_labels[ ,3]

annotation_thinned <- data.frame(
    sample_id = paste0("X", 1:length(tissue_labels)),
    tissue_label = annotation$tissue_label )

# cols_thinned <- cols1
# cols_thinned[15] <- cols1[14];
# cols_thinned[14] <- cols1[13];
# cols_thinned[10] <- cols1[10];
# cols_thinned[3] <- cols1[4];
# cols_thinned[7] <- cols1[7];
# cols_thinned[8] <- cols1[9];
# cols_thinned[11] <- cols1[12];
# cols_thinned[12] <- cols1[11];
# cols_thinned[1] <- cols1[1];
# cols_thinned[2] <- cols1[2];
# cols_thinned[4] <- cols1[3];
# cols_thinned[5] <- cols1[8];
# cols_thinned[6] <- cols1[6];
# cols_thinned[9] <- cols1[5];
# cols_thinned[13] <- cols1[15];
StructureGGplot(omega = omega_thin,
                annotation= annotation,
                palette = cols1,
                yaxis_label = "",
                order_sample = TRUE,
                split_line = list(split_lwd = .1,
                                  split_col = "white"),
                axis_tick = list(axis_ticks_length = .1,
                                 axis_ticks_lwd_y = .1,
                                 axis_ticks_lwd_x = .1,
                                 axis_label_size = 5,
                                 axis_label_face="bold"))

Structure plot of Brain data

sample_labels <- read.table("../external_data/GTEX_V6/samples_id.txt",
                            header = TRUE, sep = " ",
                            stringsAsFactors = FALSE)
brain_indices <- grep("Brain", sample_labels[,3]);
brain_data <- data[,brain_indices];
Topic_clus_brain <- topics(t(brain_data), K=4, tol=100);

write.table(Topic_clus_brain$omega, "../external_data/GTEX_V6/admix_out_GTEX_V6/omega_cis_genes_brain_2.txt")
write.table(Topic_clus_brain$theta, "../external_data/GTEX_V6/admix_out_GTEX_V6/theta_cis_genes_brain_2.txt")
omega_brain <- read.table("../external_data/GTEX_V6/admix_out_GTEX_V6/omega_cis_genes_brain_2.txt",
                    header = TRUE, sep = " ",
                    stringsAsFactors = FALSE)
dim(omega_brain)
## [1] 1259    4
colnames(omega_brain) <- c(1:NCOL(omega_brain))
head(omega_brain)
##               1            2            3            4
## V1423 0.8387476 1.880549e-07 1.605299e-01 0.0007222382
## V1424 0.9352019 7.753218e-08 6.455504e-02 0.0002429765
## V1425 0.6590782 8.951338e-02 3.115989e-07 0.2514081048
## V1426 0.6329390 5.261889e-02 2.086520e-02 0.2935769207
## V1427 0.5519008 6.069850e-02 4.256315e-02 0.3448375859
## V1428 0.5104048 3.246305e-01 1.795718e-02 0.1470075448
sample_labels <- read.table("../external_data/GTEX_V6/samples_id.txt",
                            header = TRUE, sep = " ",
                            stringsAsFactors = FALSE)
brain_labels <- sample_labels[grep("Brain", sample_labels[,3]), 3]

rownames(omega_brain) <- paste0("X", 1:length(brain_labels))
annotation <- data.frame(
    sample_id = paste0("X", 1:length(brain_labels)),
    tissue_label = factor(brain_labels,
                          levels = rev(c("Brain - Cerebellar Hemisphere",
                                     "Brain - Cerebellum",
                                     "Brain - Spinal cord (cervical c-1)",
                                     "Brain - Anterior cingulate cortex (BA24)",
                                     "Brain - Frontal Cortex (BA9)",
                                     "Brain - Cortex",
                                     "Brain - Hippocampus",
                                     "Brain - Substantia nigra",
                                     "Brain - Amygdala",
                                     "Brain - Putamen (basal ganglia)",
                                     "Brain - Caudate (basal ganglia)",
                                     "Brain - Hypothalamus",
                                     "Brain - Nucleus accumbens (basal ganglia)") ) ) )

# define colors of the clusers
cols <- c("blue", "darkgoldenrod1", "cyan", "red")
StructureGGplot(omega = omega_brain,
                annotation= annotation,
                palette = cols,
                yaxis_label = "",
                order_sample = TRUE,
                split_line = list(split_lwd = .4,
                                  split_col = "white"),
                axis_tick = list(axis_ticks_length = .1,
                                 axis_ticks_lwd_y = .1,
                                 axis_ticks_lwd_x = .1,
                                 axis_label_size = 3,
                                 axis_label_face = "bold"))