GTEX V6 analysis

Objective
Data preprocessing
Structure Plot (All tissues)
Structure Plot (thinned GTEx data)
Structure plot of Brain data

Objective

In this script, we perform the topic model and Structure plots on the GTEX V6 data across all tissues and then exclusively on the brain tissues. We fit a topic model with \(K=15\) topics over all tissues in the GTEX study and then plot the Structure plot. We also fit a topic model with \(4\) topics on the brain data exclusively and then plot the corresponding Structure plot as well.

rm(list=ls())
library(data.table)
#install_github('kkdey/maptpx') 
library(maptpx)
library(CountClust)
library(data.table)

Data preprocessing

data <- data.frame(fread('../external_data/GTEX_V6/cis_gene_expression.txt'));

## 
Read 0.0% of 16069 rows
Read 62.2% of 16069 rows
Read 16069 rows and 8557 (of 8557) columns from 0.532 GB file in 00:00:09

matdata <- data[,-(1:2)];
samples_id=read.table("../external_data/GTEX_V6/samples_id.txt")[,3];

We fit a topic model for \(K=20\) over all the tissues in the GTEX V6 data.

Topic_Clus=topics(t(matdata),15,kill=0,tol=0.1);
docweights=Topic_Clus$omega;

Structure Plot (All tissues)

gom_model_fit <- get(load("../external_data/GTEX_V6/gtexv6fit.k.20.master.rda"))
omega <- gom_model_fit$omega
colnames(omega) <- c(1:NCOL(omega))

# make cell sample labels
# want a version consistent with majority of the literature
sample_labels <- read.table("../external_data/GTEX_V6/samples_id.txt",
                            header = TRUE, sep = " ",
                            stringsAsFactors = FALSE)
tissue_labels <- vector("numeric", NROW(sample_labels))
tissue_labels <- sample_labels[ ,3]

# clean labels
tissue_labels[grep("Nucleus", tissue_labels)] <- "Brain -N. accumbens"
tissue_labels[grep("Putamen", tissue_labels)] <- "Brain -Putamen"
tissue_labels[grep("Caudate", tissue_labels)] <- "Brain -Caudate"
tissue_labels[grep("Gastroe", tissue_labels)] <- "Esophagus -Gastroesophageal Jn."
tissue_labels[grep("cingulate", tissue_labels)] <- "Brain - Anterior cortex (BA24)."
tissue_labels[grep("EBV", tissue_labels)] <- "Cells -EBV-lymphocytes"
tissue_labels[grep("Suprapubic", tissue_labels)] <- "Skin - Unexposed (Suprapubic)"
tissue_labels[grep("Lower Leg", tissue_labels)] <- "Skin - Sun Exposed (Lower Leg)"

# find sample orders in hierarchical clustering
docweights_per_tissue_mean <- apply(omega, 2,
                                    function(x) { tapply(x, tissue_labels, mean) })
ordering <- heatmap(docweights_per_tissue_mean)$rowInd

# order tissue by hierarhical clustering results
tissue_levels_reordered <- unique(tissue_labels)[ordering]


annotation <- data.frame(
    sample_id = paste0("X", 1:length(tissue_labels)),
    tissue_label = factor(tissue_labels,
                          levels = rev(tissue_levels_reordered ) ) )


cols1 <- c(rev(RColorBrewer::brewer.pal(12, "Paired"))[c(3,4,7,8,11,12,5,6,9,10)],
           RColorBrewer::brewer.pal(12, "Set3"))

StructureGGplot(omega = omega,
                annotation= annotation,
                palette = cols1,
                yaxis_label = "",
                order_sample = TRUE,
                split_line = list(split_lwd = .1,
                                  split_col = "white"),
                axis_tick = list(axis_ticks_length = .1,
                                 axis_ticks_lwd_y = .1,
                                 axis_ticks_lwd_x = .1,
                                 axis_label_size = 5,
                                 axis_label_face="bold"))

Structure Plot (thinned GTEx data)

gom_model_fit <- get(load("../external_data/GTEX_V6/gtexv6fit.k20.thin.0001.rda"))
omega_thin <- gom_model_fit$omega
colnames(omega_thin) <- c(1:NCOL(omega_thin))

sample_labels <- read.table("../external_data/GTEX_V6/samples_id.txt",
                            header = TRUE, sep = " ",
                            stringsAsFactors = FALSE)
tissue_labels <- vector("numeric", NROW(sample_labels))
tissue_labels <- sample_labels[ ,3]

annotation_thinned <- data.frame(
    sample_id = paste0("X", 1:length(tissue_labels)),
    tissue_label = annotation$tissue_label )

# cols_thinned <- cols1
# cols_thinned[15] <- cols1[14];
# cols_thinned[14] <- cols1[13];
# cols_thinned[10] <- cols1[10];
# cols_thinned[3] <- cols1[4];
# cols_thinned[7] <- cols1[7];
# cols_thinned[8] <- cols1[9];
# cols_thinned[11] <- cols1[12];
# cols_thinned[12] <- cols1[11];
# cols_thinned[1] <- cols1[1];
# cols_thinned[2] <- cols1[2];
# cols_thinned[4] <- cols1[3];
# cols_thinned[5] <- cols1[8];
# cols_thinned[6] <- cols1[6];
# cols_thinned[9] <- cols1[5];
# cols_thinned[13] <- cols1[15];

StructureGGplot(omega = omega_thin,
                annotation= annotation,
                palette = cols1,
                yaxis_label = "",
                order_sample = TRUE,
                split_line = list(split_lwd = .1,
                                  split_col = "white"),
                axis_tick = list(axis_ticks_length = .1,
                                 axis_ticks_lwd_y = .1,
                                 axis_ticks_lwd_x = .1,
                                 axis_label_size = 5,
                                 axis_label_face="bold"))

Structure plot of Brain data

sample_labels <- read.table("../external_data/GTEX_V6/samples_id.txt",
                            header = TRUE, sep = " ",
                            stringsAsFactors = FALSE)
brain_indices <- grep("Brain", sample_labels[,3]);
brain_data <- data[,brain_indices];
Topic_clus_brain <- topics(t(brain_data), K=4, tol=100);

write.table(Topic_clus_brain$omega, "../external_data/GTEX_V6/admix_out_GTEX_V6/omega_cis_genes_brain_2.txt")
write.table(Topic_clus_brain$theta, "../external_data/GTEX_V6/admix_out_GTEX_V6/theta_cis_genes_brain_2.txt")

omega_brain <- read.table("../external_data/GTEX_V6/admix_out_GTEX_V6/omega_cis_genes_brain_2.txt",
                    header = TRUE, sep = " ",
                    stringsAsFactors = FALSE)
dim(omega_brain)

## [1] 1259    4

colnames(omega_brain) <- c(1:NCOL(omega_brain))
head(omega_brain)

##               1            2            3            4
## V1423 0.8387476 1.880549e-07 1.605299e-01 0.0007222382
## V1424 0.9352019 7.753218e-08 6.455504e-02 0.0002429765
## V1425 0.6590782 8.951338e-02 3.115989e-07 0.2514081048
## V1426 0.6329390 5.261889e-02 2.086520e-02 0.2935769207
## V1427 0.5519008 6.069850e-02 4.256315e-02 0.3448375859
## V1428 0.5104048 3.246305e-01 1.795718e-02 0.1470075448

sample_labels <- read.table("../external_data/GTEX_V6/samples_id.txt",
                            header = TRUE, sep = " ",
                            stringsAsFactors = FALSE)
brain_labels <- sample_labels[grep("Brain", sample_labels[,3]), 3]

rownames(omega_brain) <- paste0("X", 1:length(brain_labels))
annotation <- data.frame(
    sample_id = paste0("X", 1:length(brain_labels)),
    tissue_label = factor(brain_labels,
                          levels = rev(c("Brain - Cerebellar Hemisphere",
                                     "Brain - Cerebellum",
                                     "Brain - Spinal cord (cervical c-1)",
                                     "Brain - Anterior cingulate cortex (BA24)",
                                     "Brain - Frontal Cortex (BA9)",
                                     "Brain - Cortex",
                                     "Brain - Hippocampus",
                                     "Brain - Substantia nigra",
                                     "Brain - Amygdala",
                                     "Brain - Putamen (basal ganglia)",
                                     "Brain - Caudate (basal ganglia)",
                                     "Brain - Hypothalamus",
                                     "Brain - Nucleus accumbens (basal ganglia)") ) ) )

# define colors of the clusers
cols <- c("blue", "darkgoldenrod1", "cyan", "red")

StructureGGplot(omega = omega_brain,
                annotation= annotation,
                palette = cols,
                yaxis_label = "",
                order_sample = TRUE,
                split_line = list(split_lwd = .4,
                                  split_col = "white"),
                axis_tick = list(axis_ticks_length = .1,
                                 axis_ticks_lwd_y = .1,
                                 axis_ticks_lwd_x = .1,
                                 axis_label_size = 3,
                                 axis_label_face = "bold"))