In this script, we perform the topic model and Structure plots on the GTEX V6 data across all tissues and then exclusively on the brain tissues. We fit a topic model with \(K=15\) topics over all tissues in the GTEX study and then plot the Structure plot. We also fit a topic model with \(4\) topics on the brain data exclusively and then plot the corresponding Structure plot as well.
rm(list=ls())
library(data.table)
#install_github('kkdey/maptpx')
library(maptpx)
library(CountClust)
library(data.table)
data <- data.frame(fread('../external_data/GTEX_V6/cis_gene_expression.txt'));
##
Read 0.0% of 16069 rows
Read 62.2% of 16069 rows
Read 16069 rows and 8557 (of 8557) columns from 0.532 GB file in 00:00:09
matdata <- data[,-(1:2)];
samples_id=read.table("../external_data/GTEX_V6/samples_id.txt")[,3];
We fit a topic model for \(K=20\) over all the tissues in the GTEX V6 data.
Topic_Clus=topics(t(matdata),15,kill=0,tol=0.1);
docweights=Topic_Clus$omega;
gom_model_fit <- get(load("../external_data/GTEX_V6/gtexv6fit.k.20.master.rda"))
omega <- gom_model_fit$omega
colnames(omega) <- c(1:NCOL(omega))
# make cell sample labels
# want a version consistent with majority of the literature
sample_labels <- read.table("../external_data/GTEX_V6/samples_id.txt",
header = TRUE, sep = " ",
stringsAsFactors = FALSE)
tissue_labels <- vector("numeric", NROW(sample_labels))
tissue_labels <- sample_labels[ ,3]
# clean labels
tissue_labels[grep("Nucleus", tissue_labels)] <- "Brain -N. accumbens"
tissue_labels[grep("Putamen", tissue_labels)] <- "Brain -Putamen"
tissue_labels[grep("Caudate", tissue_labels)] <- "Brain -Caudate"
tissue_labels[grep("Gastroe", tissue_labels)] <- "Esophagus -Gastroesophageal Jn."
tissue_labels[grep("cingulate", tissue_labels)] <- "Brain - Anterior cortex (BA24)."
tissue_labels[grep("EBV", tissue_labels)] <- "Cells -EBV-lymphocytes"
tissue_labels[grep("Suprapubic", tissue_labels)] <- "Skin - Unexposed (Suprapubic)"
tissue_labels[grep("Lower Leg", tissue_labels)] <- "Skin - Sun Exposed (Lower Leg)"
# find sample orders in hierarchical clustering
docweights_per_tissue_mean <- apply(omega, 2,
function(x) { tapply(x, tissue_labels, mean) })
ordering <- heatmap(docweights_per_tissue_mean)$rowInd
# order tissue by hierarhical clustering results
tissue_levels_reordered <- unique(tissue_labels)[ordering]
annotation <- data.frame(
sample_id = paste0("X", 1:length(tissue_labels)),
tissue_label = factor(tissue_labels,
levels = rev(tissue_levels_reordered ) ) )
cols1 <- c(rev(RColorBrewer::brewer.pal(12, "Paired"))[c(3,4,7,8,11,12,5,6,9,10)],
RColorBrewer::brewer.pal(12, "Set3"))
StructureGGplot(omega = omega,
annotation= annotation,
palette = cols1,
yaxis_label = "",
order_sample = TRUE,
split_line = list(split_lwd = .1,
split_col = "white"),
axis_tick = list(axis_ticks_length = .1,
axis_ticks_lwd_y = .1,
axis_ticks_lwd_x = .1,
axis_label_size = 5,
axis_label_face="bold"))