In this script, we perform the Grade of Membership model (GoM) on the dataset obtained from the study due to Deng . Check the paper.
The dataset has single cells sequenced starting from zygote through the 2 cell, 4 cell all the way up to blastocyst. We want to cluster the cells by development phase, also taking into account the information about the embryo of the origin of the cells and try to see which genes are actually driving the clusters in the development phase. These would be the genes that would also play a significant part in differentiating the various stages of the development.
rm(list=ls())
library(data.table)
#install_github('kkdey/maptpx')
library(maptpx)
library(CountClust)
library(data.table)
files <- list.files("../external_data/Deng_Data/Deng_files/");
num <- 1
temp_data <- data.frame(fread(paste0('../external_data/Deng_Data/Deng_files/',files[num])));
gene_names <- temp_data$X.Gene_symbol;
reads_mat <- cbind.data.frame(gene_names);
for(num in 1:length(files))
{
temp_data <- data.frame(fread(paste0('../data/Deng_data/Deng_files/',files[num])));
reads_mat <- cbind.data.frame(reads_mat,temp_data$reads);
}
cell_meta <- unlist(lapply(files, function(x) strsplit(x,"_")[[1]][2]));
colnames(reads_mat) <- c("gene_names",files);
reads_no_dups <- reads_mat %>%
group_by(gene_names) %>%
summarise_each(funs(sum))
reads_no_dups <- data.frame(reads_no_dups)
gene_names_new <- reads_no_dups[,1]
reads_no_dups <- reads_no_dups[,-1];
rownames(reads_no_dups) <- gene_names_new;
colnames(reads_no_dups) <- cell_meta;
dim(reads_no_dups);
write.table(reads_no_dups,"../external_data/Deng_cell_data.txt");
Now we load the data.
reads <- data.frame(fread('../external_data/Deng_Data/Deng_cell_data.txt'),row.names=1);
files <- list.files("../external_data/Deng_Data/Deng_files/");
cell_meta <- unlist(lapply(files, function(x) strsplit(x,"_")[[1]][2]));
embryo_id <- unlist(lapply(files, function(x) strsplit(strsplit(x,"_")[[1]][3],"-")[[1]][1]));
cell_meta[grep("zy",cell_meta)]="zy";
cell_meta[grep("smartseq2", files)]="8cell_nd";
cell_meta[grep("8cell_2pooled", files)]="8cell_nd";
cell_meta[grep("8cell_split", files)]="8cell_nd";
cell_meta[grep("16cell_2pooled", files)]="16cell_nd";
cell_meta[grep("16cell_split", files)]="16cell_nd";
indices_not_reqd <- which(cell_meta=="BXC" | cell_meta=="C57twocell" | cell_meta=="fibroblast" | cell_meta =="8cell_nd" | cell_meta == "16cell_nd");
cell_meta <- cell_meta[-indices_not_reqd];
embryo_id <- embryo_id[-indices_not_reqd];
embryo_id[which(embryo_id == "expression.txt")]="."
cell_embryo <- paste0(cell_meta,"_", embryo_id);
reads <- reads[,-indices_not_reqd];
cell_meta_unique <- c("zy","early2cell","mid2cell","late2cell","4cell","8cell","16cell","earlyblast","midblast","lateblast") ;
order_of_development <- order(match(cell_meta,cell_meta_unique))
reads <- reads[,order_of_development];
cell_meta <- cell_meta[order_of_development]
cell_embryo <- cell_embryo[order_of_development];
colnames(reads) <- cell_meta;
load("../rdas/deng_topic_fit.rda")
# extract the omega matrix: membership weights of each cell
names(Topic_clus_list)
## [1] "clust_2" "clust_3" "clust_4" "clust_5" "clust_6" "clust_7"
str(Topic_clus_list$clust_6)
## List of 6
## $ K : int 6
## $ theta: num [1:22431, 1:6] 1.32e-05 7.60e-05 3.60e-05 1.19e-05 6.47e-05 ...
## ..- attr(*, "dimnames")=List of 2
## .. ..$ phrase: chr [1:22431] "0610005C13Rik" "0610007C21Rik" "0610007L01Rik" "0610007P08Rik" ...
## .. ..$ topic : chr [1:6] "1" "2" "3" "4" ...
## $ omega: num [1:259, 1:6] 1.26e-06 2.47e-04 1.88e-04 9.90e-06 3.47e-07 ...
## ..- attr(*, "dimnames")=List of 2
## .. ..$ document: chr [1:259] "zy" "zy" "zy" "zy" ...
## .. ..$ topic : chr [1:6] "1" "2" "3" "4" ...
## $ BF : NULL
## $ D :List of 3
## ..$ dispersion: num 380
## ..$ pvalue : num 0
## ..$ df : num 4668431
## $ X :List of 6
## ..$ i : int [1:2307465] 5 13 14 15 16 18 19 20 21 22 ...
## ..$ j : int [1:2307465] 1 1 1 1 1 1 1 1 1 1 ...
## ..$ v : int [1:2307465] 2 46 376 130 84 48 77 147 48 16 ...
## ..$ nrow : int 259
## ..$ ncol : int 22431
## ..$ dimnames:List of 2
## .. ..$ : chr [1:259] "zy" "zy" "zy" "zy" ...
## .. ..$ : chr [1:22431] "0610005C13Rik" "0610007C21Rik" "0610007L01Rik" "0610007P08Rik" ...
## ..- attr(*, "class")= chr "simple_triplet_matrix"
## - attr(*, "class")= chr "topics"
omega <- Topic_clus_list$clust_6$omega
# import embryonl labels
embryo_label <- read.table("../external_data/Deng_Data/cell_labels_phase_embryo.txt",
quote = "\"",
header = TRUE,
stringsAsFactors = FALSE)$x
head(embryo_label, 20)
## [1] "zy_." "zy_." "zy_." "zy_."
## [5] "early2cell_0r" "early2cell_0r" "early2cell_1" "early2cell_1"
## [9] "early2cell_2" "early2cell_2" "early2cell_3" "early2cell_3"
## [13] "mid2cell_0r" "mid2cell_0r" "mid2cell_3" "mid2cell_3"
## [17] "mid2cell_4" "mid2cell_4" "mid2cell_5" "mid2cell_5"
table(embryo_label)
## embryo_label
## 16cell_1 16cell_4 16cell_5 16cell_6 4cell_1
## 14 11 13 12 3
## 4cell_2 4cell_3 4cell_4 8cell_1 8cell_2
## 4 3 4 7 7
## 8cell_5 8cell_8 early2cell_0r early2cell_1 early2cell_2
## 7 7 2 2 2
## early2cell_3 earlyblast_2 earlyblast_3 earlyblast_4 late2cell_5
## 2 15 15 13 2
## late2cell_6 late2cell_7 late2cell_8 late2cell_9 lateblast_1
## 2 2 2 2 19
## lateblast_2 mid2cell_0r mid2cell_3 mid2cell_4 mid2cell_5
## 11 2 2 2 2
## mid2cell_6 mid2cell_7 midblast_1 midblast_2 midblast_3
## 2 2 22 20 18
## zy_.
## 4
stopifnot(length(embryo_label) == NROW(omega))
# make annotation matrix for the plot of all tissues
# sample_id has to be unique
annotation <- data.frame(
sample_id = paste0("X", c(1:NROW(omega))),
tissue_label = factor(rownames(omega),
levels = rev( c("zy", "early2cell", "mid2cell", "late2cell",
"4cell", "8cell", "16cell", "earlyblast",
"midblast", "lateblast") ) ) )
# make annotation for early stage plot
# sample_id has to be unique
annotation_embryo <- data.frame(
sample_id = paste0("X", c(1:NROW(omega))),
tissue_label = factor(embryo_label,
levels = rev( c("zy_.",
paste("early2cell",c("0r", c(1:3)), sep = "_"),
paste("mid2cell",c("0r", c(3:7)), sep = "_"),
paste("late2cell",c("0r", c(5:9)), sep = "_"),
paste("4cell",c("0r", c(1:4)), sep = "_"),
paste("8cell",c("0r", c(1,2,5,8)), sep = "_"),
paste("16cell",c("0r", c(1,4,5,6)), sep = "_"),
paste("earlyblast",c("0r", c(2:4)), sep = "_"),
paste("midblast",c("0r", c(1:3)), sep = "_"),
paste("lateblast",c("0r", c(1:3)), sep = "_") ) ) ) )
# after extracting tissue type of each sample
# recode each sample to have unique rownames
rownames(omega) <- paste0("X", annotation$sample_id)
StructureGGplot(omega = omega,
annotation = annotation,
palette = RColorBrewer::brewer.pal(8, "Accent"),
figure_title = "",
yaxis_label = "Cell type",
sample_order_decreasing = FALSE,
axis_tick = list(axis_ticks_length = .1,
axis_ticks_lwd_y = .1,
axis_ticks_lwd_x = .1,
axis_label_size = 7,
axis_label_face = "bold"))