Objective

In this script, we perform the Grade of Membership model (GoM) on the dataset obtained from the study due to Deng . Check the paper.

The dataset has single cells sequenced starting from zygote through the 2 cell, 4 cell all the way up to blastocyst. We want to cluster the cells by development phase, also taking into account the information about the embryo of the origin of the cells and try to see which genes are actually driving the clusters in the development phase. These would be the genes that would also play a significant part in differentiating the various stages of the development.

rm(list=ls())
library(data.table)
#install_github('kkdey/maptpx') 
library(maptpx)
library(CountClust)
library(data.table)

Prepare the Data

files <- list.files("../external_data/Deng_Data/Deng_files/");

num <- 1
temp_data <- data.frame(fread(paste0('../external_data/Deng_Data/Deng_files/',files[num])));
gene_names <- temp_data$X.Gene_symbol;

reads_mat <- cbind.data.frame(gene_names);

for(num in 1:length(files))
{
  temp_data <- data.frame(fread(paste0('../data/Deng_data/Deng_files/',files[num])));
  reads_mat <- cbind.data.frame(reads_mat,temp_data$reads);
}

cell_meta <- unlist(lapply(files, function(x) strsplit(x,"_")[[1]][2]));
colnames(reads_mat) <- c("gene_names",files);
reads_no_dups <- reads_mat %>%
                  group_by(gene_names) %>%
                  summarise_each(funs(sum))

reads_no_dups <- data.frame(reads_no_dups)

gene_names_new <- reads_no_dups[,1]
reads_no_dups <- reads_no_dups[,-1];
rownames(reads_no_dups) <- gene_names_new;
colnames(reads_no_dups) <- cell_meta;
dim(reads_no_dups);

write.table(reads_no_dups,"../external_data/Deng_cell_data.txt");

Now we load the data.

Data and Meta-data Processing

reads <- data.frame(fread('../external_data/Deng_Data/Deng_cell_data.txt'),row.names=1);
files <- list.files("../external_data/Deng_Data/Deng_files/");
cell_meta <- unlist(lapply(files, function(x) strsplit(x,"_")[[1]][2]));
embryo_id <- unlist(lapply(files, function(x) strsplit(strsplit(x,"_")[[1]][3],"-")[[1]][1]));
cell_meta[grep("zy",cell_meta)]="zy";
cell_meta[grep("smartseq2", files)]="8cell_nd";
cell_meta[grep("8cell_2pooled", files)]="8cell_nd";
cell_meta[grep("8cell_split", files)]="8cell_nd";
cell_meta[grep("16cell_2pooled", files)]="16cell_nd";
cell_meta[grep("16cell_split", files)]="16cell_nd";
indices_not_reqd <- which(cell_meta=="BXC"   | cell_meta=="C57twocell" | cell_meta=="fibroblast" | cell_meta =="8cell_nd" | cell_meta == "16cell_nd");
cell_meta <- cell_meta[-indices_not_reqd];
embryo_id <- embryo_id[-indices_not_reqd];
embryo_id[which(embryo_id == "expression.txt")]="."
cell_embryo <- paste0(cell_meta,"_", embryo_id);
reads <- reads[,-indices_not_reqd];
cell_meta_unique <- c("zy","early2cell","mid2cell","late2cell","4cell","8cell","16cell","earlyblast","midblast","lateblast") ;
order_of_development <- order(match(cell_meta,cell_meta_unique))
reads <- reads[,order_of_development];
cell_meta <- cell_meta[order_of_development]
cell_embryo <- cell_embryo[order_of_development];
colnames(reads) <- cell_meta;
load("../rdas/deng_topic_fit.rda")

# extract the omega matrix: membership weights of each cell
names(Topic_clus_list)
## [1] "clust_2" "clust_3" "clust_4" "clust_5" "clust_6" "clust_7"
str(Topic_clus_list$clust_6)
## List of 6
##  $ K    : int 6
##  $ theta: num [1:22431, 1:6] 1.32e-05 7.60e-05 3.60e-05 1.19e-05 6.47e-05 ...
##   ..- attr(*, "dimnames")=List of 2
##   .. ..$ phrase: chr [1:22431] "0610005C13Rik" "0610007C21Rik" "0610007L01Rik" "0610007P08Rik" ...
##   .. ..$ topic : chr [1:6] "1" "2" "3" "4" ...
##  $ omega: num [1:259, 1:6] 1.26e-06 2.47e-04 1.88e-04 9.90e-06 3.47e-07 ...
##   ..- attr(*, "dimnames")=List of 2
##   .. ..$ document: chr [1:259] "zy" "zy" "zy" "zy" ...
##   .. ..$ topic   : chr [1:6] "1" "2" "3" "4" ...
##  $ BF   : NULL
##  $ D    :List of 3
##   ..$ dispersion: num 380
##   ..$ pvalue    : num 0
##   ..$ df        : num 4668431
##  $ X    :List of 6
##   ..$ i       : int [1:2307465] 5 13 14 15 16 18 19 20 21 22 ...
##   ..$ j       : int [1:2307465] 1 1 1 1 1 1 1 1 1 1 ...
##   ..$ v       : int [1:2307465] 2 46 376 130 84 48 77 147 48 16 ...
##   ..$ nrow    : int 259
##   ..$ ncol    : int 22431
##   ..$ dimnames:List of 2
##   .. ..$ : chr [1:259] "zy" "zy" "zy" "zy" ...
##   .. ..$ : chr [1:22431] "0610005C13Rik" "0610007C21Rik" "0610007L01Rik" "0610007P08Rik" ...
##   ..- attr(*, "class")= chr "simple_triplet_matrix"
##  - attr(*, "class")= chr "topics"
omega <- Topic_clus_list$clust_6$omega

# import embryonl labels
embryo_label <- read.table("../external_data/Deng_Data/cell_labels_phase_embryo.txt",
                           quote = "\"",
                           header = TRUE,
                           stringsAsFactors = FALSE)$x
head(embryo_label, 20)
##  [1] "zy_."          "zy_."          "zy_."          "zy_."         
##  [5] "early2cell_0r" "early2cell_0r" "early2cell_1"  "early2cell_1" 
##  [9] "early2cell_2"  "early2cell_2"  "early2cell_3"  "early2cell_3" 
## [13] "mid2cell_0r"   "mid2cell_0r"   "mid2cell_3"    "mid2cell_3"   
## [17] "mid2cell_4"    "mid2cell_4"    "mid2cell_5"    "mid2cell_5"
table(embryo_label)
## embryo_label
##      16cell_1      16cell_4      16cell_5      16cell_6       4cell_1 
##            14            11            13            12             3 
##       4cell_2       4cell_3       4cell_4       8cell_1       8cell_2 
##             4             3             4             7             7 
##       8cell_5       8cell_8 early2cell_0r  early2cell_1  early2cell_2 
##             7             7             2             2             2 
##  early2cell_3  earlyblast_2  earlyblast_3  earlyblast_4   late2cell_5 
##             2            15            15            13             2 
##   late2cell_6   late2cell_7   late2cell_8   late2cell_9   lateblast_1 
##             2             2             2             2            19 
##   lateblast_2   mid2cell_0r    mid2cell_3    mid2cell_4    mid2cell_5 
##            11             2             2             2             2 
##    mid2cell_6    mid2cell_7    midblast_1    midblast_2    midblast_3 
##             2             2            22            20            18 
##          zy_. 
##             4
stopifnot(length(embryo_label) == NROW(omega))


# make annotation matrix for the plot of all tissues
# sample_id has to be unique
annotation <- data.frame(
  sample_id = paste0("X", c(1:NROW(omega))),
  tissue_label = factor(rownames(omega),
                        levels = rev( c("zy", "early2cell", "mid2cell", "late2cell",
                                       "4cell", "8cell", "16cell", "earlyblast",
                                       "midblast", "lateblast") ) ) )

# make annotation for early stage plot
# sample_id has to be unique
annotation_embryo <- data.frame(
    sample_id = paste0("X", c(1:NROW(omega))),
    tissue_label = factor(embryo_label,
      levels = rev( c("zy_.",
                      paste("early2cell",c("0r", c(1:3)), sep = "_"),
                      paste("mid2cell",c("0r", c(3:7)), sep = "_"),
                      paste("late2cell",c("0r", c(5:9)), sep = "_"),
                      paste("4cell",c("0r", c(1:4)), sep = "_"),
                      paste("8cell",c("0r", c(1,2,5,8)), sep = "_"),
                      paste("16cell",c("0r", c(1,4,5,6)), sep = "_"),
                      paste("earlyblast",c("0r", c(2:4)), sep = "_"),
                      paste("midblast",c("0r", c(1:3)), sep = "_"),
                      paste("lateblast",c("0r", c(1:3)), sep = "_") ) ) ) )




# after extracting tissue type of each sample
# recode each sample to have unique rownames
rownames(omega) <- paste0("X", annotation$sample_id)

Structure plot of Deng data(K=6)

StructureGGplot(omega = omega,
               annotation = annotation,
               palette = RColorBrewer::brewer.pal(8, "Accent"),
               figure_title = "",
               yaxis_label = "Cell type",
               sample_order_decreasing = FALSE,
               axis_tick = list(axis_ticks_length = .1,
                                 axis_ticks_lwd_y = .1,
                                 axis_ticks_lwd_x = .1,
                                 axis_label_size = 7,
                                 axis_label_face = "bold"))