Batch effects exploration

Overview

In this script, we explore the effectiveness of removing batch effects by the BatchCorrectedCounts() function of the package CountClust. We consider three simulation scenarios and then fit the topic model before and after removing the batch effects.

library(CountClust)
library(maptpx)

## Loading required package: slam

Simulation Scenario 1

We present the simulation mechanism for scenario 1.

K=4;
G=100;
N=200;
alpha_true=matrix(rnorm((K)*G,0.5,1),nrow=(K)); ### the matrix of fixed effects
Label.Batch=c(rep(1,N/2),rep(2,N/2));
B=max(Label.Batch);
sigmab_true=2;
beta_true=matrix(0,B,G);       ###  the matrix of the random effect
for(g in 1:G)
{
  beta_true[,g]=rnorm(B,mean=0,sd=sigmab_true);
}
library(gtools)

## 
## Attaching package: 'gtools'
## 
## The following object is masked from 'package:maptpx':
## 
##     logit

T=10;
omega_true=matrix(rbind(rdirichlet(T*4,c(3,4,2,6)),rdirichlet(T*4,c(1,4,6,3)),
                        rdirichlet(T*4,c(4,1,2,2)),rdirichlet(T*4,c(2,6,3,2)),
                        rdirichlet(T*4,c(3,3,5,4))), nrow=N);

over_dis=0.5;
noise_true=matrix(0,N,G);
for(n in 1:N)
{
  noise_true[n,]=over_dis*rnorm(G,0,1);
}

###  generating the table
read_counts=matrix(0,N,G);
for(n in 1:N)
{
  for(g in 1:G)
  {
    mean=exp(omega_true[n,]%*%alpha_true[,g] +beta_true[Label.Batch[n],g]+noise_true[n,g]);
    read_counts[n,g]=rpois(1,mean);
  }
}

True Structure Plot

barplot(t(omega_true),col=2:(K+1),axisnames=F,space=0,border=NA,main="",las=1,ylim=c(0,1),cex.axis=1.5,cex.main=1.4)
title(main=paste("Structure Plot of initial chosen topic proportions,k=",K))

Topic model (Without Batch Correction)

Topic_clus <- topics(read_counts, K, tol=0.01);

## 
## Estimating on a 200 document collection.
## Fitting the 4 topic model.
## log posterior increase: 1987.11, 178.42, 75.88, 23.55, 15.87, 8.59, 5.06, 10.65, 36.37, 17.17, 10.32, 0.31, 0.17, 0.12, 0.08, 0.07, 0.06, 0.05, 0.04, 0.04, 0.03, 0.03, 0.03, 0.03, 0.03, 0.03, 0.03, 0.02, 0.02, 0.02, 0.02, 0.02, 0.02, 0.02, 0.02, 0.02, 0.02, 0.02, 0.02, 0.02, 0.02, 0.02, 0.02, 0.02, 0.02, 0.02, 0.01, 0.01, 0.01, 0.01, 0.01, 0.02, 0.01, 0.01, 0.01, done.

docweights <- Topic_clus$omega;

Estimated Structure plot (Without Batch correction)

barplot(t(docweights),col=2:(K+1),axisnames=F,space=0,border=NA,main="",las=1,ylim=c(0,1),cex.axis=1.5,cex.main=1.4)
title(main=paste("Structure Plot of initial chosen topic proportions,k=",K))

We perform the batch correction

Batch corrected topic model fit

batch_counts <- BatchCorrectedCounts(read_counts, batch_lab = Label.Batch);

Topic_clus_2 <- topics(batch_counts, K, tol=0.01);

## 
## Estimating on a 200 document collection.
## Fitting the 4 topic model.
## log posterior increase: 12216.86, 5752.5, 3411.88, 1818.31, 1045.38, 715.9, 553.12, 467.42, 410.68, 364.63, 321.2, 277.67, 234.67, 194.5, 158.99, 128.91, 104.26, 84.51, 68.9, 56.63, 14.7, 2.6, 4.05, 7.92, 5.06, 3.5, 2.42, 1.69, 1.21, 0.89, 0.68, 0.53, 0.42, 0.34, 0.28, 0.24, 0.21, 0.19, 0.17, 0.15, 0.15, 0.14, 0.14, 0.14, 0.14, 0.13, 0.12, 0.11, 0.1, 0.09, 0.08, 0.08, 0.07, 0.06, 0.06, 0.05, 0.05, 0.04, 0.04, 0.04, 0.03, 0.03, 0.03, 0.02, 0.02, 0.01, done.

docweights_2 <- Topic_clus_2$omega;

Estimated Structure Plot (Batch correction)

library(permute);

## 
## Attaching package: 'permute'
## 
## The following object is masked from 'package:gtools':
## 
##     permute

library("BioPhysConnectoR");

## Loading required package: snow
## Loading required package: matrixcalc

perm_set=rbind(1:K,allPerms(1:K));
diff=array(0,dim(perm_set)[1]);
for (p in 1:dim(perm_set)[1])
{
  temp=docweights_2[,perm_set[p,]];
  diff[p]=fnorm(temp,omega_true);
}

p_star=which(diff==min(diff));
docweights_2=docweights_2[,perm_set[p_star,]];

barplot(t(docweights_2),col=2:(K+1),axisnames=F,space=0,border=NA,main="",las=1,ylim=c(0,1),cex.axis=1.5,cex.main=1.4)
title(main=paste("Structure Plot of initial chosen topic proportions,k=",K))

Simulation Scenario 2

We now present simulation scenario 2

K=4;
G=100;
N=200;
alpha_true=matrix(rnorm((K)*G,0.5,1),nrow=(K)); ### the matrix of fixed effects
Label.Batch=c(rep(1,N/2),rep(2,N/2));
B=max(Label.Batch);
sigmab_true=2;
beta_true=matrix(0,B,G);       ###  the matrix of the random effect
for(g in 1:G)
{
  beta_true[,g]=rnorm(B,mean=0,sd=sigmab_true);
}
library(gtools)
T=10;
omega_true=matrix(rbind(rdirichlet(T*4,c(3,4,2,6)),rdirichlet(T*4,c(1,4,6,3)),
                        rdirichlet(T*4,c(4,1,2,2)),rdirichlet(T*4,c(2,6,3,2)),
                        rdirichlet(T*4,c(3,3,5,4))), nrow=N);

over_dis=0.5;
noise_true=matrix(0,N,G);
for(n in 1:N)
{
  noise_true[n,]=over_dis*rnorm(G,0,1);
}

###  generating the table
read_counts=matrix(0,N,G);
for(n in 1:N)
{
  for(g in 1:G)
  {
    mean=omega_true[n,]%*%exp(alpha_true[,g] +beta_true[Label.Batch[n],g]+noise_true[n,g]);
    read_counts[n,g]=rpois(1,mean);
  }
}

True Structure Plot

barplot(t(omega_true),col=2:(K+1),axisnames=F,space=0,border=NA,main="",las=1,ylim=c(0,1),cex.axis=1.5,cex.main=1.4)
title(main=paste("Structure Plot of initial chosen topic proportions,k=",K))

Topic model (Without Batch Correction)

Topic_clus <- topics(read_counts, K, tol=0.01);

## 
## Estimating on a 200 document collection.
## Fitting the 4 topic model.
## log posterior increase: 2259.59, 846.7, 56.75, 14.84, 8.54, 7.46, 7.58, 7.07, 6.99, 6.07, 3.51, 1.1, 0.53, 0.57, 0.83, 1.26, 0.88, 0.56, 0.7, 0.63, 0.24, 0.25, 0.19, 0.08, 0.05, 0.04, 0.04, 0.03, 0.03, 0.03, 0.02, 0.02, 0.02, 0.02, 0.02, 0.01, 0.01, 0.01, 0.01, done.

docweights <- Topic_clus$omega;

Estimated Structure plot (Without Batch correction)

barplot(t(docweights),col=2:(K+1),axisnames=F,space=0,border=NA,main="",las=1,ylim=c(0,1),cex.axis=1.5,cex.main=1.4)
title(main=paste("Structure Plot of initial chosen topic proportions,k=",K))

We perform the batch correction

Batch corrected topic model fit

batch_counts <- BatchCorrectedCounts(read_counts, batch_lab = Label.Batch);

Topic_clus_2 <- topics(batch_counts, K, tol=0.01);

## 
## Estimating on a 200 document collection.
## Fitting the 4 topic model.
## log posterior increase: 41679.1, 10675.28, 2724.22, 1042.06, 570.95, 394.75, 311.2, 265.74, 238.51, 222.12, 212.22, 206.13, 202.53, 200.41, 198.89, 197.21, 194.7, 190.9, 185.6, 178.39, 168.68, 158.22, 147.56, 137.14, 127.28, 118.27, 110.33, 103.55, 181.9, 45.2, 42, 39.07, 36.35, 33.84, 31.53, 29.42, 26.58, 21.02, 16.95, 13.61, 11, 8.93, 7.24, 5.74, 4.24, 3.06, 2.56, 2.13, 1.74, 1.38, 1.05, 0.76, 0.52, 0.35, 0.23, 0.16, 0.17, 0.13, 0.12, 0.09, 0.07, 0.05, 0.05, 0.04, 0.03, 0.03, 0.03, 0.03, 0.03, 0.02, 0.02, 0.02, 0.02, 0.02, 0.02, 0.02, 0.02, 0.02, 0.02, 0.02, 0.02, 0.01, 0.01, 0.01, 0.01, 0.01, 0.01, 0.01, 0.01, done.

docweights_2 <- Topic_clus_2$omega;

Estimated Structure Plot (Batch correction)

library(permute);
library("BioPhysConnectoR");
perm_set=rbind(1:K,allPerms(1:K));
diff=array(0,dim(perm_set)[1]);
for (p in 1:dim(perm_set)[1])
{
  temp=docweights_2[,perm_set[p,]];
  diff[p]=fnorm(temp,omega_true);
}

p_star=which(diff==min(diff));
docweights_2=docweights_2[,perm_set[p_star,]];

barplot(t(docweights_2),col=2:(K+1),axisnames=F,space=0,border=NA,main="",las=1,ylim=c(0,1),cex.axis=1.5,cex.main=1.4)
title(main=paste("Structure Plot of initial chosen topic proportions,k=",K))

Simualtion Scenario 3

We present the simulation scenario 3.

K=4;
G=100;
N=200;

Label.Batch=c(rep(1,N/2),rep(2,N/2));
B=max(Label.Batch);
sigmab_true=2;
beta_true=matrix(0,B,G);       ###  the matrix of the random effect
for(g in 1:G)
{
  beta_true[,g]=rnorm(B,mean=0,sd=sigmab_true);
}
library(gtools)
T=10;
omega_true=matrix(rbind(rdirichlet(T*4,c(3,4,2,6)),rdirichlet(T*4,c(1,4,6,3)),
                        rdirichlet(T*4,c(4,1,2,2)),rdirichlet(T*4,c(2,6,3,2)),
                        rdirichlet(T*4,c(3,3,5,4))), nrow=N);

theta_b_true <- rbind(rdirichlet(2, c(0.2, 0.2, rep(0.6/G,G-2))),
                           rdirichlet(2, c(rep(0.5/G,G-3),0.3,0.1,0.1)),
                           rdirichlet(2, c(rep(0.4/G, G/2 -2),0.2,0.1,rep(0.3/G, G/2))),
                           rdirichlet(2, c(rep(0.6/G,G-2), 0.2, 0.2)));

read_counts <- t(do.call(cbind,lapply(1:dim(omega_true)[1], function(x)
                                                        {
                                                            if(Label.Batch[x]==1)
                                                              out <- rmultinom(1,1000,prob=omega_true[x,]%*%theta_b_true[c(2,4,6,8),]);
                                                            if(Label.Batch[x]==2)
                                                              out <- rmultinom(1,1000,prob=omega_true[x,]%*%theta_b_true[c(1,3,5,7),]);
                                                            return(out)
                                                             }  )));

True Structure Plot

barplot(t(omega_true),col=2:(K+1),axisnames=F,space=0,border=NA,main="",las=1,ylim=c(0,1),cex.axis=1.5,cex.main=1.4)
title(main=paste("Structure Plot of initial chosen topic proportions,k=",K))

Topic model (Without Batch Correction)

Topic_clus <- topics(read_counts, K, tol=0.01);

## 
## Estimating on a 200 document collection.
## Fitting the 4 topic model.
## log posterior increase: 3854.71, 39.41, 10.6, 4.79, 2.65, 0.93, 0.25, 0.2, 0.19, 0.22, 0.26, 0.21, 0.12, 0.07, 0.04, 0.03, 0.03, 0.03, done.

docweights <- Topic_clus$omega;

Estimated Structure plot (Without Batch correction)

barplot(t(docweights),col=2:(K+1),axisnames=F,space=0,border=NA,main="",las=1,ylim=c(0,1),cex.axis=1.5,cex.main=1.4)
title(main=paste("Structure Plot of initial chosen topic proportions,k=",K))

We perform the batch correction

Batch corrected topic model fit

batch_counts <- BatchCorrectedCounts(read_counts, batch_lab = Label.Batch);

Topic_clus_2 <- topics(batch_counts, K, tol=0.01);

## 
## Estimating on a 200 document collection.
## Fitting the 4 topic model.
## log posterior increase: 10926.46, 5781.6, 1333.56, 597.02, 361.76, 263.73, 214.12, 184.62, 165.47, 152.31, 142.65, 134.91, 128.81, 123.88, 119.64, 115.9, 112.65, 109.78, 107.13, 104.56, 101.93, 99.09, 95.96, 92.47, 88.66, 84.71, 61.36, 58.05, 54.9, 51.9, 49.07, 29.44, 28.2, 26.44, 24.73, 23.08, 12.08, 11.96, 12.67, 16.33, 15.51, 13.07, 16.29, 19.42, 22.29, 24.86, 27.65, 31.7, 38.62, 52.54, 77.61, 76.84, 49.99, 37.38, 37.16, 42.43, 41.83, 41.97, 43.28, 44.79, 44.24, 38.33, 28.24, 19.53, 14.47, 15.23, 11.52, 11, 7.55, 6.05, 5.18, 4.67, 4.36, 4, 3.5, 2.99, 2.6, 2.3, 2.06, 1.88, 1.74, 1.65, 1.58, 1.53, 1.48, 1.42, 1.36, 1.3, 1.22, 1.15, 1.07, 0.99, 0.92, 0.84, 0.78, 0.73, 0.69, 0.67, 0.66, 0.66, p 100 iter 1000 diff 1
## 0.67, 0.68, 0.7, 0.71, 0.73, 0.74, 0.74, 0.73, 0.72, 0.7, 0.67, 0.63, 0.59, 0.55, 0.5, 0.46, 0.42, 0.39, 0.37, 0.35, 0.34, 0.35, 0.35, 0.36, 0.34, 0.31, 0.27, 0.24, 0.23, 0.24, 0.25, 0.26, 0.27, 0.29, 0.36, 0.63, 0.73, 0.51, 0.38, 0.3, 0.24, 0.2, 0.17, 0.15, 0.13, 0.12, 0.11, 0.1, 0.09, 0.08, 0.07, 0.07, 0.06, 0.06, 0.05, 0.05, 0.05, 0.04, 0.04, 0.04, 0.03, 0.03, 0.03, 0.02, 0.02, 0.02, 0.02, 0.01, 0.01, 0.01, done.

docweights_2 <- Topic_clus_2$omega;

Estimated Structure Plot (Batch correction)

library(permute);
library("BioPhysConnectoR");
perm_set=rbind(1:K,allPerms(1:K));
diff=array(0,dim(perm_set)[1]);
for (p in 1:dim(perm_set)[1])
{
  temp=docweights_2[,perm_set[p,]];
  diff[p]=fnorm(temp,omega_true);
}

p_star=which(diff==min(diff));
docweights_2=docweights_2[,perm_set[p_star,]];

barplot(t(docweights_2),col=2:(K+1),axisnames=F,space=0,border=NA,main="",las=1,ylim=c(0,1),cex.axis=1.5,cex.main=1.4)
title(main=paste("Structure Plot of initial chosen topic proportions,k=",K))

Conclusions

It seems that although the batch correction does improve on model fitting and seem to remove the batch effects problem, but the model fit after batch correction and corresponding Structure plot representation does not correspond very accurately to the true Structure plot representation.

Batch effects exploration

Kushal K Dey

February 4, 2016

Overview

Simulation Scenario 1

True Structure Plot

Topic model (Without Batch Correction)

Estimated Structure plot (Without Batch correction)

Batch corrected topic model fit

Estimated Structure Plot (Batch correction)

Simulation Scenario 2

True Structure Plot

Topic model (Without Batch Correction)

Estimated Structure plot (Without Batch correction)

Batch corrected topic model fit

Estimated Structure Plot (Batch correction)

Simualtion Scenario 3

True Structure Plot

Topic model (Without Batch Correction)

Estimated Structure plot (Without Batch correction)

Batch corrected topic model fit

Estimated Structure Plot (Batch correction)

Conclusions