Overview

In this script, we explore the effectiveness of removing batch effects by the BatchCorrectedCounts() function of the package CountClust. We consider three simulation scenarios and then fit the topic model before and after removing the batch effects.

library(CountClust)
library(maptpx)
## Loading required package: slam

Simulation Scenario 1

We present the simulation mechanism for scenario 1.

K=4;
G=100;
N=200;
alpha_true=matrix(rnorm((K)*G,0.5,1),nrow=(K)); ### the matrix of fixed effects
Label.Batch=c(rep(1,N/2),rep(2,N/2));
B=max(Label.Batch);
sigmab_true=2;
beta_true=matrix(0,B,G);       ###  the matrix of the random effect
for(g in 1:G)
{
  beta_true[,g]=rnorm(B,mean=0,sd=sigmab_true);
}
library(gtools)
## 
## Attaching package: 'gtools'
## 
## The following object is masked from 'package:maptpx':
## 
##     logit
T=10;
omega_true=matrix(rbind(rdirichlet(T*4,c(3,4,2,6)),rdirichlet(T*4,c(1,4,6,3)),
                        rdirichlet(T*4,c(4,1,2,2)),rdirichlet(T*4,c(2,6,3,2)),
                        rdirichlet(T*4,c(3,3,5,4))), nrow=N);

over_dis=0.5;
noise_true=matrix(0,N,G);
for(n in 1:N)
{
  noise_true[n,]=over_dis*rnorm(G,0,1);
}

###  generating the table
read_counts=matrix(0,N,G);
for(n in 1:N)
{
  for(g in 1:G)
  {
    mean=exp(omega_true[n,]%*%alpha_true[,g] +beta_true[Label.Batch[n],g]+noise_true[n,g]);
    read_counts[n,g]=rpois(1,mean);
  }
}

True Structure Plot

barplot(t(omega_true),col=2:(K+1),axisnames=F,space=0,border=NA,main="",las=1,ylim=c(0,1),cex.axis=1.5,cex.main=1.4)
title(main=paste("Structure Plot of initial chosen topic proportions,k=",K))

Topic model (Without Batch Correction)

Topic_clus <- topics(read_counts, K, tol=0.01);
## 
## Estimating on a 200 document collection.
## Fitting the 4 topic model.
## log posterior increase: 1987.11, 178.42, 75.88, 23.55, 15.87, 8.59, 5.06, 10.65, 36.37, 17.17, 10.32, 0.31, 0.17, 0.12, 0.08, 0.07, 0.06, 0.05, 0.04, 0.04, 0.03, 0.03, 0.03, 0.03, 0.03, 0.03, 0.03, 0.02, 0.02, 0.02, 0.02, 0.02, 0.02, 0.02, 0.02, 0.02, 0.02, 0.02, 0.02, 0.02, 0.02, 0.02, 0.02, 0.02, 0.02, 0.02, 0.01, 0.01, 0.01, 0.01, 0.01, 0.02, 0.01, 0.01, 0.01, done.
docweights <- Topic_clus$omega;

Estimated Structure plot (Without Batch correction)

barplot(t(docweights),col=2:(K+1),axisnames=F,space=0,border=NA,main="",las=1,ylim=c(0,1),cex.axis=1.5,cex.main=1.4)
title(main=paste("Structure Plot of initial chosen topic proportions,k=",K))

We perform the batch correction

Batch corrected topic model fit

batch_counts <- BatchCorrectedCounts(read_counts, batch_lab = Label.Batch);

Topic_clus_2 <- topics(batch_counts, K, tol=0.01);
## 
## Estimating on a 200 document collection.
## Fitting the 4 topic model.
## log posterior increase: 12216.86, 5752.5, 3411.88, 1818.31, 1045.38, 715.9, 553.12, 467.42, 410.68, 364.63, 321.2, 277.67, 234.67, 194.5, 158.99, 128.91, 104.26, 84.51, 68.9, 56.63, 14.7, 2.6, 4.05, 7.92, 5.06, 3.5, 2.42, 1.69, 1.21, 0.89, 0.68, 0.53, 0.42, 0.34, 0.28, 0.24, 0.21, 0.19, 0.17, 0.15, 0.15, 0.14, 0.14, 0.14, 0.14, 0.13, 0.12, 0.11, 0.1, 0.09, 0.08, 0.08, 0.07, 0.06, 0.06, 0.05, 0.05, 0.04, 0.04, 0.04, 0.03, 0.03, 0.03, 0.02, 0.02, 0.01, done.
docweights_2 <- Topic_clus_2$omega;

Estimated Structure Plot (Batch correction)

library(permute);
## 
## Attaching package: 'permute'
## 
## The following object is masked from 'package:gtools':
## 
##     permute
library("BioPhysConnectoR");
## Loading required package: snow
## Loading required package: matrixcalc
perm_set=rbind(1:K,allPerms(1:K));
diff=array(0,dim(perm_set)[1]);
for (p in 1:dim(perm_set)[1])
{
  temp=docweights_2[,perm_set[p,]];
  diff[p]=fnorm(temp,omega_true);
}

p_star=which(diff==min(diff));
docweights_2=docweights_2[,perm_set[p_star,]];

barplot(t(docweights_2),col=2:(K+1),axisnames=F,space=0,border=NA,main="",las=1,ylim=c(0,1),cex.axis=1.5,cex.main=1.4)
title(main=paste("Structure Plot of initial chosen topic proportions,k=",K))

Simulation Scenario 2

We now present simulation scenario 2

K=4;
G=100;
N=200;
alpha_true=matrix(rnorm((K)*G,0.5,1),nrow=(K)); ### the matrix of fixed effects
Label.Batch=c(rep(1,N/2),rep(2,N/2));
B=max(Label.Batch);
sigmab_true=2;
beta_true=matrix(0,B,G);       ###  the matrix of the random effect
for(g in 1:G)
{
  beta_true[,g]=rnorm(B,mean=0,sd=sigmab_true);
}
library(gtools)
T=10;
omega_true=matrix(rbind(rdirichlet(T*4,c(3,4,2,6)),rdirichlet(T*4,c(1,4,6,3)),
                        rdirichlet(T*4,c(4,1,2,2)),rdirichlet(T*4,c(2,6,3,2)),
                        rdirichlet(T*4,c(3,3,5,4))), nrow=N);

over_dis=0.5;
noise_true=matrix(0,N,G);
for(n in 1:N)
{
  noise_true[n,]=over_dis*rnorm(G,0,1);
}

###  generating the table
read_counts=matrix(0,N,G);
for(n in 1:N)
{
  for(g in 1:G)
  {
    mean=omega_true[n,]%*%exp(alpha_true[,g] +beta_true[Label.Batch[n],g]+noise_true[n,g]);
    read_counts[n,g]=rpois(1,mean);
  }
}

True Structure Plot

barplot(t(omega_true),col=2:(K+1),axisnames=F,space=0,border=NA,main="",las=1,ylim=c(0,1),cex.axis=1.5,cex.main=1.4)
title(main=paste("Structure Plot of initial chosen topic proportions,k=",K))

Topic model (Without Batch Correction)

Topic_clus <- topics(read_counts, K, tol=0.01);
## 
## Estimating on a 200 document collection.
## Fitting the 4 topic model.
## log posterior increase: 2259.59, 846.7, 56.75, 14.84, 8.54, 7.46, 7.58, 7.07, 6.99, 6.07, 3.51, 1.1, 0.53, 0.57, 0.83, 1.26, 0.88, 0.56, 0.7, 0.63, 0.24, 0.25, 0.19, 0.08, 0.05, 0.04, 0.04, 0.03, 0.03, 0.03, 0.02, 0.02, 0.02, 0.02, 0.02, 0.01, 0.01, 0.01, 0.01, done.
docweights <- Topic_clus$omega;

Estimated Structure plot (Without Batch correction)

barplot(t(docweights),col=2:(K+1),axisnames=F,space=0,border=NA,main="",las=1,ylim=c(0,1),cex.axis=1.5,cex.main=1.4)
title(main=paste("Structure Plot of initial chosen topic proportions,k=",K))

We perform the batch correction

Batch corrected topic model fit

batch_counts <- BatchCorrectedCounts(read_counts, batch_lab = Label.Batch);

Topic_clus_2 <- topics(batch_counts, K, tol=0.01);
## 
## Estimating on a 200 document collection.
## Fitting the 4 topic model.
## log posterior increase: 41679.1, 10675.28, 2724.22, 1042.06, 570.95, 394.75, 311.2, 265.74, 238.51, 222.12, 212.22, 206.13, 202.53, 200.41, 198.89, 197.21, 194.7, 190.9, 185.6, 178.39, 168.68, 158.22, 147.56, 137.14, 127.28, 118.27, 110.33, 103.55, 181.9, 45.2, 42, 39.07, 36.35, 33.84, 31.53, 29.42, 26.58, 21.02, 16.95, 13.61, 11, 8.93, 7.24, 5.74, 4.24, 3.06, 2.56, 2.13, 1.74, 1.38, 1.05, 0.76, 0.52, 0.35, 0.23, 0.16, 0.17, 0.13, 0.12, 0.09, 0.07, 0.05, 0.05, 0.04, 0.03, 0.03, 0.03, 0.03, 0.03, 0.02, 0.02, 0.02, 0.02, 0.02, 0.02, 0.02, 0.02, 0.02, 0.02, 0.02, 0.02, 0.01, 0.01, 0.01, 0.01, 0.01, 0.01, 0.01, 0.01, done.
docweights_2 <- Topic_clus_2$omega;

Estimated Structure Plot (Batch correction)

library(permute);
library("BioPhysConnectoR");
perm_set=rbind(1:K,allPerms(1:K));
diff=array(0,dim(perm_set)[1]);
for (p in 1:dim(perm_set)[1])
{
  temp=docweights_2[,perm_set[p,]];
  diff[p]=fnorm(temp,omega_true);
}

p_star=which(diff==min(diff));
docweights_2=docweights_2[,perm_set[p_star,]];

barplot(t(docweights_2),col=2:(K+1),axisnames=F,space=0,border=NA,main="",las=1,ylim=c(0,1),cex.axis=1.5,cex.main=1.4)
title(main=paste("Structure Plot of initial chosen topic proportions,k=",K))