Overview

In this script, we explore the effectiveness of removing batch effects by the BatchCorrectedCounts() function of the package CountClust. We consider three simulation scenarios and then fit the topic model before and after removing the batch effects.

library(CountClust)
library(maptpx)
## Loading required package: slam

Simulation Scenario 1

We present the simulation mechanism for scenario 1.

K=4;
G=100;
N=200;
alpha_true=matrix(rnorm((K)*G,0.5,1),nrow=(K)); ### the matrix of fixed effects
Label.Batch=c(rep(1,N/2),rep(2,N/2));
B=max(Label.Batch);
sigmab_true=2;
beta_true=matrix(0,B,G);       ###  the matrix of the random effect
for(g in 1:G)
{
  beta_true[,g]=rnorm(B,mean=0,sd=sigmab_true);
}
library(gtools)
## 
## Attaching package: 'gtools'
## 
## The following object is masked from 'package:maptpx':
## 
##     logit
T=10;
omega_true=matrix(rbind(rdirichlet(T*4,c(3,4,2,6)),rdirichlet(T*4,c(1,4,6,3)),
                        rdirichlet(T*4,c(4,1,2,2)),rdirichlet(T*4,c(2,6,3,2)),
                        rdirichlet(T*4,c(3,3,5,4))), nrow=N);

over_dis=0.5;
noise_true=matrix(0,N,G);
for(n in 1:N)
{
  noise_true[n,]=over_dis*rnorm(G,0,1);
}

###  generating the table
read_counts=matrix(0,N,G);
for(n in 1:N)
{
  for(g in 1:G)
  {
    mean=exp(omega_true[n,]%*%alpha_true[,g] +beta_true[Label.Batch[n],g]+noise_true[n,g]);
    read_counts[n,g]=rpois(1,mean);
  }
}

True Structure Plot

barplot(t(omega_true),col=2:(K+1),axisnames=F,space=0,border=NA,main="",las=1,ylim=c(0,1),cex.axis=1.5,cex.main=1.4)
title(main=paste("Structure Plot of initial chosen topic proportions,k=",K))

Topic model (Without Batch Correction)

Topic_clus <- topics(read_counts, K, tol=0.01);
## 
## Estimating on a 200 document collection.
## Fitting the 4 topic model.
## log posterior increase: 1987.11, 178.42, 75.88, 23.55, 15.87, 8.59, 5.06, 10.65, 36.37, 17.17, 10.32, 0.31, 0.17, 0.12, 0.08, 0.07, 0.06, 0.05, 0.04, 0.04, 0.03, 0.03, 0.03, 0.03, 0.03, 0.03, 0.03, 0.02, 0.02, 0.02, 0.02, 0.02, 0.02, 0.02, 0.02, 0.02, 0.02, 0.02, 0.02, 0.02, 0.02, 0.02, 0.02, 0.02, 0.02, 0.02, 0.01, 0.01, 0.01, 0.01, 0.01, 0.02, 0.01, 0.01, 0.01, done.
docweights <- Topic_clus$omega;

Estimated Structure plot (Without Batch correction)

barplot(t(docweights),col=2:(K+1),axisnames=F,space=0,border=NA,main="",las=1,ylim=c(0,1),cex.axis=1.5,cex.main=1.4)
title(main=paste("Structure Plot of initial chosen topic proportions,k=",K))

We perform the batch correction

Batch corrected topic model fit

batch_counts <- BatchCorrectedCounts(read_counts, batch_lab = Label.Batch);

Topic_clus_2 <- topics(batch_counts, K, tol=0.01);
## 
## Estimating on a 200 document collection.
## Fitting the 4 topic model.
## log posterior increase: 12216.86, 5752.5, 3411.88, 1818.31, 1045.38, 715.9, 553.12, 467.42, 410.68, 364.63, 321.2, 277.67, 234.67, 194.5, 158.99, 128.91, 104.26, 84.51, 68.9, 56.63, 14.7, 2.6, 4.05, 7.92, 5.06, 3.5, 2.42, 1.69, 1.21, 0.89, 0.68, 0.53, 0.42, 0.34, 0.28, 0.24, 0.21, 0.19, 0.17, 0.15, 0.15, 0.14, 0.14, 0.14, 0.14, 0.13, 0.12, 0.11, 0.1, 0.09, 0.08, 0.08, 0.07, 0.06, 0.06, 0.05, 0.05, 0.04, 0.04, 0.04, 0.03, 0.03, 0.03, 0.02, 0.02, 0.01, done.
docweights_2 <- Topic_clus_2$omega;

Estimated Structure Plot (Batch correction)

library(permute);
## 
## Attaching package: 'permute'
## 
## The following object is masked from 'package:gtools':
## 
##     permute
library("BioPhysConnectoR");
## Loading required package: snow
## Loading required package: matrixcalc
perm_set=rbind(1:K,allPerms(1:K));
diff=array(0,dim(perm_set)[1]);
for (p in 1:dim(perm_set)[1])
{
  temp=docweights_2[,perm_set[p,]];
  diff[p]=fnorm(temp,omega_true);
}

p_star=which(diff==min(diff));
docweights_2=docweights_2[,perm_set[p_star,]];

barplot(t(docweights_2),col=2:(K+1),axisnames=F,space=0,border=NA,main="",las=1,ylim=c(0,1),cex.axis=1.5,cex.main=1.4)
title(main=paste("Structure Plot of initial chosen topic proportions,k=",K))

Simulation Scenario 2

We now present simulation scenario 2

K=4;
G=100;
N=200;
alpha_true=matrix(rnorm((K)*G,0.5,1),nrow=(K)); ### the matrix of fixed effects
Label.Batch=c(rep(1,N/2),rep(2,N/2));
B=max(Label.Batch);
sigmab_true=2;
beta_true=matrix(0,B,G);       ###  the matrix of the random effect
for(g in 1:G)
{
  beta_true[,g]=rnorm(B,mean=0,sd=sigmab_true);
}
library(gtools)
T=10;
omega_true=matrix(rbind(rdirichlet(T*4,c(3,4,2,6)),rdirichlet(T*4,c(1,4,6,3)),
                        rdirichlet(T*4,c(4,1,2,2)),rdirichlet(T*4,c(2,6,3,2)),
                        rdirichlet(T*4,c(3,3,5,4))), nrow=N);

over_dis=0.5;
noise_true=matrix(0,N,G);
for(n in 1:N)
{
  noise_true[n,]=over_dis*rnorm(G,0,1);
}

###  generating the table
read_counts=matrix(0,N,G);
for(n in 1:N)
{
  for(g in 1:G)
  {
    mean=omega_true[n,]%*%exp(alpha_true[,g] +beta_true[Label.Batch[n],g]+noise_true[n,g]);
    read_counts[n,g]=rpois(1,mean);
  }
}

True Structure Plot

barplot(t(omega_true),col=2:(K+1),axisnames=F,space=0,border=NA,main="",las=1,ylim=c(0,1),cex.axis=1.5,cex.main=1.4)
title(main=paste("Structure Plot of initial chosen topic proportions,k=",K))

Topic model (Without Batch Correction)

Topic_clus <- topics(read_counts, K, tol=0.01);
## 
## Estimating on a 200 document collection.
## Fitting the 4 topic model.
## log posterior increase: 2259.59, 846.7, 56.75, 14.84, 8.54, 7.46, 7.58, 7.07, 6.99, 6.07, 3.51, 1.1, 0.53, 0.57, 0.83, 1.26, 0.88, 0.56, 0.7, 0.63, 0.24, 0.25, 0.19, 0.08, 0.05, 0.04, 0.04, 0.03, 0.03, 0.03, 0.02, 0.02, 0.02, 0.02, 0.02, 0.01, 0.01, 0.01, 0.01, done.
docweights <- Topic_clus$omega;

Estimated Structure plot (Without Batch correction)

barplot(t(docweights),col=2:(K+1),axisnames=F,space=0,border=NA,main="",las=1,ylim=c(0,1),cex.axis=1.5,cex.main=1.4)
title(main=paste("Structure Plot of initial chosen topic proportions,k=",K))

We perform the batch correction

Batch corrected topic model fit

batch_counts <- BatchCorrectedCounts(read_counts, batch_lab = Label.Batch);

Topic_clus_2 <- topics(batch_counts, K, tol=0.01);
## 
## Estimating on a 200 document collection.
## Fitting the 4 topic model.
## log posterior increase: 41679.1, 10675.28, 2724.22, 1042.06, 570.95, 394.75, 311.2, 265.74, 238.51, 222.12, 212.22, 206.13, 202.53, 200.41, 198.89, 197.21, 194.7, 190.9, 185.6, 178.39, 168.68, 158.22, 147.56, 137.14, 127.28, 118.27, 110.33, 103.55, 181.9, 45.2, 42, 39.07, 36.35, 33.84, 31.53, 29.42, 26.58, 21.02, 16.95, 13.61, 11, 8.93, 7.24, 5.74, 4.24, 3.06, 2.56, 2.13, 1.74, 1.38, 1.05, 0.76, 0.52, 0.35, 0.23, 0.16, 0.17, 0.13, 0.12, 0.09, 0.07, 0.05, 0.05, 0.04, 0.03, 0.03, 0.03, 0.03, 0.03, 0.02, 0.02, 0.02, 0.02, 0.02, 0.02, 0.02, 0.02, 0.02, 0.02, 0.02, 0.02, 0.01, 0.01, 0.01, 0.01, 0.01, 0.01, 0.01, 0.01, done.
docweights_2 <- Topic_clus_2$omega;

Estimated Structure Plot (Batch correction)

library(permute);
library("BioPhysConnectoR");
perm_set=rbind(1:K,allPerms(1:K));
diff=array(0,dim(perm_set)[1]);
for (p in 1:dim(perm_set)[1])
{
  temp=docweights_2[,perm_set[p,]];
  diff[p]=fnorm(temp,omega_true);
}

p_star=which(diff==min(diff));
docweights_2=docweights_2[,perm_set[p_star,]];

barplot(t(docweights_2),col=2:(K+1),axisnames=F,space=0,border=NA,main="",las=1,ylim=c(0,1),cex.axis=1.5,cex.main=1.4)
title(main=paste("Structure Plot of initial chosen topic proportions,k=",K))

Simualtion Scenario 3

We present the simulation scenario 3.

K=4;
G=100;
N=200;

Label.Batch=c(rep(1,N/2),rep(2,N/2));
B=max(Label.Batch);
sigmab_true=2;
beta_true=matrix(0,B,G);       ###  the matrix of the random effect
for(g in 1:G)
{
  beta_true[,g]=rnorm(B,mean=0,sd=sigmab_true);
}
library(gtools)
T=10;
omega_true=matrix(rbind(rdirichlet(T*4,c(3,4,2,6)),rdirichlet(T*4,c(1,4,6,3)),
                        rdirichlet(T*4,c(4,1,2,2)),rdirichlet(T*4,c(2,6,3,2)),
                        rdirichlet(T*4,c(3,3,5,4))), nrow=N);

theta_b_true <- rbind(rdirichlet(2, c(0.2, 0.2, rep(0.6/G,G-2))),
                           rdirichlet(2, c(rep(0.5/G,G-3),0.3,0.1,0.1)),
                           rdirichlet(2, c(rep(0.4/G, G/2 -2),0.2,0.1,rep(0.3/G, G/2))),
                           rdirichlet(2, c(rep(0.6/G,G-2), 0.2, 0.2)));

read_counts <- t(do.call(cbind,lapply(1:dim(omega_true)[1], function(x)
                                                        {
                                                            if(Label.Batch[x]==1)
                                                              out <- rmultinom(1,1000,prob=omega_true[x,]%*%theta_b_true[c(2,4,6,8),]);
                                                            if(Label.Batch[x]==2)
                                                              out <- rmultinom(1,1000,prob=omega_true[x,]%*%theta_b_true[c(1,3,5,7),]);
                                                            return(out)
                                                             }  )));

True Structure Plot

barplot(t(omega_true),col=2:(K+1),axisnames=F,space=0,border=NA,main="",las=1,ylim=c(0,1),cex.axis=1.5,cex.main=1.4)
title(main=paste("Structure Plot of initial chosen topic proportions,k=",K))

Topic model (Without Batch Correction)

Topic_clus <- topics(read_counts, K, tol=0.01);
## 
## Estimating on a 200 document collection.
## Fitting the 4 topic model.
## log posterior increase: 3854.71, 39.41, 10.6, 4.79, 2.65, 0.93, 0.25, 0.2, 0.19, 0.22, 0.26, 0.21, 0.12, 0.07, 0.04, 0.03, 0.03, 0.03, done.
docweights <- Topic_clus$omega;

Estimated Structure plot (Without Batch correction)

barplot(t(docweights),col=2:(K+1),axisnames=F,space=0,border=NA,main="",las=1,ylim=c(0,1),cex.axis=1.5,cex.main=1.4)
title(main=paste("Structure Plot of initial chosen topic proportions,k=",K))

We perform the batch correction

Batch corrected topic model fit

batch_counts <- BatchCorrectedCounts(read_counts, batch_lab = Label.Batch);

Topic_clus_2 <- topics(batch_counts, K, tol=0.01);
## 
## Estimating on a 200 document collection.
## Fitting the 4 topic model.
## log posterior increase: 10926.46, 5781.6, 1333.56, 597.02, 361.76, 263.73, 214.12, 184.62, 165.47, 152.31, 142.65, 134.91, 128.81, 123.88, 119.64, 115.9, 112.65, 109.78, 107.13, 104.56, 101.93, 99.09, 95.96, 92.47, 88.66, 84.71, 61.36, 58.05, 54.9, 51.9, 49.07, 29.44, 28.2, 26.44, 24.73, 23.08, 12.08, 11.96, 12.67, 16.33, 15.51, 13.07, 16.29, 19.42, 22.29, 24.86, 27.65, 31.7, 38.62, 52.54, 77.61, 76.84, 49.99, 37.38, 37.16, 42.43, 41.83, 41.97, 43.28, 44.79, 44.24, 38.33, 28.24, 19.53, 14.47, 15.23, 11.52, 11, 7.55, 6.05, 5.18, 4.67, 4.36, 4, 3.5, 2.99, 2.6, 2.3, 2.06, 1.88, 1.74, 1.65, 1.58, 1.53, 1.48, 1.42, 1.36, 1.3, 1.22, 1.15, 1.07, 0.99, 0.92, 0.84, 0.78, 0.73, 0.69, 0.67, 0.66, 0.66, p 100 iter 1000 diff 1
## 0.67, 0.68, 0.7, 0.71, 0.73, 0.74, 0.74, 0.73, 0.72, 0.7, 0.67, 0.63, 0.59, 0.55, 0.5, 0.46, 0.42, 0.39, 0.37, 0.35, 0.34, 0.35, 0.35, 0.36, 0.34, 0.31, 0.27, 0.24, 0.23, 0.24, 0.25, 0.26, 0.27, 0.29, 0.36, 0.63, 0.73, 0.51, 0.38, 0.3, 0.24, 0.2, 0.17, 0.15, 0.13, 0.12, 0.11, 0.1, 0.09, 0.08, 0.07, 0.07, 0.06, 0.06, 0.05, 0.05, 0.05, 0.04, 0.04, 0.04, 0.03, 0.03, 0.03, 0.02, 0.02, 0.02, 0.02, 0.01, 0.01, 0.01, done.
docweights_2 <- Topic_clus_2$omega;

Estimated Structure Plot (Batch correction)

library(permute);
library("BioPhysConnectoR");
perm_set=rbind(1:K,allPerms(1:K));
diff=array(0,dim(perm_set)[1]);
for (p in 1:dim(perm_set)[1])
{
  temp=docweights_2[,perm_set[p,]];
  diff[p]=fnorm(temp,omega_true);
}

p_star=which(diff==min(diff));
docweights_2=docweights_2[,perm_set[p_star,]];

barplot(t(docweights_2),col=2:(K+1),axisnames=F,space=0,border=NA,main="",las=1,ylim=c(0,1),cex.axis=1.5,cex.main=1.4)
title(main=paste("Structure Plot of initial chosen topic proportions,k=",K))

Conclusions

It seems that although the batch correction does improve on model fitting and seem to remove the batch effects problem, but the model fit after batch correction and corresponding Structure plot representation does not correspond very accurately to the true Structure plot representation.