In this script, we explore the effectiveness of removing batch effects by the BatchCorrectedCounts()
function of the package CountClust
. We consider three simulation scenarios and then fit the topic model before and after removing the batch effects.
library(CountClust)
library(maptpx)
## Loading required package: slam
We present the simulation mechanism for scenario 1.
K=4;
G=100;
N=200;
alpha_true=matrix(rnorm((K)*G,0.5,1),nrow=(K)); ### the matrix of fixed effects
Label.Batch=c(rep(1,N/2),rep(2,N/2));
B=max(Label.Batch);
sigmab_true=2;
beta_true=matrix(0,B,G); ### the matrix of the random effect
for(g in 1:G)
{
beta_true[,g]=rnorm(B,mean=0,sd=sigmab_true);
}
library(gtools)
##
## Attaching package: 'gtools'
##
## The following object is masked from 'package:maptpx':
##
## logit
T=10;
omega_true=matrix(rbind(rdirichlet(T*4,c(3,4,2,6)),rdirichlet(T*4,c(1,4,6,3)),
rdirichlet(T*4,c(4,1,2,2)),rdirichlet(T*4,c(2,6,3,2)),
rdirichlet(T*4,c(3,3,5,4))), nrow=N);
over_dis=0.5;
noise_true=matrix(0,N,G);
for(n in 1:N)
{
noise_true[n,]=over_dis*rnorm(G,0,1);
}
### generating the table
read_counts=matrix(0,N,G);
for(n in 1:N)
{
for(g in 1:G)
{
mean=exp(omega_true[n,]%*%alpha_true[,g] +beta_true[Label.Batch[n],g]+noise_true[n,g]);
read_counts[n,g]=rpois(1,mean);
}
}
barplot(t(omega_true),col=2:(K+1),axisnames=F,space=0,border=NA,main="",las=1,ylim=c(0,1),cex.axis=1.5,cex.main=1.4)
title(main=paste("Structure Plot of initial chosen topic proportions,k=",K))
Topic_clus <- topics(read_counts, K, tol=0.01);
##
## Estimating on a 200 document collection.
## Fitting the 4 topic model.
## log posterior increase: 1987.11, 178.42, 75.88, 23.55, 15.87, 8.59, 5.06, 10.65, 36.37, 17.17, 10.32, 0.31, 0.17, 0.12, 0.08, 0.07, 0.06, 0.05, 0.04, 0.04, 0.03, 0.03, 0.03, 0.03, 0.03, 0.03, 0.03, 0.02, 0.02, 0.02, 0.02, 0.02, 0.02, 0.02, 0.02, 0.02, 0.02, 0.02, 0.02, 0.02, 0.02, 0.02, 0.02, 0.02, 0.02, 0.02, 0.01, 0.01, 0.01, 0.01, 0.01, 0.02, 0.01, 0.01, 0.01, done.
docweights <- Topic_clus$omega;
barplot(t(docweights),col=2:(K+1),axisnames=F,space=0,border=NA,main="",las=1,ylim=c(0,1),cex.axis=1.5,cex.main=1.4)
title(main=paste("Structure Plot of initial chosen topic proportions,k=",K))
We perform the batch correction
batch_counts <- BatchCorrectedCounts(read_counts, batch_lab = Label.Batch);
Topic_clus_2 <- topics(batch_counts, K, tol=0.01);
##
## Estimating on a 200 document collection.
## Fitting the 4 topic model.
## log posterior increase: 12216.86, 5752.5, 3411.88, 1818.31, 1045.38, 715.9, 553.12, 467.42, 410.68, 364.63, 321.2, 277.67, 234.67, 194.5, 158.99, 128.91, 104.26, 84.51, 68.9, 56.63, 14.7, 2.6, 4.05, 7.92, 5.06, 3.5, 2.42, 1.69, 1.21, 0.89, 0.68, 0.53, 0.42, 0.34, 0.28, 0.24, 0.21, 0.19, 0.17, 0.15, 0.15, 0.14, 0.14, 0.14, 0.14, 0.13, 0.12, 0.11, 0.1, 0.09, 0.08, 0.08, 0.07, 0.06, 0.06, 0.05, 0.05, 0.04, 0.04, 0.04, 0.03, 0.03, 0.03, 0.02, 0.02, 0.01, done.
docweights_2 <- Topic_clus_2$omega;
library(permute);
##
## Attaching package: 'permute'
##
## The following object is masked from 'package:gtools':
##
## permute
library("BioPhysConnectoR");
## Loading required package: snow
## Loading required package: matrixcalc
perm_set=rbind(1:K,allPerms(1:K));
diff=array(0,dim(perm_set)[1]);
for (p in 1:dim(perm_set)[1])
{
temp=docweights_2[,perm_set[p,]];
diff[p]=fnorm(temp,omega_true);
}
p_star=which(diff==min(diff));
docweights_2=docweights_2[,perm_set[p_star,]];
barplot(t(docweights_2),col=2:(K+1),axisnames=F,space=0,border=NA,main="",las=1,ylim=c(0,1),cex.axis=1.5,cex.main=1.4)
title(main=paste("Structure Plot of initial chosen topic proportions,k=",K))
We now present simulation scenario 2
K=4;
G=100;
N=200;
alpha_true=matrix(rnorm((K)*G,0.5,1),nrow=(K)); ### the matrix of fixed effects
Label.Batch=c(rep(1,N/2),rep(2,N/2));
B=max(Label.Batch);
sigmab_true=2;
beta_true=matrix(0,B,G); ### the matrix of the random effect
for(g in 1:G)
{
beta_true[,g]=rnorm(B,mean=0,sd=sigmab_true);
}
library(gtools)
T=10;
omega_true=matrix(rbind(rdirichlet(T*4,c(3,4,2,6)),rdirichlet(T*4,c(1,4,6,3)),
rdirichlet(T*4,c(4,1,2,2)),rdirichlet(T*4,c(2,6,3,2)),
rdirichlet(T*4,c(3,3,5,4))), nrow=N);
over_dis=0.5;
noise_true=matrix(0,N,G);
for(n in 1:N)
{
noise_true[n,]=over_dis*rnorm(G,0,1);
}
### generating the table
read_counts=matrix(0,N,G);
for(n in 1:N)
{
for(g in 1:G)
{
mean=omega_true[n,]%*%exp(alpha_true[,g] +beta_true[Label.Batch[n],g]+noise_true[n,g]);
read_counts[n,g]=rpois(1,mean);
}
}
barplot(t(omega_true),col=2:(K+1),axisnames=F,space=0,border=NA,main="",las=1,ylim=c(0,1),cex.axis=1.5,cex.main=1.4)
title(main=paste("Structure Plot of initial chosen topic proportions,k=",K))
Topic_clus <- topics(read_counts, K, tol=0.01);
##
## Estimating on a 200 document collection.
## Fitting the 4 topic model.
## log posterior increase: 2259.59, 846.7, 56.75, 14.84, 8.54, 7.46, 7.58, 7.07, 6.99, 6.07, 3.51, 1.1, 0.53, 0.57, 0.83, 1.26, 0.88, 0.56, 0.7, 0.63, 0.24, 0.25, 0.19, 0.08, 0.05, 0.04, 0.04, 0.03, 0.03, 0.03, 0.02, 0.02, 0.02, 0.02, 0.02, 0.01, 0.01, 0.01, 0.01, done.
docweights <- Topic_clus$omega;
barplot(t(docweights),col=2:(K+1),axisnames=F,space=0,border=NA,main="",las=1,ylim=c(0,1),cex.axis=1.5,cex.main=1.4)
title(main=paste("Structure Plot of initial chosen topic proportions,k=",K))
We perform the batch correction
batch_counts <- BatchCorrectedCounts(read_counts, batch_lab = Label.Batch);
Topic_clus_2 <- topics(batch_counts, K, tol=0.01);
##
## Estimating on a 200 document collection.
## Fitting the 4 topic model.
## log posterior increase: 41679.1, 10675.28, 2724.22, 1042.06, 570.95, 394.75, 311.2, 265.74, 238.51, 222.12, 212.22, 206.13, 202.53, 200.41, 198.89, 197.21, 194.7, 190.9, 185.6, 178.39, 168.68, 158.22, 147.56, 137.14, 127.28, 118.27, 110.33, 103.55, 181.9, 45.2, 42, 39.07, 36.35, 33.84, 31.53, 29.42, 26.58, 21.02, 16.95, 13.61, 11, 8.93, 7.24, 5.74, 4.24, 3.06, 2.56, 2.13, 1.74, 1.38, 1.05, 0.76, 0.52, 0.35, 0.23, 0.16, 0.17, 0.13, 0.12, 0.09, 0.07, 0.05, 0.05, 0.04, 0.03, 0.03, 0.03, 0.03, 0.03, 0.02, 0.02, 0.02, 0.02, 0.02, 0.02, 0.02, 0.02, 0.02, 0.02, 0.02, 0.02, 0.01, 0.01, 0.01, 0.01, 0.01, 0.01, 0.01, 0.01, done.
docweights_2 <- Topic_clus_2$omega;
library(permute);
library("BioPhysConnectoR");
perm_set=rbind(1:K,allPerms(1:K));
diff=array(0,dim(perm_set)[1]);
for (p in 1:dim(perm_set)[1])
{
temp=docweights_2[,perm_set[p,]];
diff[p]=fnorm(temp,omega_true);
}
p_star=which(diff==min(diff));
docweights_2=docweights_2[,perm_set[p_star,]];
barplot(t(docweights_2),col=2:(K+1),axisnames=F,space=0,border=NA,main="",las=1,ylim=c(0,1),cex.axis=1.5,cex.main=1.4)
title(main=paste("Structure Plot of initial chosen topic proportions,k=",K))
We present the simulation scenario 3.
K=4;
G=100;
N=200;
Label.Batch=c(rep(1,N/2),rep(2,N/2));
B=max(Label.Batch);
sigmab_true=2;
beta_true=matrix(0,B,G); ### the matrix of the random effect
for(g in 1:G)
{
beta_true[,g]=rnorm(B,mean=0,sd=sigmab_true);
}
library(gtools)
T=10;
omega_true=matrix(rbind(rdirichlet(T*4,c(3,4,2,6)),rdirichlet(T*4,c(1,4,6,3)),
rdirichlet(T*4,c(4,1,2,2)),rdirichlet(T*4,c(2,6,3,2)),
rdirichlet(T*4,c(3,3,5,4))), nrow=N);
theta_b_true <- rbind(rdirichlet(2, c(0.2, 0.2, rep(0.6/G,G-2))),
rdirichlet(2, c(rep(0.5/G,G-3),0.3,0.1,0.1)),
rdirichlet(2, c(rep(0.4/G, G/2 -2),0.2,0.1,rep(0.3/G, G/2))),
rdirichlet(2, c(rep(0.6/G,G-2), 0.2, 0.2)));
read_counts <- t(do.call(cbind,lapply(1:dim(omega_true)[1], function(x)
{
if(Label.Batch[x]==1)
out <- rmultinom(1,1000,prob=omega_true[x,]%*%theta_b_true[c(2,4,6,8),]);
if(Label.Batch[x]==2)
out <- rmultinom(1,1000,prob=omega_true[x,]%*%theta_b_true[c(1,3,5,7),]);
return(out)
} )));
barplot(t(omega_true),col=2:(K+1),axisnames=F,space=0,border=NA,main="",las=1,ylim=c(0,1),cex.axis=1.5,cex.main=1.4)
title(main=paste("Structure Plot of initial chosen topic proportions,k=",K))
Topic_clus <- topics(read_counts, K, tol=0.01);
##
## Estimating on a 200 document collection.
## Fitting the 4 topic model.
## log posterior increase: 3854.71, 39.41, 10.6, 4.79, 2.65, 0.93, 0.25, 0.2, 0.19, 0.22, 0.26, 0.21, 0.12, 0.07, 0.04, 0.03, 0.03, 0.03, done.
docweights <- Topic_clus$omega;
barplot(t(docweights),col=2:(K+1),axisnames=F,space=0,border=NA,main="",las=1,ylim=c(0,1),cex.axis=1.5,cex.main=1.4)
title(main=paste("Structure Plot of initial chosen topic proportions,k=",K))
We perform the batch correction
batch_counts <- BatchCorrectedCounts(read_counts, batch_lab = Label.Batch);
Topic_clus_2 <- topics(batch_counts, K, tol=0.01);
##
## Estimating on a 200 document collection.
## Fitting the 4 topic model.
## log posterior increase: 10926.46, 5781.6, 1333.56, 597.02, 361.76, 263.73, 214.12, 184.62, 165.47, 152.31, 142.65, 134.91, 128.81, 123.88, 119.64, 115.9, 112.65, 109.78, 107.13, 104.56, 101.93, 99.09, 95.96, 92.47, 88.66, 84.71, 61.36, 58.05, 54.9, 51.9, 49.07, 29.44, 28.2, 26.44, 24.73, 23.08, 12.08, 11.96, 12.67, 16.33, 15.51, 13.07, 16.29, 19.42, 22.29, 24.86, 27.65, 31.7, 38.62, 52.54, 77.61, 76.84, 49.99, 37.38, 37.16, 42.43, 41.83, 41.97, 43.28, 44.79, 44.24, 38.33, 28.24, 19.53, 14.47, 15.23, 11.52, 11, 7.55, 6.05, 5.18, 4.67, 4.36, 4, 3.5, 2.99, 2.6, 2.3, 2.06, 1.88, 1.74, 1.65, 1.58, 1.53, 1.48, 1.42, 1.36, 1.3, 1.22, 1.15, 1.07, 0.99, 0.92, 0.84, 0.78, 0.73, 0.69, 0.67, 0.66, 0.66, p 100 iter 1000 diff 1
## 0.67, 0.68, 0.7, 0.71, 0.73, 0.74, 0.74, 0.73, 0.72, 0.7, 0.67, 0.63, 0.59, 0.55, 0.5, 0.46, 0.42, 0.39, 0.37, 0.35, 0.34, 0.35, 0.35, 0.36, 0.34, 0.31, 0.27, 0.24, 0.23, 0.24, 0.25, 0.26, 0.27, 0.29, 0.36, 0.63, 0.73, 0.51, 0.38, 0.3, 0.24, 0.2, 0.17, 0.15, 0.13, 0.12, 0.11, 0.1, 0.09, 0.08, 0.07, 0.07, 0.06, 0.06, 0.05, 0.05, 0.05, 0.04, 0.04, 0.04, 0.03, 0.03, 0.03, 0.02, 0.02, 0.02, 0.02, 0.01, 0.01, 0.01, done.
docweights_2 <- Topic_clus_2$omega;
library(permute);
library("BioPhysConnectoR");
perm_set=rbind(1:K,allPerms(1:K));
diff=array(0,dim(perm_set)[1]);
for (p in 1:dim(perm_set)[1])
{
temp=docweights_2[,perm_set[p,]];
diff[p]=fnorm(temp,omega_true);
}
p_star=which(diff==min(diff));
docweights_2=docweights_2[,perm_set[p_star,]];
barplot(t(docweights_2),col=2:(K+1),axisnames=F,space=0,border=NA,main="",las=1,ylim=c(0,1),cex.axis=1.5,cex.main=1.4)
title(main=paste("Structure Plot of initial chosen topic proportions,k=",K))
It seems that although the batch correction does improve on model fitting and seem to remove the batch effects problem, but the model fit after batch correction and corresponding Structure plot representation does not correspond very accurately to the true Structure plot representation.