In this script, we explore the effectiveness of removing batch effects by the BatchCorrectedCounts()
function of the package CountClust
. We consider three simulation scenarios and then fit the topic model before and after removing the batch effects.
library(CountClust)
library(maptpx)
## Loading required package: slam
We present the simulation mechanism for scenario 1.
K=4;
G=100;
N=200;
alpha_true=matrix(rnorm((K)*G,0.5,1),nrow=(K)); ### the matrix of fixed effects
Label.Batch=c(rep(1,N/2),rep(2,N/2));
B=max(Label.Batch);
sigmab_true=2;
beta_true=matrix(0,B,G); ### the matrix of the random effect
for(g in 1:G)
{
beta_true[,g]=rnorm(B,mean=0,sd=sigmab_true);
}
library(gtools)
##
## Attaching package: 'gtools'
##
## The following object is masked from 'package:maptpx':
##
## logit
T=10;
omega_true=matrix(rbind(rdirichlet(T*4,c(3,4,2,6)),rdirichlet(T*4,c(1,4,6,3)),
rdirichlet(T*4,c(4,1,2,2)),rdirichlet(T*4,c(2,6,3,2)),
rdirichlet(T*4,c(3,3,5,4))), nrow=N);
over_dis=0.5;
noise_true=matrix(0,N,G);
for(n in 1:N)
{
noise_true[n,]=over_dis*rnorm(G,0,1);
}
### generating the table
read_counts=matrix(0,N,G);
for(n in 1:N)
{
for(g in 1:G)
{
mean=exp(omega_true[n,]%*%alpha_true[,g] +beta_true[Label.Batch[n],g]+noise_true[n,g]);
read_counts[n,g]=rpois(1,mean);
}
}
barplot(t(omega_true),col=2:(K+1),axisnames=F,space=0,border=NA,main="",las=1,ylim=c(0,1),cex.axis=1.5,cex.main=1.4)
title(main=paste("Structure Plot of initial chosen topic proportions,k=",K))
Topic_clus <- topics(read_counts, K, tol=0.01);
##
## Estimating on a 200 document collection.
## Fitting the 4 topic model.
## log posterior increase: 1987.11, 178.42, 75.88, 23.55, 15.87, 8.59, 5.06, 10.65, 36.37, 17.17, 10.32, 0.31, 0.17, 0.12, 0.08, 0.07, 0.06, 0.05, 0.04, 0.04, 0.03, 0.03, 0.03, 0.03, 0.03, 0.03, 0.03, 0.02, 0.02, 0.02, 0.02, 0.02, 0.02, 0.02, 0.02, 0.02, 0.02, 0.02, 0.02, 0.02, 0.02, 0.02, 0.02, 0.02, 0.02, 0.02, 0.01, 0.01, 0.01, 0.01, 0.01, 0.02, 0.01, 0.01, 0.01, done.
docweights <- Topic_clus$omega;
barplot(t(docweights),col=2:(K+1),axisnames=F,space=0,border=NA,main="",las=1,ylim=c(0,1),cex.axis=1.5,cex.main=1.4)
title(main=paste("Structure Plot of initial chosen topic proportions,k=",K))
We perform the batch correction
batch_counts <- BatchCorrectedCounts(read_counts, batch_lab = Label.Batch);
Topic_clus_2 <- topics(batch_counts, K, tol=0.01);
##
## Estimating on a 200 document collection.
## Fitting the 4 topic model.
## log posterior increase: 12216.86, 5752.5, 3411.88, 1818.31, 1045.38, 715.9, 553.12, 467.42, 410.68, 364.63, 321.2, 277.67, 234.67, 194.5, 158.99, 128.91, 104.26, 84.51, 68.9, 56.63, 14.7, 2.6, 4.05, 7.92, 5.06, 3.5, 2.42, 1.69, 1.21, 0.89, 0.68, 0.53, 0.42, 0.34, 0.28, 0.24, 0.21, 0.19, 0.17, 0.15, 0.15, 0.14, 0.14, 0.14, 0.14, 0.13, 0.12, 0.11, 0.1, 0.09, 0.08, 0.08, 0.07, 0.06, 0.06, 0.05, 0.05, 0.04, 0.04, 0.04, 0.03, 0.03, 0.03, 0.02, 0.02, 0.01, done.
docweights_2 <- Topic_clus_2$omega;
library(permute);
##
## Attaching package: 'permute'
##
## The following object is masked from 'package:gtools':
##
## permute
library("BioPhysConnectoR");
## Loading required package: snow
## Loading required package: matrixcalc
perm_set=rbind(1:K,allPerms(1:K));
diff=array(0,dim(perm_set)[1]);
for (p in 1:dim(perm_set)[1])
{
temp=docweights_2[,perm_set[p,]];
diff[p]=fnorm(temp,omega_true);
}
p_star=which(diff==min(diff));
docweights_2=docweights_2[,perm_set[p_star,]];
barplot(t(docweights_2),col=2:(K+1),axisnames=F,space=0,border=NA,main="",las=1,ylim=c(0,1),cex.axis=1.5,cex.main=1.4)
title(main=paste("Structure Plot of initial chosen topic proportions,k=",K))
We now present simulation scenario 2
K=4;
G=100;
N=200;
alpha_true=matrix(rnorm((K)*G,0.5,1),nrow=(K)); ### the matrix of fixed effects
Label.Batch=c(rep(1,N/2),rep(2,N/2));
B=max(Label.Batch);
sigmab_true=2;
beta_true=matrix(0,B,G); ### the matrix of the random effect
for(g in 1:G)
{
beta_true[,g]=rnorm(B,mean=0,sd=sigmab_true);
}
library(gtools)
T=10;
omega_true=matrix(rbind(rdirichlet(T*4,c(3,4,2,6)),rdirichlet(T*4,c(1,4,6,3)),
rdirichlet(T*4,c(4,1,2,2)),rdirichlet(T*4,c(2,6,3,2)),
rdirichlet(T*4,c(3,3,5,4))), nrow=N);
over_dis=0.5;
noise_true=matrix(0,N,G);
for(n in 1:N)
{
noise_true[n,]=over_dis*rnorm(G,0,1);
}
### generating the table
read_counts=matrix(0,N,G);
for(n in 1:N)
{
for(g in 1:G)
{
mean=omega_true[n,]%*%exp(alpha_true[,g] +beta_true[Label.Batch[n],g]+noise_true[n,g]);
read_counts[n,g]=rpois(1,mean);
}
}
barplot(t(omega_true),col=2:(K+1),axisnames=F,space=0,border=NA,main="",las=1,ylim=c(0,1),cex.axis=1.5,cex.main=1.4)
title(main=paste("Structure Plot of initial chosen topic proportions,k=",K))
Topic_clus <- topics(read_counts, K, tol=0.01);
##
## Estimating on a 200 document collection.
## Fitting the 4 topic model.
## log posterior increase: 2259.59, 846.7, 56.75, 14.84, 8.54, 7.46, 7.58, 7.07, 6.99, 6.07, 3.51, 1.1, 0.53, 0.57, 0.83, 1.26, 0.88, 0.56, 0.7, 0.63, 0.24, 0.25, 0.19, 0.08, 0.05, 0.04, 0.04, 0.03, 0.03, 0.03, 0.02, 0.02, 0.02, 0.02, 0.02, 0.01, 0.01, 0.01, 0.01, done.
docweights <- Topic_clus$omega;
barplot(t(docweights),col=2:(K+1),axisnames=F,space=0,border=NA,main="",las=1,ylim=c(0,1),cex.axis=1.5,cex.main=1.4)
title(main=paste("Structure Plot of initial chosen topic proportions,k=",K))
We perform the batch correction
batch_counts <- BatchCorrectedCounts(read_counts, batch_lab = Label.Batch);
Topic_clus_2 <- topics(batch_counts, K, tol=0.01);
##
## Estimating on a 200 document collection.
## Fitting the 4 topic model.
## log posterior increase: 41679.1, 10675.28, 2724.22, 1042.06, 570.95, 394.75, 311.2, 265.74, 238.51, 222.12, 212.22, 206.13, 202.53, 200.41, 198.89, 197.21, 194.7, 190.9, 185.6, 178.39, 168.68, 158.22, 147.56, 137.14, 127.28, 118.27, 110.33, 103.55, 181.9, 45.2, 42, 39.07, 36.35, 33.84, 31.53, 29.42, 26.58, 21.02, 16.95, 13.61, 11, 8.93, 7.24, 5.74, 4.24, 3.06, 2.56, 2.13, 1.74, 1.38, 1.05, 0.76, 0.52, 0.35, 0.23, 0.16, 0.17, 0.13, 0.12, 0.09, 0.07, 0.05, 0.05, 0.04, 0.03, 0.03, 0.03, 0.03, 0.03, 0.02, 0.02, 0.02, 0.02, 0.02, 0.02, 0.02, 0.02, 0.02, 0.02, 0.02, 0.02, 0.01, 0.01, 0.01, 0.01, 0.01, 0.01, 0.01, 0.01, done.
docweights_2 <- Topic_clus_2$omega;
library(permute);
library("BioPhysConnectoR");
perm_set=rbind(1:K,allPerms(1:K));
diff=array(0,dim(perm_set)[1]);
for (p in 1:dim(perm_set)[1])
{
temp=docweights_2[,perm_set[p,]];
diff[p]=fnorm(temp,omega_true);
}
p_star=which(diff==min(diff));
docweights_2=docweights_2[,perm_set[p_star,]];
barplot(t(docweights_2),col=2:(K+1),axisnames=F,space=0,border=NA,main="",las=1,ylim=c(0,1),cex.axis=1.5,cex.main=1.4)
title(main=paste("Structure Plot of initial chosen topic proportions,k=",K))