Make figures for simple simulation with good and poor precision observations

Last updated: 2017-01-15

Code version: b05decc05714ddcb20aa04e56b557d5123d178fb

First, we load the necessary libraries and function definitions.

library(ashr)
library(qvalue)
library(locfdr)
library(mixfdr)
library(ggplot2)
source("../R/plot_FDReg_hist.R")

Create a simple simulated data set to illustrate high and low signal. True values of beta simulated from \((0.5 N(0,2^2) + 0.5 \delta_0)\).

ntest = 10000
set.seed(112)

null_alt  = rbinom(ntest,1,0.5)
beta      = rnorm(ntest,0,sd = 1)
beta      = ifelse(null_alt == 1,beta,0)
GOOD      = 1:(ntest/2)
sebetahat = rep(1,ntest)
sebetahat[-GOOD] = 10

betahat = beta + rnorm(ntest, 0, sebetahat)
zscore  = betahat/sebetahat
pval    = pchisq(zscore^2,df = 1,lower.tail = F)

Show how poor precision observations dilute good ones.

par(mai = c(0.3,0.3,0.2,0.2),mgp = c(3, 0.5, 0))
layout(matrix(1:3,ncol = 3,byrow = TRUE))
plot_FDReg_hist(pval[GOOD],1,type = 1,title = "Good-precision observations", 
                ylab = "",nc = 20,cex.axis = 1,cex.main = 1.2,ylim = c(0,2.5))
plot_FDReg_hist(pval[-GOOD],1,type = 1,
                title = "Poor-precision observations",ylab = "",nc = 20,
                yaxt = 'n',cex.axis = 1,cex.main = 1.2,ylim = c(0,2.5))
axis(side = 2, labels = FALSE,tck = -0.01)
plot_FDReg_hist(pval,1,type = 1,title = "Combined",yaxt = 'n',ylab = "",
                nc = 20,cex.axis = 1,cex.main = 1.2,ylim = c(0,2.5))
axis(side = 2, labels = FALSE,tck = -0.01)

Apply alternative methods to the same data set:

res.qvalue = qvalue(pval)
res.locfdr = locfdr(zscore,nulltype = 0,plot = 0)
res.ash    = ash(betahat,sebetahat,method = "fdr",outputlevel = 4)

res.qvalue.good = qvalue(pval[GOOD])
res.locfdr.good = locfdr(zscore[GOOD],nulltype = 0,plot = 0)
res.ash.good    = ash(betahat[GOOD],sebetahat[GOOD],method = "fdr")

Compare the ash’s accuracy against the alternative approaches.

res = rbind(data.frame(x    = res.qvalue.good$qvalues,
                       y    = res.qvalue$qvalues[GOOD],
                       type = "qvalue"),
            data.frame(x    = res.locfdr.good$fdr,
                       y    = res.locfdr$fdr[GOOD],
                       type = 'locfdr'), 
            data.frame(x    = get_lfsr(res.ash.good),
                       y    = get_lfsr(res.ash)[GOOD],
                       type = "ashr"))

pp = ggplot(data = res,aes(x,y)) + geom_point(shape = 1) +
  facet_grid(. ~ type) +
  geom_abline(colour = "red") +
  xlab("Analysing good-precision data only") +
  ylab("Analysing combined data")
print(pp + scale_y_continuous(limits = c(0,1)) +
      scale_x_continuous(limits = c(0,1))  +
      coord_equal(ratio = 1))

Compare the LFSR against the p-values with different prior choices, and at different levels of precision in the observations.

make_df_for_pval = function(ash,method = "default")
  data.frame(p      = pnorm(-abs(ash$data$x/ash$data$s)),
             lfsr   = get_lfsr(ash),
             s      = ash$data$s,
             method = method)

plot_pval_vs_lfsr=function(df,plot.it=TRUE){
  if (length(unique(df$s)) > 2) {
    df$s = log(df$s)
  } else { 
    df$s = as.factor(df$s)
  }
  
  p = ggplot(df,aes(x = p,y = lfsr,color = s)) + geom_point() +
      facet_grid(. ~ method) + xlim(c(0, 0.025)) + xlab("p value") + 
      ylab("lfsr")
  
  if (length(unique(df$s)) > 2)
    p = p + scale_colour_gradient2(midpoint = 1,low = "blue",mid = "white", 
                                   high = "red",space = "Lab")
  if (plot.it)
    print(p)
  return(p)
}

res.ash.ET = ash(betahat,sebetahat,method = "fdr",alpha = 1,
                 mixcompdist = "normal",outputlevel = 4)
p = plot_pval_vs_lfsr(rbind(
  make_df_for_pval(res.ash,method = "Default prior (alpha=0)"),
  make_df_for_pval(res.ash.ET,method = "p-value prior (alpha=1)")))

Warning: Removed 18452 rows containing missing values (geom_point).

print(p + theme(axis.text.x = element_text(size = 8,angle = 45)) + 
      scale_colour_manual(values = c("blue","red")))

Warning: Removed 18452 rows containing missing values (geom_point).

Now plot one at a time.

p = plot_pval_vs_lfsr(make_df_for_pval(res.ash,method =" ash"))

Warning: Removed 9226 rows containing missing values (geom_point).

print(p + theme(axis.text.x = element_text(size = 8,angle = 45)))

Warning: Removed 9226 rows containing missing values (geom_point).

p = plot_pval_vs_lfsr(make_df_for_pval(res.ash.ET,method = "ash (p-value prior)"))

Warning: Removed 9226 rows containing missing values (geom_point).

print(p + theme(axis.text.x = element_text(size = 8,angle = 45)))

Warning: Removed 9226 rows containing missing values (geom_point).

Compare the log-likelihoods:

res.ash.ET$logLR

[1] 178.7

res.ash$logLR

[1] 295.3

Plot number of true positives against false positives (an “ROC curve”):

df0 = cbind(make_df_for_pval(res.ash,method = "Default prior (alpha=0)"), 
            beta = beta)
df1 = cbind(make_df_for_pval(res.ash.ET,method = "p-value prior (alpha=1)"),
            beta = beta)
  
# False positives (fp) and true positives (tp).
fp = function(df) cumsum((df$beta == 0)[order(df$lfsr,decreasing = FALSE)])
tp = function(df) cumsum((df$beta != 0)[order(df$lfsr,decreasing = FALSE)])
df.fptp = rbind(data.frame(method = "ash lfsr",tp = tp(df0),fp = fp(df0)),
                data.frame(method = "p values",tp = tp(df1),fp = fp(df1)))
ggplot(df.fptp,aes(x = fp,y = tp,col = method)) + geom_line() + 
  xlab("False positives") + ylab("True positives") + xlim(c(0,100)) + 
  ylim(c(0,400)) + scale_colour_manual(values = c("black","green"))

Warning: Removed 19074 rows containing missing values (geom_path).

Session information

sessionInfo()

R version 3.3.2 (2016-10-31)
Platform: x86_64-apple-darwin13.4.0 (64-bit)
Running under: macOS Sierra 10.12.2

locale:
[1] en_US.UTF-8/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8

attached base packages:
[1] stats     graphics  grDevices utils     datasets  methods   base     

other attached packages:
[1] knitr_1.15.1  ggplot2_2.2.0 mixfdr_1.0    locfdr_1.1-8  qvalue_2.4.2 
[6] ashr_2.0.5   

loaded via a namespace (and not attached):
 [1] Rcpp_0.12.8       plyr_1.8.4        iterators_1.0.8  
 [4] tools_3.3.2       digest_0.6.10     evaluate_0.10    
 [7] tibble_1.2        gtable_0.2.0      lattice_0.20-34  
[10] Matrix_1.2-7.1    foreach_1.4.3     yaml_2.1.14      
[13] parallel_3.3.2    stringr_1.1.0     REBayes_0.63     
[16] rprojroot_1.1     grid_3.3.2        rmarkdown_1.3    
[19] reshape2_1.4.2    magrittr_1.5      backports_1.0.4  
[22] scales_0.4.1      codetools_0.2-15  htmltools_0.3.5  
[25] MASS_7.3-45       splines_3.3.2     assertthat_0.1   
[28] colorspace_1.2-6  labeling_0.3      stringi_1.1.2    
[31] Rmosek_7.1.3      lazyeval_0.2.0    pscl_1.4.9       
[34] doParallel_1.0.10 munsell_0.4.3     truncnorm_1.0-7  
[37] SQUAREM_2016.8-2

Make figures for simple simulation with good and poor precision observations

Matthew Stephens

2015-10-26

Session information