Last updated: 2017-01-15
Code version: b05decc05714ddcb20aa04e56b557d5123d178fb
First, we load the necessary libraries and function definitions.
library(ashr)
library(qvalue)
library(locfdr)
library(mixfdr)
library(ggplot2)
source("../R/plot_FDReg_hist.R")
Create a simple simulated data set to illustrate high and low signal. True values of beta simulated from \((0.5 N(0,2^2) + 0.5 \delta_0)\).
ntest = 10000
set.seed(112)
null_alt = rbinom(ntest,1,0.5)
beta = rnorm(ntest,0,sd = 1)
beta = ifelse(null_alt == 1,beta,0)
GOOD = 1:(ntest/2)
sebetahat = rep(1,ntest)
sebetahat[-GOOD] = 10
betahat = beta + rnorm(ntest, 0, sebetahat)
zscore = betahat/sebetahat
pval = pchisq(zscore^2,df = 1,lower.tail = F)
Show how poor precision observations dilute good ones.
par(mai = c(0.3,0.3,0.2,0.2),mgp = c(3, 0.5, 0))
layout(matrix(1:3,ncol = 3,byrow = TRUE))
plot_FDReg_hist(pval[GOOD],1,type = 1,title = "Good-precision observations",
ylab = "",nc = 20,cex.axis = 1,cex.main = 1.2,ylim = c(0,2.5))
plot_FDReg_hist(pval[-GOOD],1,type = 1,
title = "Poor-precision observations",ylab = "",nc = 20,
yaxt = 'n',cex.axis = 1,cex.main = 1.2,ylim = c(0,2.5))
axis(side = 2, labels = FALSE,tck = -0.01)
plot_FDReg_hist(pval,1,type = 1,title = "Combined",yaxt = 'n',ylab = "",
nc = 20,cex.axis = 1,cex.main = 1.2,ylim = c(0,2.5))
axis(side = 2, labels = FALSE,tck = -0.01)
Apply alternative methods to the same data set:
res.qvalue = qvalue(pval)
res.locfdr = locfdr(zscore,nulltype = 0,plot = 0)
res.ash = ash(betahat,sebetahat,method = "fdr",outputlevel = 4)
res.qvalue.good = qvalue(pval[GOOD])
res.locfdr.good = locfdr(zscore[GOOD],nulltype = 0,plot = 0)
res.ash.good = ash(betahat[GOOD],sebetahat[GOOD],method = "fdr")
Compare the ash’s accuracy against the alternative approaches.
res = rbind(data.frame(x = res.qvalue.good$qvalues,
y = res.qvalue$qvalues[GOOD],
type = "qvalue"),
data.frame(x = res.locfdr.good$fdr,
y = res.locfdr$fdr[GOOD],
type = 'locfdr'),
data.frame(x = get_lfsr(res.ash.good),
y = get_lfsr(res.ash)[GOOD],
type = "ashr"))
pp = ggplot(data = res,aes(x,y)) + geom_point(shape = 1) +
facet_grid(. ~ type) +
geom_abline(colour = "red") +
xlab("Analysing good-precision data only") +
ylab("Analysing combined data")
print(pp + scale_y_continuous(limits = c(0,1)) +
scale_x_continuous(limits = c(0,1)) +
coord_equal(ratio = 1))
Compare the LFSR against the p-values with different prior choices, and at different levels of precision in the observations.
make_df_for_pval = function(ash,method = "default")
data.frame(p = pnorm(-abs(ash$data$x/ash$data$s)),
lfsr = get_lfsr(ash),
s = ash$data$s,
method = method)
plot_pval_vs_lfsr=function(df,plot.it=TRUE){
if (length(unique(df$s)) > 2) {
df$s = log(df$s)
} else {
df$s = as.factor(df$s)
}
p = ggplot(df,aes(x = p,y = lfsr,color = s)) + geom_point() +
facet_grid(. ~ method) + xlim(c(0, 0.025)) + xlab("p value") +
ylab("lfsr")
if (length(unique(df$s)) > 2)
p = p + scale_colour_gradient2(midpoint = 1,low = "blue",mid = "white",
high = "red",space = "Lab")
if (plot.it)
print(p)
return(p)
}
res.ash.ET = ash(betahat,sebetahat,method = "fdr",alpha = 1,
mixcompdist = "normal",outputlevel = 4)
p = plot_pval_vs_lfsr(rbind(
make_df_for_pval(res.ash,method = "Default prior (alpha=0)"),
make_df_for_pval(res.ash.ET,method = "p-value prior (alpha=1)")))
Warning: Removed 18452 rows containing missing values (geom_point).
print(p + theme(axis.text.x = element_text(size = 8,angle = 45)) +
scale_colour_manual(values = c("blue","red")))
Warning: Removed 18452 rows containing missing values (geom_point).
Now plot one at a time.
p = plot_pval_vs_lfsr(make_df_for_pval(res.ash,method =" ash"))
Warning: Removed 9226 rows containing missing values (geom_point).
print(p + theme(axis.text.x = element_text(size = 8,angle = 45)))
Warning: Removed 9226 rows containing missing values (geom_point).
p = plot_pval_vs_lfsr(make_df_for_pval(res.ash.ET,method = "ash (p-value prior)"))
Warning: Removed 9226 rows containing missing values (geom_point).
print(p + theme(axis.text.x = element_text(size = 8,angle = 45)))
Warning: Removed 9226 rows containing missing values (geom_point).
Compare the log-likelihoods:
res.ash.ET$logLR
[1] 178.7
res.ash$logLR
[1] 295.3
Plot number of true positives against false positives (an “ROC curve”):
df0 = cbind(make_df_for_pval(res.ash,method = "Default prior (alpha=0)"),
beta = beta)
df1 = cbind(make_df_for_pval(res.ash.ET,method = "p-value prior (alpha=1)"),
beta = beta)
# False positives (fp) and true positives (tp).
fp = function(df) cumsum((df$beta == 0)[order(df$lfsr,decreasing = FALSE)])
tp = function(df) cumsum((df$beta != 0)[order(df$lfsr,decreasing = FALSE)])
df.fptp = rbind(data.frame(method = "ash lfsr",tp = tp(df0),fp = fp(df0)),
data.frame(method = "p values",tp = tp(df1),fp = fp(df1)))
ggplot(df.fptp,aes(x = fp,y = tp,col = method)) + geom_line() +
xlab("False positives") + ylab("True positives") + xlim(c(0,100)) +
ylim(c(0,400)) + scale_colour_manual(values = c("black","green"))
Warning: Removed 19074 rows containing missing values (geom_path).
sessionInfo()
R version 3.3.2 (2016-10-31)
Platform: x86_64-apple-darwin13.4.0 (64-bit)
Running under: macOS Sierra 10.12.2
locale:
[1] en_US.UTF-8/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8
attached base packages:
[1] stats graphics grDevices utils datasets methods base
other attached packages:
[1] knitr_1.15.1 ggplot2_2.2.0 mixfdr_1.0 locfdr_1.1-8 qvalue_2.4.2
[6] ashr_2.0.5
loaded via a namespace (and not attached):
[1] Rcpp_0.12.8 plyr_1.8.4 iterators_1.0.8
[4] tools_3.3.2 digest_0.6.10 evaluate_0.10
[7] tibble_1.2 gtable_0.2.0 lattice_0.20-34
[10] Matrix_1.2-7.1 foreach_1.4.3 yaml_2.1.14
[13] parallel_3.3.2 stringr_1.1.0 REBayes_0.63
[16] rprojroot_1.1 grid_3.3.2 rmarkdown_1.3
[19] reshape2_1.4.2 magrittr_1.5 backports_1.0.4
[22] scales_0.4.1 codetools_0.2-15 htmltools_0.3.5
[25] MASS_7.3-45 splines_3.3.2 assertthat_0.1
[28] colorspace_1.2-6 labeling_0.3 stringi_1.1.2
[31] Rmosek_7.1.3 lazyeval_0.2.0 pscl_1.4.9
[34] doParallel_1.0.10 munsell_0.4.3 truncnorm_1.0-7
[37] SQUAREM_2016.8-2