This page contains information to reproduce the numerical comparisons in SuSiE manuscript.
Note that the DSC benchmark has to be completed at this stage. Please follow these instructions to run the DSC benchmark for SuSiE fine-mapping applications.
%revisions -s --tags --no-walk
The work directory on my computer is ~/GIT/github/mvarbvs/dsc/
. I will navigate to that directory to perform analysis and show the results.
%cd ~/GIT/github/mvarbvs/dsc
%preview susie_comparison/PIP_comparison_1008_calibrated_prior_0p1_null_0p0.png
With null weight penalty,
%preview susie_comparison/PIP_comparison_1008_calibrated_prior_0p1_null_0p9.png
Using estimated prior,
%preview susie_comparison/PIP_comparison_1008_calibrated_prior_0p0_null_0p0.png
%preview susie_comparison/ROC_1008_prior_0p1.png --width 90%
Using estimated prior,
%preview susie_comparison/ROC_1008_prior_0p0.png --width 90%
%preview susie_comparison/hist_1008.png --width 90%
%preview hard_case/hist_1008_L10.png --width 18%
%preview susie_comparison/purity_1007/37.png --width 80%
For "simple case" simulation, number of causal is 1~5 out of 1000 non-causal variants; for the "hard case" number of causal is 10 out of 3000~12000 non-causal variants.
sos run analysis/20180527_PIP_Workflow.ipynb power
sos run analysis/20180615_Power_DAP.ipynb power
for i in 0.0 0.1; do
for j in 0.0 0.5 0.9 0.95; do
sos run ~/GIT/github/susie-paper/manuscript_results/numerical_results.ipynb cs_eval \
--null-weight $j --susie-prior $i -s force
done
done
readRDS('susie_comparison/Power_comparison_1008_cluster_prob_prior_0p1_null_0p0.rds')
CS overlapping status:
readRDS('susie_comparison/Power_comparison_1008_cluster_prob_prior_0p1_null_0p0.overlap_status.rds')
readRDS('hard_case/Power_comparison_1008_cluster_prob_prior_0p1_null_0p0.rds')
CS overlapping status (there are none):
readRDS('hard_case/Power_comparison_1008_cluster_prob_prior_0p1_null_0p0.overlap_status.rds')
Out of 1500 simulations under different scenarios, only one instance we see CS overlaps, that the two CS in question have size 623 and 640 respectively (with purity > 0.5), and the CS with 623 variables is subset of that with 640. This is an edge case. Overall, overlapping CS is not a concern here.
[global]
def fmtP(x):
return str(x).replace(".", "p")
date = '1008'
parameter: susie_prior = 0.0
parameter: null_weight = 0.0
parameter: cwd = path('./')
[cs_eval_1]
depends: R_library('ggplot2'), R_library('cowplot')
input: f'{cwd:a}/susie_comparison/Power_comparison_{date}_cluster_prob_prior_{fmtP(susie_prior)}_null_{fmtP(null_weight)}.rds', f'{cwd:a}/hard_case/Power_comparison_{date}_cluster_prob_prior_{fmtP(susie_prior)}_null_{fmtP(null_weight)}.rds'
output: f'{cwd:a}/susie_comparison/cs_comparison_prior_{fmtP(susie_prior)}_null_{fmtP(null_weight)}.pdf'
R: expand = "${ }", workdir = cwd, stderr = False, stdout = False
library(ggplot2)
library(cowplot)
# Multiple plot function
#
# ggplot objects can be passed in ..., or to plotlist (as a list of ggplot objects)
# - cols: Number of columns in layout
# - layout: A matrix specifying the layout. If present, 'cols' is ignored.
#
# If the layout is something like matrix(c(1,2,3,3), nrow=2, byrow=TRUE),
# then plot 1 will go in the upper left, 2 will go in the upper right, and
# 3 will go all the way across the bottom.
#
multiplot <- function(..., plotlist=NULL, file, cols=1, layout=NULL) {
library(grid)
# Make a list from the ... arguments and plotlist
plots <- c(list(...), plotlist)
numPlots = length(plots)
# If layout is NULL, then use 'cols' to determine layout
if (is.null(layout)) {
# Make the panel
# ncol: Number of columns of plots
# nrow: Number of rows needed, calculated from # of cols
layout <- matrix(seq(1, cols * ceiling(numPlots/cols)),
ncol = cols, nrow = ceiling(numPlots/cols), byrow=TRUE)
}
if (numPlots==1) {
print(plots[[1]])
} else {
# Set up the page
grid.newpage()
pushViewport(viewport(layout = grid.layout(nrow(layout), ncol(layout))))
# Make each plot, in the correct location
for (i in 1:numPlots) {
# Get the i,j matrix positions of the regions that contain this subplot
matchidx <- as.data.frame(which(layout == i, arr.ind = TRUE))
print(plots[[i]], vp = viewport(layout.pos.row = matchidx$row,
layout.pos.col = matchidx$col))
}
}
}
d1 = readRDS(${_input[0]:r})
d2 = readRDS(${_input[1]:r})
dat = rbind(d1,d2)
#print(head(dat))
dat$susie_power_se = sqrt(dat$susie_power * (1-dat$susie_power) / dat$expected_discoveries)
dat$dap_power_se = sqrt(dat$dap_power * (1-dat$dap_power) / dat$expected_discoveries)
dat$susie_coverage_se = sqrt(dat$susie_coverage * (1-dat$susie_coverage) / dat$susie_discoveries)
dat$dap_coverage_se = sqrt(dat$dap_coverage * (1-dat$dap_coverage) / dat$dap_discoveries)
susie = cbind(dat[,c("n_signal", "expected_discoveries", "susie_power", "susie_coverage", "susie_power_se", "susie_coverage_se", "susie_median_size", "susie_avg_ld")], "SuSiE")
colnames(susie) = c("n_signal", "expected_discoveries", "power", "coverage", "power_se", "coverage_se", "median_size", "avg_ld", "Method")
dap = cbind(dat[,c("n_signal", "expected_discoveries", "dap_power", "dap_coverage", "dap_power_se", "dap_coverage_se", "dap_median_size", "dap_avg_ld")], "DAP-G")
colnames(dap) = c("n_signal", "expected_discoveries", "power", "coverage", "power_se", "coverage_se", "median_size", "avg_ld", "Method")
dat = rbind(susie, dap)
dat$n_signal = as.factor(dat$n_signal)
plot_panel = function(dat, quantity, legend = TRUE) {
p = ggplot(dat, aes_string(x="n_signal", y=quantity[1], fill="Method")) +
scale_color_manual("Method", values = c("DAP-G" = "#348ABD", "SuSiE" = "#A60628")) +
geom_point(aes(colour=Method), position=position_dodge(.25), size=2.5)
if (quantity[1] == 'power') p = p + geom_errorbar(aes(ymin=power-power_se, ymax=power+power_se, color=Method), width=.2, position=position_dodge(.25))
if (quantity[1] == 'coverage') p = p + geom_errorbar(aes(ymin=coverage-coverage_se, ymax=coverage+coverage_se, color=Method), width=.2, position=position_dodge(.25)) + geom_hline(yintercept = 0.95, colour = 'gray')
p = p + labs(x = "number of effect variables", y = "") + theme_cowplot() + background_grid(major = "x", minor = "none") + ggtitle(quantity[2])
if (!legend) p = p + theme(legend.position="none")
return(p)
}
p1 = plot_panel(dat, c('coverage', 'A. coverage'), legend = FALSE)
p2 = plot_panel(dat, c('power', 'B. power'), legend = FALSE)
p3 = plot_panel(dat, c('median_size', 'C. median number of variables'), legend = FALSE)
p4 = plot_panel(dat, c('avg_ld', 'D. average r2'), legend = FALSE)
pdf('${_output:n}.pdf', width=12, height=3)
print(multiplot(p1, p2, p3, p4, cols=4))
dev.off()
%preview susie_comparison/cs_comparison_prior_0p1_null_0p0.pdf -s png --dpi 120
Use estimated prior,
%preview susie_comparison/cs_comparison_prior_0p0_null_0p0.pdf -s png --dpi 120
%preview susie_comparison/PIP_comparison_1008_prior_0p1_null_0p0.susie_vs_dap.png --width 90%