This notebook is adapted from the 2SNP.ipynb workflow for MASH paper.
We are interested in evaluating change in effect size estimate from 1 SNP vs 2 SNP model. Specifically for given gene we want to take 2 SNPs: one with smallest p-value in brain tissues and the other with smallest p-value in non-brain tissues. Then for each tissue in GTEx we perform the following regression analyses (with covariates adjusted for):
We plan to plot for selected analysis the estimated effect sizes and standard error to show how effect estimates may change.
Analysis seems to work, but I cannot reproduce results for MASH paper Figure S4 --- see the end of this document. Notice here that Figure S4 was using GTEx V6 data and here I use GTEx V8 post-processed and formatted by myself, with univariate summary statistics computed on the fly.
Not able to reproduce MASH paper Figure S4 is a bit worrisome to me. Currently I format data for each gene to an RDS file with multivariate Y and genotypes in matrices. For each column of Y I have removed covariates. This is an "analysis ready" format being used by at least two projects. Up next, I want to check if my data processing is correct and the univariate analysis I do myself agree with the GTEx V8 summary statistics release from GTEx analysis group, before moving on to larger scale analysis.
sos run analysis/two_snps.ipynb \
--data-dir ~/Documents/GTExV8/Multi_Tissue_Toys \
--gene-id-file ~/Documents/GTExV8/Multi_Tissue_Toys/pleiotropy_genes.txt \
-j 30
[global]
parameter: output_dir = path('./output')
parameter: data_dir = path('/project2/compbio/GTEx_eQTL/cis_eqtl_analysis_ready')
parameter: gene_id_file = path(f"{data_dir}/multitissue_genes_list.txt")
parameter: missing_rate_cutoff = 0.05
parameter: maf_cutoff = 0.01
genes = [x.strip() for x in open(gene_id_file).readlines() if not x.startswith('#')]
fail_if(len(genes) == 0, msg = 'No gene to analyze!')
[0]
report: expand = "${ }", output = '.sos/utils.R'
###Functions to compute MAF and missing genotype rate
compute_maf <- function(geno){
f <- mean(geno,na.rm = TRUE)/2
return(min(f, 1-f))
}
compute_missing <- function(geno){
miss <- sum(is.na(geno))/length(geno)
return(miss)
}
mean_impute <- function(geno){
f <- apply(geno, 2, function(x) mean(x,na.rm = TRUE))
for (i in 1:length(f)) geno[,i][which(is.na(geno[,i]))] <- f[i]
return(geno)
}
is_zero_variance <- function(x) {
if (length(unique(x))==1) return(T)
else return(F)
}
### Filter X matrix
filter_X <- function(X, missing_rate_thresh, maf_thresh) {
rm_col <- which(apply(X, 2, compute_missing) > missing_rate_thresh)
if (length(rm_col)) X <- X[, -rm_col]
rm_col <- which(apply(X, 2, compute_maf) < maf_thresh)
if (length(rm_col)) X <- X[, -rm_col]
rm_col <- which(apply(X, 2, is_zero_variance))
if (length(rm_col)) X <- X[, -rm_col]
return(mean_impute(X))
}
matxMax <- function(mtx) {
colmn <- which.max(mtx) %/% nrow(mtx) + 1
row <- which.max(mtx) %% nrow(mtx)
return( matrix(c(row, colmn), 1))
}
tissues_brain = c('Brain_Amygdala',
'Brain_Anterior_cingulate_cortex_BA24',
'Brain_Caudate_basal_ganglia',
'Brain_Cerebellar_Hemisphere',
'Brain_Cerebellum',
'Brain_Cortex',
'Brain_Frontal_Cortex_BA9',
'Brain_Hippocampus',
'Brain_Hypothalamus',
'Brain_Nucleus_accumbens_basal_ganglia',
'Brain_Putamen_basal_ganglia',
'Brain_Spinal_cord_cervical_c-1',
'Brain_Substantia_nigra')
tissues_other = c('Adipose_Subcutaneous',
'Adipose_Visceral_Omentum',
'Adrenal_Gland',
'Artery_Aorta',
'Artery_Coronary',
'Artery_Tibial',
'Breast_Mammary_Tissue',
'Cells_Cultured_fibroblasts',
'Cells_EBV-transformed_lymphocytes',
'Colon_Sigmoid',
'Colon_Transverse',
'Esophagus_Gastroesophageal_Junction',
'Esophagus_Mucosa',
'Esophagus_Muscularis',
'Heart_Atrial_Appendage',
'Heart_Left_Ventricle',
'Kidney_Cortex',
'Liver',
'Lung',
'Minor_Salivary_Gland',
'Muscle_Skeletal',
'Nerve_Tibial',
'Ovary',
'Pancreas',
'Pituitary',
'Prostate',
'Skin_Not_Sun_Exposed_Suprapubic',
'Skin_Sun_Exposed_Lower_leg',
'Small_Intestine_Terminal_Ileum',
'Spleen',
'Stomach',
'Testis',
'Thyroid',
'Uterus',
'Vagina',
'Whole_Blood')
colors_brain = rep('#eeee00', 10)
colors_other = c("#ff6600",
"#ffaa00",
"#33dd33",
"#ff5555",
"#ffaa99",
"#ff0000",
"#33cccc",
"#cc66ff",
"#aaeeff",
"#eebb77",
"#cc9955",
"#8b7355",
"#552200",
"#bb9988",
"#9900ff",
"#660099",
"#22ffdd",
"#aabb66",
"#99ff00",
"#800000",
"#aaaaff",
"#ffd700",
"#ffaaff",
"#995522",
"#aaff99",
"#dddddd",
"#0000ff",
"#7777ff",
"#555522",
"#778855",
"#ffdd99",
"#aaaaaa",
"#006600",
"#ff66ff",
"#ff5599",
"#ff00bb")
tissues = c(tissues_brain, tissues_other)
gtex_colors = c(colors_brain, colors_other)
# Perform marginal association + two SNP association analysis
[1]
depends: R_library('susieR')
input: [f'{data_dir}/{gene}.Multi_Tissues.rds' for gene in genes], group_by = 1
output: f'{output_dir}/{_input:bn}.two_snp.rds'
# each job uses 10 nodes, each node 4 cores in parallel each core using 2G memory; and jobs are created in batches of 40.
task: trunk_workers = [4] * 10, trunk_size = 40, walltime = '30m', mem = '8G', cores = 1, tags = f'{step_name}_{_output:bn}'
R: expand = '${ }', input = '.sos/utils.R'
attach(readRDS(${_input:r}))
# setup data
X <- filter_X(X, ${missing_rate_cutoff}, ${maf_cutoff})
# get univariate results
univariate_res = lapply(1:ncol(y_res), function(i) susieR:::univariate_regression(X,y_res[,i]))
bhat = do.call(cbind, lapply(1:ncol(y_res), function(i) univariate_res[[i]]$betahat))
shat = do.call(cbind, lapply(1:ncol(y_res), function(i) univariate_res[[i]]$sebetahat))
dat = abs(bhat / shat)
colnames(bhat) = colnames(y_res)
rownames(bhat) = colnames(X)
colnames(shat) = colnames(y_res)
rownames(shat) = colnames(X)
colnames(dat) = colnames(y_res)
rownames(dat) = colnames(X)
brain.col = which(colnames(dat) %in% tissues_brain)
snp.brain.idx = matxMax(dat[, brain.col])[1]
non.brain.col = which(colnames(dat) %in% tissues_other)
snp.non.brain.idx = matxMax(dat[, non.brain.col])[1]
# get two-snp results
res = lapply(1:ncol(y_res), function(i) summary(lm(y_res[,i] ~ X[,c(snp.brain.idx, snp.non.brain.idx)]))$coef)
saveRDS(list(bhat=bhat[c(snp.brain.idx, snp.non.brain.idx),],
sbhat=shat[c(snp.brain.idx, snp.non.brain.idx),],
two.snps=res), ${_output:r})
# metaplot
[2]
depends: R_library('rmeta')
output: f'{_input:n}.pdf'
R: expand = '${ }', input = '.sos/utils.R'
library(rmeta)
dat = readRDS(${_input:r})
gtex.colors = as.character(gtex_colors)[match(colnames(dat$bhat), tissues)]
get_coef = function(x) {
if (nrow(x) == 3) return(x[c(2,3),])
else return(x[c(2,2),])
}
pdf(${_output:r})
# beta
joint_1b = sapply(1:length(dat$two.snps), function(i) get_coef(dat$two.snps[[i]])[1,1])
joint_2b = sapply(1:length(dat$two.snps), function(i) get_coef(dat$two.snps[[i]])[2,1])
# SE
joint_1e = sapply(1:length(dat$two.snps), function(i) get_coef(dat$two.snps[[i]])[1,2])
joint_2e = sapply(1:length(dat$two.snps), function(i) get_coef(dat$two.snps[[i]])[2,2])
par(mfrow=c(2,2))
metaplot(dat$bhat[1,],dat$sbhat[1,],xlab = "",ylab="",colors=meta.colors(box=gtex.colors), xlim=c(-1.5,1.5), main = rownames(dat$bhat)[1])
metaplot(dat$bhat[2,],dat$sbhat[2,],xlab = "",ylab="",colors=meta.colors(box=gtex.colors), xlim=c(-1.5,1.5), main = rownames(dat$bhat)[2])
metaplot(joint_1b,joint_1e,xlab = "",ylab="",colors=meta.colors(box=gtex.colors), xlim=c(-1.5,1.5), main = paste0(rownames(dat$bhat)[1], "\nin joint analysis"))
metaplot(joint_2b,joint_2e,xlab = "",ylab="",colors=meta.colors(box=gtex.colors), xlim=c(-1.5,1.5), main = paste0(rownames(dat$bhat)[2], "\nin joint analysis"))
dev.off()
However, I'm not able to reproduce Urbut et al (2019) Figure S4 with this analysis. In the paper,
%preview /home/gaow/NIHMS1508683-supplement-1.f4.png
In current analysis,
%preview ../output/ENSG00000264247.1.Multi_Tissues.two_snp.pdf -s png --dpi 150
This result suggest that I have to identify from other data sets examples I need to illustrate the point.
Exported from analysis/two_snps.ipynb
committed by Gao Wang on Tue Feb 2 19:11:23 2021 revision 1, c5fe213