## Fecundity Gene Expression Correlations # ## created by jamilla a ## updated: 2020-02-09 library(tidyverse) library(corrplot) library(Matching) ## functions --------- # summarize expression data - take average of 2 replicates when possible sumexp = function(data){ data.sum = matrix(NA, dim(data)[1], dim(data)[2]) cols = str_split_fixed(colnames(data), ':', 2)[,1] for(i in 2:(length(data)-1)){ if(cols[i-1] != cols[i] && cols[i+1] == cols[i]){ #average two replicates data.sum[,i] = apply(data[,c(i,i+1)], 1, FUN = 'mean') } else if(cols[i-1] != cols[i] && cols[i+1] != cols[i]){ #or just use the single replicate data.sum[,i] = pull(data, i) #convert to vector } } temp = data.sum[,!colSums(is.na(data.sum))] #put in col and row names colnames(temp) = unique(str_split_fixed(colnames(data[,-1]), ':', 2)[,1]) rownames(temp) = pull(data, 1) return(temp) } #select validation genes from expression data orderExprData = function(avg.exp, gene.names){ fecundity.exp = subset(avg.exp, rownames(avg.exp) %in% gene.names) fecundity.exp = data.frame(t(fecundity.exp)) rownames(fecundity.exp) = str_split_fixed(rownames(fecundity.exp), '_', 2)[,2] fecundity.exp = fecundity.exp[order(as.numeric(rownames(fecundity.exp))),] return(fecundity.exp) } #run correlation test on gene x gene expression data gene.cor.test = function(data1, data2){ cor.exp = matrix(NA, dim(data1)[2], dim(data2)[2]) pval.exp = matrix(NA, dim(data1)[2], dim(data2)[2]) for(i in 1:dim(data1)[2]){ for(j in 1:dim(data2)[2]){ cor.exp[i,j] = as.numeric(cor.test(data1[,i], data2[,j])$estimate) pval.exp[i,j] = cor.test(data1[,i], data2[,j])$p.value } } colnames(cor.exp) = colnames(data2) colnames(pval.exp) = colnames(data2) rownames(cor.exp) = colnames(data1) rownames(pval.exp) = colnames(data1) return(list('cor'= cor.exp, 'pval'= pval.exp)) } #correlate expression of genes with trait values #pheno: data frame with trait values - rows (lines), cols (traits) #fecundity.exp: data frame with cols (genes) and rows (lines) trait.cor.test = function(fecundity.exp, pheno){ #subset only to lines tested lines = intersect(rownames(fecundity.exp), rownames(pheno)) fecundity.exp = subset(fecundity.exp, rownames(fecundity.exp) %in% lines) pheno = subset(pheno, rownames(pheno) %in% lines) #correlation matrix cor.exp = matrix(NA, dim(fecundity.exp)[2], dim(pheno)[2]) pval.exp = matrix(NA, dim(fecundity.exp)[2], dim(pheno)[2]) for(i in 1:dim(pheno)[2]){ for(j in 1:dim(fecundity.exp)[2]){ cor.exp[j,i] = as.numeric(cor.test(fecundity.exp[,j], pheno[,i])$estimate) pval.exp[j,i] = cor.test(fecundity.exp[,j], pheno[,i])$p.value } } #add in col and row names rownames(cor.exp) = colnames(fecundity.exp) rownames(pval.exp) = colnames(fecundity.exp) colnames(cor.exp) = colnames(pheno) colnames(pval.exp) = colnames(pheno) return(list('cor'= cor.exp, 'pval'= pval.exp)) } #generate a random set (n) of genes for correlating gene vs. gene expr #for use in bootstrapping bootstr.mat = function(avg.exp, n){ tot = dim(avg.exp)[1] rand.genes = sample(tot, n) gene.names = rownames(avg.exp)[rand.genes] #t = str_split_fixed(colnames(avg.exp), '_', 2)[,2] %in% as.character(lines) #create expression data frame with rand genes as columns and lines as rows rand.exp = subset(avg.exp, rownames(avg.exp) %in% gene.names) rand.exp = data.frame(t(rand.exp)) rownames(rand.exp) = str_split_fixed(rownames(rand.exp), '_', 2)[,2] rand.exp = rand.exp[order(as.numeric(rownames(rand.exp))),] return(rand.exp) } #generate reps (1000) random correlation matrices & compare #the num sigf corrs in fecundity top hits (n) to num sigf corrs #of random set (n) of genes bootstr.corr = function(avg.exp, n, reps){ temp = rep(NA, reps) for (i in 1:reps){ d = bootstr.mat(avg.exp, n) d.exp = gene.cor.test(d,d) d.exp$pval.adj = p.adjust(d.exp$pval, method = 'BH') d.exp$pval.adj = matrix(d.exp$pval.adj, n, n) rand = d.exp$cor[d.exp$pval.adj < 0.05] len_rand = length(unique(rand[round(rand)!=1])) temp[i] = len_rand } return(temp) } ##load expression data from Huang et al. 2015 --------- setwd("~/de Bivort 2/Fecundity/new analysis/gwas/DGRP data") #fem.exp = read_delim('dgrp.array.exp.female.txt', delim = ' ') #male.exp = read_delim('dgrp.array.exp.male.txt', delim = ' ') #summarize the expression data #avg.fem.exp = sumexp(fem.exp) #avg.male.exp = sumexp(male.exp) load('Average DGRP Expression Data.RData') ##load candidate gene FlyBase Gene IDs -------------- setwd("~/de Bivort 2/Fecundity/gwas") validation.lines = read_tsv('validation_genes.txt') #genes validated fb_ids = pull(validation.lines, FlyBaseID) candidate.lines = read_tsv('candidate_genes.txt') #all candidate genes fb_ids = pull(candidate.lines, FlyBaseID) ##pull out validation (candidate) genes from expression data -------------- #female fecundity.exp = orderExprData(avg.fem.exp, fb_ids) colnames(fecundity.exp) = pull(validation.lines, Gene)[c(1,3:6)] #missing CG8312 colnames(fecundity.exp) = pull(candidate.lines, Gene)[c(1:9,11:18)] #male fecundity.exp.m = orderExprData(avg.male.exp, fb_ids) colnames(fecundity.exp.m) = pull(validation.lines, Gene)[c(1,3:6)]#missing CG8312 colnames(fecundity.exp.m) = pull(candidate.lines, Gene)[c(1:9,11:18)] ##correlation test for gene x gene interactions -------------- femaleGeneCorr = gene.cor.test(fecundity.exp, fecundity.exp) femaleGeneCorr$pval.adj = p.adjust(femaleGeneCorr$pval, method = 'BH') femaleGeneCorr$pval.adj = matrix(femaleGeneCorr$pval.adj, length(fecundity.exp), length(fecundity.exp)) maleGeneCorr = gene.cor.test(fecundity.exp.m, fecundity.exp.m) maleGeneCorr$pval.adj = p.adjust(maleGeneCorr$pval, method = 'BH') maleGeneCorr$pval.adj = matrix(maleGeneCorr$pval.adj, length(fecundity.exp.m),length(fecundity.exp.m)) ##plot correlation matrices for gene x gene interactions -------------------- corrplot(t(femaleGeneCorr$cor), method= "ellipse", tl.cex =0.7, tl.srt = 45, cl.pos = 'r', p.mat = t(femaleGeneCorr$pval.adj), sig.level = 0.05, insig = 'p-value', cl.align.text = "l", cl.cex = 0.7) corrplot(t(maleGeneCorr$cor), method= "ellipse", tl.cex =0.7, tl.srt = 45, cl.pos = 'r', p.mat = t(maleGeneCorr$pval.adj), sig.level = 0.05, insig = 'p-value', cl.align.text = "l", cl.cex = 0.7) ##resampling correlation matrices for gene x gene interactions ------------------- #find number of significant correlations (post-mult test correction) in obs data obs_f = femaleGeneCorr$cor[femaleGeneCorr[["pval.adj"]] < 0.05] #find only sigf cors obs_m = maleGeneCorr$cor[maleGeneCorr[["pval.adj"]] < 0.05] #find only sigf cors len_obs_f = length(unique(obs_f[round(obs_f)!=1])) #filter out all 1 correlations len_obs_m = length(unique(obs_m[round(obs_m)!=1])) #filter out all 1 correlations #resampling boot_est_f = bootstr.corr(avg.fem.exp, 17, 1000) boot_est_m = bootstr.corr(avg.male.exp, 17, 1000) #plotting q = qplot(boot_est_m, geom='density') q + geom_vline(xintercept = len_obs_m, size = 1.5, color = 'red') + theme_classic() + theme(axis.text.y = element_text(size=16), axis.text.x = element_text(size=16), axis.title.y = element_text(size = 18), axis.title.x = element_text(size = 18), axis.ticks.length = unit(.25, "cm"), axis.ticks.x = element_blank(), axis.line.y = element_line(lineend = 'butt'), axis.line.x = element_line(lineend = 'butt'), plot.margin = margin(t = 20, r = 20, b = 20, l = 20, unit = "pt")) + scale_x_continuous(expand = c(0, 0)) + scale_y_continuous(expand = c(0, 0)) ##correlation test for gene x trait interactions -------------- #load in the phenotype data - line effects from modeling setwd("~/de Bivort 2/Fecundity/new analysis/data") pheno = read_csv('Line Effects Data.csv') lines = pull(pheno, Line) pheno = as.matrix(pheno[,c(2:6)])#take away the Line column and make matrix rownames(pheno) = lines #correlations femaleTraitCorr = trait.cor.test(fecundity.exp, pheno) femaleTraitCorr$pval.adj = p.adjust(femaleTraitCorr$pval, method = 'BH') femaleTraitCorr$pval.adj = matrix(femaleTraitCorr$pval.adj, 5, 5) maleTraitCorr = trait.cor.test(fecundity.exp.m, pheno) maleTraitCorr$pval.adj = p.adjust(maleTraitCorr$pval, method = 'BH') maleTraitCorr$pval.adj = matrix(maleTraitCorr$pval.adj, 5, 5) ##plot correlation matrices for gene x trait interactions -------------------- corrplot(t(femaleTraitCorr$cor), method= "ellipse", tl.cex =0.7, tl.srt = 45, cl.pos = 'r', p.mat = t(femaleTraitCorr$pval.adj), sig.level = 0.05, insig = 'p-value', cl.align.text = "l", cl.cex = 0.7) corrplot(t(maleTraitCorr$cor), method= "ellipse", tl.cex =0.7, tl.srt = 45, cl.pos = 'r', p.mat = t(maleTraitCorr$pval.adj), sig.level = 0.05, insig = 'p-value', cl.align.text = "l", cl.cex = 0.7)