#!/usr/bin/env Rscript #' # Differential abundance analysis #' #' Created by: `r system("whoami", intern=T)` #' #' Created at: `r format(Sys.Date(), format="%B %d %Y")` #----------------------------------------------------------------------------------------------------------------------- #+ include=FALSE suppressMessages(require(docopt)) doc = ' Perform a differential gene expression analysis using limma and edgeR Usage: featcounts_deseq_mf.R [options] Options: --contrasts= Table with sample pairs for which dge analysis should be performed --design Design fomula for DeSeq with contrast attribute at the end [default: condition] --qcutoff Use a q-value cutoff of 0.01 instead of a q-value cutoff [default: 0.01] --pcutoff Override q-value filter and filter by p-value instead --out Name to prefix all generated result files [default: ] --lfc Just report genes with abs(lfc) > lfc_cutoff as hits [default: 1.0] --protein_info Protein infos extracted from MaxQuant result file and exported as part of ms_data_prep.R ' # commandArgs = function(foo) c("--contrast", "contrasts.txt", "--protein_info", "../data_prep.feature_information.txt", "intens_imputed.txt", "exp_design.txt") # commandArgs = function(foo) c("--contrast", "contrasts.txt", "intens_imputed.txt", "exp_design.txt") # commandArgs = function(foo) c("--gene_info", "../mmus_ens_aug2017_uniprot_compl_gene_info.txt","--contrasts", "example_contrasts.txt", "../inten_matrix_acc.txt", "../diff_abund/ex_design.txt") # commandArgs = function(x) c("--contrasts", "contrasts.txt", "../../data_prep.intens_imputed.txt", "exp_design_timepoint.txt") opts = docopt(doc, commandArgs(TRUE)) #----------------------------------------------------------------------------------------------------------------------- devtools::source_url("https://git.mpi-cbg.de/bioinfo/datautils/raw/v1.49/R/core_commons.R") devtools::source_url("https://git.mpi-cbg.de/bioinfo/datautils/raw/v1.49/R/ggplot_commons.R") devtools::source_url("https://git.mpi-cbg.de/bioinfo/ngs_tools/raw/v10/dge_workflow/diffex_commons.R") install_package("cummeRbund") load_pack(knitr) load_pack(gridExtra) load_pack(limma) load_pack(edgeR) load_pack(GGally) load_pack(corrplot) load_pack(d3heatmap) load_pack(limma) load_pack(edgeR) #----------------------------------------------------------------------------------------------------------------------- results_prefix = "limma_diffex" ## process input arguments count_matrix_file = opts$count_matrix design_matrix_file = opts$design_matrix contrasts_file = opts$contrasts protein_info_file = opts$protein_info assert(is.null(protein_info_file) || file.exists(protein_info_file), "invalid protein_info_file") designFormula = opts$design assert(str_detect(designFormula, "^condition.*")) ## make sure that the condition comes before all batch factors if (!is.null(opts$out)){ results_prefix = opts$out } pcutoff = if (is.null(opts$pcutoff))NULL else as.numeric(opts$pcutoff) qcutoff = if (is.numeric(pcutoff))NULL else as.numeric(opts$qcutoff) if (is.numeric(pcutoff))opts$qcutoff = NULL lfc_cutoff = if (is.null(opts$lfc))0 else as.numeric(opts$lfc) #' Run configuration was vec_as_df(unlist(opts)) %>% filter(! str_detect(name, "^[<-]")) %>% kable() # The working directory of the analysis was: `r getwd()` #' #'

#' #----------------------------------------------------------------- #' ## Data Preparation #' #'
#' #' experimental design: expDesign = read_tsv(design_matrix_file) kable(expDesign) #' #'

#' #' count matrix: countData = read_tsv(count_matrix_file) %T>% glimpse # import protein information if (is.null(protein_info_file)) { protein_info = distinct(countData, protein_ids) } else { protein_info <- read_tsv(protein_info_file) } colnames(protein_info[,1]) <- "protein_ids" #' #'

#' #' contrasts ## set contrast with comparison of each condition against each if not specified otherwise: if (! is.null(contrasts_file)) { contrasts = read_tsv(contrasts_file) }else { contrasts = select(expDesign, condition) %>% distinct %>% merge(., ., suffixes = c("_1", "_2"), by = NULL) %>% filter(ac(condition_1) > ac(condition_2)) # write_tsv(contrasts, paste(resultsBase, "contrasts.txt")) } kable(contrasts) #' #'

#' #----------------------------------------------------------------- #' ## QC, Normalization and Preprocessing ## report sample features, i.e. total measurements > 0 and total LFQ intensities per sample #' ### Sample statistics cdLong = gather(countData, sample, expr, -protein_ids) p1 <- cdLong %>% filter(expr > 0) %>% ggplot(aes(sample)) + geom_bar() + coord_flip() + geom_hline(yintercept = length(unique(cdLong$protein_ids)), color = "coral1") + ggtitle("measurements > 0 per samples") p2 <- cdLong %>% group_by(sample) %>% summarize(total = sum(expr)) %>% ggplot(aes(sample, weight = total)) + geom_bar() + coord_flip() + ylab("total LFQ intensities") + ggtitle("total LFQ intensities per sample") grid.arrange(p1, p2, nrow = 1) # create abundance matrix and remove rows with missing values expMatrix = countData %>% column_to_rownames("protein_ids") %>% as.matrix expMatrix = expMatrix[complete.cases(expMatrix),] #' #'

#' #' ### Principal component analysis group_labels = data_frame(replicate = colnames(expMatrix)) %>% left_join(expDesign) %>% pull(condition) names(group_labels) = colnames(expMatrix) makePcaPlot(t(expMatrix), color_by = group_labels, title = "PCA of quantifiable proteins in all conditions") mydata.pca = prcomp(t(expMatrix), retx = TRUE, center = TRUE, scale. = FALSE) data.frame(var = mydata.pca$sdev^2) %>% mutate(prop_var = 1/sum(mydata.pca$sdev^2)*var, pc_num = 1:n()) %>% ggplot(aes(pc_num, prop_var)) + geom_col() + xlab("principal component number") + ylab("proportion of explained variance") + ggtitle("proportion of variance explained by the individual components") pcs = mydata.pca$x %>% as_df %>% rownames_to_column("sample") my_dens <- function(data, mapping, ...) { ggplot(data = data, mapping=mapping) + geom_density(..., alpha = 0.7, color = NA) } pcs %>% GGally::ggpairs(columns=2:6, mapping=ggplot2::aes(fill=group_labels, color=group_labels), upper="blank", legend=c(3,3), diag = list(continuous = my_dens)) + theme(legend.position = "bottom") + theme(axis.text.x = element_text(angle = 45, hjust = 1)) #' #'

#' #' ### Spearman correlation correlation = cor(expMatrix, method = "spearman") col<- colorRampPalette(c("coral1", "white", "cadetblue"))(20) # col2<- colorRampPalette(c("coral1", "white", "cadetblue"))(20) corrplot(correlation, col = col, title = "Spearman correlation between conditions", mar=c(0,0,2,0), tl.col="black") ## TODO: include Jensen-Shannon distance # res = cummeRbund::JSdist(cummeRbund::makeprobs(expMatrix)) %>% # hclust() %>% # as.dendrogram() # par(mar = c(10, 4, 4, 4)) # plot(res, main = "Sample Clustering of Expressed Genes") #' #'

#' #' ### Euclidean distance #distMatrix %>% d3heatmap(xaxis_height=1, Colv = T, dendrogram="row") distMatrix = as.matrix(dist(t(expMatrix))) distMatrix %>% d3heatmap(xaxis_height = 1, color = col) #' #'

#' #----------------------------------------------------- #' ## Data normalization orderMatcheExpDesign = data_frame(replicate = colnames(expMatrix)) %>% mutate(col_index = row_number()) %>% right_join(expDesign, by = "replicate") %>% arrange(col_index) #' Build design matrix #' > A key strength of limma’s linear modelling approach, is the ability accommodate arbitrary experimental complexity. Simple designs, such as the one in this workflow, with cell type and batch, through to more complicated factorial designs and models with interaction terms can be handled relatively easily #' #' Make sure that non of the batch-factors is confounded with treatment (condition). See https://support.bioconductor.org/p/39385/ for a discussion #' References #' * https://f1000research.com/articles/5-1408/v1 # design <- orderMatcheExpDesign %$% model.matrix(~ 0 + condition + prep_day) design = orderMatcheExpDesign %$% model.matrix(formula(as.formula(paste("~0+", designFormula)))) rownames(design) <- orderMatcheExpDesign$replicate ## create a DGEList object: exp_study = DGEList(counts = expMatrix, group = orderMatcheExpDesign$condition) # par(mfrow=c(1,2)) ## 2panel plot for mean-var relationship before and after boom # Removing heteroscedasticity from count data ## transform count data to log2 CPM, estimate the mean-variance relationship and compute appropiate observational-level weights with voom: voomNorm <- voom(exp_study, design, plot = FALSE, save.plot = TRUE) # str(voomNorm) # exp_study$counts is equilvaent to voomNorm$E see https://www.bioconductor.org/help/workflows/RNAseq123/ # identical (names(voomNorm$voom.xy$x), names(voomNorm$voom.xy$y)) voom_before <- data.frame(protein_ids = names(voomNorm$voom.xy$x), x = unname(voomNorm$voom.xy$x), y = unname(voomNorm$voom.xy$y), line_x = voomNorm$voom.line$x, line_y = voomNorm$voom.line$y) voom_before %>% ggplot(aes(x, y)) + geom_point(size = 1) + geom_line(aes(line_x, line_y), color = "red") + ggtitle("voom: mean-variance trend") + xlab("log2(count + 0.5)") + ylab("sqrt(standard deviation)") if (is.null(protein_info_file)){ voom_before %<>% select(x, y, protein_ids) %>% arrange(x,y) # voom_before %>% DT::datatable() voom_before %>% table_browser() } else { voom_before %<>% left_join(protein_info, by = "protein_ids") %>% select(x, y, gene_name, protein_acc) %>% arrange(x,y) # voom_before %>% DT::datatable() voom_before %>% table_browser() } ## get log2 normalized expression values voomMat = voomNorm$E group_labels = data_frame(replicate = colnames(voomMat)) %>% left_join(expDesign) %>% pull(condition) names(group_labels) = colnames(voomMat) makePcaPlot(t(voomMat), color_by = group_labels, title = "Normalized PCA of quantifiable proteins in all conditions") #' Corrleate normalized data with raw expression inner_join(expr_matrix_to_df(expMatrix) , expr_matrix_to_df(voomNorm$E), suffix = c("_raw", "_voom"), by = c("gene_id", "replicate")) %>% sample_frac(0.1) %>% ggplot(aes(expression_raw, expression_voom)) + geom_point() + scale_x_log10() + ggtitle("voom vs raw") contr.matrix = makeContrasts(contrasts = with(contrasts, paste0("condition", condition_1, "-", "condition", condition_2)), levels = colnames(design)) #' ## Model Fitting & Moderated t-test #' Linear modelling in limma is carried out using the lmFit and contrasts.fit functions originally written for application to microarrays. The functions can be used for both microarray and RNA-seq data and fit a separate model to the expression values for each gene. Next, empirical Bayes moderation is carried out by borrowing information across all the genes to obtain more precise estimates of gene-wise variability (source: RNAseq123) vfit <- lmFit(voomNorm, design) vfit <- contrasts.fit(vfit, contrasts = contr.matrix) efit <- eBayes(vfit) #TODO: check for alternatives, e.g. linear-mixed models (https://bioconductor.org/packages/release/bioc/vignettes/variancePartition/inst/doc/dream.html) voom_after <- data.frame(protein_ids = names(efit$Amean), x = unname(efit$Amean), y = sqrt(efit$sigma)) # plotSA(efit, main = "Final model: Mean−variance trend") voom_after %>% ggplot(aes(x, y)) + geom_point(size = 1) + geom_hline(yintercept = sqrt(sqrt(efit$s2.prior)), color = "red") + ggtitle("final model: mean-variance trend") + xlab("average log-abundance") + ylab("sqrt(sigma)") if (is.null(protein_info_file)){ voom_after %<>% select(x, y, protein_ids) %>% arrange(x,y) # voom_after %>% DT::datatable() voom_after %>% table_browser() } else { voom_after %<>% left_join(protein_info, by = "protein_ids") %>% select(x, y, gene_name, protein_acc) %>% arrange(x,y) # voom_after %>% DT::datatable() voom_after %>% table_browser() } #' #'

#' #' ### Differential abundance results without lfc cutoff summary(decideTests(efit)) %>% as_df %>% kable #' #'

#' #' ### Differential abundance results with lfc cutoff #' Some studies require more than an adjusted p-value cut-off. For a stricter definition on significance, one may require log-fold-changes (log-FCs) to be above a minimum value. The treat method (McCarthy and Smyth 2009) can be used to calculate p-values from empirical Bayes moderated t-statistics with a minimum log-FC requirement. if (! is.null(qcutoff)) { echo("Using q-value cutoff of", qcutoff) }else { echo("Using p-value cutoff of", pcutoff) } tfit <- treat(vfit, lfc = lfc_cutoff) dt <- decideTests(tfit) summary(dt) %>% as_df %>% kable #' plot MA per contrast numContrasts = length(colnames(tfit)) par(mfrow=c(1,1)) # par(mfrow = c(4, numContrasts)) ## 2panel plot for mean-var relationship before and after boom # colnames(tfit) %>% iwalk(~ plotMD(tfit, column=.y, status=dt[,.y], main=.x, xlim=c(-8,13))) colnames(tfit) %>% iwalk(~ plotMD(tfit, column = .y, status = dt[, .y], main = .x, col = c("coral1", "cadetblue"))) # load_pack("Glimma") # glMDPlot(tfit) # -> just a clickable ma plot with table # glMDPlot(tfit, coef=1, status=dt, main=colnames(tfit)[1], side.main="ENTREZID", counts=x$counts, groups=group, launch=FALSE) # diffexData = colnames(contr.matrix) %>% imap(function(contrast, cIndex){ print(cIndex)}) # basal.vs.lp <- topTreat(tfit, coef=2, n=Inf) deResults = colnames(contr.matrix) %>% imap(function(contrast, cIndex){ #DEBUG contrast="conditione12-conditione16"; cIndex=1 topTreat(tfit, coef = cIndex, n = Inf) %>% as.data.frame() %>% rownames_to_column("protein_ids") %>% mutate(contrast = str_replace_all(contrast, "condition", "")) %>% separate(contrast, c("condition_1", "condition_2"), "-") %>% # push_left("contrast") %>% pretty_columns }) %>% bind_rows() ## calculate sample means # https://support.bioconductor.org/p/62541/ sampleMeans = voomNorm$E %>% as_df %>% rownames_to_column("protein_ids") %>% tbl_df %>% gather(replicate, norm_count, -protein_ids) %>% left_join(expDesign) %>% group_by(protein_ids, condition) %>% summarize(mean_expr = mean(norm_count, na.rm = T)) %>% ungroup() %>% # spread(condition, mean_pep) inner_join(., ., by = "protein_ids", suffix = c("_1", "_2")) #%>% # filter(ac(condition_1)% # add diffex status # inner_join(deResults) deResults %<>% left_join(sampleMeans) #logFcs = contrasts %>% rowwise %>% do(with(., { # # condition_1="Q80"; condition_2="Q20" # sampleMeans %>% select_("gene_id", condition_1, condition_2) %>% # set_names("gene_id", "condition_1_mean", "condition_2_mean") %>% group_by(gene_id) %>% # transmute(logfc=log2(condition_1_mean/condition_2_mean), condition_1=condition_1, condition_2=condition_2) #})) # report hit criterion #+ results='asis' if (! is.null(qcutoff)) { deResults %<>% transform(is_hit = adj_p_val <= qcutoff) }else { deResults %<>% transform(is_hit = p_value <= pcutoff) } #+ deResults %<>% mutate(c1_overex = logfc > 0) deResults %>% count(condition_1, condition_2, is_hit) %>% filter(is_hit) ## Annotate results deAnnot = deResults %>% left_join(protein_info, by = "protein_ids") ######################################################################################################################## #' ## Results & Discussion ## DEBUG-START #if(F){ #deResults %>% table_browser # #paNorm %>% filter(gene_id=="ENSMUSG00000015970") %>% ggplot(aes(sample, expr)) + geom_point() # ## debug p adjustment #deResults %>% ggplot(aes(q_value, p_value)) + #geom_point(alpha=0.3) + #facet_grid(condition_1 ~ condition_2) #} ## DEBUG-END #' Are log2 distributions symmetric around 0 (because of globally we'd expect no change in abundance) deResults %>% ggplot(aes(logfc)) + geom_histogram() + facet_grid(condition_1 ~ condition_2) + geom_vline(xintercept = 0, color = "blue") + # xlim(-2,2) + ggtitle("condition_1 over condition_2 logFC ") ## Extract hits from deseq results degs = deAnnot %>% filter(is_hit) degs %>% transmute(protein_ids, contrast = paste(condition_1, "vs", condition_2)) %>% write_tsv(add_prefix("daps_by_contrast.txt")) ggplot(degs, aes(paste(condition_1, "vs", condition_2), fill = (c1_overex))) + geom_bar() + xlab(NULL) + ylab("No. of differentially abundant proteins") + ggtitle("Results by contrast") + coord_flip() + scale_fill_manual(values=c("coral1", "cadetblue")) #with(degs, as.data.frame(table(condition_1, condition_2, c1_overex))) %>% filter(Freq >0) %>% kable() maxX = quantile(deResults$logfc, c(0.005, 0.99), na.rm = TRUE) %>% abs %>% max maxY = quantile(log10(deResults$p_value), c(0.005, 0.99), na.rm = TRUE) %>% abs %>% max hitCounts = filter(deResults, is_hit) %>% count(condition_1, condition_2, c1_overex) %>% rename(hits = n) %>% merge(data.frame(c1_overex = c(F, T), x_pos = c(- maxX * 0.9, maxX * 0.9))) #+ fig.width=16, fig.height=14 deResults %>% ggplot(aes(logfc, - log10(p_value), color = is_hit)) + geom_jitter(alpha = 0.2, position = position_jitter(height = 0.2)) + # theme_bw() + xlim(- 3, 3) + scale_color_manual(values = c("TRUE" = "coral1", "FALSE" = "black")) + # ggtitle("Insm1/Ctrl") + ## http://stackoverflow.com/questions/19764968/remove-point-transparency-in-ggplot2-legend guides(colour = guide_legend(override.aes = list(alpha = 1))) + ## tweak axis labels xlab(expression(log[2]("condition_1/condition_2"))) + ylab(expression(- log[10]("p value"))) + xlim(- maxX, maxX) + ylim(0, maxY) + # ylim(0,50) + ## add hit couts geom_text(aes(label = hits, x = x_pos), y = maxY * 0.9, color = "coral1", size = 10, data = hitCounts) + facet_grid(condition_1 ~ condition_2) # #' Redo MA plots but now including hit overlays # meansWide = voomNorm$E %>% # as_df %>% # rownames_to_column("gene_id") %>% # tbl_df %>% # gather(replicate, norm_count, - gene_id) %>% # left_join(expDesign) %>% # group_by(gene_id, condition) %>% # summarize(mean_expr = mean(norm_count, na.rm = T)) %>% # spread(condition, mean_expr) # # maPlots = deResults %>% # group_by(condition_1, condition_2) %>% # do(gg = { # #browser() # maData <- . # # maData <- deResults %>% group_by(condition_1, condition_2) %>% first_group() # # calc_mean_product = lazyeval::interp(~ a * b, a = as.name(maData$condition_1[1]), b = as.name(maData$condition_2[1])) # meanProducts = meansWide %>% transmute_(.dots = setNames(list(calc_mean_product), "mean_prod")) # # maData %>% # left_join(meanProducts) %>% # # ggplot(aes(0.5*log2(norm_count_1*norm_count_2), logfc, color=is_hit)) + # ggplot(aes(0.5 * log2(mean_prod), logfc, color = is_hit)) + # geom_point(alpha = 0.3) + # geom_hline(yintercept = 0, color = "red") + # ggtitle(with(maData[1,], paste(condition_1, "vs", condition_2))) # }) %$% gg # #+ fig.height=6*ceiling(length(maPlots)/2) # load_pack(grid) # # multiplot(plotlist = maPlots, cols = min(2, length(maPlots))) ######################################################################################################################## #' ## Export results write_tsv(deAnnot, path = add_prefix("da_results.txt")) degs %>% write_tsv(add_prefix("diff_proteins.txt")) degs %>% transmute(protein_ids, contrast = paste(condition_1, "vs", condition_2)) %>% write_tsv(add_prefix("daps_by_contrast.txt")) degs %>% transmute(protein_ids, contrast = paste(condition_1, if_else(c1_overex, ">", "<"), condition_2)) %>% write_tsv(add_prefix("daps_by_contrast_directed.txt")) #' Export voom normalization scores per replicate voomNorm$E %>% as_df %>% rownames_to_column("protein_ids") %>% left_join(protein_info) %>% # push_left(c("gene_name", "gene_description")) %>% write_tsv(add_prefix("norm_abundance_by_replicate.txt")) # Also average voom normalized expression scores per condition and export voomNorm$E %>% as_df %>% rownames_to_column("protein_ids") %>% gather(replicate, norm_expr, - protein_ids) %>% inner_join(expDesign, by = "replicate") %>% group_by(condition, protein_ids) %>% summarize(mean_norm_expr = mean(norm_expr)) %>% left_join(protein_info) %>% # push_left(c("gene_name", "gene_description")) %>% write_tsv(add_prefix("norm_abundance_by_condition.txt")) left_join(voom_before %>% rename(x_voom = x, y_voom = y), voom_after %>% rename(x_final = x, y_final = y)) %>% push_left(c("x_final", "y_final")) %>% write_tsv("model_prot_positions.txt") #' | File | Description | #' |------|------| #' | [diffex_genes.txt](diffex_genes.txt) | list of all differentially expressed genes from the DESeq analysis - That's the file you are most likely looking for! | #' | [de_results.txt](de_results.txt) | list of all genes from the DESeq analysis | #' | [norm_exp_by_condition.txt](norm_exp_by_condition.txt) | tpm values of the different conditions | #' | [norm_exp_by_replicate.txt](norm_exp_by_replicate.txt) | tpm values of individual replicates | #' | [geneInfo.txt](geneInfo.txt) | general gene information | #' writeLines(capture.output(devtools::session_info()), ".sessionInfo.txt") session::save.session(".ms_limma.R.dat"); ## session::restore.session(".dge_limma.R.dat")