#!/usr/bin/env Rscript #' # Differential Expression Analysis #' #' Contents #' #' - [Data Preparation](#data-preparation) #' - [Quality Control](#quality-control) #' - [Size Factors](#size-factors) #' - [Global Dispersion Model](#global-dispersion-model) #' - [PCA and Clustering](#pca-and-clustering) #' - [Differential Expression Test](#differential-expression-test) #' - [MA and Volcano plots](#ma-and-volcano-plots) #' - [Count Outlier Analysis](#count-outlier-analysis) #' - [Hits Summary](#hits-summary) #' - [Exported Data](#exported-data) #' # to rebuild toc do knitr::spin("featcounts_deseq_mf.R", FALSE) #' Created by: `r system("whoami", intern=T)` #' #' Created at: `r format(Sys.Date(), format="%B %d %Y")` #+ include=FALSE suppressMessages(require(docopt)) doc = ' Perform a differential gene expression analysis using deseq2 Usage: featcounts_deseq_mf.R [options] Options: --contrasts= Table with sample pairs for which dge analysis should be performed --qcutoff Use a q-value cutoff of 0.01 instead of a q-value cutoff [default: 0.01] --pcutoff Override q-value filter and filter by p-value instead --min_count Minimal expression in any of the sample to be included in the final result list [default: 10] --out Name to prefix all generated result files [default: ] --design Design fomula for DeSeq with contrast attribute at the end [default: condition] --lfc Just report genes with abs(lfc) > lfc_cutoff as hits [default: 1.0] --ensembl_db Ensebmbl db to be used. If not specified inferred from data. TODO implement NONE here!! --gene_info Gene information file downloaded from the ensembl website if bioMart version is not present --betaprior Apply betaprior when running DESeq2 [default: TRUE] --sub_data Subset de_results by contrasts and store the data in a subfolder [default: FALSE] ' #commandArgs <- function(x) c("--design", "'batch+condition'", "--contrasts", "../contrasts.txt star_counts_matrix_reduced_sampleset.txt", "basic_design_reduced_sampleset.txt") #commandArgs <- function(x) c("--design", "'condition'", "--contrasts", "../contrasts.txt", "--sub_data", "TRUE", "../alignments/star_counts_matrix.txt", "../basic_design.txt") opts = docopt(doc, commandArgs(TRUE)) ## load some packages first because of name-space order library(knitr) #source("https://bioconductor.org/biocLite.R") #biocLite("DESeq2") library(DESeq2) #load_pack(readr) devtools::source_url("https://raw.githubusercontent.com/holgerbrandl/datautils/v1.45/R/core_commons.R") devtools::source_url("https://raw.githubusercontent.com/holgerbrandl/datautils/v1.45/R/ggplot_commons.R") ## also load common helper for expression data analysis # source(interp_from_env("${NGS_TOOLS}/dge_workflow/diffex_commons.R")) devtools::source_url("https://git.mpi-cbg.de/bioinfo/ngs_tools/raw/v6/dge_workflow/diffex_commons.R") ## process input arguments count_matrix_file = opts$count_matrix design_matrix_file = opts$design_matrix contrasts_file = opts$contrasts use_betaPrior = as.logical(opts$betaprior) sub_data = as.logical(opts$sub_data) gene_info_file = opts$gene_info assert(is.null(gene_info_file) || file.exists(gene_info_file), "invalid gene_info_file") resultsBase = if (str_length(opts$out) > 0) paste0(opts$out, ".") else "" pcutoff = if (is.null(opts$pcutoff))NULL else as.numeric(opts$pcutoff) qcutoff = if (is.numeric(pcutoff))NULL else as.numeric(opts$qcutoff) if (is.numeric(pcutoff))opts$qcutoff = NULL lfc_cutoff = if (is.null(opts$lfc))0 else as.numeric(opts$lfc) ## extract the sample for the design designFormula = opts$design ## consider last element of design formula as sample attribute contrastAttribute = str_split(designFormula, "[+]") %>% unlist() %>% tail(1) # assert(designFormula == "condition" || ! is.null(design_matrix_file), "custom designs are not supported without a design matrix") assert(str_detect(designFormula, ".*condition$")) #' Run configuration was vec_as_df(unlist(opts)) %>% filter(! str_detect(name, "^[<-]")) %>% kable() vec_as_df(unlist(opts)) %>% filter(! str_detect(name, "^[<-]")) %>% write_tsv(".run_configuration.txt") ######################################################################################################################## #' ## Data Preparation #' The detection of differentially expressed genes is performed using the R package DEseq2 (http://bioconductor.org/packages/release/bioc/html/DESeq2.html), which requires a count matrix as input. #' The count data are presented as a table which reports, for each sample, the number of sequence fragments that have been assigned to each gene. #' An important analysis question is the quantification and statistical inference of systematic changes between conditions, as compared to within-condition variability. #' Working Directory: `r getwd()` countData = read_tsv(count_matrix_file) ## save a backup into the current working directory as a reference countData %>% write_tsv(paste0(resultsBase, "input_counts.txt")) #' Structure of input matrix was glimpse(countData, width = 60) countMatrix = countData %>% column2rownames("ensembl_gene_id") %>% as.matrix() # Expression Filtering genesBefore = nrow(countMatrix) countMatrix %<>% filterByExpression(as.integer(opts$min_count)) genesAfter = nrow(countMatrix) #' Counts were filtered to only retain genes which had more that `r opts$min_count` alignments in at least one replicate. This reduced the number of genes from `r genesBefore` to `r genesAfter`. ## todo this is required now , so why all the checking if (! is.null(design_matrix_file)) { replicates2samples = read_tsv(design_matrix_file) #%>% select(replicate, sample) }else { warning("DEPRECATED extracting conditions from replicates is bad practise and a design file should be provided instead") get_sample_from_replicate = function(repName) str_match(repName, "(.*)_([R]|rep)?[0-9]{1,2}$")[, 2] replicates2samples = data_frame(replicate = colnames(countMatrix)) %>% mutate(condition = get_sample_from_replicate(replicate)) # %>% distinct_all() } write_tsv(replicates2samples, "basic_design.txt") # Define or load a contrasts matrix if (! is.null(contrasts_file)) { contrasts = read_tsv(contrasts_file) }else { assert(! is.null(replicates2samples), "Can not auto-create contrasts without design matrix") ## cant work yet because sample column with have random name contrasts = distinct_all(replicates2samples, condition) %>% select(condition) %>% merge(., ., suffixes = c("_1", "_2"), by = NULL) %>% filter(ac(condition_1) > ac(condition_2)) %>% # filter(ac(condition_1)!=ac(condition_2)) %>% fac2char() write_tsv(contrasts, paste(resultsBase, "contrasts.txt")) } #' The used design matrix is datatable(replicates2samples) #' The used design formula is. designFormula #' For more information about designs see section 1.6 "multi-factor designs" in [deseq-manual](https://bioconductor.org/packages/release/bioc/vignettes/DESeq2/inst/doc/DESeq2.pdf) #' The contrasts of interest are contrasts %>% kable() ## gene specific factors #normalizationFactors(dds) ## sizeFactor R help:sigEnrResults #dds = makeExampleDESeqDataSet() #dds = estimateSizeFactors( dds ) #sizeFactors(dds) ## see "3.11 Sample-/gene-dependent normalization factors" in the DEseq2 manual for details ## old non-mf approach #exDesign = data.frame(condition=colnames(countMatrix) %>% get_sample_from_replicate) ## new mf-approach exDesign = data_frame(replicate = colnames(countMatrix)) %>% mutate(col_index = row_number()) %>% right_join(replicates2samples, by = "replicate") %>% arrange(col_index) #%>% transmute(condition=condition_2ndwt) %>% fac2char assert(! any(is.na(exDesign$col_index)), "assay design file is not compatible to count matrix column names") exDesign %>% ggplot(aes(condition)) + geom_bar() + ylab("# samples") + coord_flip() + ggtitle("samples per condition") ## infer the sample to be used from the formula #designFormula = "condition_2ndwt + batch" #exDesign %<>% rename(sample, condition_2ndwt) #exDesign = replicates2samples %>% arrange(colnames(countMatrix)) # valiadate that contrasts model is compatible with data assert(setequal(names(contrasts), c("condition_1", "condition_2")), "contrasts matrix has incorrect column headers. Expected: condition_1, condition_2") designSamples = as_df(exDesign)[, contrastAttribute] %>% unique %>% sort contrastSamples = contrasts %>% gather %$% value %>% unique %>% sort if (! all(contrastSamples %in% designSamples)) { warning("design : ", paste(designSamples, collapse = ", ")) warning("contrasts: ", paste(contrastSamples, collapse = ", ")) stop("experiment design is not consistent with contrasts") } #stopifnot(all((contrasts %>% gather %$% value %>% unique) %in% exDesign$condition)) #http://www.cookbook-r.com/Formulas/Creating_a_formula_from_a_string/ dds = DESeqDataSetFromMatrix(countMatrix, exDesign, formula(as.formula(paste("~", designFormula)))) #dds = estimateSizeFactors(dds) #dds = estimateDispersions(dds) # dds = DESeq(dds, parallel = T) dds = DESeq(dds, parallel = T, betaPrior = use_betaPrior) ######################################################################################################################## #' ## Quality Control #' ### Size Factors #' For details about size factor normalziation and calculation see https://www.biostars.org/p/79978/ #' #' From the DESeq docs about how size factors are used: The sizeFactors vector assigns to each column of the count matrix a value, the size factor, such that count values in the columns can be brought to a common scale by dividing by the corresponding size factor. Counts are divied by size factors. sizeFactors(dds) %>% # set_names(colnames(countMatrix)) %>% vec_as_df("replicate", "size_factor") %>% ggplot(aes(replicate, size_factor)) + geom_bar(stat = "identity") + ggtitle("Size Factors estimated by DESeq") + coord_flip() # From DESeq manual: The regularized log transformation is preferable to the variance stabilizing transformation if the size factors vary widely. #' ### Global Dispersion Model #' Plot of the per-gene dispersion estimates together with the fitted mean-dispersion relationship. #' The dispersion plot shows how the estimates are shrunk from the gene wise values (black dots) toward the fitted estimates, with the final values used in testing being the blue dots. #' The dispersion can be understood as the square of the coefficient of biological variation. So, if a gene's expression typically differs from replicate to replicate sample by 20%, this gene's dispersion is: .20^2 = .04. ## The function estimateDispersions performs three steps. First, it estimates, for each gene and each (replicated) condition, a dispersion value, then, it fits, for each condition, a curve through the estimates. Finally, it assigns to each gene a dispersion value, using either the estimated or the fitted value. #' The estimation of dispersions performs three steps. First, it estimates, for each gene and each (replicated) condition, a dispersion value, then, it fits, for each condition, a curve through the estimates. Finally, it assigns to each gene a dispersion value, using either the estimated or the fitted value. plotDispEsts(dds, main = "Dispersion Model") ######################################################################################################################## #' ## PCA and Clustering #' In order to assess overall similarity between samples two common statistical methods are used - Principal component analysis (PCA) and clustering. #' This should provide answers to the questions: Which samples are similar to each other, which are different? Does this fit to the expectation from the experiment's design? #' Using a regularized log transformation of the raw counts provides the advantage that it stabilizes the variance across the mean. #' Principal component analysis (PCA) is a technique used to emphasize variation and bring out strong patterns in a dataset. #' Principal components are the underlying structure in the data. They are the directions where there is the most variance, the directions where the data is most spread out. The data points/samples are projected onto the 2D plane such that they spread out in the two directions that explain most of the differences. The x-axis is the direction that separates the data points the most and the y-axis is a direction that separates the data the second most. # Regularized log transformation for clustering/heatmaps, etc rld = rlogTransformation(dds) # vst = vst(dds) ## vst is supposed to be much 1k x faster # extract rlogTransformation normalized count data norm_counts <- assay(rld) load_pack(limma) load_pack(stringi) load_pack(htmltools) load_pack(plotly) # install.packages("htmlwidgets") #for (i in {designFormula %>% str_split("[+]") %>% unlist}) { # condition = pull(exDesign, i) # names(condition) = exDesign$replicate # # suppressWarnings(suppressMessages( makePcaPlot(t(corr_data), color_by = condition, scale = T, title = "DESeq normalized count data WITH batch correction") %>% ggplotly() %>% print)) # makePcaPlot(t(norm_counts), color_by = condition, scale = T, title = "DESeq normalized count data WITHOUT batch correction") %>% ggplotly() %>% print #} effects = designFormula %>% str_split("[+]") %>% unlist htmltools::tagList(lapply(effects, function(batchFactor) { # batchFactor=effects[1] condition = pull(exDesign, batchFactor) names(condition) = exDesign$replicate # suppressWarnings(suppressMessages( makePcaPlot(t(corr_data), color_by = condition, scale = T, title = "DESeq normalized count data WITH batch correction") %>% ggplotly() %>% print)) { makePcaPlot(t(norm_counts), color_by = condition, scale = T, title = "DESeq normalized count data WITHOUT batch correction") + guides(color = guide_legend(title = batchFactor)) } %>% ggplotly() })) # extract batch effect variables from the design formula and the corresponding data from the exDesign data.frame adjvar <- designFormula %>% str_split("[+]") %>% unlist %>% stri_subset_regex("condition", negate = T) adjvar_data <- data.frame(exDesign[, which(names(exDesign) %in% head(adjvar))]) treatDesign = exDesign %$% model.matrix(~ condition) if (length(adjvar_data) == 1) { corr_data <- limma::removeBatchEffect(norm_counts, batch = adjvar_data[, 1], design = treatDesign) }else if (length(adjvar_data) == 2) { corr_data <- removeBatchEffect(norm_counts, batch = adjvar_data[, 1], batch2 = adjvar_data[, 2], design = treatDesign) }else if (length(adjvar_data) > 2) { print("Please note, the number of adjustment variables exceeds the maximal number of removable batch effects.") }else { print("The design formula does not contain an adjustment variable for batch effect correction") } # for (i in colnames(adjvar_data)) { #for (i in {designFormula %>% str_split("[+]") %>% unlist}) { # condition = pull(exDesign, i) # names(condition) = exDesign$replicate # # suppressWarnings(suppressMessages( makePcaPlot(t(corr_data), color_by = condition, scale = T, title = "DESeq normalized count data WITH batch correction") %>% ggplotly() %>% print)) # makePcaPlot(t(corr_data), color_by = condition, scale = T, title = "DESeq normalized count data WITH batch correction") %>% ggplotly() %>% print #} if (exists("corr_data")) { htmltools::tagList(lapply(effects, function(batchFactor) { # batchFactor=effects[1] # batchFactor="condition" condition = pull(exDesign, batchFactor) names(condition) = exDesign$replicate # suppressWarnings(suppressMessages( makePcaPlot(t(corr_data), color_by = condition, scale = T, title = "DESeq normalized count data WITH batch correction") %>% ggplotly() %>% print)) { makePcaPlot(t(corr_data), color_by = condition, scale = F, title = "DESeq normalized count data WITH batch correction") + guides(color = guide_legend(title = batchFactor)) } %>% ggplotly() })) }else { print("No batch correction was done, as the design formula does not contain an adjustment variable for batch effect correction") } #' The Euclidean distance between samples are calculated after performing the regularized log transformation. #' Using the calculated distance matrix, the samples are projected on a two-dimensional graph such that the distance between samples approximately corresponds to the biological coefficient of variation between those samples. #' More information can be found here: https://en.wikipedia.org/wiki/Principal_component_analysis #distsRL = dist(t(assay(rld))) #hc = hclust(distsRL) #distMatrix = as.matrix(distsRL) distMatrix = as.matrix(dist(t(assay(rld)))) #rownames(distMatrix) = colnames(distMatrix) = with(exDesign(dds), paste(condition, sep=" : ")) countData %>% distinct(ensembl_gene_id) labelcntData = countData %>% distinct_all(ensembl_gene_id) %>% column2rownames("ensembl_gene_id") %>% data.matrix() %>% round() %>% data.frame() # load_pack(gplots) # heatmap.2(distMatrix, labRow = colnames(labelcntData), labCol = colnames(labelcntData), # symm = TRUE, trace = "none", #key=F, col=colorpanel(100, "black", "white"), # margin = c(8, 8), main = "Sample Distance Matrix") rownames(distMatrix) = colnames(labelcntData) colnames(distMatrix) = colnames(labelcntData) # load_pack(d3heatmap) load_pack(heatmaply) ## todo hide column dendrogram but keep cluster-order #distMatrix %>% d3heatmap(xaxis_height=1, Colv = T, dendrogram="row") print("DESeq normalized count data WITHOUT batch correction") # distMatrix %>% d3heatmap(xaxis_height = 1) distMatrix %>% heatmaply(colors=c("red","lightblue"),showticklabels=c(F,T)) if (exists("corr_data")) { print("DESeq normalized count data WITH batch correction") corr_distMatrix = as.matrix(dist(t(corr_data))) # corr_distMatrix %>% d3heatmap(xaxis_height = 1) corr_distMatrix %>% heatmaply(colors=c("red","lightblue"),showticklabels=c(F,T)) }else { # print("No batch correction was done, as the design formula does not contain an adjustment variable for batch effect correction") } ## see http://www.bioconductor.org/help/workflows/rnaseqGene/ ## DEBUG-START # if (F) { # assay(rld) %>% head load_pack("genefilter") load_pack("pheatmap") mostSIg <- head(order(rowVars(assay(rld)), decreasing = TRUE), 20) mat <- assay(rld)[mostSIg,] mat <- mat - rowMeans(mat) # df <- as.data.frame(colData(rld)[,c("cell","dex")]) #pheatmap(mat, annotation_col=df) # pheatmap(mat) pheatmap(mat, annotation_col = column2rownames(exDesign, "replicate") %>% select(- col_index)) # } ## DEBUG-END ######################################################################################################################## #' ## Differential Expression Test #' #' DESeq is an R package to analyse count data from high-throughput sequencing assays such as RNA-Seq and test for differential expression. The latest version, DESeq2, is used. #' DESeq uses a model based on the negative binomial distribution and offers the following features: #' Count data is discrete and skewed and is hence not well approximated by a normal distribution. Thus, a test based on the negative binomial distribution, which can reflect these properties, has much higher power to detect differential expression. #' Tests for differential expression between experimental conditions should take into account both technical and biological variability. Recently, several authors have claimed that the Poisson distribution can be used for this purpose. However, tests based on the Poisson assumption (this includes the binomial test and the chi-squared test) ignore the biological sampling variance, leading to incorrectly optimistic p values. The negative binomial distribution is a generalisation of the Poisson model that allows to model biological variance correctly. #' DESeq estimate the variance in a local fashion, using different coefficients of variation for different expression strengths. This removes potential selection biases in the hit list of differentially expressed genes, and gives a more balanced and accurate result. #' #' See deseq reference [docs](http://master.bioconductor.org/packages/release/bioc/vignettes/DESeq2/inst/doc/DESeq2.pdf) for details. #' #' To understand fold-change shrinking and estimation check out #' [Moderated estimation of fold change and dispersion for RNA-seq data with DESeq2](http://genomebiology.com/2014/15/12/550/abstract) # Run Deseq Test #dds = DESeq(dds, fitType='local', betaPrior=FALSE) #dds = DESeq(dds, fitType='local') #dds = DESeq(dds) #res = results(dds) #resultsNames(dds) #' Model Overview: summary(results(dds)) ## extract all de-sets according to our contrasts model # install_package("plyr") # load_pack(IHW) --> todo reenable once installation on macos is fixed (see https://support.bioconductor.org/p/92731/) # low count filtering https://support.bioconductor.org/p/65256/ # independent filtering occurs within results() to save you from multiple test correction on genes with no power # (see ?results and the vignette section about independent filtering, or the paper). T #deResults = alply(contrasts, 1, splat(function(condition_1, condition_2){ deResults = plyr::alply(contrasts, 1, plyr::splat(function(condition_1, condition_2){ #DEBUG condition_1=as_df(contrasts)[1,1]; condition_2=as_df(contrasts)[1,2] # results(dds, contrast=c(contrastAttribute, condition_1, condition_2)) %>% # results(dds, contrast = c(contrastAttribute, condition_1, condition_2), lfcThreshold = lfc_cutoff, filterFun = ihw) %>% results(dds, contrast = c(contrastAttribute, condition_1, condition_2), lfcThreshold = lfc_cutoff) %>% as_tibble() %>% tibble::rownames_to_column("ensembl_gene_id") %>% ## see http://rpackages.ianhowson.com/bioc/DESeq2/man/results.html when using contrasts argument rename(c1_over_c2_logfc = log2FoldChange) %>% mutate(condition_1 = condition_1, condition_2 = condition_2) #%>% head %>% print_head })) %>% bind_rows # deResults = adply(contrasts, 1, splat(function(condition_1, condition_2){ # echo('samples', condition_1, condition_2) # return(data.frame()) # })) deResults %>% ggplot(aes(c1_over_c2_logfc)) + geom_histogram(binwidth = 0.1) + facet_grid(condition_1 ~ condition_2) + geom_vline(xintercept = 0, color = "blue") + xlim(- 3, 3) + ggtitle("condition_1 over condition_2 logFC ") #deResults %>% ggplot(aes(pvalue)) + # geom_histogram() + # xlim(0,1) + # facet_grid(condition_1 ~ condition_2) + geom_vline(xintercept=0.01, slope=1, color="blue") + # ggtitle("p-value distributions") #+ scale_x_log10() # #deResults %>% ggplot(aes(padj)) + # geom_histogram() + ## xlim(0,1) + # facet_grid(condition_1 ~ condition_2) + geom_vline(xintercept=0.01, slope=1, color="blue") + # ggtitle("Adjusted p-value distributions") #+ scale_x_log10() # report hit criterion #+ results='asis' if (! is.null(qcutoff)) { echo("Using q-value cutoff of", qcutoff) deResults %<>% transform(is_hit = padj <= qcutoff) }else { echo("Using p-value cutoff of", pcutoff) deResults %<>% transform(is_hit = pvalue <= pcutoff) } #deResults %<>% mutate(is_hit=pvalue<0.05) # filter(deResults, ensembl_gene_id=="ENSMUSG00000034957") deResults %<>% mutate(c1_overex = c1_over_c2_logfc > 0) normCounts = counts(dds, normalized = TRUE) %>% # set_names(exDesign(dds)$condition) %>% rownames2column("ensembl_gene_id") set_names(colnames(countMatrix)) %>% rownames_to_column("ensembl_gene_id") %>% tbl_df normCounts %>% write_tsv(paste0(resultsBase, "sizefac_normalized_counts_by_replicate.txt")) ## .. should be same as input #filter(countData, ensembl_gene_id=="ENSDARG00000000001") ######################################################################################################################## #' ### MA and Volcano plots #' MA-plot: The log2 fold change for a particular comparison is plotted on the y-axis and the average of the counts normalized by log2 is shown on the x-axis ("M" for minus, because a log ratio is equal to log minus log, and "A" for average). Each gene is represented with a dot. Genes with an adjusted p value below a certain threshold are shown in cyan (True). #' This plot demonstrates that only genes with a large average normalized count contain sufficient information to yield a significant call. # deseq approach # plotMA(deResults, main="DESeq2", ylim=c(-2,2)) # Calculation of the base means per condition #https://stat.ethz.ch/R-manual/R-devel/library/base/html/attr.html meanNormCounts = counts(dds, normalized = TRUE) %>% as_df %>% rownames_to_column("ensembl_gene_id") %>% tbl_df %>% gather(replicate, norm_count, - ensembl_gene_id) %>% inner_join(exDesign) %>% group_by(ensembl_gene_id, condition) %>% summarize(mean_norm_count = mean(norm_count)) ## add base means to diff??ex summary #numResultsBeforeMerge = nrow(deResults) deResults = meanNormCounts %>% # gather(condition, norm_count, - ensembl_gene_id) %>% inner_join(., ., by = "ensembl_gene_id", suffix = c("_1", "_2")) %>% # filter(ac(condition_1)% # add diffex status inner_join(deResults) #+ fig.width=16, fig.height=14 deResults %>% ggplot(aes(0.5 * log2(mean_norm_count_1 * mean_norm_count_2), log2(mean_norm_count_2 / mean_norm_count_1), color = pvalue < 0.05)) + geom_point(alpha = 0.1) + geom_hline(yintercept = 0, color = "red") + facet_grid(condition_1 ~ condition_2) #deResults %$% pvalue %>% log10() %>% quantile(0.05, na.rm=T) #' A volcano plot displays unstandardized signal (e.g. log-fold-change) against noise-adjusted/standardized signal (e.g. t-statistic or -log(10)(p-value) from the t-test). #' This scatter plot is used to quickly identify changes in large datasets composed of replicate data. #' Here we have log2-fold-change on the X-axis and -log10 of the p-value on the y-axis. #' This results in datapoints with low p-values (highly significant) appearing toward the top of the plot. For the x-axis the log of the fold-change is used so that changes in both directions (left and right) appear equidistant from the center. #' Therefore interesting points are those, which are found toward the top of the plot that are far to either the left- or the right-hand side. These represent values that display large magnitude fold changes (hence being left- or right- of center) as well as high statistical significance (hence being toward the top). #' see: https://en.wikipedia.org/wiki/Volcano_plot_%28statistics%29) maxX = quantile(deResults$c1_over_c2_logfc, c(0.005, 0.99), na.rm = TRUE) %>% abs %>% max ##Volcano plots hitCounts = filter(deResults, is_hit) %>% count(condition_1, condition_2, c1_overex) %>% rename(hits = n) %>% ## complement hit counts with directed contrasts without any diffex genes right_join(merge(mutate(contrasts), data_frame(c1_overex = c(T, F)), by = NULL)) %>% mutate(hits = if_else(is.na(hits), as.integer(0), hits)) %>% merge(data.frame(c1_overex = c(T, F), x_pos = c(maxX - maxX * 0.1, - maxX + maxX * 0.1))) hitCounts %<>% mutate(y_pos = quantile(- log10(deResults$pvalue), 0.987, na.rm = TRUE)) #+ fig.width=16, fig.height=14 ## todo remove; part of commons v1.41 trim_outliers <- function(values, probs=c(0.05, 0.95)){ values = deResults$pvalue stopifnot(length(probs) == 2) quantiles = quantile(values, probs, na.rm = TRUE) pmax(quantiles[1], pmin(quantiles[2], values)) } deResults %>% mutate(p_trimmed = trim_outliers(pvalue, c(0.01, 1))) %>% filter(! is.na(is_hit)) %>% ggplot(aes(c1_over_c2_logfc, - log10(p_trimmed), color = is_hit)) + # geom_jitter(alpha = 0.1, position = position_jitter(height = 0.2)) + geom_point(alpha = 0.1) + # theme_bw() + xlim(- maxX, maxX) + scale_color_manual(values = c("TRUE" = "red", "FALSE" = "black")) + # ggtitle("Insm1/Ctrl") + ## http://stackoverflow.com/questions/19764968/remove-point-transparency-in-ggplot2-legend guides(colour = guide_legend(override.aes = list(alpha = 1))) + ## tweak axis labels xlab(expression(log[2]("x/y"))) + ylab(expression(- log[10]("p value"))) + # ylim(0,50) + ## add hit couts geom_text(aes(label = hits, x = x_pos, y = y_pos), color = "red", size = 8, data = hitCounts) + facet_grid(condition_1 ~ condition_2) ######################################################################################################################## # Count Table Exports ## optionally install bioconducor package install_package("biomaRt") # Load transcriptome annotations needed for results annotation if (is.null(gene_info_file)) { ensembl_dataset = if (! is.null(opts$ensembl_db)) paste0(opts$ensembl_db, "_gene_ensembl") else guess_mart(countData$ensembl_gene_id) geneInfo = quote({ ## mart = biomaRt::useDataset("drerio_gene_ensembl", mart = biomaRt::useMart("ensembl"))?? # mart = biomaRt::useDataset(guess_mart(countData$ensembl_gene_id), mart = biomaRt::useMart("ensembl")) ## todo fix this https://support.bioconductor.org/p/74322/ # mart = biomaRt::useDataset(guess_mart(countData$ensembl_gene_id), mart = biomaRt::useMart("ENSEMBL_MART_ENSEMBL", host="www.ensembl.org")) # mart = biomaRt::useMart("ENSEMBL_MART_ENSEMBL", dataset = "mmusculus_gene_ensembl", host = "dec2016.archive.ensembl.org", path = "/biomart/martservice", archive = FALSE) mart = biomaRt::useMart("ENSEMBL_MART_ENSEMBL", dataset = ensembl_dataset, host = "aug2017.archive.ensembl.org", path = "/biomart/martservice", archive = FALSE) c("ensembl_gene_id", "external_gene_name", "description", "chromosome_name", "start_position", "end_position") %>% biomaRt::getBM(mart = mart) %>% tbl_df }) %>% cache_it("geneInfo") } else { geneInfo = read_tsv(gene_info_file) colnames(geneInfo)[1] = "ensembl_gene_id" } geneLengths = transmute(geneInfo, ensembl_gene_id, gene_length = end_position - start_position) geneDescs = transmute(geneInfo, ensembl_gene_id, gene_name = external_gene_name, gene_description = description) ## save as reference for downstream analysis write_tsv(geneInfo, path = paste0(resultsBase, "geneInfo.txt")) # Export counts per replicate as double-normalized FPKMs=(read_count * 10^9)/(gene_length * library_size) fpkms = countMatrix %>% as.data.frame %>% rownames_to_column("ensembl_gene_id") %>% gather(replicate, num_tags, - ensembl_gene_id) %>% tbl_df %>% left_join(geneLengths, by = "ensembl_gene_id") %>% ## calculate normalized counts group_by(replicate) %>% mutate(lib_size = sum(num_tags), fpkm = ((1E9 / lib_size) * (num_tags / gene_length)) %>% round(digits = 3)) %>% ungroup() #fpkms %>% ggplot(aes(fpkm)) + geom_histogram() + scale_x_log10(labels=comma) fpkms %>% transmute(ensembl_gene_id, replicate, fpkm) %>% spread(replicate, fpkm) %>% left_join(geneDescs) %>% push_left(c("gene_name", "gene_description")) %>% write_tsv(paste0(resultsBase, "fpkms_by_replicate.txt")) # Aggregate FPKMs by condition and export into tabele fpkms %>% inner_join(exDesign, by = "replicate") %>% # group_by_(.dots = c(contrastAttribute, "ensembl_gene_id")) %>% # group_by(!!contrastAttribute, ensembl_gene_id) %>% group_by(condition, ensembl_gene_id) %>% summarize(mean_fpkm = mean(fpkm)) %>% ungroup() %>% spread(condition, mean_fpkm) %>% left_join(geneDescs) %>% push_left(c("gene_name", "gene_description")) %>% write_tsv(paste0(resultsBase, "fpkms_by_condition.txt")) # [FPKMs as Matrix](`r paste0(resultsBase, "fpkms_by_condition.txt")`) # Export counts per replicate as TPMs (transcript per million) # calculation of tpms and generation of a wide table including gene description and the external gene name tpms = countMatrix %>% as.data.frame %>% rownames_to_column("ensembl_gene_id") %>% gather(replicate, num_tags, - ensembl_gene_id) %>% tbl_df %>% left_join(geneLengths) %>% mutate(length_norm_count = num_tags / gene_length) %>% group_by(replicate) %>% mutate(length_norm_sum = sum(length_norm_count, na.rm = T), tpm = (length_norm_count * (1E6 / length_norm_sum)) %>% round(digits = 3)) %>% ungroup() tpms %>% select(ensembl_gene_id, replicate, tpm) %>% spread(replicate, tpm) %>% left_join(geneDescs) %>% push_left(c("gene_name", "gene_description")) %>% write_tsv(paste0(resultsBase, "tpms_by_replicate.txt")) # Also average TPMs per condition and export tpms %>% inner_join(exDesign, by = "replicate") %>% group_by(condition, ensembl_gene_id) %>% # group_by(condition, ensembl_gene_id) %>% summarize(mean_tpm = mean(tpm)) %>% # select(ensembl_gene_id, replicate, gene_name, gene_description, mean_tpm) %>% spread(condition, mean_tpm) %>% left_join(geneDescs) %>% push_left(c("gene_name", "gene_description")) %>% write_tsv(paste0(resultsBase, "tpms_by_condition.txt")) #----------------------------- # Export the complete dataset for later analysis deAnnot = deResults %>% left_join(geneInfo) #inner_join(normCounts) %>% #merge(., contrasts) %>% #merge(.,., suffixes=c("_1", "_2"), by=NULL) %>% #inner_join(baseMeanPerLvl, id='ensembl_gene_id') %>% write_tsv(deAnnot, path = paste0(resultsBase, "de_results.txt")) #deAnnot = read_tsv(paste0(resultsBase, "de_results.txt")) # [de_results.txt](`r paste0(resultsBase, "de_results.txt")`) ## also export results which we'll always need for pathways overlays deAnnot %>% transmute( ensembl_gene_id, contrast = paste(condition_1, "vs", condition_2), plot_score = - log10(pvalue) * ifelse(c1_overex, 1, - 1) ) %>% spread(contrast, plot_score) %>% write_tsv(paste0(resultsBase, "plot_score_matrix.txt")) #deAnnot %>% filter(ensembl_gene_id=="FBgn0000015") # subset data according to contrasts if (sub_data) { dir.create("de_results") comp <- apply(contrasts, 1, paste0, collapse = "_vs_") lapply(comp, function(x) { deAnnot %>% mutate(contrast = paste(condition_1, condition_2, sep = "_vs_")) %>% filter(grepl(x, contrast)) %>% select(- contrast) %>% write_tsv(paste("de_results/", x, ".txt", sep = "")) }) } ## todo understand purpose and effeect of indpendentFiltering (see https://support.bioconductor.org/p/57128/) #deResults %>% count(is_hit) #deAnnot %>% count(is_hit) #deAnnot %>% count(is.na(padj)) ######################################################################################################################## #' ### Count Outlier Analysis #' For how many genes did Deseq did not assign a p-value (due to outliers; see section 1.5.3 os [DEseq manual](https://bioconductor.org/packages/release/bioc/vignettes/DESeq2/inst/doc/DESeq2.html#count-outlier-detection)) # summary(results(dds, cooksCutoff=FALSE)) #' See "3.6 Approach to count outliers" in the [DESEQ2 manual](https://bioc.ism.ac.jp/packages/3.4/bioc/vignettes/DESeq2/inst/doc/DESeq2.pdf) for details about outlier handling #' #' Def: Cook's distance is a measure of how much a single sample is influencing the fitted coefficients for a gene, and a large value of Cook's distance is intended to indicate an outlier count # par(mar=c(8,5,2,2)) # boxplot(log10(assays(dds)[["cooks"]]), range=0, las=2) deAnnot %>% filter(is.na(pvalue)) %>% mutate(contrast = paste(condition_1, "vs", condition_2)) %>% count(contrast) %>% n_as("genes_with_NA_pvalue") %>% kable(caption = "Failed comparsions by sample") if ((deAnnot %>% filter(is.na(pvalue)) %>% nrow) > 0) { na_counts = norm_counts %>% as.data.frame() %>% rownames_to_column("ensembl_gene_id") %>% tbl_df %>% gather(replicate, norm_count, - ensembl_gene_id) %>% right_join(deAnnot %>% filter(is.na(pvalue)) %>% select(ensembl_gene_id, condition_1, condition_2)) %>% # glimpse # filter(ensembl_gene_id=="ENSMUSG00000000120") left_join(select(exDesign, replicate, condition)) %>% filter(condition_1 == condition | condition_2 == condition) na_counts %>% filter(ensembl_gene_id %in% unique(ensembl_gene_id)[1 : 6]) %>% ggplot(aes(condition, norm_count, label = replicate)) + geom_point() + geom_text() + facet_wrap(~ ensembl_gene_id, scales = "free_y") + ggtitle("exmple gene counts by replicate") na_counts %>% group_by(ensembl_gene_id) %>% filter(norm_count == max(norm_count)) %>% ggplot(aes(replicate)) + geom_bar() + coord_flip() + facet_grid(condition ~ ., scales = "free_y") + ylab("Genes without p-value in which sample has largest count") + ggtitle("Which replicates are causing most of the NAs") } ######################################################################################################################## #' ## Hits Summary # Extract hits from deseq results degs = deAnnot %>% filter(is_hit) # ggplot(degs, aes(paste(condition_1, "vs", condition_2))) + geom_bar() +xlab(NULL) + ylab("# DBGs") + ggtitle("DBG count summary") + coord_flip() # degs %>% # ggplot(aes(paste(condition_1, "vs", condition_2), fill=c1_overex)) + # geom_bar(position="dodge") + # xlab(NULL) + ylab("# DBGs") + # ggtitle("DBG count summary by direction of expression") + # coord_flip() # Export DBA genes ## disabled because we just subset the annotated data now to define degs #degsAnnot = degs %>% # inner_join(normCounts) %>% # left_join(geneInfo) %>% # left_join(bindCats) degs %>% write_tsv(paste0(resultsBase, "diffex_genes.txt")) #degs = read_tsv(paste0(resultsBase, "diffex_genes.txt")) #' [Differentially Expressed Genes](`r paste0(resultsBase, "diffbind_genes.txt")`) #unloadNamespace("dplyr"); require(dplyr) ## export slim hit list for downstream enrichment analysis degs %>% transmute(ensembl_gene_id, contrast = paste(condition_1, "vs", condition_2)) %>% write_tsv(paste0(resultsBase, "degs_by_contrast.txt")) degs %>% transmute(ensembl_gene_id, contrast = paste(condition_1, if_else(c1_overex, ">", "<"), condition_2)) %>% write_tsv(paste0(resultsBase, "degs_by_contrast_directed.txt")) ## also export gsea inputs #deAnnot %>% # mutate(contrast = paste(condition_1, "vs", condition_2)) %>% # filter(! is.na(pvalue)) %>% # transmute(contrast, ensembl_gene_id, rank_score = - log10(pvalue)) %>% #count(is.na(rank_score)) ## arrange(contrast, rank_score) %>% # write_tsv(paste0(resultsBase, "gsea__genes_by_contrast__undirected.txt")) #deAnnot %>% # mutate(contrast = paste(condition_1, "vs", condition_2)) %>% # filter(! is.na(pvalue)) %>% # transmute(contrast, ensembl_gene_id, rank_score = - log10(pvalue) * ifelse(c1_overex, 1, - 1)) %>% ## arrange(contrast, rank_score) %>% # write_tsv(paste0(resultsBase, "gsea__genes_by_contrast__directed.txt")) # ## render interactive hit browser # #+results='asis' # degs %>% # left_join(geneInfo) %>% # mutate( # igv=paste0("IGV") # ) %>% # select(c1_over_c2_logfc, pvalue, ensembl_gene_id, condition_1, condition_2, external_gene_name, description, igv) %>% # # kable("html", table.attr = "class='dtable'", escape=F) # datatable(escape=F) #+ fig.height=nrow(contrasts)/2+2, eval=nrow(degs)>0 #ggplot(degs, aes(paste(condition_1, "vs", condition_2), fill=status)) + geom_bar() +xlab(NULL) + ylab("# DGEs") +ggtitle("DEGs by contrast") + coord_flip() ggplot(degs, aes(paste(condition_1, "vs", condition_2), fill = (c1_overex))) + geom_bar() + xlab(NULL) + ylab("# DGEs") + ggtitle("DEGs by contrast") + coord_flip() #with(degs, as.data.frame(table(condition_1, condition_2, c1_overex))) %>% filter(Freq >0) %>% kable() degs %>% count(condition_1, condition_2, c1_overex) %>% mutate(c1_overex = ifelse(c1_overex, ("Up in condition_1"), "Down in condition_1")) %>% spread(c1_overex, n) %>% kable() #' DEGs (differentially expressed genes) are contained in the following table #' DEGs can be browsed in Excel using the exported table or via the embedded table browser. To enable the IGV links, you need to set the port in the IGV preferences to 3334. #+ results='asis' # if (nrow(degs) < 3000) { degs %>% #inner_join(geneLoci) %>% mutate( ensembl = paste0("", ensembl_gene_id, ""), igv = paste0("IGV") ) %>% select(condition_1, condition_2, c1_over_c2_logfc, pvalue, padj, ensembl_gene_id, external_gene_name, description) %>% # kable("html", table.attr = "class='dtable'", escape=F) datatable(escape = F) # } ## just needed to restore environment easily #save(degs, file=".degs.RData") # degs = local(get(load(".degs.RData"))) # ######################################################################################################################## # #' ## MA Plots # # ## todo round of MA should be enough. # # # Redo MA plots but now including hit overlays # maPlots = deResults %>% # group_by(condition_1, condition_2) %>% # do(gg = { maData <- . # ## todo why not s2_over_c1_log2fc # maData %>% ggplot(aes(0.5 * log2(norm_count_1 * norm_count_2), log2(norm_count_1 / norm_count_2), color = is_hit)) + # geom_point(alpha = 0.3) + # geom_hline(yintercept = 0, color = "red") + # ggtitle(with(maData[1,], paste(condition_1, "vs", condition_2))) # }) %$% gg # # #+ fig.height=6*ceiling(length(maPlots)/2) # multiplot(plotlist = maPlots, cols = min(2, length(maPlots))) # # #' Extract most significantly changed genes and display their gene-centered rlog-normalized expression ## see http://www.bioconductor.org/help/workflows/rnaseqGene/ # assay(rld) %>% head # load_pack("genefilter") # load_pack("pheatmap") mostSig <- deAnnot %>% arrange(padj) %>% head(20) %$% ensembl_gene_id mat <- assay(rld) %>% as.data.frame %>% rownames_to_column("ensembl_gene_id") %>% filter(ensembl_gene_id %in% mostSig) %>% # as.data.frame() %>% column2rownames("ensembl_gene_id") %>% as.matrix() mat <- mat - rowMeans(mat) pheatmap(mat, annotation_col = column2rownames(exDesign, "replicate") %>% select(- col_index)) ######################################################################################################################## #' ## Exported Data #' #' | File | Description | #' |------|------| #' | [diffex_genes.txt](diffex_genes.txt) | list of all differentially expressed genes from the DESeq analysis - That's the file you are most likely looking for! | #' | [de_results.txt](de_results.txt) | list of all genes from the DESeq analysis | #' | [fpkms_by_condition.txt](fpkms_by_condition.txt) | fpkm values of the different conditions | #' | [fpkms_by_replicate.txt](fpkms_by_replicate.txt) | fpkm values of individual replicates | #' | [tpms_by_condition.txt](tpms_by_condition.txt) | tpm values of the different conditions | #' | [tpms_by_replicate.txt](tpms_by_replicate.txt) | tpm values of individual replicates | #' | [geneInfo.txt](geneInfo.txt) | general gene information | #' #' Further informtion on [FPKM and TPM](http://www.rna-seqblog.com/rpkm-fpkm-and-tpm-clearly-explained/) ################################## # get R version and package infos # writeLines(capture.output(sessionInfo()), ".sessionInfo.txt") writeLines(capture.output(devtools::session_info()), ".sessionInfo.txt") ################################# session::save.session(".featcounts_deseq_mf.dat"); # session::restore.session(".featcounts_deseq_mf.dat");