#!/usr/bin/env Rscript #+ include=FALSE ## TODO Melanie: make sure reports can be spinned ## Example: spin.R -c $NGS_TOOLS/dge_workflow/featcounts_deseq.R.R "\"count_matrix.txt"" suppressMessages(require(docopt)) doc <- ' Convert a featureCounts results matrix into a dge-report using deseq2 Usage: region_dba.R [options] <count_matrix> Options: --contrasts=<tab_delim_table> Table with sample pairs for which dge analysis should be performed --qcutoff <qcutoff> Use a q-value cutoff of 0.01 instead of a q-value cutoff [default: 0.01] --pcutoff <pcutoff> Override q-value filter and filter by p-value instead --min_count <min_count> Minimal expression in any of the sample to be included in the final result list [default: 1] ' #opts <- docopt(doc, commandArgs(TRUE)) opts <- docopt(doc, "--pcutoff 0.05 --contrasts ../time_contrasts.txt ../peak_clusters_tss5kb.count_matrix.txt") ## opts <- docopt(doc, "countMatrix.txt") require(knitr) require(DESeq2) devtools::source_url("https://raw.githubusercontent.com/holgerbrandl/datautils/v1.12/R/core_commons.R") devtools::source_url("https://raw.githubusercontent.com/holgerbrandl/datautils/v1.12/R/ggplot_commons.R") devtools::source_url("https://raw.githubusercontent.com/holgerbrandl/datautils/v1.12/R/bio/diffex_commons.R") #require.auto(gplots) #+ results='asis', echo=FALSE cat(' <link rel="stylesheet" type="text/css" href="http://cdn.datatables.net/1.10.5/css/jquery.dataTables.min.css"> <script type="text/javascript" charset="utf8" src="http://code.jquery.com/jquery-2.1.2.min.js"></script> <script type="text/javascript" charset="utf8" src="http://cdn.datatables.net/1.10.5/js/jquery.dataTables.min.js"></script> <script type="text/javascript"> $(document).ready(function() { // alert("test") //$("table").DataTable(); //$("table").DataTable(); //$("#tab_id").DataTable(); $(".dtable").DataTable(); } ); </script> ') count_matrix_file <- opts$count_matrix contrasts_file <- opts$contrasts pcutoff <- if(is.null(opts$pcutoff)) NULL else as.numeric(opts$pcutoff) qcutoff <- if(is.numeric(pcutoff)) NULL else as.numeric(opts$qcutoff) if(is.numeric(pcutoff)) opts$qcutoff <- NULL ######################################################################################################################## #' # Load annotation data ## load transcriptome annotations needed for results annotation geneInfo <- quote({ mart <- biomaRt::useDataset("drerio_gene_ensembl", mart = biomaRt::useMart("ensembl")) c("ensembl_gene_id", "external_gene_name", "description", "chromosome_name", "start_position", "end_position") %>% biomaRt::getBM(mart=mart) }) %>% cache_it("geneInfo") ######################################################################################################################## #' # Differential Expresssion Analysis ## TODO Melanie ## * similar to example-report (see email) created with "dge_workflow/cuffdiff_report.R" ## * sections to migrate are "Replicate Correlation and Clustering" and "Quality Control" ######################################################################################################################## #' # Differential Expresssion Analysis #' Working Directory: `r getwd()` resultsBase <- count_matrix_file %>% basename() %>% trim_ext(".txt") %>% trim_ext(".count_matrix") countData <- read.delim(count_matrix_file) names(countData) <- names(countData) %>% str_replace("[.]1", "") countMatrix <- countData %>% column2rownames("ensembl_gene_id") %>% as.matrix() #' Apply expression filter nrow(countMatrix) countMatrix %<>% filterByExpression() nrow(countMatrix) #filterByExpression <- function(fpkmMat, min_count=1, logMode=F){ # if(logMode) fpkmMat<-log10(fpkmMat+1) ## add a pseudocount # # geneMax <- apply(fpkmMat, 1, max) # # fpkmMat[geneMax>min_count,] #} #countMatrixFilt <- filterByExpression(countMatrix, min_count=50) #' See deseq reference [docs](http://master.bioconductor.org/packages/release/bioc/vignettes/DESeq2/inst/doc/DESeq2.pdf) for details #' To understand fold-change shrinking and estimation check out #' [Moderated estimation of fold change and dispersion for RNA-seq data with DESeq2](http://genomebiology.com/2014/15/12/550/abstract) ## todo expose as argument get_sample_from_replicate <- function(repName) str_match(repName, "(.*)_[0-9]{1}$")[,2] #' Define or load a contrasts matrix if(!is.null(contrasts_file)){ contrasts <- read.delim(contrasts_file, header=T) %>% fac2char() }else{ contrasts <- data.frame(sample=get_sample_from_replicate(colnames(countMatrix))) %>% distinct() %>% merge(.,., suffixes=c("_1", "_2"), by=NULL) %>% filter(ac(sample_1)>ac(sample_2)) %>% # filter(ac(sample_1)!=ac(sample_2)) %>% fac2char() write.delim(contrasts, "dba_contrasts.txt") } contrasts %>% kable() ######################################################################################################################## #' ## Perform Differential Expression Analysis ## gene specific factors #normalizationFactors(dds) ## sizeFactor R help:sigEnrResults #dds <- makeExampleDESeqDataSet() #dds <- estimateSizeFactors( dds ) #sizeFactors(dds) ## try again but now use lambda normalization ## see "3.11 Sample-/gene-dependent normalization factors" in the DEseq2 manual for details colData <- data.frame(condition=colnames(countMatrix) %>% get_sample_from_replicate) dds <- DESeqDataSetFromMatrix(countMatrix, colData, formula(~ condition)) dds <- estimateSizeFactors(dds) #' # Custom Lambda Normalization #' See https://www.biostars.org/p/79978/ ##' Size Factors estimated by DESeq sizeFactors(dds) %>% set_names(colnames(countMatrix)) %>% melt() %>% rownames2column("sample") %>% ggplot(aes(sample, value)) + geom_bar(stat="identity") + ggtitle("deseq size factors") ##' From the DESeq docs about how size factors are used: The sizeFactors vector assigns to each column of the count matrix a value, the size factor, such that count values in the columns can be brought to a common scale by dividing by the corresponding size factor. ##' This means that counts are divied by size factors. So let's now load the lambda libraies and replace the predefined size factors with our custom ones #' From DESeq manual: The regularized log transformation is preferable to the variance stabilizing transformation if the size factors vary widely. #' Run Deseq Test #dds <- DESeq(dds, fitType='local', betaPrior=FALSE) #dds <- DESeq(dds, fitType='local') dds <- DESeq(dds) #res <- results(dds) #resultsNames(dds) #' Model Overview #+ results='asis' summary(results(dds)) ## extract all de-sets according to our contrasts model deResults <- adply(contrasts, 1, splat(function(sample_1, sample_2){ # browser() results(dds, contrast=c("condition",sample_1,sample_2)) %>% rownames2column("ensembl_gene_id") %>% as.data.frame() %>% ## see http://rpackages.ianhowson.com/bioc/DESeq2/man/results.html when using contrasts argument rename(s1_over_s2_logfc=log2FoldChange) %>% mutate(sample_1=sample_1, sample_2=sample_2) })) deResults %>% ggplot(aes(s1_over_s2_logfc)) + geom_histogram(binwidth=0.1) + facet_grid(sample_1 ~ sample_2) + geom_vline(yintercept=0, color="blue") + xlim(-2,2) + ggtitle("sample1 over sampl2 logFC ") #' ## Significnce of differential binding deResults %>% ggplot(aes(pvalue)) + geom_histogram() + xlim(0,1) + facet_grid(sample_1 ~ sample_2) + geom_vline(yintercept=0.01, slope=1, color="blue") + ggtitle("p-value distributions") #+ scale_x_log10() deResults %>% ggplot(aes(padj)) + geom_histogram() + # xlim(0,1) + facet_grid(sample_1 ~ sample_2) + geom_vline(yintercept=0.01, slope=1, color="blue") + ggtitle("Adjusted p-value distributions") #+ scale_x_log10() #' Set hit criterion #+ results='asis' if(!is.null(qcutoff)){ echo("Using q-value cutoff of", qcutoff) deResults %<>% transform(is_hit=padj<=qcutoff) }else{ echo("Using p-value cutoff of", pcutoff) deResults %<>% transform(is_hit=pvalue<=pcutoff) } #deResults %<>% mutate(is_hit=pvalue<0.05) deResults %<>% mutate(s1_overex=s1_over_s2_logfc>1) normCounts <- counts(dds,normalized=TRUE) %>% # set_names(colData(dds)$condition) %>% rownames2column("ensembl_gene_id") set_names(colnames(countMatrix)) %>% rownames2column("ensembl_gene_id") ## .. should be same as input #filter(countData, ensembl_gene_id=="ENSDARG00000000001") #' # MA and Volcano plots # deseq approach # plotMA(deResults, main="DESeq2", ylim=c(-2,2)) ## facet plot maData <- normCounts %>% gather(sample, norm_count, -ensembl_gene_id) %>% merge(.,., by="ensembl_gene_id", suffixes=c("_1", "_2")) %>% # filter(ac(sample_1)<ac(sample_2)) %>% # add diffex status merge(deResults) #+ fig.width=16, fig.height=14 maData %>% ggplot(aes(0.5*log2(norm_count_1*norm_count_2), log2(norm_count_2/norm_count_1), color=pvalue<0.05)) + geom_point(alpha=0.3) + geom_hline(yintercept=0, color="red") + facet_grid(sample_1 ~ sample_2) ##Volcano plots hitCounts <- filter(deResults, is_hit) %>% count(sample_1, sample_2, sample_1_overex=s1_over_s2_logfc>0) %>% rename(hits=n) %>% merge(data.frame(sample_1_overex=c(T,F), x_pos=c(-2.5,2.5))) #+ fig.width=16, fig.height=14 deResults %>% ggplot(aes(s1_over_s2_logfc, -log10(pvalue), color=is_hit)) + geom_jitter(alpha=0.3, position = position_jitter(height = 0.2)) + # theme_bw() + xlim(-3,3) + scale_color_manual(values = c("TRUE"="red", "FALSE"="black")) + # ggtitle("Insm1/Ctrl") + ## http://stackoverflow.com/questions/19764968/remove-point-transparency-in-ggplot2-legend guides(colour = guide_legend(override.aes = list(alpha=1))) + ## tweak axis labels xlab(expression(log[2]("x/y"))) + ylab(expression(-log[10]("p value"))) + ## add hit couts geom_text(aes(label=hits, x=x_pos), y=2, color="red", size=10, data=hitCounts) + facet_grid(sample_1 ~ sample_2) # Define absolute binding categories #rawCounts <- counts(dds,normalized=F) %>% # set_names(colData(dds)$condition) %>% rownames2column("ensembl_gene_id") #ggplot(rawCounts, aes(H3HA_Sphere)) + geom_histogram() + scale_x_log10() + ggtitle("raw H3HA_Sphere counts distribution") #require.auto(Hmisc) # #bindCats <- rawCounts %>% transmute(ensembl_gene_id, bind_category=cut2(H3HA_Sphere, cuts=c(10, 100))) #with(bindCats, as.data.frame(table(bind_category))) %>% kable() # Export the complete dataset for later analysis deAnnot <- deResults %>% inner_join(normCounts) %>% left_join(geneInfo) write.delim(deAnnot, file=paste0(resultsBase, ".dba_results.txt")) #' [deAnnot](`r paste0(resultsBase, ".dba_results.txt")`) #' ## Hits Summary ## Extract hits from deseq results degs <- deAnnot %>% filter(is_hit) ggplot(degs, aes(paste(sample_1, "vs", sample_2))) + geom_bar() +xlab(NULL) + ylab("# DBGs") + ggtitle("DBG count summary") + coord_flip() degs %>% ggplot(aes(paste(sample_1, "vs", sample_2), fill=s1_overex)) + geom_bar(position="dodge") + xlab(NULL) + ylab("# DBGs") + ggtitle("DBG count summary by direction of expression") + coord_flip() # Export DBA genes ## disabled because we just subset the annotated data now to define degs #degsAnnot <- degs %>% # inner_join(normCounts) %>% # left_join(geneInfo) %>% # left_join(bindCats) degs %>% write.delim(file=paste0(resultsBase, ".diffbind_genes.txt")) #' [degs](`r paste0(resultsBase, ".diffbind_genes.txt")`) ## render interactive hit browser #+results='asis' degs %>% left_join(geneInfo) %>% mutate( igv=paste0("<a href='http://localhost:60151/goto?locus=", chromosome_name,":", start_position, "-", end_position, "'>IGV</a>") ) %>% select(s1_over_s2_logfc, pvalue, ensembl_gene_id, sample_1, sample_2, external_gene_name, description, igv) %>% kable("html", table.attr = "class='dtable'", escape=F) ## TODO Melanie compare with section "Differentially expressed genes" and complement missing bits ## TODO later, reenable child-inclusion of enrichment analysis ######################################################################################################################## ##' ## Term enrichment Data Preparation ##+ echo=FALSE # ##' This analysis was performed using [David](http://david.abcc.ncifcrf.gov/). The following ontologies were tested: `r paste(ontologies, collapse=', ')` # #geneLists <- degs %>% # transmute(ensembl_gene_id, list_id=paste(sample_1, "vs", sample_2)) # #if(nrow(contrasts)<4){ # geneLists <- rbind_list( # geneLists, # degs %>% filter(s1_overex) %>% transmute(ensembl_gene_id, list_id=paste(sample_1, ">", sample_2)), # degs %>% filter(!s1_overex) %>% transmute(ensembl_gene_id, list_id=paste(sample_1, "<", sample_2)) # ) #} # ### additional overlaps before doing the intersection analysis #geneLists %>% count(list_id) %>% kable() # #intersectLists <- function(geneLists, listIdA, listIdB, intersectListId) { # commonGenes <- setdiff(filter(geneLists, list_id==listIdA)$ensembl_gene_id, filter(geneLists, list_id==listIdB)$ensembl_gene_id) # data.frame(list_id=intersectListId, ensembl_gene_id=commonGenes) #} # #geneLists %<>% group_by(list_id) #save(geneLists, file=".enrGeneLists.RData") ## geneLists <- local(get(load("enrGeneLists.RData"))) # # ### redefine opts arguments and tweak knitr options #opts_knit$set(root.dir = getwd()) #commandArgs <- function(x) c(paste("--overlay_expr_data ", count_matrix_file, " enrGeneLists.RData")) ##source("/home/brandl/mnt/mack/bioinfo/scripts/ngs_tools/dev/common/david_enrichment.R") ## child='/home/brandl/mnt/mack/bioinfo/scripts/ngs_tools/dev/common/david_enrichment.Rmd'