featcounts_deseq.R

#!/usr/bin/env Rscript
#+ include=FALSE


## TODO Melanie: make sure reports can be spinned
## Example: spin.R -c $NGS_TOOLS/dge_workflow/featcounts_deseq.R.R "\"count_matrix.txt""


suppressMessages(require(docopt))

doc <- '
Convert a featureCounts results matrix into a dge-report using deseq2
Usage: region_dba.R [options] <count_matrix>

Options:
--contrasts=<tab_delim_table> Table with sample pairs for which dge analysis should be performed
--qcutoff <qcutoff>    Use a q-value cutoff of 0.01 instead of a q-value cutoff [default: 0.01]
--pcutoff <pcutoff>    Override q-value filter and filter by p-value instead
--min_count <min_count>   Minimal expression in any of the sample to be included in the final result list [default: 1]
'

#opts <- docopt(doc, commandArgs(TRUE))

opts <- docopt(doc, "--pcutoff 0.05 --contrasts ../time_contrasts.txt ../peak_clusters_tss5kb.count_matrix.txt")
## opts <- docopt(doc, "countMatrix.txt")

require(knitr)
require(DESeq2)

devtools::source_url("https://raw.githubusercontent.com/holgerbrandl/datautils/v1.12/R/core_commons.R")
devtools::source_url("https://raw.githubusercontent.com/holgerbrandl/datautils/v1.12/R/ggplot_commons.R")
devtools::source_url("https://raw.githubusercontent.com/holgerbrandl/datautils/v1.12/R/bio/diffex_commons.R")

#require.auto(gplots)

#+ results='asis', echo=FALSE
cat('
<link rel="stylesheet" type="text/css" href="http://cdn.datatables.net/1.10.5/css/jquery.dataTables.min.css">
<script type="text/javascript" charset="utf8" src="http://code.jquery.com/jquery-2.1.2.min.js"></script>
<script type="text/javascript" charset="utf8" src="http://cdn.datatables.net/1.10.5/js/jquery.dataTables.min.js"></script>

<script type="text/javascript">
         $(document).ready(function() {
            // alert("test")
             //$("table").DataTable();
             //$("table").DataTable();
             //$("#tab_id").DataTable();
             $(".dtable").DataTable();
         } );
</script>
')


count_matrix_file <- opts$count_matrix
contrasts_file <- opts$contrasts

pcutoff <- if(is.null(opts$pcutoff)) NULL else as.numeric(opts$pcutoff)
qcutoff <- if(is.numeric(pcutoff)) NULL else as.numeric(opts$qcutoff)
if(is.numeric(pcutoff)) opts$qcutoff <- NULL


########################################################################################################################
#' # Load annotation data

## load transcriptome annotations needed for results annotation
geneInfo <- quote({
        mart <- biomaRt::useDataset("drerio_gene_ensembl", mart = biomaRt::useMart("ensembl"))
        c("ensembl_gene_id", "external_gene_name", "description", "chromosome_name", "start_position", "end_position") %>%
            biomaRt::getBM(mart=mart)
    }) %>% cache_it("geneInfo")

########################################################################################################################
#' # Differential Expresssion Analysis

## TODO Melanie
## * similar to example-report (see email) created with "dge_workflow/cuffdiff_report.R"
## * sections to migrate are "Replicate Correlation and Clustering" and "Quality Control"


########################################################################################################################
#' # Differential Expresssion Analysis

#' Working Directory: `r getwd()`

resultsBase <- count_matrix_file %>% basename() %>% trim_ext(".txt") %>% trim_ext(".count_matrix")

countData <- read.delim(count_matrix_file)
names(countData) <- names(countData) %>% str_replace("[.]1", "")

countMatrix <- countData %>% column2rownames("ensembl_gene_id") %>% as.matrix()

#' Apply expression filter
nrow(countMatrix)
countMatrix %<>% filterByExpression()
nrow(countMatrix)


#filterByExpression <- function(fpkmMat, min_count=1, logMode=F){
#    if(logMode) fpkmMat<-log10(fpkmMat+1) ## add a pseudocount
#
#    geneMax <- apply(fpkmMat, 1, max)
#
#    fpkmMat[geneMax>min_count,]
#}
#countMatrixFilt  <- filterByExpression(countMatrix, min_count=50)

#' See deseq reference [docs](http://master.bioconductor.org/packages/release/bioc/vignettes/DESeq2/inst/doc/DESeq2.pdf) for details

#' To understand fold-change shrinking and estimation check out
#' [Moderated estimation of fold change and dispersion for RNA-seq data with DESeq2](http://genomebiology.com/2014/15/12/550/abstract)

## todo expose as argument
get_sample_from_replicate <- function(repName) str_match(repName, "(.*)_[0-9]{1}$")[,2]

#' Define or load a contrasts matrix
if(!is.null(contrasts_file)){
    contrasts <- read.delim(contrasts_file, header=T) %>% fac2char()
}else{
    contrasts <- data.frame(sample=get_sample_from_replicate(colnames(countMatrix))) %>% distinct() %>%
        merge(.,., suffixes=c("_1", "_2"), by=NULL) %>%
        filter(ac(sample_1)>ac(sample_2)) %>%
#        filter(ac(sample_1)!=ac(sample_2)) %>%
        fac2char()
    write.delim(contrasts, "dba_contrasts.txt")
}

contrasts %>% kable()

########################################################################################################################
#' ## Perform Differential Expression Analysis

## gene specific factors
#normalizationFactors(dds)

## sizeFactor R help:sigEnrResults
#dds <- makeExampleDESeqDataSet()
#dds <- estimateSizeFactors( dds )
#sizeFactors(dds)

## try again but now use lambda normalization
## see "3.11 Sample-/gene-dependent normalization factors" in the DEseq2 manual for details
colData <- data.frame(condition=colnames(countMatrix) %>% get_sample_from_replicate)
dds <- DESeqDataSetFromMatrix(countMatrix, colData, formula(~ condition))
dds <- estimateSizeFactors(dds)

#' #  Custom Lambda Normalization

#' See https://www.biostars.org/p/79978/

##' Size Factors estimated by DESeq
sizeFactors(dds) %>% set_names(colnames(countMatrix)) %>% melt() %>% rownames2column("sample") %>%  ggplot(aes(sample, value)) + geom_bar(stat="identity") + ggtitle("deseq size factors")

##' From the DESeq docs about how size factors are used: The sizeFactors vector assigns to each column of the count matrix a value, the size factor, such that count values in the columns can be brought to a common scale by dividing by the corresponding size factor.

##' This means that counts are divied by size factors. So let's now load the lambda libraies and replace the predefined size factors with our custom ones

#' From DESeq manual: The regularized log transformation is preferable to the variance stabilizing transformation if the size factors vary widely.

#' Run Deseq Test
#dds <- DESeq(dds, fitType='local', betaPrior=FALSE)
#dds <- DESeq(dds, fitType='local')
dds <- DESeq(dds)

#res <- results(dds)
#resultsNames(dds)

#' Model Overview
#+ results='asis'
summary(results(dds))

## extract all de-sets according to our contrasts model
deResults <- adply(contrasts, 1,  splat(function(sample_1, sample_2){
#    browser()
   results(dds, contrast=c("condition",sample_1,sample_2)) %>%
        rownames2column("ensembl_gene_id") %>%
        as.data.frame() %>%
        ## see http://rpackages.ianhowson.com/bioc/DESeq2/man/results.html when using contrasts argument
        rename(s1_over_s2_logfc=log2FoldChange) %>%
        mutate(sample_1=sample_1, sample_2=sample_2)
}))


deResults %>% ggplot(aes(s1_over_s2_logfc)) +
    geom_histogram(binwidth=0.1) +
    facet_grid(sample_1 ~ sample_2) + geom_vline(yintercept=0, color="blue") +
    xlim(-2,2) + 
    ggtitle("sample1 over sampl2 logFC ")


#' ## Significnce of differential binding

deResults %>% ggplot(aes(pvalue)) +
    geom_histogram() +
    xlim(0,1) +
    facet_grid(sample_1 ~ sample_2) + geom_vline(yintercept=0.01, slope=1, color="blue") +
    ggtitle("p-value distributions") #+ scale_x_log10()

deResults %>% ggplot(aes(padj)) +
    geom_histogram() +
#    xlim(0,1) +
    facet_grid(sample_1 ~ sample_2) + geom_vline(yintercept=0.01, slope=1, color="blue") +
    ggtitle("Adjusted p-value distributions") #+ scale_x_log10()


#' Set hit criterion
#+ results='asis'
if(!is.null(qcutoff)){
    echo("Using q-value cutoff of", qcutoff)
    deResults %<>% transform(is_hit=padj<=qcutoff)
}else{
    echo("Using p-value cutoff of", pcutoff)
    deResults %<>% transform(is_hit=pvalue<=pcutoff)
}
#deResults %<>% mutate(is_hit=pvalue<0.05)

deResults %<>% mutate(s1_overex=s1_over_s2_logfc>1)


normCounts <- counts(dds,normalized=TRUE) %>%
#    set_names(colData(dds)$condition) %>% rownames2column("ensembl_gene_id")
    set_names(colnames(countMatrix)) %>% rownames2column("ensembl_gene_id")


## .. should be same as input
#filter(countData, ensembl_gene_id=="ENSDARG00000000001")

#' # MA and Volcano plots

# deseq approach
# plotMA(deResults, main="DESeq2", ylim=c(-2,2))

## facet plot
maData <- normCounts %>%
    gather(sample, norm_count, -ensembl_gene_id) %>%
    merge(.,., by="ensembl_gene_id", suffixes=c("_1", "_2")) %>%
#    filter(ac(sample_1)<ac(sample_2)) %>%
    # add diffex status
    merge(deResults)


#+ fig.width=16, fig.height=14
maData %>% ggplot(aes(0.5*log2(norm_count_1*norm_count_2), log2(norm_count_2/norm_count_1), color=pvalue<0.05)) +
    geom_point(alpha=0.3) +
    geom_hline(yintercept=0, color="red") +
    facet_grid(sample_1 ~ sample_2)


##Volcano plots
hitCounts <- filter(deResults, is_hit) %>%
        count(sample_1, sample_2, sample_1_overex=s1_over_s2_logfc>0) %>%
        rename(hits=n) %>%
        merge(data.frame(sample_1_overex=c(T,F), x_pos=c(-2.5,2.5)))

#+ fig.width=16, fig.height=14
deResults %>% ggplot(aes(s1_over_s2_logfc, -log10(pvalue), color=is_hit)) +
    geom_jitter(alpha=0.3, position = position_jitter(height = 0.2)) +
#    theme_bw() +
    xlim(-3,3) +
    scale_color_manual(values = c("TRUE"="red", "FALSE"="black")) +
#    ggtitle("Insm1/Ctrl") +
    ## http://stackoverflow.com/questions/19764968/remove-point-transparency-in-ggplot2-legend
    guides(colour = guide_legend(override.aes = list(alpha=1))) +

    ## tweak axis labels
    xlab(expression(log[2]("x/y"))) +
    ylab(expression(-log[10]("p value"))) +

    ## add hit couts
    geom_text(aes(label=hits, x=x_pos), y=2, color="red", size=10, data=hitCounts) +
    facet_grid(sample_1 ~ sample_2)


# Define absolute binding categories
#rawCounts <- counts(dds,normalized=F) %>%
#    set_names(colData(dds)$condition) %>% rownames2column("ensembl_gene_id")
#ggplot(rawCounts, aes(H3HA_Sphere)) + geom_histogram() + scale_x_log10() + ggtitle("raw H3HA_Sphere counts distribution")
#require.auto(Hmisc)
#
#bindCats <- rawCounts %>% transmute(ensembl_gene_id, bind_category=cut2(H3HA_Sphere, cuts=c(10, 100)))
#with(bindCats, as.data.frame(table(bind_category))) %>% kable()


# Export the complete dataset for later analysis
deAnnot <- deResults %>%
    inner_join(normCounts) %>%
    left_join(geneInfo)

write.delim(deAnnot, file=paste0(resultsBase, ".dba_results.txt"))
#'  [deAnnot](`r paste0(resultsBase, ".dba_results.txt")`)


#' ## Hits Summary

## Extract hits from deseq results
degs <- deAnnot %>% filter(is_hit)

ggplot(degs, aes(paste(sample_1, "vs",  sample_2))) + geom_bar() +xlab(NULL) + ylab("# DBGs") + ggtitle("DBG count summary") + coord_flip()

degs %>%
    ggplot(aes(paste(sample_1, "vs",  sample_2), fill=s1_overex)) +
    geom_bar(position="dodge") +
    xlab(NULL) + ylab("# DBGs") +
    ggtitle("DBG count summary by direction of expression") +
    coord_flip()

# Export DBA genes
## disabled because we just subset the annotated data now to define degs
#degsAnnot <- degs %>%
#    inner_join(normCounts) %>%
#    left_join(geneInfo) %>%
#    left_join(bindCats)
degs %>% write.delim(file=paste0(resultsBase, ".diffbind_genes.txt"))
#'  [degs](`r paste0(resultsBase, ".diffbind_genes.txt")`)


## render interactive hit browser
#+results='asis'
degs %>%
    left_join(geneInfo) %>%
    mutate(
        igv=paste0("<a href='http://localhost:60151/goto?locus=", chromosome_name,":", start_position, "-", end_position, "'>IGV</a>")
    ) %>%
    select(s1_over_s2_logfc, pvalue, ensembl_gene_id, sample_1, sample_2, external_gene_name, description, igv) %>%
    kable("html", table.attr = "class='dtable'", escape=F)

## TODO Melanie compare with section "Differentially expressed genes" and complement missing bits


## TODO later, reenable child-inclusion of enrichment analysis
########################################################################################################################
##' ## Term enrichment Data Preparation
##+ echo=FALSE
#
##' This analysis was performed using [David](http://david.abcc.ncifcrf.gov/). The following ontologies were tested: `r paste(ontologies, collapse=', ')`
#
#geneLists <- degs %>%
#    transmute(ensembl_gene_id, list_id=paste(sample_1, "vs", sample_2))
#
#if(nrow(contrasts)<4){
#    geneLists <- rbind_list(
#        geneLists,
#        degs %>% filter(s1_overex) %>% transmute(ensembl_gene_id, list_id=paste(sample_1, ">", sample_2)),
#        degs %>% filter(!s1_overex) %>% transmute(ensembl_gene_id, list_id=paste(sample_1, "<", sample_2))
#    )
#}
#
### additional overlaps before doing the intersection analysis
#geneLists %>% count(list_id) %>% kable()
#
#intersectLists <- function(geneLists, listIdA, listIdB, intersectListId) {
#    commonGenes <- setdiff(filter(geneLists, list_id==listIdA)$ensembl_gene_id, filter(geneLists, list_id==listIdB)$ensembl_gene_id)
#    data.frame(list_id=intersectListId, ensembl_gene_id=commonGenes)
#}
#
#geneLists %<>% group_by(list_id)
#save(geneLists, file=".enrGeneLists.RData")
## geneLists <- local(get(load("enrGeneLists.RData")))
#
#
### redefine opts arguments and tweak knitr options
#opts_knit$set(root.dir = getwd())
#commandArgs <- function(x) c(paste("--overlay_expr_data ", count_matrix_file, " enrGeneLists.RData"))
##source("/home/brandl/mnt/mack/bioinfo/scripts/ngs_tools/dev/common/david_enrichment.R")
## child='/home/brandl/mnt/mack/bioinfo/scripts/ngs_tools/dev/common/david_enrichment.Rmd'