featcounts_deseq.R

#!/usr/bin/env Rscript
#+ include=FALSE

require(knitr)
require(DESeq2)

devtools::source_url("https://raw.githubusercontent.com/holgerbrandl/datautils/master/R/core_commons.R")
devtools::source_url("https://raw.githubusercontent.com/holgerbrandl/datautils/v1.9/R/ggplot_commons.R")
devtools::source_url("https://raw.githubusercontent.com/holgerbrandl/datautils/v1.9/R/bio/diffex_commons.R")

#require.auto(gplots)

#+ results='asis', echo=FALSE
cat('
<link rel="stylesheet" type="text/css" href="http://cdn.datatables.net/1.10.5/css/jquery.dataTables.min.css">
<script type="text/javascript" charset="utf8" src="http://code.jquery.com/jquery-2.1.2.min.js"></script>
<script type="text/javascript" charset="utf8" src="http://cdn.datatables.net/1.10.5/js/jquery.dataTables.min.js"></script>

<script type="text/javascript">
         $(document).ready(function() {
            // alert("test")
             //$("table").DataTable();
             //$("table").DataTable();
             //$("#tab_id").DataTable();
             $(".dtable").DataTable();
         } );
</script>
')


suppressMessages(require(docopt))

doc <- '
Convert a featureCounts results matrix into a dge-report using deseq2
Usage: region_dba.R [options] <count_matrix>

Options:
--contrasts=<tab_delim_table> Table with sample pairs for which dge analysis should be performed
--qcutoff <qcutoff>    Use a q-value cutoff of 0.01 instead of a q-value cutoff [default: 0.01]
--pcutoff <pcutoff>    Override q-value filter and filter by p-value instead
--min_count <min_count>   Minimal expression in any of the sample to be included in the final result list [default: 1]
'
opts <- docopt(doc, commandArgs(TRUE))

## opts <- docopt(doc, "--pcutoff 0.05 --contrasts ../time_contrasts.txt ../peak_clusters_tss5kb.count_matrix.txt")

count_matrix_file <- opts$count_matrix
contrasts_file <- opts$contrasts

pcutoff <- if(is.null(opts$pcutoff)) NULL else as.numeric(opts$pcutoff)
qcutoff <- if(is.numeric(pcutoff)) NULL else as.numeric(opts$qcutoff)
if(is.numeric(pcutoff)) opts$qcutoff <- NULL


########################################################################################################################
#' # Load annotation data

## load transcriptome annotations needed for results annotation
geneInfo <- quote({
        mart <- biomaRt::useDataset("drerio_gene_ensembl", mart = biomaRt::useMart("ensembl"))
        c("ensembl_gene_id", "external_gene_name", "description", "chromosome_name", "start_position", "end_position") %>%
            biomaRt::getBM(mart=mart)
    }) %>% cache_it("geneInfo")


########################################################################################################################
#' # Differential Binding Analysis

#' Working Directory: `r getwd()`

resultsBase <- count_matrix_file %>% basename() %>% trim_ext(".txt") %>% trim_ext(".count_matrix")

countData <- read.delim(count_matrix_file)
names(countData) <- names(countData) %>% str_replace("[.]1", "")

countMatrix <- countData %>% column2rownames("ensembl_gene_id") %>% as.matrix()


#filterByExpression <- function(fpkmMat, min_count=1, logMode=F){
#    if(logMode) fpkmMat<-log10(fpkmMat+1) ## add a pseudocount
#
#    geneMax <- apply(fpkmMat, 1, max)
#
#    fpkmMat[geneMax>min_count,]
#}
#countMatrixFilt  <- filterByExpression(countMatrix, min_count=50)

#' See deseq reference [docs](http://master.bioconductor.org/packages/release/bioc/vignettes/DESeq2/inst/doc/DESeq2.pdf) for details

#' To understand fold-change shrinking and estimation check out
#' [Moderated estimation of fold change and dispersion for RNA-seq data with DESeq2](http://genomebiology.com/2014/15/12/550/abstract)

#' Define or load a contrasts matrix
if(!is.null(contrasts_file)){
    contrasts <- read.delim(contrasts_file, header=T) %>% fac2char()
}else{
    contrasts <- data.frame(sample=colnames(countMatrix)) %>%
        merge(.,., suffixes=c("_1", "_2"), by=NULL) %>%
        filter(ac(sample_1)>ac(sample_2)) %>%
#        filter(ac(sample_1)!=ac(sample_2)) %>%
        fac2char()
    write.delim(contrasts, "dba_contrasts.txt")
}

contrasts %>% kable()


## basic approach using build in normalization
#colData <- data.frame(condition=colnames(countMatrix))
#dds <- DESeqDataSetFromMatrix(countMatrix, colData, formula(~ condition))
##dds <- DESeq(dds, fitType='local', betaPrior=FALSE)
#dds <- DESeq(dds, fitType='local')


## gene specific
#normalizationFactors(dds)

## sizeFactor R help:sigEnrResults
#dds <- makeExampleDESeqDataSet()
#dds <- estimateSizeFactors( dds )
#sizeFactors(dds)

## try again but now use lambda normalization
## see "3.11 Sample-/gene-dependent normalization factors" in the DEseq2 manual for details
colData <- data.frame(condition=colnames(countMatrix))
dds <- DESeqDataSetFromMatrix(countMatrix, colData, formula(~ condition))

#' #  Custom Lambda Normalization

#' See https://www.biostars.org/p/79978/

#' Size Factors estimated by DESeq
dds <- estimateSizeFactors( dds )
sizeFactors(dds) %>% set_names(colnames(countMatrix)) %>% melt() %>% rownames2column("sample") %>%  ggplot(aes(sample, value)) + geom_bar(stat="identity") + ggtitle("deseq size factors")

#' From the DESeq docs about how size factors are used: The sizeFactors vector assigns to each column of the count matrix a value, the size factor, such that count values in the columns can be brought to a common scale by dividing by the corresponding size factor.

#' This means that counts are divied by size factors. So let's now load the lambda libraies and replace the predefined size factors with our custom ones
lambdaLibs <- read.delim("../..//lambda_norm/lambda_lib_sizes.txt") ## load the labmda factors

#' normalize them with their median to achieve a similar scale as the original size factors
lambdaLibs %<>% mutate(lambda_size_factor=ngs_lambda_lib_size/median(ngs_lambda_lib_size))
lambdaLibs %>% ggplot(aes(sample, lambda_size_factor)) + geom_bar(stat="identity") + ggtitle("lambda size factors")

#' From DESeq manual: The regularized log transformation is preferable to the variance stabilizing transformation if the size factors vary widely.

#' Replace size factors with our ones
#' NOTE THIS IS DISABLED FOR NOW
sizeFactors(dds) <- lambdaLibs$lambda_size_factor

#rld <- rlog(dds, fitType='local')
#rlogMat <- assay(rld)

#' Run Deseq Test
#dds <- DESeq(dds, fitType='local', betaPrior=FALSE)
dds <- DESeq(dds, fitType='local')

#' Test if deseq preserved our size factors
sizeFactors(dds) %>% set_names(colnames(countMatrix)) %>% melt() %>% rownames2column("sample") %>%  ggplot(aes(sample, value)) + geom_bar(stat="identity") + ggtitle("deseq size factors after running deseq")


#res <- results(dds)
#resultsNames(dds)

#' Model Overview
#+ results='asis'
summary(results(dds))

## extract all de-sets according to our contrasts model
deResults <- adply(contrasts, 1,  splat(function(sample_1, sample_2){
#    browser()
   results(dds, contrast=c("condition",sample_1,sample_2)) %>%
        rownames2column("ensembl_gene_id") %>%
        as.data.frame() %>%
        ## see http://rpackages.ianhowson.com/bioc/DESeq2/man/results.html when using contrasts argument
        rename(s1_over_s2_logfc=log2FoldChange) %>%
        mutate(sample_1=sample_1, sample_2=sample_2)
}))


deResults %>% ggplot(aes(s1_over_s2_logfc)) +
    geom_histogram(binwidth=0.1) +
    facet_grid(sample_1 ~ sample_2) + geom_vline(yintercept=0, color="blue") +
    xlim(-2,2) + 
    ggtitle("sample1 over sampl2 logFC ")


#' ## Significnce of differential binding

deResults %>% ggplot(aes(pvalue)) +
    geom_histogram() +
    xlim(0,1) +
    facet_grid(sample_1 ~ sample_2) + geom_vline(yintercept=0.01, slope=1, color="blue") +
    ggtitle("p-value distributions") #+ scale_x_log10()

deResults %>% ggplot(aes(padj)) +
    geom_histogram() +
    xlim(0,1) +
    facet_grid(sample_1 ~ sample_2) + geom_vline(yintercept=0.01, slope=1, color="blue") +
    ggtitle("Adjusted p-value distributions") + scale_x_log10()


#' Set hit criterion
#+ results='asis'
if(!is.null(qcutoff)){
    echo("Using q-value cutoff of", qcutoff)
    deResults %<>% transform(is_hit=padj<=qcutoff)
}else{
    echo("Using p-value cutoff of", pcutoff)
    deResults %<>% transform(is_hit=pvalue<=pcutoff)
}
#deResults %<>% mutate(is_hit=pvalue<0.05)

deResults %<>% mutate(s1_overex=s1_over_s2_logfc>1)


normCounts <- counts(dds,normalized=TRUE) %>%
    set_names(colData(dds)$condition) %>% rownames2column("ensembl_gene_id")

rawCounts <- counts(dds,normalized=F) %>%
    set_names(colData(dds)$condition) %>% rownames2column("ensembl_gene_id")

## .. should be same as input
#filter(countData, ensembl_gene_id=="ENSDARG00000000001")

#' # MA and Volcano plots

# deseq approach
# plotMA(deResults, main="DESeq2", ylim=c(-2,2))

## facet plot
maData <- normCounts %>%
    gather(sample, norm_count, -ensembl_gene_id) %>%
    merge(.,., by="ensembl_gene_id", suffixes=c("_1", "_2")) %>%
#    filter(ac(sample_1)<ac(sample_2)) %>%
    # add diffex status
    merge(deResults)


#+ fig.width=16, fig.height=14
maData %>% ggplot(aes(0.5*log2(norm_count_1*norm_count_2), log2(norm_count_2/norm_count_1), color=pvalue<0.05)) +
    geom_point(alpha=0.3) +
    geom_hline(yintercept=0, color="red") +
    facet_grid(sample_1 ~ sample_2)


##Volcano plots
hitCounts <- filter(deResults, is_hit) %>%
        count(sample_1, sample_2, sample_1_overex=s1_over_s2_logfc>0) %>%
        rename(hits=n) %>%
        merge(data.frame(sample_1_overex=c(T,F), x_pos=c(-2.5,2.5)))

#+ fig.width=16, fig.height=14
deResults %>% ggplot(aes(s1_over_s2_logfc, -log10(pvalue), color=is_hit)) +
    geom_jitter(alpha=0.3, position = position_jitter(height = 0.2)) +
#    theme_bw() +
    xlim(-3,3) +
    scale_color_manual(values = c("TRUE"="red", "FALSE"="black")) +
#    ggtitle("Insm1/Ctrl") +
    ## http://stackoverflow.com/questions/19764968/remove-point-transparency-in-ggplot2-legend
    guides(colour = guide_legend(override.aes = list(alpha=1))) +

    ## tweak axis labels
    xlab(expression(log[2]("x/y"))) +
    ylab(expression(-log[10]("p value"))) +

    ## add hit couts
    geom_text(aes(label=hits, x=x_pos), y=2, color="red", size=10, data=hitCounts) +
    facet_grid(sample_1 ~ sample_2)


# Define absolute binding categories
ggplot(rawCounts, aes(H3HA_Sphere)) + geom_histogram() + scale_x_log10() + ggtitle("raw H3HA_Sphere counts distribution")
require.auto(Hmisc)

bindCats <- rawCounts %>% transmute(ensembl_gene_id, bind_category=cut2(H3HA_Sphere, cuts=c(10, 100)))
with(bindCats, as.data.frame(table(bind_category))) %>% kable()


########################################################################################################################
## if working with peak clusters try to fox the gene idby using chipseq anno

if(str_detect(count_matrix_file, "peak_clusters_tss5kb")){

require(ChIPpeakAnno)
data(TSS.zebrafish.Zv9)

## reload dplyr to fix namespace issues
unloadNamespace('dplyr'); require(dplyr)

## https://www.biostars.org/p/115101/
## http://master.bioconductor.org/help/course-materials/2009/SeattleNov09/IRanges/IRangesOverview.R

distinctClusters <- deResults %>%
    transmute(cluster_id=ensembl_gene_id) %>% distinct() %>%
    separate(cluster_id, c("chromosome_name", "start_position", "end_position"), remove=F) %>%
    mutate_each(funs(as.numeric), start_position, end_position)

clusters <- distinctClusters %$% RangedData(ranges = IRanges(start_position, end_position), strand = ".", space = chromosome_name)

annotatedClusters = annotatePeakInBatch (clusters, AnnotationData = TSS.zebrafish.Zv9)
annotatedClusters %>% as.df()%>% ggplot(aes(insideFeature))  + geom_bar() + ggtitle("clusters relative to tss")
annotatedClusters %>% as.df() %>% head() %>% kable()

## add slim version of it to de-results
tssAnnoSlim <- annotatedClusters %>% as.df() %>%  
    inner_join(distinctClusters,c("space"="chromosome_name", "start"="start_position", "end"="end_position")) %>% 
    transmute(cluster_id, ensembl_gene_id=feature, distancetoFeature, insideFeature)

stopifnot(nrow(tssAnnoSlim)==nrow(distinctClusters))

save(deResults, file=".deResults.RData")
# deResults <- local(get(load("deResults.RData")))

deResults %<>% rename(cluster_id=ensembl_gene_id) %>% inner_join(tssAnnoSlim)
normCounts %<>% rename(cluster_id=ensembl_gene_id)
bindCats %<>% rename(cluster_id=ensembl_gene_id)
}


# Export the complete dataset for later analysis
deAnnot <- deResults %>%
    inner_join(normCounts) %>%
    left_join(geneInfo) %>%
    left_join(bindCats)

stopifnot(nrow(deAnnot)==nrow(deResults))

write.delim(deAnnot, file=paste0(resultsBase, ".dba_results.txt"))
#'  [deAnnot](`r paste0(resultsBase, ".dba_results.txt")`)


#' ## Hits Summary

## Extract hits from deseq results
degs <- deAnnot %>% filter(is_hit)

ggplot(degs, aes(paste(sample_1, "vs",  sample_2))) + geom_bar() +xlab(NULL) + ylab("# DBGs") + ggtitle("DBG count summary") + coord_flip()

degs %>%
    ggplot(aes(paste(sample_1, "vs",  sample_2), fill=s1_overex)) +
    geom_bar(position="dodge") +
    xlab(NULL) + ylab("# DBGs") +
    ggtitle("DBG count summary by direction of expression") +
    coord_flip()

# Export DBA genes
## disabled because we just subset the annotated data now to define degs
#degsAnnot <- degs %>%
#    inner_join(normCounts) %>%
#    left_join(geneInfo) %>%
#    left_join(bindCats)
degs %>% write.delim(file=paste0(resultsBase, ".diffbind_genes.txt"))
#'  [degs](`r paste0(resultsBase, ".diffbind_genes.txt")`)


## render interactive hit browser
#+results='asis'
degs %>%
    left_join(geneInfo) %>%
    mutate(
        igv=paste0("<a href='http://localhost:60151/goto?locus=", chromosome_name,":", start_position, "-", end_position, "'>IGV</a>")
    ) %>%
    select(s1_over_s2_logfc, pvalue, ensembl_gene_id, sample_1, sample_2, external_gene_name, description, igv) %>%
    kable("html", table.attr = "class='dtable'", escape=F)


########################################################################################################################
#' ## Term enrichment

#+ echo=FALSE

#' This analysis was performed using [David](http://david.abcc.ncifcrf.gov/). The following ontologies were tested: `r paste(ontologies, collapse=', ')`

geneLists <- degs %>%
    transmute(ensembl_gene_id, list_id=paste(sample_1, "vs", sample_2))

if(nrow(contrasts)<4){
    geneLists <- rbind_list(
        geneLists,
        degs %>% filter(s1_overex) %>% transmute(ensembl_gene_id, list_id=paste(sample_1, ">", sample_2)),
        degs %>% filter(!s1_overex) %>% transmute(ensembl_gene_id, list_id=paste(sample_1, "<", sample_2))
    )
}

## additional overlaps before doing the intersection analysis
geneLists %>% count(list_id) %>% kable()

intersectLists <- function(geneLists, listIdA, listIdB, intersectListId) {
    commonGenes <- setdiff(filter(geneLists, list_id==listIdA)$ensembl_gene_id, filter(geneLists, list_id==listIdB)$ensembl_gene_id)
    data.frame(list_id=intersectListId, ensembl_gene_id=commonGenes)
}

## make this project specific setting
#with(geneLists, as.data.frame(table(list_id))) %$% ac(unique(list_id))
#if(any(str_detect(geneLists$list_id, "cyst"))){
#    geneLists <- intersectLists(geneLists, "liver != unpolarised", "cyst != unpolarised", "unp_liv & unp_cys") %>% rbind_list(geneLists)
#}

geneLists <- distinct(geneLists) ## just precaution in case of multiple evals of statements above


#+ eval=T
enrResults <- quote(geneLists %>% group_by(list_id) %>% do(davidAnnotationChart(.$ensembl_gene_id))) %>%
    cache_it(paste0("enrdata_", digest(geneLists)))

write.delim(enrResults, file="enrResults.txt")
# enrResults <- read.delim("enrResults.txt")
#'  [Enrichment Results](enrResults.txt)

#ac(annoChart$Genes[1]) %>% str_split(", ") %>% unlist() %>% length()

#
#' Because David is not too strigent by default we extract just those terms for which Bonferroni<0.01
sigEnrResults <- filter(enrResults, Bonferroni <0.01)


#' results='asis'
if(nrow(sigEnrResults)>0){
sigEnrResults %>% select(-Genes) %>% kable("html", table.attr = "class='dtable'", escape=F)
}else{
echo("no highly significant terms found")
sigEnrResults = data.frame(Category=factor(c(), c()))
}

write.delim(sigEnrResults, file="sigEnrResults.txt")
# sigEnrResults <- read.delim("sigEnrResults.txt")
#'  [Very Significant Terms](sigEnrResults.txt)


## plot the enrichment results
#sigEnrResults %>% group_by(Category, add=T) %>% do({
#    logPlot <- . %>% ggplot(aes(Term, PValue)) +
#	    geom_bar(stat="identity")+coord_flip() +
#	    xlab("Enriched Terms") +
#	    ggtitle(.$Category[1]) +
#	    scale_y_log10()
#	    print(logPlot)
#})

#+ include=FALSE, eval=nrow(sigEnrResults)>1
warning("dropping levels")
sigEnrResults %<>% mutate(Category=ac(Category)) ## drop unsused level to get consistent color palette

term_category_colors <- create_palette(unique(ac(sigEnrResults$Category)))

term_barplot_files <- sigEnrResults %>%
## chop and pad category names
mutate(Term=str_sub(Term, 1, 70) %>% str_pad(70)) %>%
do({
#browser()
    enrResultsGrp <- .
    ## DEBUG enrResultsGrp <- sigEnrResults
    label <- enrResultsGrp$list_id[1]

    if(nrow(enrResultsGrp)==0)  return(data.frame())
    browser()

#    warning(paste0("processing terms", paste(ac(unique(enrResultsGrp$Category)), collapse=",")))
    logPlot <- enrResultsGrp %>%
        ## fix factor order
        mutate(Term=reorder(Term, -PValue) %>% reorder(as.integer(as.factor(Category)))) %>%
        ggplot(aes(Term, PValue, fill=Category)) +
        geom_bar(stat="identity")+
        scale_fill_manual(values = term_category_colors, drop=F, name="Ontology") +
        coord_flip() +
        xlab("Enriched Terms") +
        ggtitle(label) +
        scale_y_log10()

    fileNameLabel <- label %>%
        str_replace_all("!=", "ne") %>%
        str_replace_all(">", "gt") %>%
        str_replace_all("<", "lt") %>%
        str_replace_all(fixed("&"), "AND") %>%
        str_replace_all(fixed("|"), "OR") %>%
        str_replace_all(" ", "_")

#        ggsave(paste0("enrichmed_terms__", fileNameLabel, ".pdf"))
#        print(logPlot)

     tmpPng <- paste0("enrterms__", fileNameLabel, ".png")
     ggsave(tmpPng, logPlot, width=10, height = 2+round(nrow(enrResultsGrp)/5), limitsize=FALSE)
     data.frame(file=tmpPng)
})

#+ results="asis"
l_ply(term_barplot_files$file, function(pngFile){ cat(paste0("<img src='", pngFile, "'><br>"))})