Skip to content
Snippets Groups Projects
featcounts_deseq.R 16.2 KiB
Newer Older
#!/usr/bin/env Rscript
#+ include=FALSE

Holger Brandl's avatar
Holger Brandl committed
## TODO Melanie: make sure reports can be spinned
## Example: spin.R -c $NGS_TOOLS/dge_workflow/featcounts_deseq.R.R "\"count_matrix.txt""


Holger Brandl's avatar
Holger Brandl committed
suppressMessages(require(docopt))

doc <- '
Convert a featureCounts results matrix into a dge-report using deseq2
Usage: region_dba.R [options] <count_matrix>

Options:
--contrasts=<tab_delim_table> Table with sample pairs for which dge analysis should be performed
--qcutoff <qcutoff>    Use a q-value cutoff of 0.01 instead of a q-value cutoff [default: 0.01]
--pcutoff <pcutoff>    Override q-value filter and filter by p-value instead
--min_count <min_count>   Minimal expression in any of the sample to be included in the final result list [default: 1]
'

Holger Brandl's avatar
Holger Brandl committed
#opts <- docopt(doc, commandArgs(TRUE))
opts <- docopt(doc, "--pcutoff 0.05 --contrasts ~/MPI-CBG_work/P5_DESeq/testing/dba_contrasts.txt ~/MPI-CBG_work/P5_DESeq/testing/countMatrix.txt")
Holger Brandl's avatar
Holger Brandl committed
## opts <- docopt(doc, "countMatrix.txt")

require(knitr)
require(DESeq2)

Holger Brandl's avatar
Holger Brandl committed
devtools::source_url("https://raw.githubusercontent.com/holgerbrandl/datautils/v1.12/R/core_commons.R")
devtools::source_url("https://raw.githubusercontent.com/holgerbrandl/datautils/v1.12/R/ggplot_commons.R")
devtools::source_url("https://raw.githubusercontent.com/holgerbrandl/datautils/v1.12/R/bio/diffex_commons.R")

#require.auto(gplots)

#+ results='asis', echo=FALSE
cat('
<link rel="stylesheet" type="text/css" href="http://cdn.datatables.net/1.10.5/css/jquery.dataTables.min.css">
<script type="text/javascript" charset="utf8" src="http://code.jquery.com/jquery-2.1.2.min.js"></script>
<script type="text/javascript" charset="utf8" src="http://cdn.datatables.net/1.10.5/js/jquery.dataTables.min.js"></script>

<script type="text/javascript">
         $(document).ready(function() {
            // alert("test")
             //$("table").DataTable();
             //$("table").DataTable();
             //$("#tab_id").DataTable();
             $(".dtable").DataTable();
         } );
</script>
')



count_matrix_file <- opts$count_matrix
contrasts_file <- opts$contrasts

pcutoff <- if(is.null(opts$pcutoff)) NULL else as.numeric(opts$pcutoff)
qcutoff <- if(is.numeric(pcutoff)) NULL else as.numeric(opts$qcutoff)
if(is.numeric(pcutoff)) opts$qcutoff <- NULL




Holger Brandl's avatar
Holger Brandl committed
########################################################################################################################
#' # Differential Expresssion Analysis

## TODO Melanie
## * similar to example-report (see email) created with "dge_workflow/cuffdiff_report.R"
## * sections to migrate are "Replicate Correlation and Clustering" and "Quality Control"



########################################################################################################################
Holger Brandl's avatar
Holger Brandl committed
#' # Differential Expresssion Analysis
#' ## Data preparation

#' Working Directory: `r getwd()`

resultsBase <- count_matrix_file %>% basename() %>% trim_ext(".txt") %>% trim_ext(".count_matrix")

countData <- read.delim(count_matrix_file)
names(countData) <- names(countData) %>% str_replace("[.]1", "")

countMatrix <- countData %>% column2rownames("ensembl_gene_id") %>% as.matrix()

Holger Brandl's avatar
Holger Brandl committed
#' Apply expression filter
nrow(countMatrix)
countMatrix %<>% filterByExpression()
nrow(countMatrix)

########################################################################################################################
#' # Load annotation data

## load transcriptome annotations needed for results annotation
geneInfo <- quote({
  ## mart <- biomaRt::useDataset("drerio_gene_ensembl", mart = biomaRt::useMart("ensembl"))§
  mart <- biomaRt::useDataset(guess_mart(ensembl_gene_id), mart = biomaRt::useMart("ensembl")),
                              c("ensembl_gene_id", "external_gene_name", "description", "chromosome_name", "start_position", "end_position") %>%
                                biomaRt::getBM(mart=mart)
}) %>% cache_it("geneInfo")

#filterByExpression <- function(fpkmMat, min_count=1, logMode=F){
#    if(logMode) fpkmMat<-log10(fpkmMat+1) ## add a pseudocount
#
#    geneMax <- apply(fpkmMat, 1, max)
#
#    fpkmMat[geneMax>min_count,]
#}
#countMatrixFilt  <- filterByExpression(countMatrix, min_count=50)

#' See deseq reference [docs](http://master.bioconductor.org/packages/release/bioc/vignettes/DESeq2/inst/doc/DESeq2.pdf) for details

#' To understand fold-change shrinking and estimation check out
#' [Moderated estimation of fold change and dispersion for RNA-seq data with DESeq2](http://genomebiology.com/2014/15/12/550/abstract)

Holger Brandl's avatar
Holger Brandl committed
## todo expose as argument
Holger Brandl's avatar
Holger Brandl committed
get_sample_from_replicate <- function(repName) str_match(repName, "(.*)_[0-9]{1}$")[,2]
#' Define or load a contrasts matrix
if(!is.null(contrasts_file)){
    contrasts <- read.delim(contrasts_file, header=T) %>% fac2char()
}else{
Holger Brandl's avatar
Holger Brandl committed
    contrasts <- data.frame(sample=get_sample_from_replicate(colnames(countMatrix))) %>% distinct() %>%
        merge(.,., suffixes=c("_1", "_2"), by=NULL) %>%
        filter(ac(sample_1)>ac(sample_2)) %>%
#        filter(ac(sample_1)!=ac(sample_2)) %>%
        fac2char()
    write.delim(contrasts, "dba_contrasts.txt")
}

contrasts %>% kable()

Holger Brandl's avatar
Holger Brandl committed
########################################################################################################################
#' ## Perform Differential Expression Analysis
Holger Brandl's avatar
Holger Brandl committed
## gene specific factors
#normalizationFactors(dds)

## sizeFactor R help:sigEnrResults
#dds <- makeExampleDESeqDataSet()
#dds <- estimateSizeFactors( dds )
#sizeFactors(dds)

## try again but now use lambda normalization
## see "3.11 Sample-/gene-dependent normalization factors" in the DEseq2 manual for details
Holger Brandl's avatar
Holger Brandl committed
colData <- data.frame(condition=colnames(countMatrix) %>% get_sample_from_replicate)
dds <- DESeqDataSetFromMatrix(countMatrix, colData, formula(~ condition))
Holger Brandl's avatar
Holger Brandl committed
dds <- estimateSizeFactors(dds)

#' #  Custom Lambda Normalization

#' See https://www.biostars.org/p/79978/

Holger Brandl's avatar
Holger Brandl committed
##' Size Factors estimated by DESeq
sizeFactors(dds) %>% set_names(colnames(countMatrix)) %>% melt() %>% rownames2column("sample") %>%  ggplot(aes(sample, value)) + geom_bar(stat="identity") + ggtitle("deseq size factors")

Holger Brandl's avatar
Holger Brandl committed
##' From the DESeq docs about how size factors are used: The sizeFactors vector assigns to each column of the count matrix a value, the size factor, such that count values in the columns can be brought to a common scale by dividing by the corresponding size factor.
Holger Brandl's avatar
Holger Brandl committed
##' This means that counts are divied by size factors. So let's now load the lambda libraies and replace the predefined size factors with our custom ones

#' From DESeq manual: The regularized log transformation is preferable to the variance stabilizing transformation if the size factors vary widely.

#' Run Deseq Test
#dds <- DESeq(dds, fitType='local', betaPrior=FALSE)
Holger Brandl's avatar
Holger Brandl committed
#dds <- DESeq(dds, fitType='local')
dds <- DESeq(dds)

#res <- results(dds)
#resultsNames(dds)

#' Model Overview
#+ results='asis'
summary(results(dds))

## extract all de-sets according to our contrasts model
deResults <- adply(contrasts, 1,  splat(function(sample_1, sample_2){
#    browser()
   results(dds, contrast=c("condition",sample_1,sample_2)) %>%
        rownames2column("ensembl_gene_id") %>%
        as.data.frame() %>%
        ## see http://rpackages.ianhowson.com/bioc/DESeq2/man/results.html when using contrasts argument
        rename(s1_over_s2_logfc=log2FoldChange) %>%
        mutate(sample_1=sample_1, sample_2=sample_2)
}))


deResults %>% ggplot(aes(s1_over_s2_logfc)) +
    geom_histogram(binwidth=0.1) +
    facet_grid(sample_1 ~ sample_2) + geom_vline(yintercept=0, color="blue") +
    xlim(-2,2) + 
    ggtitle("sample1 over sampl2 logFC ")



#' ## Significnce of differential binding

deResults %>% ggplot(aes(pvalue)) +
    geom_histogram() +
    xlim(0,1) +
    facet_grid(sample_1 ~ sample_2) + geom_vline(yintercept=0.01, slope=1, color="blue") +
    ggtitle("p-value distributions") #+ scale_x_log10()

deResults %>% ggplot(aes(padj)) +
    geom_histogram() +
Holger Brandl's avatar
Holger Brandl committed
#    xlim(0,1) +
    facet_grid(sample_1 ~ sample_2) + geom_vline(yintercept=0.01, slope=1, color="blue") +
Holger Brandl's avatar
Holger Brandl committed
    ggtitle("Adjusted p-value distributions") #+ scale_x_log10()


#' Set hit criterion
#+ results='asis'
if(!is.null(qcutoff)){
    echo("Using q-value cutoff of", qcutoff)
    deResults %<>% transform(is_hit=padj<=qcutoff)
}else{
    echo("Using p-value cutoff of", pcutoff)
    deResults %<>% transform(is_hit=pvalue<=pcutoff)
}
#deResults %<>% mutate(is_hit=pvalue<0.05)

deResults %<>% mutate(s1_overex=s1_over_s2_logfc>1)


normCounts <- counts(dds,normalized=TRUE) %>%
Holger Brandl's avatar
Holger Brandl committed
#    set_names(colData(dds)$condition) %>% rownames2column("ensembl_gene_id")
    set_names(colnames(countMatrix)) %>% rownames2column("ensembl_gene_id")


## .. should be same as input
#filter(countData, ensembl_gene_id=="ENSDARG00000000001")

#' # MA and Volcano plots

# deseq approach
# plotMA(deResults, main="DESeq2", ylim=c(-2,2))

## facet plot
maData <- normCounts %>%
    gather(sample, norm_count, -ensembl_gene_id) %>%
    merge(.,., by="ensembl_gene_id", suffixes=c("_1", "_2")) %>%
#    filter(ac(sample_1)<ac(sample_2)) %>%
    # add diffex status
    merge(deResults)


#+ fig.width=16, fig.height=14
maData %>% ggplot(aes(0.5*log2(norm_count_1*norm_count_2), log2(norm_count_2/norm_count_1), color=pvalue<0.05)) +
    geom_point(alpha=0.3) +
    geom_hline(yintercept=0, color="red") +
    facet_grid(sample_1 ~ sample_2)


##Volcano plots
hitCounts <- filter(deResults, is_hit) %>%
        count(sample_1, sample_2, s1_overex) %>%
        rename(hits=n) %>%
        merge(data.frame(s1_overex=c(T,F), x_pos=c(-2.5,2.5)))

#+ fig.width=16, fig.height=14
deResults %>% ggplot(aes(s1_over_s2_logfc, -log10(pvalue), color=is_hit)) +
    geom_jitter(alpha=0.3, position = position_jitter(height = 0.2)) +
#    theme_bw() +
    xlim(-3,3) +
    scale_color_manual(values = c("TRUE"="red", "FALSE"="black")) +
#    ggtitle("Insm1/Ctrl") +
    ## http://stackoverflow.com/questions/19764968/remove-point-transparency-in-ggplot2-legend
    guides(colour = guide_legend(override.aes = list(alpha=1))) +

    ## tweak axis labels
    xlab(expression(log[2]("x/y"))) +
    ylab(expression(-log[10]("p value"))) +

    ## add hit couts
    geom_text(aes(label=hits, x=x_pos), y=2, color="red", size=10, data=hitCounts) +
    facet_grid(sample_1 ~ sample_2)



# Define absolute binding categories
Holger Brandl's avatar
Holger Brandl committed
#rawCounts <- counts(dds,normalized=F) %>%
#    set_names(colData(dds)$condition) %>% rownames2column("ensembl_gene_id")
#ggplot(rawCounts, aes(H3HA_Sphere)) + geom_histogram() + scale_x_log10() + ggtitle("raw H3HA_Sphere counts distribution")
#require.auto(Hmisc)
#
#bindCats <- rawCounts %>% transmute(ensembl_gene_id, bind_category=cut2(H3HA_Sphere, cuts=c(10, 100)))
#with(bindCats, as.data.frame(table(bind_category))) %>% kable()




# Export the complete dataset for later analysis
deAnnot <- deResults %>%
    inner_join(normCounts) %>%
Holger Brandl's avatar
Holger Brandl committed
    left_join(geneInfo)

write.delim(deAnnot, file=paste0(resultsBase, ".dba_results.txt"))
#'  [deAnnot](`r paste0(resultsBase, ".dba_results.txt")`)


#' ## Hits Summary

## Extract hits from deseq results
degs <- deAnnot %>% filter(is_hit)

ggplot(degs, aes(paste(sample_1, "vs",  sample_2))) + geom_bar() +xlab(NULL) + ylab("# DBGs") + ggtitle("DBG count summary") + coord_flip()

degs %>%
    ggplot(aes(paste(sample_1, "vs",  sample_2), fill=s1_overex)) +
    geom_bar(position="dodge") +
    xlab(NULL) + ylab("# DBGs") +
    ggtitle("DBG count summary by direction of expression") +
    coord_flip()

# Export DBA genes
## disabled because we just subset the annotated data now to define degs
#degsAnnot <- degs %>%
#    inner_join(normCounts) %>%
#    left_join(geneInfo) %>%
#    left_join(bindCats)
degs %>% write.delim(file=paste0(resultsBase, ".diffbind_genes.txt"))
#'  [degs](`r paste0(resultsBase, ".diffbind_genes.txt")`)


## render interactive hit browser
#+results='asis'
degs %>%
    left_join(geneInfo) %>%
    mutate(
        igv=paste0("<a href='http://localhost:60151/goto?locus=", chromosome_name,":", start_position, "-", end_position, "'>IGV</a>")
    ) %>%
    select(s1_over_s2_logfc, pvalue, ensembl_gene_id, sample_1, sample_2, external_gene_name, description, igv) %>%
    kable("html", table.attr = "class='dtable'", escape=F)




#'  DEGs (differentially expressed genes) are contained in the following table
write.delim(degs, file="degs.txt")
# degs <- read.delim("degs.txt")
#' [Differentially Expressed Genes](degs.txt)

#' ### DEG Counts
#with(degs, as.data.frame(table(sample_1, sample_2, s1_overex))) %>% filter(Freq >0) %>% kable()
degs %>% count( sample_1, sample_2, s1_overex) %>% kable()

#+ fig.height=nrow(contrasts)+2
#ggplot(degs, aes(paste(sample_1, "vs",  sample_2), fill=status)) + geom_bar() +xlab(NULL) + ylab("# DGEs") +ggtitle("DEGs by contrast") + coord_flip()
ggplot(degs, aes(paste(sample_1, "vs",  sample_2), fill=(s1_overex))) + geom_bar() + xlab(NULL) + ylab("# DGEs") + ggtitle("DEGs by contrast") + coord_flip()


#' DEGs can be browsed in Excel using the exported table or via the embedded table browser. To enable the IGV links, you need to set the port in the IGV preferences to 3334.
kableDegs <- degs
if(nrow(degs>2000)){
  kableDegs <- degs %>% arrange(padj) %>% head(1000)
  #  Error: object 'q_value' not found. So: padj used instead of '-q_value'
  print("just showing first 1000 most significant hits (highest p-value) in interactive hit table!!!! Use Excel to browser to browse the full set")
}

#+ results='asis'
kableDegs %>%
  #inner_join(geneLoci) %>%
  mutate(
    ensembl=paste0("<a href='http://www.ensembl.org/Multi/Search/Results?y=0;site=ensembl_all;x=0;page=1;facet_feature_type=Gene;q=",ensembl_gene_id, "'>",ensembl_gene_id, "</a>"),
    igv=paste0("<a href='http://localhost:60151/goto?locus=", chromosome_name,":", start_position, "-", end_position, "'>IGV</a>")
  ) %>%
  select(sample_1, sample_2, s1_over_s2_logfc, pvalue, padj, ensembl_gene_id, external_gene_name, description) %>%
  kable("html", table.attr = "class='dtable'", escape=F)

## just needed to restore environment easily
save(degs, file=".degs.RData")
# degs <- local(get(load(".degs.RData")))

##--------------------------------------------------------

#' ### MA Plot
#' Redo MA plots but now including hit overlays

maPlots <- deResults %>% group_by(sample_1, sample_2) %>% do(gg={ maData <-.
## todo why not s2_over_s1_log2fc
maData %>% ggplot(aes(0.5*log2(value_1*value_2), log2(value_1/value_2), color=is_hit)) +
  geom_point(alpha=0.3) +
  geom_hline(yintercept=0, color="red") +
  ggtitle(with(maData[1,], paste(sample_1, "vs", sample_2)))
}) %$% gg

#+ fig.height=10*ceiling(length(maPlots)/2)
multiplot(plotlist=maPlots, cols=min(2, length(maPlots)))

#  Error in eval(expr, envir, enclos) : object 'value_1' not found 
##--------------------------------------------------------



Holger Brandl's avatar
Holger Brandl committed
## TODO Melanie compare with section "Differentially expressed genes" and complement missing bits
Holger Brandl's avatar
Holger Brandl committed
## TODO later, reenable child-inclusion of enrichment analysis
########################################################################################################################
##' ## Term enrichment Data Preparation
##+ echo=FALSE
#
##' This analysis was performed using [David](http://david.abcc.ncifcrf.gov/). The following ontologies were tested: `r paste(ontologies, collapse=', ')`
#
#geneLists <- degs %>%
#    transmute(ensembl_gene_id, list_id=paste(sample_1, "vs", sample_2))
#
#if(nrow(contrasts)<4){
#    geneLists <- rbind_list(
#        geneLists,
#        degs %>% filter(s1_overex) %>% transmute(ensembl_gene_id, list_id=paste(sample_1, ">", sample_2)),
#        degs %>% filter(!s1_overex) %>% transmute(ensembl_gene_id, list_id=paste(sample_1, "<", sample_2))
#    )
#}
#
### additional overlaps before doing the intersection analysis
#geneLists %>% count(list_id) %>% kable()
#
#intersectLists <- function(geneLists, listIdA, listIdB, intersectListId) {
#    commonGenes <- setdiff(filter(geneLists, list_id==listIdA)$ensembl_gene_id, filter(geneLists, list_id==listIdB)$ensembl_gene_id)
#    data.frame(list_id=intersectListId, ensembl_gene_id=commonGenes)
#}
#
#geneLists %<>% group_by(list_id)
#save(geneLists, file=".enrGeneLists.RData")
## geneLists <- local(get(load("enrGeneLists.RData")))
#
#
### redefine opts arguments and tweak knitr options
#opts_knit$set(root.dir = getwd())
#commandArgs <- function(x) c(paste("--overlay_expr_data ", count_matrix_file, " enrGeneLists.RData"))
##source("/home/brandl/mnt/mack/bioinfo/scripts/ngs_tools/dev/common/david_enrichment.R")
## child='/home/brandl/mnt/mack/bioinfo/scripts/ngs_tools/dev/common/david_enrichment.Rmd'