polished version

d801a060 · Melanie Schneider · dac77bc0 · d801a060
Commit d801a060 authored 9 years ago by Melanie Schneider
--- a/dge_workflow/featcounts_deseq.R
+++ b/dge_workflow/featcounts_deseq.R
@@ -6,7 +6,7 @@ suppressMessages(require(docopt))

 doc <- '
 Convert a featureCounts results matrix into a dge-report using deseq2
-Usage: region_dba.R [options] <count_matrix>
+Usage: featcounts_deseq.R [options] <count_matrix>

 Options:
 --contrasts=<tab_delim_table> Table with sample pairs for which dge analysis should be performed
@@ -65,13 +65,18 @@ genesAfter <- nrow(countMatrix)
 ########################################################################################################################
 #' ## Perform Differential Expression Analysis

+
+#' DESeq is an R package to analyse count data from high-throughput sequencing assays such as RNA-Seq and test for differential expression. The latest version, DESeq2, is used.
+#' DESeq uses a model based on the negative binomial distribution and offers the following features:
+#' Count data is discrete and skewed and is hence not well approximated by a normal distribution. Thus, a test based on the negative binomial distribution, which can reflect these properties, has much higher power to detect differential expression.
+#' Tests for differential expression between experimental conditions should take into account both technical and biological variability. Recently, several authors have claimed that the Poisson distribution can be used for this purpose. However, tests based on the Poisson assumption (this includes the binomial test and the chi-squared test) ignore the biological sampling variance, leading to incorrectly optimistic p values. The negative binomial distribution is a generalisation of the Poisson model that allows to model biological variance correctly. 
+#' DESeq estimate the variance in a local fashion, using different coefficients of variation for different expression strengths. This removes potential selection biases in the hit list of differentially expressed genes, and gives a more balanced and accurate result.
+
 #' See deseq reference [docs](http://master.bioconductor.org/packages/release/bioc/vignettes/DESeq2/inst/doc/DESeq2.pdf) for details

 #' To understand fold-change shrinking and estimation check out
 #' [Moderated estimation of fold change and dispersion for RNA-seq data with DESeq2](http://genomebiology.com/2014/15/12/550/abstract)

-# TODO add general info about the deseq process/model here
-

 get_sample_from_replicate <- function(repName) str_match(repName, "(.*)_[R]?[0-9]{1}$")[,2]

@@ -140,6 +145,11 @@ summary(results(dds))
 #' ### Data Dispersion

 # TODO explain how this plot is useful
+#' Plot of the per-gene dispersion estimates together with the fitted mean-dispersion relationship.
+#' The dispersion plot shows how the estimates are shrunk from the gene wise values (black dots) toward the fitted estimates, with the final values used in testing being the blue dots.
+#' The dispersion can be understood as the square of the coefficient of biological variation.  So, if a gene's expression typically differs from replicate to replicate sample by 20%, this gene's dispersion is: .20^2 = .04.
+## The function estimateDispersions performs three steps. First, it estimates, for each gene and each (replicated) condition, a dispersion value, then, it fits, for each condition, a curve through the estimates. Finally, it assigns to each gene a dispersion value, using either the estimated or the fitted value.
+dds <- estimateDispersions(dds)
 plotDispEsts(dds, main="Dispersion plot")

 ########################################################################################################################
@@ -247,8 +257,9 @@ normCounts <- counts(dds,normalized=TRUE) %>%
 # deseq approach
 # plotMA(deResults, main="DESeq2", ylim=c(-2,2))

+# Calculation of the base means per condition
 baseMeanPerLvl <- sapply( levels(dds$condition), function(lvl) rowMeans( counts(dds,normalized=TRUE)[,dds$condition == lvl] ) ) %>% 
-  rownames2column("ensembl_gene_id") # %>% melt()
+  rownames2column("ensembl_gene_id")

 ## add base means to diffßex summary
 deResults <- baseMeanPerLvl %>%
@@ -258,7 +269,6 @@ deResults <- baseMeanPerLvl %>%
    # add diffex status
    merge(deResults)

-
 #+ fig.width=16, fig.height=14
 deResults %>% ggplot(aes(0.5*log2(norm_count_1*norm_count_2), log2(norm_count_2/norm_count_1), color=pvalue<0.05)) +
    geom_point(alpha=0.3) +
@@ -267,6 +277,8 @@ deResults %>% ggplot(aes(0.5*log2(norm_count_1*norm_count_2), log2(norm_count_2/


 deResults %$% pvalue %>%  log10() %>% quantile(0.05, na.rm=T)
+
+#' For the volcano plot we have fold change on the X-axis and -log10 of the p-value on the y-axis.
 ##Volcano plots
 hitCounts <- filter(deResults, is_hit) %>%
        count(sample_1, sample_2, s1_overex) %>%
@@ -305,7 +317,7 @@ deResults %>% ggplot(aes(s1_over_s2_logfc, -log10(pvalue), color=is_hit)) +



-## load transcriptome annotations needed for results annotation
+# Load transcriptome annotations needed for results annotation
 geneInfo <- quote({
  ## mart <- biomaRt::useDataset("drerio_gene_ensembl", mart = biomaRt::useMart("ensembl"))§
  mart <- biomaRt::useDataset(guess_mart(countData$ensembl_gene_id), mart = biomaRt::useMart("ensembl"))
@@ -332,14 +344,14 @@ write.delim(deAnnot, file=paste0(resultsBase, ".dba_results.txt"))
 ## Extract hits from deseq results
 degs <- deAnnot %>% filter(is_hit)

-ggplot(degs, aes(paste(sample_1, "vs",  sample_2))) + geom_bar() +xlab(NULL) + ylab("# DBGs") + ggtitle("DBG count summary") + coord_flip()
+# ggplot(degs, aes(paste(sample_1, "vs",  sample_2))) + geom_bar() +xlab(NULL) + ylab("# DBGs") + ggtitle("DBG count summary") + coord_flip()

-degs %>%
-    ggplot(aes(paste(sample_1, "vs",  sample_2), fill=s1_overex)) +
-    geom_bar(position="dodge") +
-    xlab(NULL) + ylab("# DBGs") +
-    ggtitle("DBG count summary by direction of expression") +
-    coord_flip()
+# degs %>%
+#     ggplot(aes(paste(sample_1, "vs",  sample_2), fill=s1_overex)) +
+#     geom_bar(position="dodge") +
+#     xlab(NULL) + ylab("# DBGs") +
+#     ggtitle("DBG count summary by direction of expression") +
+#     coord_flip()

 # Export DBA genes
 ## disabled because we just subset the annotated data now to define degs
@@ -352,18 +364,26 @@ degs %>% write.delim(file=paste0(resultsBase, ".diffbind_genes.txt"))

 unloadNamespace("dplyr"); require(dplyr)

-## render interactive hit browser
-#+results='asis'
-degs %>%
-    left_join(geneInfo) %>%
-    mutate(
-        igv=paste0("<a href='http://localhost:60151/goto?locus=", chromosome_name,":", start_position, "-", end_position, "'>IGV</a>")
-    ) %>%
-    select(s1_over_s2_logfc, pvalue, ensembl_gene_id, sample_1, sample_2, external_gene_name, description, igv) %>%
-#    kable("html", table.attr = "class='dtable'", escape=F)
-    datatable(escape=F)
+# ## render interactive hit browser
+# #+results='asis'
+# degs %>%
+#     left_join(geneInfo) %>%
+#     mutate(
+#         igv=paste0("<a href='http://localhost:60151/goto?locus=", chromosome_name,":", start_position, "-", end_position, "'>IGV</a>")
+#     ) %>%
+#     select(s1_over_s2_logfc, pvalue, ensembl_gene_id, sample_1, sample_2, external_gene_name, description, igv) %>%
+# #    kable("html", table.attr = "class='dtable'", escape=F)
+#     datatable(escape=F)


+#' ### DEG Counts
+
+#+ fig.height=nrow(contrasts)+2
+#ggplot(degs, aes(paste(sample_1, "vs",  sample_2), fill=status)) + geom_bar() +xlab(NULL) + ylab("# DGEs") +ggtitle("DEGs by contrast") + coord_flip()
+ggplot(degs, aes(paste(sample_1, "vs",  sample_2), fill=(s1_overex))) + geom_bar() + xlab(NULL) + ylab("# DGEs") + ggtitle("DEGs by contrast") + coord_flip()
+
+#with(degs, as.data.frame(table(sample_1, sample_2, s1_overex))) %>% filter(Freq >0) %>% kable()
+degs %>% count( sample_1, sample_2, s1_overex) %>% kable()


 #'  DEGs (differentially expressed genes) are contained in the following table
@@ -371,14 +391,6 @@ write.delim(degs, file="degs.txt")
 # degs <- read.delim("degs.txt")
 #' [Differentially Expressed Genes](degs.txt)

-#' ### DEG Counts
-#with(degs, as.data.frame(table(sample_1, sample_2, s1_overex))) %>% filter(Freq >0) %>% kable()
-degs %>% count( sample_1, sample_2, s1_overex) %>% kable()
-
-#+ fig.height=nrow(contrasts)+2
-#ggplot(degs, aes(paste(sample_1, "vs",  sample_2), fill=status)) + geom_bar() +xlab(NULL) + ylab("# DGEs") +ggtitle("DEGs by contrast") + coord_flip()
-ggplot(degs, aes(paste(sample_1, "vs",  sample_2), fill=(s1_overex))) + geom_bar() + xlab(NULL) + ylab("# DGEs") + ggtitle("DEGs by contrast") + coord_flip()
-

 #' DEGs can be browsed in Excel using the exported table or via the embedded table browser. To enable the IGV links, you need to set the port in the IGV preferences to 3334.
 kableDegs <- degs