commented out enrichement section

41344d37 · Holger Brandl · a7cb6336 · 41344d37
Commit 41344d37 authored 9 years ago by Holger Brandl
--- a/dge_workflow/featcounts_deseq.R
+++ b/dge_workflow/featcounts_deseq.R
@@ -12,7 +12,7 @@ Options:
 --contrasts=<tab_delim_table> Table with sample pairs for which dge analysis should be performed
 --qcutoff <qcutoff>    Use a q-value cutoff of 0.01 instead of a q-value cutoff [default: 0.01]
 --pcutoff <pcutoff>    Override q-value filter and filter by p-value instead
--min_count <min_count>   Minimal expression in any of the sample to be included in the final result list [default: 1]
+--min_count <min_count>   Minimal expression in any of the sample to be included in the final result list [default: 10]
 '

 opts <- docopt(doc, commandArgs(TRUE))
@@ -50,14 +50,16 @@ resultsBase <- count_matrix_file %>% basename() %>% trim_ext(".txt") %>% trim_ex

 countData <- read.delim(count_matrix_file)

-head(countData) %>% kable()
+#' strucutre of input matrix was
+glimpse(countData)
+
 countMatrix <- countData %>% column2rownames("ensembl_gene_id") %>% as.matrix()

 # Expression Filtering
 genesBefore <- nrow(countMatrix)
 countMatrix %<>% filterByExpression(opts$min_count)
 genesAfter <- nrow(countMatrix)
-#' Counts were filtered to only retain genes which had more that `opts$min_count` alignments in at least one replicates. This reduced the number of genes from `r genesBefore' to `r genesAfter`.
+#' Counts were filtered to only retain genes which had more that `r opts$min_count` alignments in at least one replicates. This reduced the number of genes from `r genesBefore` to `r genesAfter`.


 ########################################################################################################################
@@ -110,7 +112,10 @@ dds <- estimateSizeFactors(dds)
 #' For details about size factor normalziation and calculation see https://www.biostars.org/p/79978/

 ##' Size Factors estimated by DESeq
-sizeFactors(dds) %>% set_names(colnames(countMatrix)) %>% melt() %>% rownames2column("sample") %>%  ggplot(aes(sample, value)) + geom_bar(stat="identity") + ggtitle("deseq size factors")
+sizeFactors(dds) %>%
+    set_names(colnames(countMatrix)) %>% melt() %>%
+    rownames2column("sample") %>%
+    ggplot(aes(sample, value)) + geom_bar(stat="identity") + ggtitle("deseq size factors") + coord_flip()

 ##' From the DESeq docs about how size factors are used: The sizeFactors vector assigns to each column of the count matrix a value, the size factor, such that count values in the columns can be brought to a common scale by dividing by the corresponding size factor.

@@ -126,31 +131,33 @@ dds <- DESeq(dds)
 #res <- results(dds)
 #resultsNames(dds)

-#' Model Overview
+#' Model Overview:
 summary(results(dds))

 ########################################################################################################################
-#' Plot dispersions
+#' ## Quality Control
+
+#' ### Data Dispersion
+
+# TODO explain how this plot is useful
 plotDispEsts(dds, main="Dispersion plot")

 ########################################################################################################################
-#' ## PCA and Clustering
-
-# Regularized log transformation for clustering/heatmaps, etc
-rld <- rlogTransformation(dds)
+#' ### PCA and Clustering

 #' In order to assess overall similarity between samples two common statistical methods are used - Principal component analysis (PCA) and clustering.
 #' This should provide answers to the questions: Which samples are similar to each other, which are different? Does this fit to the expectation from the experiment’s design?
 #' Using a regularized log transformation of the raw counts provides the advantage that it stabilizes the variance across the mean.


-#' ### Principal component analysis
 #' Principal component analysis (PCA) is a technique used to emphasize variation and bring out strong patterns in a dataset.
 #' Principal components are the underlying structure in the data. They are the directions where there is the most variance, the directions where the data is most spread out. The data points/samples are projected onto the 2D plane such that they spread out in the two directions that explain most of the differences. The x-axis is the direction that separates the data points the most and the y-axis is a direction that separates the data the second most.
-plotPCA(rld, intgroup = "condition")

+# Regularized log transformation for clustering/heatmaps, etc
+rld <- rlogTransformation(dds)
+
+plotPCA(rld, intgroup = "condition")

-#' ### Sample distance heatmap

 #' The Euclidean distance between samples are calculated after performing the regularized log transformation.
 #' Using the calculated distance matrix, the samples are projected on a two-dimensional graph such that the distance between samples approximately corresponds to the biological coefficient of variation between those samples.
@@ -212,7 +219,7 @@ deResults %>% ggplot(aes(padj)) +
    ggtitle("Adjusted p-value distributions") #+ scale_x_log10()


-#' Set hit criterion
+# report hit criterion
 #+ results='asis'
 if(!is.null(qcutoff)){
    echo("Using q-value cutoff of", qcutoff)
@@ -410,7 +417,7 @@ maData %>% ggplot(aes(0.5*log2(norm_count_1*norm_count_2), log2(norm_count_1/nor
  ggtitle(with(maData[1,], paste(sample_1, "vs", sample_2)))
 }) %$% gg

-#+ fig.height=7*ceiling(length(maPlots)/2)
+#+ fig.height=6*ceiling(length(maPlots)/2)
 multiplot(plotlist=maPlots, cols=min(2, length(maPlots)))


@@ -420,10 +427,10 @@ multiplot(plotlist=maPlots, cols=min(2, length(maPlots)))

 ## TODO later, reenable child-inclusion of enrichment analysis
 ########################################################################################################################
-##' ## Term enrichment Data Preparation
+## ## Term enrichment Data Preparation
 ##+ echo=FALSE
 #
-##' This analysis was performed using [David](http://david.abcc.ncifcrf.gov/). The following ontologies were tested: `r paste(ontologies, collapse=', ')`
+## This analysis was performed using [David](http://david.abcc.ncifcrf.gov/). The following ontologies were tested: `r paste(ontologies, collapse=', ')`
 #
 #geneLists <- degs %>%
 #    transmute(ensembl_gene_id, list_id=paste(sample_1, "vs", sample_2))