smoothed deseq workflow

a7cb6336 · Holger Brandl · 9c606c16 · a7cb6336
Commit a7cb6336 authored 9 years ago by Holger Brandl
--- a/dge_workflow/featcounts_deseq.R
+++ b/dge_workflow/featcounts_deseq.R
@@ -33,25 +33,6 @@ require.auto(DT)

 require.auto(gplots)

-##+ results='asis', echo=FALSE
-#cat('
-#<link rel="stylesheet" type="text/css" href="http://cdn.datatables.net/1.10.5/css/jquery.dataTables.min.css">
-#<script type="text/javascript" charset="utf8" src="http://code.jquery.com/jquery-2.1.2.min.js"></script>
-#<script type="text/javascript" charset="utf8" src="http://cdn.datatables.net/1.10.5/js/jquery.dataTables.min.js"></script>
-#
-#<script type="text/javascript">
-#         $(document).ready(function() {
-#            // alert("test")
-#             //$("table").DataTable();
-#             //$("table").DataTable();
-#             //$("#tab_id").DataTable();
-#             $(".dtable").DataTable();
-#         } );
-#</script>
-#')
-
-
-
 count_matrix_file <- opts$count_matrix
 contrasts_file <- opts$contrasts

@@ -61,8 +42,6 @@ if(is.numeric(pcutoff)) opts$qcutoff <- NULL


 ########################################################################################################################
-#' # Differential Expresssion Analysis
-
 #' ## Data preparation

 #' Working Directory: `r getwd()`
@@ -74,41 +53,27 @@ countData <- read.delim(count_matrix_file)
 head(countData) %>% kable()
 countMatrix <- countData %>% column2rownames("ensembl_gene_id") %>% as.matrix()

-#' Apply expression filter
-nrow(countMatrix)
-countMatrix %<>% filterByExpression()
-nrow(countMatrix)
+# Expression Filtering
+genesBefore <- nrow(countMatrix)
+countMatrix %<>% filterByExpression(opts$min_count)
+genesAfter <- nrow(countMatrix)
+#' Counts were filtered to only retain genes which had more that `opts$min_count` alignments in at least one replicates. This reduced the number of genes from `r genesBefore' to `r genesAfter`.


 ########################################################################################################################
-#' # Load annotation data
-
-## load transcriptome annotations needed for results annotation
-geneInfo <- quote({
-  ## mart <- biomaRt::useDataset("drerio_gene_ensembl", mart = biomaRt::useMart("ensembl"))§
-  mart <- biomaRt::useDataset(guess_mart(countData$ensembl_gene_id), mart = biomaRt::useMart("ensembl"))
-  c("ensembl_gene_id", "external_gene_name", "description", "chromosome_name", "start_position", "end_position") %>%
-        biomaRt::getBM(mart=mart)
-}) %>% cache_it("geneInfo")
-
-#filterByExpression <- function(fpkmMat, min_count=1, logMode=F){
-#    if(logMode) fpkmMat<-log10(fpkmMat+1) ## add a pseudocount
-#
-#    geneMax <- apply(fpkmMat, 1, max)
-#
-#    fpkmMat[geneMax>min_count,]
-#}
-#countMatrixFilt  <- filterByExpression(countMatrix, min_count=50)
+#' ## Perform Differential Expression Analysis

 #' See deseq reference [docs](http://master.bioconductor.org/packages/release/bioc/vignettes/DESeq2/inst/doc/DESeq2.pdf) for details

 #' To understand fold-change shrinking and estimation check out
 #' [Moderated estimation of fold change and dispersion for RNA-seq data with DESeq2](http://genomebiology.com/2014/15/12/550/abstract)

-## todo expose as argument
+# TODO add general info about the deseq process/model here
+
+
 get_sample_from_replicate <- function(repName) str_match(repName, "(.*)_[0-9]{1}$")[,2]

-#' Define or load a contrasts matrix
+# Define or load a contrasts matrix
 if(!is.null(contrasts_file)){
    contrasts <- read.delim(contrasts_file, header=T) %>% fac2char()
 }else{
@@ -120,12 +85,9 @@ if(!is.null(contrasts_file)){
    write.delim(contrasts, "dba_contrasts.txt")
 }

-
+#' The used contrasts model is
 contrasts %>% kable()

-########################################################################################################################
-#' ## Perform Differential Expression Analysis
-
 ## gene specific factors
 #normalizationFactors(dds)

@@ -145,9 +107,7 @@ dds <- DESeqDataSetFromMatrix(countMatrix, colData, formula(~ condition))
 dds <- estimateSizeFactors(dds)


-#' #  Custom Lambda Normalization
-
-#' See https://www.biostars.org/p/79978/
+#' For details about size factor normalziation and calculation see https://www.biostars.org/p/79978/

 ##' Size Factors estimated by DESeq
 sizeFactors(dds) %>% set_names(colnames(countMatrix)) %>% melt() %>% rownames2column("sample") %>%  ggplot(aes(sample, value)) + geom_bar(stat="identity") + ggtitle("deseq size factors")
@@ -167,7 +127,6 @@ dds <- DESeq(dds)
 #resultsNames(dds)

 #' Model Overview
-#+ results='asis'
 summary(results(dds))

 ########################################################################################################################
@@ -339,6 +298,15 @@ deResults %>% ggplot(aes(s1_over_s2_logfc, -log10(pvalue), color=is_hit)) +



+## load transcriptome annotations needed for results annotation
+geneInfo <- quote({
+  ## mart <- biomaRt::useDataset("drerio_gene_ensembl", mart = biomaRt::useMart("ensembl"))§
+  mart <- biomaRt::useDataset(guess_mart(countData$ensembl_gene_id), mart = biomaRt::useMart("ensembl"))
+  c("ensembl_gene_id", "external_gene_name", "description", "chromosome_name", "start_position", "end_position") %>%
+        biomaRt::getBM(mart=mart)
+}) %>% cache_it("geneInfo")
+
+
 # Export the complete dataset for later analysis
 deAnnot <- deResults %>%
  #inner_join(normCounts) %>% 
@@ -430,7 +398,8 @@ save(degs, file=".degs.RData")


 ########################################################################################################################
-#' ### MA Plot
+#' ## MA Plots
+
 #' Redo MA plots but now including hit overlays

 maPlots <- deResults %>% group_by(sample_1, sample_2) %>% do(gg={ maData <-.
@@ -441,7 +410,7 @@ maData %>% ggplot(aes(0.5*log2(norm_count_1*norm_count_2), log2(norm_count_1/nor
  ggtitle(with(maData[1,], paste(sample_1, "vs", sample_2)))
 }) %$% gg

-#+ fig.height=10*ceiling(length(maPlots)/2)
+#+ fig.height=7*ceiling(length(maPlots)/2)
 multiplot(plotlist=maPlots, cols=min(2, length(maPlots)))