Skip to content
Snippets Groups Projects
Commit a4056b0f authored by Holger Brandl's avatar Holger Brandl
Browse files

improved deseq reporting

parent b485c9e1
No related branches found
No related tags found
No related merge requests found
......@@ -9,44 +9,54 @@ Perform a differntial gene expression analysis using deseq2
Usage: featcounts_deseq_mf.R [options] <count_matrix> <design_matrix>
Options:
--contrasts=<tab_delim_table> Table with sample pairs for which dge analysis should be performed
--qcutoff <qcutoff> Use a q-value cutoff of 0.01 instead of a q-value cutoff [default: 0.01]
--pcutoff <pcutoff> Override q-value filter and filter by p-value instead
--min_count <min_count> Minimal expression in any of the sample to be included in the final result list [default: 10]
--project <project_prefix> Name to prefix all generated result files [default: ]
--design <exp_design> Name to prefix all generated result files [default: sample]
--contrasts=<tab_delim_table> Table with sample pairs for which dge analysis should be performed
--qcutoff <qcutoff> Use a q-value cutoff of 0.01 instead of a q-value cutoff [default: 0.01]
--pcutoff <pcutoff> Override q-value filter and filter by p-value instead
--min_count <min_count> Minimal expression in any of the sample to be included in the final result list [default: 10]
--out <name_prefix> Name to prefix all generated result files [default: ]
--design <formula> Design fomula for DeSeq with contrast attribute at the end [default: sample]
'
opts <- docopt(doc, commandArgs(TRUE))
#opts <- docopt(doc, "--contrasts ~/MPI-CBG_work/P5_DESeq/dba_contrasts_human.txt ~/MPI-CBG_work/P5_DESeq/countMatrix_human.txt")
## opts <- docopt(doc, "countMatrix.txt")
## opts <- docopt(doc, "../mapped/star_counts_matrix.txt")
require(knitr)
require(DESeq2)
## load some packages first because of name-space order
library(knitr)
library(DESeq2)
library(readr)
devtools::source_url("https://raw.githubusercontent.com/holgerbrandl/datautils/v1.23/R/core_commons.R")
#devtools::source_url("https://raw.githubusercontent.com/holgerbrandl/datautils/v1.23/R/core_commons.R")
devtools::source_url("https://raw.githubusercontent.com/holgerbrandl/datautils/master/R/core_commons.R")
devtools::source_url("https://raw.githubusercontent.com/holgerbrandl/datautils/v1.23/R/ggplot_commons.R")
devtools::source_url("https://raw.githubusercontent.com/holgerbrandl/datautils/v1.23/R/bio/diffex_commons.R")
require_auto(gplots)
loadpack(gplots)
## process input arguments
count_matrix_file <- opts$count_matrix
design_matrix_file <- opts$design_matrix
contrasts_file <- opts$contrasts
#resultsBase <- count_matrix_file %>% basename() %>% trim_ext(".txt") %>% trim_ext(".count_matrix")
resultsBase <- if(str_length(opts$project)>0) paste0(opts$project, ".") else ""
resultsBase <- if(str_length(opts$out)>0) paste0(opts$outprefix, ".") else ""
pcutoff <- if(is.null(opts$pcutoff)) NULL else as.numeric(opts$pcutoff)
qcutoff <- if(is.numeric(pcutoff)) NULL else as.numeric(opts$qcutoff)
if(is.numeric(pcutoff)) opts$qcutoff <- NULL
## extract the sample for the design
designFormula <- opts$design
## consider last element of design formula as sample attribute
contrastAttribute <- str_split(designFormula, "[+]") %>% unlist() %>% tail(1)
assert(designFormula!="sample" && !is.null(design_matrix_file), "custom designs are not supported without a design matrix")
#' # Differential Expression Analysis
#' Run configuration was
vec2df(unlist(opts)) %>% filter(!str_detect(name, "^[<-]")) %>% kable()
########################################################################################################################
#' ## Data Preparation
......@@ -114,6 +124,11 @@ if(!is.null(contrasts_file)){
#' The used design matrix is
replicates2samples %>% kable()
#' The used design formula is.
designFormula
#' For more information about designs see section 1.6 "multi-factor designs" in [deseq-manual](https://bioconductor.org/packages/release/bioc/vignettes/DESeq2/inst/doc/DESeq2.pdf)
#' The contrasts of interest are
contrasts %>% kable()
......@@ -128,40 +143,41 @@ contrasts %>% kable()
## see "3.11 Sample-/gene-dependent normalization factors" in the DEseq2 manual for details
## old non-mf approach
#colData <- data.frame(condition=colnames(countMatrix) %>% get_sample_from_replicate)
#exDesign <- data.frame(condition=colnames(countMatrix) %>% get_sample_from_replicate)
## new mf-approach
colData <- data_frame(replicate=colnames(countMatrix)) %>% mutate(col_index=row_number()) %>%
exDesign <- data_frame(replicate=colnames(countMatrix)) %>% mutate(col_index=row_number()) %>%
full_join(replicates2samples, by="replicate") %>%
arrange(col_index) #%>% transmute(condition=sample_2ndwt) %>% fac2char
## infer the sample to be used from the formula
#designFormula <- "sample_2ndwt + batch"
designFormula <- opts$design
## consider last one as sample attribute
sampleAttribute <- str_split(designFormula, " ")
colData
#colData %<>% rename(sample, sample_2ndwt)
#exDesign %<>% rename(sample, sample_2ndwt)
#colData <- replicates2samples %>% arrange(colnames(countMatrix))
#exDesign <- replicates2samples %>% arrange(colnames(countMatrix))
# valiadate that contrasts model is compatible with data
#if(!all((contrasts %>% gather %$% value %>% unique) %in% colData$condition)){
# echo("column model: ",colData$condition)
# echo("contrasts: ", contrasts %>% gather %$% value %>% unique)
# stop("column model is not consistent with contrasts")
#}
#stopifnot(all((contrasts %>% gather %$% value %>% unique) %in% colData$condition))
designSamples = as.df(exDesign)[, contrastAttribute] %>% unique
contrastSamples = contrasts %>% gather %$% value %>% unique
if(!all(contrastSamples %in% designSamples)){
warning("column model: ", exDesign)
warning("contrasts: ", contrasts %>% gather %$% value %>% unique)
stop("column model is not consistent with contrasts")
}
#stopifnot(all((contrasts %>% gather %$% value %>% unique) %in% exDesign$condition))
#http://www.cookbook-r.com/Formulas/Creating_a_formula_from_a_string/
dds <- DESeqDataSetFromMatrix(countMatrix, colData, formula(as.formula("~ batch + sample_2ndwt")))
dds <- DESeqDataSetFromMatrix(countMatrix, exDesign, formula(as.formula(paste("~", designFormula))))
#dds <- estimateSizeFactors(dds)
#dds <- estimateDispersions(dds)
dds <- DESeq(dds)
# session::save.session("fc_debug.dat");
# session::restore.session("fc_debug.dat");
########################################################################################################################
#' ## Quality Control
......@@ -199,7 +215,7 @@ plotPCA(rld, intgroup = "sample")
#hc <- hclust(distsRL)
#distMatrix <- as.matrix(distsRL)
distMatrix <- as.matrix(dist(t(assay(rld))))
#rownames(distMatrix) <- colnames(distMatrix) <- with(colData(dds), paste(condition, sep=" : "))
#rownames(distMatrix) <- colnames(distMatrix) <- with(exDesign(dds), paste(condition, sep=" : "))
labelcntData <- countData %>%
distinct(ensembl_gene_id) %>% column2rownames("ensembl_gene_id") %>%
......@@ -210,6 +226,9 @@ heatmap.2(distMatrix, labRow=colnames(labelcntData), labCol=colnames(labelcntDat
#key=F, col=colorpanel(100, "black", "white"),
margin=c(8, 8), main="Sample Distance Matrix")
loadpack(d3heatmap)
d3heatmap(labelcntData, scale = "column")
########################################################################################################################
#' # Perform Differential Expression Analysis
......@@ -242,7 +261,7 @@ summary(results(dds))
## extract all de-sets according to our contrasts model
deResults <- adply(contrasts, 1, splat(function(sample_1, sample_2){
results(dds, contrast=c("condition", sample_1, sample_2)) %>%
results(dds, contrast=c(contrastAttribute, sample_1, sample_2)) %>%
rownames2column("ensembl_gene_id") %>%
as.data.frame() %>%
## see http://rpackages.ianhowson.com/bioc/DESeq2/man/results.html when using contrasts argument
......@@ -293,7 +312,7 @@ deResults %<>% mutate(s1_overex=s1_over_s2_logfc>0)
normCounts <- counts(dds,normalized=TRUE) %>%
# set_names(colData(dds)$condition) %>% rownames2column("ensembl_gene_id")
# set_names(exDesign(dds)$condition) %>% rownames2column("ensembl_gene_id")
set_names(colnames(countMatrix)) %>% rownames2column("ensembl_gene_id")
......@@ -369,7 +388,7 @@ deResults %>% ggplot(aes(s1_over_s2_logfc, -log10(pvalue), color=is_hit)) +
# Define absolute binding categories
#rawCounts <- counts(dds,normalized=F) %>%
# set_names(colData(dds)$condition) %>% rownames2column("ensembl_gene_id")
# set_names(exDesign(dds)$condition) %>% rownames2column("ensembl_gene_id")
#ggplot(rawCounts, aes(H3HA_Sphere)) + geom_histogram() + scale_x_log10() + ggtitle("raw H3HA_Sphere counts distribution")
#require_auto(Hmisc)
#
......@@ -377,6 +396,11 @@ deResults %>% ggplot(aes(s1_over_s2_logfc, -log10(pvalue), color=is_hit)) +
#with(bindCats, as.data.frame(table(bind_category))) %>% kable()
## optionally install bioconducor package
if(!any(.packages(all.available=TRUE)=="biomaRt")){
source("http://bioconductor.org/biocLite.R")
biocLite("biomaRt", ask=FALSE)
}
# Load transcriptome annotations needed for results annotation
geneInfo <- quote({
......
......@@ -8,6 +8,7 @@
import joblist.JobConfiguration
import joblist.JobList
import joblist.ResubmitStrategy
import kutils.evalBash
import org.docopt.Docopt
import java.io.File
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment