Newer
Older
#!/usr/bin/env Rscript
#+ include=FALSE
suppressMessages(require(docopt))
doc <- '
Convert a featureCounts results matrix into a dge-report using deseq2
Usage: region_dba.R [options] <count_matrix>
Options:
--contrasts=<tab_delim_table> Table with sample pairs for which dge analysis should be performed
--qcutoff <qcutoff> Use a q-value cutoff of 0.01 instead of a q-value cutoff [default: 0.01]
--pcutoff <pcutoff> Override q-value filter and filter by p-value instead
--min_count <min_count> Minimal expression in any of the sample to be included in the final result list [default: 1]
'
#opts <- docopt(doc, "--contrasts ~/MPI-CBG_work/P5_DESeq/dba_contrasts_human.txt ~/MPI-CBG_work/P5_DESeq/countMatrix_human.txt")
require(knitr)
require(DESeq2)
devtools::source_url("https://raw.githubusercontent.com/holgerbrandl/datautils/v1.12/R/core_commons.R")
devtools::source_url("https://raw.githubusercontent.com/holgerbrandl/datautils/v1.12/R/ggplot_commons.R")
devtools::source_url("https://raw.githubusercontent.com/holgerbrandl/datautils/v1.12/R/bio/diffex_commons.R")
require.auto(gplots)
##+ results='asis', echo=FALSE
#cat('
#<link rel="stylesheet" type="text/css" href="http://cdn.datatables.net/1.10.5/css/jquery.dataTables.min.css">
#<script type="text/javascript" charset="utf8" src="http://code.jquery.com/jquery-2.1.2.min.js"></script>
#<script type="text/javascript" charset="utf8" src="http://cdn.datatables.net/1.10.5/js/jquery.dataTables.min.js"></script>
#
#<script type="text/javascript">
# $(document).ready(function() {
# // alert("test")
# //$("table").DataTable();
# //$("table").DataTable();
# //$("#tab_id").DataTable();
# $(".dtable").DataTable();
# } );
#</script>
#')
count_matrix_file <- opts$count_matrix
contrasts_file <- opts$contrasts
pcutoff <- if(is.null(opts$pcutoff)) NULL else as.numeric(opts$pcutoff)
qcutoff <- if(is.numeric(pcutoff)) NULL else as.numeric(opts$qcutoff)
if(is.numeric(pcutoff)) opts$qcutoff <- NULL
########################################################################################################################
#' Working Directory: `r getwd()`
resultsBase <- count_matrix_file %>% basename() %>% trim_ext(".txt") %>% trim_ext(".count_matrix")
countData <- read.delim(count_matrix_file)
countMatrix <- countData %>% column2rownames("ensembl_gene_id") %>% as.matrix()
#' Apply expression filter
nrow(countMatrix)
countMatrix %<>% filterByExpression()
nrow(countMatrix)
########################################################################################################################
#' # Load annotation data
## load transcriptome annotations needed for results annotation
geneInfo <- quote({
## mart <- biomaRt::useDataset("drerio_gene_ensembl", mart = biomaRt::useMart("ensembl"))§
mart <- biomaRt::useDataset(guess_mart(countData$ensembl_gene_id), mart = biomaRt::useMart("ensembl"))
c("ensembl_gene_id", "external_gene_name", "description", "chromosome_name", "start_position", "end_position") %>%
biomaRt::getBM(mart=mart)
#filterByExpression <- function(fpkmMat, min_count=1, logMode=F){
# if(logMode) fpkmMat<-log10(fpkmMat+1) ## add a pseudocount
#
# geneMax <- apply(fpkmMat, 1, max)
#
# fpkmMat[geneMax>min_count,]
#}
#countMatrixFilt <- filterByExpression(countMatrix, min_count=50)
#' See deseq reference [docs](http://master.bioconductor.org/packages/release/bioc/vignettes/DESeq2/inst/doc/DESeq2.pdf) for details
#' To understand fold-change shrinking and estimation check out
#' [Moderated estimation of fold change and dispersion for RNA-seq data with DESeq2](http://genomebiology.com/2014/15/12/550/abstract)
get_sample_from_replicate <- function(repName) str_match(repName, "(.*)_[0-9]{1}$")[,2]
#' Define or load a contrasts matrix
if(!is.null(contrasts_file)){
contrasts <- read.delim(contrasts_file, header=T) %>% fac2char()
}else{
contrasts <- data.frame(sample=get_sample_from_replicate(colnames(countMatrix))) %>% distinct() %>%
merge(.,., suffixes=c("_1", "_2"), by=NULL) %>%
filter(ac(sample_1)>ac(sample_2)) %>%
# filter(ac(sample_1)!=ac(sample_2)) %>%
fac2char()
write.delim(contrasts, "dba_contrasts.txt")
}
########################################################################################################################
#' ## Perform Differential Expression Analysis
#normalizationFactors(dds)
## sizeFactor R help:sigEnrResults
#dds <- makeExampleDESeqDataSet()
#dds <- estimateSizeFactors( dds )
#sizeFactors(dds)
## try again but now use lambda normalization
## see "3.11 Sample-/gene-dependent normalization factors" in the DEseq2 manual for details
colData <- data.frame(condition=colnames(countMatrix) %>% get_sample_from_replicate)
# valiadate that contrasts model is compatible with data
stopifnot(all((contrasts %>% gather %$% value %>% unique) %in% colData$condition))
dds <- DESeqDataSetFromMatrix(countMatrix, colData, formula(~ condition))
#' # Custom Lambda Normalization
#' See https://www.biostars.org/p/79978/
sizeFactors(dds) %>% set_names(colnames(countMatrix)) %>% melt() %>% rownames2column("sample") %>% ggplot(aes(sample, value)) + geom_bar(stat="identity") + ggtitle("deseq size factors")
##' From the DESeq docs about how size factors are used: The sizeFactors vector assigns to each column of the count matrix a value, the size factor, such that count values in the columns can be brought to a common scale by dividing by the corresponding size factor.
##' This means that counts are divied by size factors. So let's now load the lambda libraies and replace the predefined size factors with our custom ones
#' From DESeq manual: The regularized log transformation is preferable to the variance stabilizing transformation if the size factors vary widely.
#' Run Deseq Test
#dds <- DESeq(dds, fitType='local', betaPrior=FALSE)
#dds <- DESeq(dds, fitType='local')
dds <- DESeq(dds)
#res <- results(dds)
#resultsNames(dds)
#' Model Overview
#+ results='asis'
summary(results(dds))
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
########################################################################################################################
#' Plot dispersions
plotDispEsts(dds, main="Dispersion plot")
########################################################################################################################
#' ## PCA and Clustering
# Regularized log transformation for clustering/heatmaps, etc
rld <- rlogTransformation(dds)
#' In order to assess overall similarity between samples two common statistical methods are used - Principal component analysis (PCA) and clustering.
#' This should provide answers to the questions: Which samples are similar to each other, which are different? Does this fit to the expectation from the experiment’s design?
#' Using a regularized log transformation of the raw counts provides the advantage that it stabilizes the variance across the mean.
#' ### Principal component analysis
#' Principal component analysis (PCA) is a technique used to emphasize variation and bring out strong patterns in a dataset.
#' Principal components are the underlying structure in the data. They are the directions where there is the most variance, the directions where the data is most spread out. The data points/samples are projected onto the 2D plane such that they spread out in the two directions that explain most of the differences. The x-axis is the direction that separates the data points the most and the y-axis is a direction that separates the data the second most.
plotPCA(rld, intgroup = "condition")
#' ### Sample distance heatmap
#' The Euclidean distance between samples are calculated after performing the regularized log transformation.
#' Using the calculated distance matrix, the samples are projected on a two-dimensional graph such that the distance between samples approximately corresponds to the biological coefficient of variation between those samples.
#distsRL <- dist(t(assay(rld)))
#hc <- hclust(distsRL)
#distMatrix <- as.matrix(distsRL)
distMatrix <- as.matrix(dist(t(assay(rld))))
#rownames(distMatrix) <- colnames(distMatrix) <- with(colData(dds), paste(condition, sep=" : "))
labelcntData <- countData %>%
distinct(ensembl_gene_id) %>% column2rownames("ensembl_gene_id") %>%
fac2char %>% data.matrix() %>% round() %>% data.frame()
heatmap.2(distMatrix, labRow=colnames(labelcntData), labCol=colnames(labelcntData),
symm=TRUE, trace="none",
#key=F, col=colorpanel(100, "black", "white"),
margin=c(8, 8), main="Sample Distance Matrix")
########################################################################################################################
## extract all de-sets according to our contrasts model
deResults <- adply(contrasts, 1, splat(function(sample_1, sample_2){
# browser()
results(dds, contrast=c("condition", sample_1, sample_2)) %>%
rownames2column("ensembl_gene_id") %>%
as.data.frame() %>%
## see http://rpackages.ianhowson.com/bioc/DESeq2/man/results.html when using contrasts argument
rename(s1_over_s2_logfc=log2FoldChange) %>%
mutate(sample_1=sample_1, sample_2=sample_2)
}))
# deResults <- adply(contrasts, 1, splat(function(sample_1, sample_2){
# echo('samples', sample_1, sample_2)
# return(data.frame())
# }))
deResults %>% ggplot(aes(s1_over_s2_logfc)) +
geom_histogram(binwidth=0.1) +
facet_grid(sample_1 ~ sample_2) + geom_vline(yintercept=0, color="blue") +
xlim(-2,2) +
ggtitle("sample1 over sampl2 logFC ")
########################################################################################################################
#' ## Significnce of differential binding
deResults %>% ggplot(aes(pvalue)) +
geom_histogram() +
xlim(0,1) +
facet_grid(sample_1 ~ sample_2) + geom_vline(yintercept=0.01, slope=1, color="blue") +
ggtitle("p-value distributions") #+ scale_x_log10()
deResults %>% ggplot(aes(padj)) +
geom_histogram() +
facet_grid(sample_1 ~ sample_2) + geom_vline(yintercept=0.01, slope=1, color="blue") +
ggtitle("Adjusted p-value distributions") #+ scale_x_log10()
#' Set hit criterion
#+ results='asis'
if(!is.null(qcutoff)){
echo("Using q-value cutoff of", qcutoff)
deResults %<>% transform(is_hit=padj<=qcutoff)
}else{
echo("Using p-value cutoff of", pcutoff)
deResults %<>% transform(is_hit=pvalue<=pcutoff)
}
#deResults %<>% mutate(is_hit=pvalue<0.05)
deResults %<>% mutate(s1_overex=s1_over_s2_logfc>1)
normCounts <- counts(dds,normalized=TRUE) %>%
# set_names(colData(dds)$condition) %>% rownames2column("ensembl_gene_id")
set_names(colnames(countMatrix)) %>% rownames2column("ensembl_gene_id")
## .. should be same as input
#filter(countData, ensembl_gene_id=="ENSDARG00000000001")
########################################################################################################################
#' # MA and Volcano plots
# deseq approach
# plotMA(deResults, main="DESeq2", ylim=c(-2,2))
baseMeanPerLvl <- sapply( levels(dds$condition), function(lvl) rowMeans( counts(dds,normalized=TRUE)[,dds$condition == lvl] ) ) %>%
rownames2column("ensembl_gene_id") # %>% melt()
## add base means to diffßex summary
deResults <- baseMeanPerLvl %>%
gather(sample, norm_count, -ensembl_gene_id) %>%
merge(.,., by="ensembl_gene_id", suffixes=c("_1", "_2")) %>%
# filter(ac(sample_1)<ac(sample_2)) %>%
# add diffex status
merge(deResults)
#+ fig.width=16, fig.height=14
deResults %>% ggplot(aes(0.5*log2(norm_count_1*norm_count_2), log2(norm_count_2/norm_count_1), color=pvalue<0.05)) +
geom_point(alpha=0.3) +
geom_hline(yintercept=0, color="red") +
facet_grid(sample_1 ~ sample_2)
deResults %$% pvalue %>% log10() %>% quantile(0.05, na.rm=T)
##Volcano plots
hitCounts <- filter(deResults, is_hit) %>%
merge(data.frame(s1_overex=c(T,F), x_pos=c(-2.5,2.5)))
#+ fig.width=16, fig.height=14
deResults %>% ggplot(aes(s1_over_s2_logfc, -log10(pvalue), color=is_hit)) +
geom_jitter(alpha=0.3, position = position_jitter(height = 0.2)) +
# theme_bw() +
xlim(-3,3) +
scale_color_manual(values = c("TRUE"="red", "FALSE"="black")) +
# ggtitle("Insm1/Ctrl") +
## http://stackoverflow.com/questions/19764968/remove-point-transparency-in-ggplot2-legend
guides(colour = guide_legend(override.aes = list(alpha=1))) +
## tweak axis labels
xlab(expression(log[2]("x/y"))) +
ylab(expression(-log[10]("p value"))) +
## add hit couts
geom_text(aes(label=hits, x=x_pos), y=2, color="red", size=10, data=hitCounts) +
facet_grid(sample_1 ~ sample_2)
# Define absolute binding categories
#rawCounts <- counts(dds,normalized=F) %>%
# set_names(colData(dds)$condition) %>% rownames2column("ensembl_gene_id")
#ggplot(rawCounts, aes(H3HA_Sphere)) + geom_histogram() + scale_x_log10() + ggtitle("raw H3HA_Sphere counts distribution")
#require.auto(Hmisc)
#
#bindCats <- rawCounts %>% transmute(ensembl_gene_id, bind_category=cut2(H3HA_Sphere, cuts=c(10, 100)))
#with(bindCats, as.data.frame(table(bind_category))) %>% kable()
# Export the complete dataset for later analysis
deAnnot <- deResults %>%
#inner_join(normCounts) %>%
#merge(., contrasts) %>%
#merge(.,., suffixes=c("_1", "_2"), by=NULL) %>%
#inner_join(baseMeanPerLvl, id='ensembl_gene_id') %>%
left_join(geneInfo)
write.delim(deAnnot, file=paste0(resultsBase, ".dba_results.txt"))
#' [deAnnot](`r paste0(resultsBase, ".dba_results.txt")`)
########################################################################################################################
#' ## Hits Summary
## Extract hits from deseq results
degs <- deAnnot %>% filter(is_hit)
ggplot(degs, aes(paste(sample_1, "vs", sample_2))) + geom_bar() +xlab(NULL) + ylab("# DBGs") + ggtitle("DBG count summary") + coord_flip()
degs %>%
ggplot(aes(paste(sample_1, "vs", sample_2), fill=s1_overex)) +
geom_bar(position="dodge") +
xlab(NULL) + ylab("# DBGs") +
ggtitle("DBG count summary by direction of expression") +
coord_flip()
# Export DBA genes
## disabled because we just subset the annotated data now to define degs
#degsAnnot <- degs %>%
# inner_join(normCounts) %>%
# left_join(geneInfo) %>%
# left_join(bindCats)
degs %>% write.delim(file=paste0(resultsBase, ".diffbind_genes.txt"))
#' [degs](`r paste0(resultsBase, ".diffbind_genes.txt")`)
## render interactive hit browser
#+results='asis'
degs %>%
left_join(geneInfo) %>%
mutate(
igv=paste0("<a href='http://localhost:60151/goto?locus=", chromosome_name,":", start_position, "-", end_position, "'>IGV</a>")
) %>%
select(s1_over_s2_logfc, pvalue, ensembl_gene_id, sample_1, sample_2, external_gene_name, description, igv) %>%
# kable("html", table.attr = "class='dtable'", escape=F)
datatable(escape=F)
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
#' DEGs (differentially expressed genes) are contained in the following table
write.delim(degs, file="degs.txt")
# degs <- read.delim("degs.txt")
#' [Differentially Expressed Genes](degs.txt)
#' ### DEG Counts
#with(degs, as.data.frame(table(sample_1, sample_2, s1_overex))) %>% filter(Freq >0) %>% kable()
degs %>% count( sample_1, sample_2, s1_overex) %>% kable()
#+ fig.height=nrow(contrasts)+2
#ggplot(degs, aes(paste(sample_1, "vs", sample_2), fill=status)) + geom_bar() +xlab(NULL) + ylab("# DGEs") +ggtitle("DEGs by contrast") + coord_flip()
ggplot(degs, aes(paste(sample_1, "vs", sample_2), fill=(s1_overex))) + geom_bar() + xlab(NULL) + ylab("# DGEs") + ggtitle("DEGs by contrast") + coord_flip()
#' DEGs can be browsed in Excel using the exported table or via the embedded table browser. To enable the IGV links, you need to set the port in the IGV preferences to 3334.
kableDegs <- degs
if(nrow(degs>2000)){
kableDegs <- degs %>% arrange(padj) %>% head(1000)
# Error: object 'q_value' not found. So: padj used instead of '-q_value'
print("just showing first 1000 most significant hits (highest p-value) in interactive hit table!!!! Use Excel to browser to browse the full set")
}
#+ results='asis'
kableDegs %>%
#inner_join(geneLoci) %>%
mutate(
ensembl=paste0("<a href='http://www.ensembl.org/Multi/Search/Results?y=0;site=ensembl_all;x=0;page=1;facet_feature_type=Gene;q=",ensembl_gene_id, "'>",ensembl_gene_id, "</a>"),
igv=paste0("<a href='http://localhost:60151/goto?locus=", chromosome_name,":", start_position, "-", end_position, "'>IGV</a>")
) %>%
select(sample_1, sample_2, s1_over_s2_logfc, pvalue, padj, ensembl_gene_id, external_gene_name, description) %>%
# kable("html", table.attr = "class='dtable'", escape=F)
datatable(escape=F)
## just needed to restore environment easily
save(degs, file=".degs.RData")
# degs <- local(get(load(".degs.RData")))
########################################################################################################################
#' ### MA Plot
#' Redo MA plots but now including hit overlays
maPlots <- deResults %>% group_by(sample_1, sample_2) %>% do(gg={ maData <-.
## todo why not s2_over_s1_log2fc
maData %>% ggplot(aes(0.5*log2(norm_count_1*norm_count_2), log2(norm_count_1/norm_count_2), color=is_hit)) +
geom_point(alpha=0.3) +
geom_hline(yintercept=0, color="red") +
ggtitle(with(maData[1,], paste(sample_1, "vs", sample_2)))
}) %$% gg
#+ fig.height=10*ceiling(length(maPlots)/2)
multiplot(plotlist=maPlots, cols=min(2, length(maPlots)))
##--------------------------------------------------------
## Useful link: https://gist.github.com/stephenturner/f60c1934405c127f09a6
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
## TODO later, reenable child-inclusion of enrichment analysis
########################################################################################################################
##' ## Term enrichment Data Preparation
##+ echo=FALSE
#
##' This analysis was performed using [David](http://david.abcc.ncifcrf.gov/). The following ontologies were tested: `r paste(ontologies, collapse=', ')`
#
#geneLists <- degs %>%
# transmute(ensembl_gene_id, list_id=paste(sample_1, "vs", sample_2))
#
#if(nrow(contrasts)<4){
# geneLists <- rbind_list(
# geneLists,
# degs %>% filter(s1_overex) %>% transmute(ensembl_gene_id, list_id=paste(sample_1, ">", sample_2)),
# degs %>% filter(!s1_overex) %>% transmute(ensembl_gene_id, list_id=paste(sample_1, "<", sample_2))
# )
#}
#
### additional overlaps before doing the intersection analysis
#geneLists %>% count(list_id) %>% kable()
#
#intersectLists <- function(geneLists, listIdA, listIdB, intersectListId) {
# commonGenes <- setdiff(filter(geneLists, list_id==listIdA)$ensembl_gene_id, filter(geneLists, list_id==listIdB)$ensembl_gene_id)
# data.frame(list_id=intersectListId, ensembl_gene_id=commonGenes)
#}
#
#geneLists %<>% group_by(list_id)
#save(geneLists, file=".enrGeneLists.RData")
## geneLists <- local(get(load("enrGeneLists.RData")))
#
#
### redefine opts arguments and tweak knitr options
#opts_knit$set(root.dir = getwd())
#commandArgs <- function(x) c(paste("--overlay_expr_data ", count_matrix_file, " enrGeneLists.RData"))
##source("/home/brandl/mnt/mack/bioinfo/scripts/ngs_tools/dev/common/david_enrichment.R")
## child='/home/brandl/mnt/mack/bioinfo/scripts/ngs_tools/dev/common/david_enrichment.Rmd'