Newer
Older
#!/usr/bin/env Rscript
#+ include=FALSE
suppressMessages(require(docopt))
doc <- '
Perform a differntial gene expression analysis using deseq2
Usage: featcounts_deseq_mf.R [options] <count_matrix> <design_matrix>
Options:
--contrasts=<tab_delim_table> Table with sample pairs for which dge analysis should be performed
--qcutoff <qcutoff> Use a q-value cutoff of 0.01 instead of a q-value cutoff [default: 0.01]
--pcutoff <pcutoff> Override q-value filter and filter by p-value instead
--min_count <min_count> Minimal expression in any of the sample to be included in the final result list [default: 10]
--out <name_prefix> Name to prefix all generated result files [default: ]
--design <formula> Design fomula for DeSeq with contrast attribute at the end [default: sample]
'
opts <- docopt(doc, commandArgs(TRUE))
## load some packages first because of name-space order
library(knitr)
library(DESeq2)
#devtools::source_url("https://raw.githubusercontent.com/holgerbrandl/datautils/v1.23/R/core_commons.R")
devtools::source_url("https://raw.githubusercontent.com/holgerbrandl/datautils/v1.24/R/core_commons.R")
devtools::source_url("https://raw.githubusercontent.com/holgerbrandl/datautils/v1.24/R/ggplot_commons.R")
devtools::source_url("https://raw.githubusercontent.com/holgerbrandl/datautils/v1.24/R/bio/diffex_commons.R")
count_matrix_file <- opts$count_matrix
design_matrix_file <- opts$design_matrix
contrasts_file <- opts$contrasts
resultsBase <- if(str_length(opts$out)>0) paste0(opts$out, ".") else ""
pcutoff <- if(is.null(opts$pcutoff)) NULL else as.numeric(opts$pcutoff)
qcutoff <- if(is.numeric(pcutoff)) NULL else as.numeric(opts$qcutoff)
if(is.numeric(pcutoff)) opts$qcutoff <- NULL
## extract the sample for the design
designFormula <- opts$design
## consider last element of design formula as sample attribute
contrastAttribute <- str_split(designFormula, "[+]") %>% unlist() %>% tail(1)
assert(designFormula=="sample" || !is.null(design_matrix_file), "custom designs are not supported without a design matrix")
#' Run configuration was
vec2df(unlist(opts)) %>% filter(!str_detect(name, "^[<-]")) %>% kable()
########################################################################################################################
#' ## Data Preparation
#' The detection of differentially expressed genes is performed using the R package DEseq2 (http://bioconductor.org/packages/release/bioc/html/DESeq2.html), which requires a count matrix as input.
#' The count data are presented as a table which reports, for each sample, the number of sequence fragments that have been assigned to each gene.
#' An important analysis question is the quantification and statistical inference of systematic changes between conditions, as compared to within-condition variability.
#' Working Directory: `r getwd()`
## save a backup into the current working directory as a reference
countData %>% write_tsv(paste0(resultsBase, "input_counts.txt"))
#' Structure of input matrix was
glimpse(countData, width=60)
countMatrix <- countData %>% column2rownames("ensembl_gene_id") %>% as.matrix()
# Expression Filtering
genesBefore <- nrow(countMatrix)
countMatrix %<>% filterByExpression(opts$min_count)
genesAfter <- nrow(countMatrix)
#' Counts were filtered to only retain genes which had more that `r opts$min_count` alignments in at least one replicate. This reduced the number of genes from `r genesBefore` to `r genesAfter`.
########################################################################################################################
#' DESeq is an R package to analyse count data from high-throughput sequencing assays such as RNA-Seq and test for differential expression. The latest version, DESeq2, is used.
#' DESeq uses a model based on the negative binomial distribution and offers the following features:
#' Count data is discrete and skewed and is hence not well approximated by a normal distribution. Thus, a test based on the negative binomial distribution, which can reflect these properties, has much higher power to detect differential expression.
#' Tests for differential expression between experimental conditions should take into account both technical and biological variability. Recently, several authors have claimed that the Poisson distribution can be used for this purpose. However, tests based on the Poisson assumption (this includes the binomial test and the chi-squared test) ignore the biological sampling variance, leading to incorrectly optimistic p values. The negative binomial distribution is a generalisation of the Poisson model that allows to model biological variance correctly.
#' DESeq estimate the variance in a local fashion, using different coefficients of variation for different expression strengths. This removes potential selection biases in the hit list of differentially expressed genes, and gives a more balanced and accurate result.
#' See deseq reference [docs](http://master.bioconductor.org/packages/release/bioc/vignettes/DESeq2/inst/doc/DESeq2.pdf) for details
#' To understand fold-change shrinking and estimation check out
#' [Moderated estimation of fold change and dispersion for RNA-seq data with DESeq2](http://genomebiology.com/2014/15/12/550/abstract)
if(!is.null(design_matrix_file)){
replicates2samples <- read_tsv(design_matrix_file) #%>% select(replicate, sample)
}else{
get_sample_from_replicate <- function(repName) str_match(repName, "(.*)_([R]|rep)?[0-9]{1,2}$")[,2]
replicates2samples <- data_frame(replicate=colnames(countMatrix)) %>% mutate(sample=get_sample_from_replicate(replicate)) # %>% distinct()
}
# Define or load a contrasts matrix
if(!is.null(contrasts_file)){
stopifnot(is.null(design_matrix_file)) ## cant work yet because sample column with have random name
contrasts <- distinct(replicates2samples, sample) %>% select(sample) %>%
merge(.,., suffixes=c("_1", "_2"), by=NULL) %>%
filter(ac(sample_1)>ac(sample_2)) %>%
# filter(ac(sample_1)!=ac(sample_2)) %>%
fac2char()
write_tsv(contrasts, paste(resultsBase, "contrasts.txt"))
#' The used design formula is.
designFormula
#' For more information about designs see section 1.6 "multi-factor designs" in [deseq-manual](https://bioconductor.org/packages/release/bioc/vignettes/DESeq2/inst/doc/DESeq2.pdf)
contrasts %>% kable()
## gene specific factors
#normalizationFactors(dds)
## sizeFactor R help:sigEnrResults
#dds <- makeExampleDESeqDataSet()
#dds <- estimateSizeFactors( dds )
#sizeFactors(dds)
## see "3.11 Sample-/gene-dependent normalization factors" in the DEseq2 manual for details
#exDesign <- data.frame(condition=colnames(countMatrix) %>% get_sample_from_replicate)
exDesign <- data_frame(replicate=colnames(countMatrix)) %>% mutate(col_index=row_number()) %>%
full_join(replicates2samples, by="replicate") %>%
arrange(col_index) #%>% transmute(condition=sample_2ndwt) %>% fac2char
## infer the sample to be used from the formula
#designFormula <- "sample_2ndwt + batch"
#exDesign <- replicates2samples %>% arrange(colnames(countMatrix))
# valiadate that contrasts model is compatible with data
designSamples = as.df(exDesign)[, contrastAttribute] %>% unique %>% sort
contrastSamples = contrasts %>% gather %$% value %>% unique %>% sort
warning("design : ", paste(designSamples, collapse=", "))
warning("contrasts: ", paste(contrastSamples, collapse=", "))
stop("experiment design is not consistent with contrasts")
}
#stopifnot(all((contrasts %>% gather %$% value %>% unique) %in% exDesign$condition))
#http://www.cookbook-r.com/Formulas/Creating_a_formula_from_a_string/
dds <- DESeqDataSetFromMatrix(countMatrix, exDesign, formula(as.formula(paste("~", designFormula))))
#dds <- estimateSizeFactors(dds)
#dds <- estimateDispersions(dds)
dds <- DESeq(dds)
# session::save.session("fc_debug.dat");
# session::restore.session("fc_debug.dat");
########################################################################################################################
#' ## Quality Control
#' ### Size Factors
##' Size Factors estimated by DESeq
sizeFactors(dds) %>%
set_names(colnames(countMatrix)) %>% melt() %>%
rownames2column("sample") %>%
ggplot(aes(sample, value)) + geom_bar(stat="identity") + ggtitle("deseq size factors") + coord_flip()
# From DESeq manual: The regularized log transformation is preferable to the variance stabilizing transformation if the size factors vary widely.
#' For details about size factor normalziation and calculation see https://www.biostars.org/p/79978/
##' From the DESeq docs about how size factors are used: The sizeFactors vector assigns to each column of the count matrix a value, the size factor, such that count values in the columns can be brought to a common scale by dividing by the corresponding size factor. Counts are divied by size factors.
#' ### Data Dispersion
#' Plot of the per-gene dispersion estimates together with the fitted mean-dispersion relationship.
#' The dispersion plot shows how the estimates are shrunk from the gene wise values (black dots) toward the fitted estimates, with the final values used in testing being the blue dots.
#' The dispersion can be understood as the square of the coefficient of biological variation. So, if a gene's expression typically differs from replicate to replicate sample by 20%, this gene's dispersion is: .20^2 = .04.
## The function estimateDispersions performs three steps. First, it estimates, for each gene and each (replicated) condition, a dispersion value, then, it fits, for each condition, a curve through the estimates. Finally, it assigns to each gene a dispersion value, using either the estimated or the fitted value.
#' The estimation of dispersions performs three steps. First, it estimates, for each gene and each (replicated) condition, a dispersion value, then, it fits, for each condition, a curve through the estimates. Finally, it assigns to each gene a dispersion value, using either the estimated or the fitted value.
plotDispEsts(dds, main="Dispersion plot")
## todo add comparison to non-corrected model here using LTR as detailed out in bioinfo/rnaseq_diffex.md:16
########################################################################################################################
#' ### PCA and Clustering
#' In order to assess overall similarity between samples two common statistical methods are used - Principal component analysis (PCA) and clustering.
#' This should provide answers to the questions: Which samples are similar to each other, which are different? Does this fit to the expectation from the experiment’s design?
#' Using a regularized log transformation of the raw counts provides the advantage that it stabilizes the variance across the mean.
#' Principal component analysis (PCA) is a technique used to emphasize variation and bring out strong patterns in a dataset.
#' Principal components are the underlying structure in the data. They are the directions where there is the most variance, the directions where the data is most spread out. The data points/samples are projected onto the 2D plane such that they spread out in the two directions that explain most of the differences. The x-axis is the direction that separates the data points the most and the y-axis is a direction that separates the data the second most.
# Regularized log transformation for clustering/heatmaps, etc
rld <- rlogTransformation(dds)
#plotPCA(rld, intgroup = "sample")
plotPCA(rld, intgroup = contrastAttribute)
#' The Euclidean distance between samples are calculated after performing the regularized log transformation.
#' Using the calculated distance matrix, the samples are projected on a two-dimensional graph such that the distance between samples approximately corresponds to the biological coefficient of variation between those samples.
#' More information can be found here: https://en.wikipedia.org/wiki/Principal_component_analysis
#distsRL <- dist(t(assay(rld)))
#hc <- hclust(distsRL)
#distMatrix <- as.matrix(distsRL)
distMatrix <- as.matrix(dist(t(assay(rld))))
#rownames(distMatrix) <- colnames(distMatrix) <- with(exDesign(dds), paste(condition, sep=" : "))
labelcntData <- countData %>%
distinct(ensembl_gene_id) %>% column2rownames("ensembl_gene_id") %>%
fac2char %>% data.matrix() %>% round() %>% data.frame()
heatmap.2(distMatrix, labRow=colnames(labelcntData), labCol=colnames(labelcntData),
symm=TRUE, trace="none",
#key=F, col=colorpanel(100, "black", "white"),
margin=c(8, 8), main="Sample Distance Matrix")
rownames(distMatrix) <- colnames(labelcntData)
colnames(distMatrix) <- colnames(labelcntData)
## todo hide column dendrogram but keep cluster-order
#distMatrix %>% d3heatmap(xaxis_height=1, Colv = T, dendrogram="row")
distMatrix %>% d3heatmap(xaxis_height=1)
########################################################################################################################
#' # Perform Differential Expression Analysis
#' Run Deseq Test
#dds <- DESeq(dds, fitType='local', betaPrior=FALSE)
#dds <- DESeq(dds, fitType='local')
#dds <- DESeq(dds)
#res <- results(dds)
#resultsNames(dds)
#' Model Overview:
summary(results(dds))
## extract all de-sets according to our contrasts model
deResults <- alply(contrasts, 1, splat(function(sample_1, sample_2){
results(dds, contrast=c(contrastAttribute, sample_1, sample_2)) %>%
rownames2column("ensembl_gene_id") %>%
as.data.frame() %>%
## see http://rpackages.ianhowson.com/bioc/DESeq2/man/results.html when using contrasts argument
rename(s1_over_s2_logfc=log2FoldChange) %>%
mutate(sample_1=sample_1, sample_2=sample_2) #%>% head %>% print_head
})) %>% bind_rows %>% tbl_df
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
# deResults <- adply(contrasts, 1, splat(function(sample_1, sample_2){
# echo('samples', sample_1, sample_2)
# return(data.frame())
# }))
deResults %>% ggplot(aes(s1_over_s2_logfc)) +
geom_histogram(binwidth=0.1) +
facet_grid(sample_1 ~ sample_2) + geom_vline(xintercept=0, color="blue") +
xlim(-2,2) +
ggtitle("sample1 over sampl2 logFC ")
########################################################################################################################
#' ## Significnce of differential binding
#deResults %>% ggplot(aes(pvalue)) +
# geom_histogram() +
# xlim(0,1) +
# facet_grid(sample_1 ~ sample_2) + geom_vline(xintercept=0.01, slope=1, color="blue") +
# ggtitle("p-value distributions") #+ scale_x_log10()
#
#deResults %>% ggplot(aes(padj)) +
# geom_histogram() +
## xlim(0,1) +
# facet_grid(sample_1 ~ sample_2) + geom_vline(xintercept=0.01, slope=1, color="blue") +
# ggtitle("Adjusted p-value distributions") #+ scale_x_log10()
# report hit criterion
#+ results='asis'
if(!is.null(qcutoff)){
echo("Using q-value cutoff of", qcutoff)
deResults %<>% transform(is_hit=padj<=qcutoff)
}else{
echo("Using p-value cutoff of", pcutoff)
deResults %<>% transform(is_hit=pvalue<=pcutoff)
}
#deResults %<>% mutate(is_hit=pvalue<0.05)
deResults %<>% mutate(s1_overex=s1_over_s2_logfc>0)
normCounts <- counts(dds,normalized=TRUE) %>%
# set_names(exDesign(dds)$condition) %>% rownames2column("ensembl_gene_id")
set_names(colnames(countMatrix)) %>% rownames2column("ensembl_gene_id")
normCounts %>% write_tsv(paste0(resultsBase, "sizefac_normalized_by_replicate.txt"))
## .. should be same as input
#filter(countData, ensembl_gene_id=="ENSDARG00000000001")
########################################################################################################################
#' # MA and Volcano plots
#' MA-plot: The log2 fold change for a particular comparison is plotted on the y-axis and the average of the counts normalized by log2 is shown on the x-axis (“M” for minus, because a log ratio is equal to log minus log, and “A” for average). Each gene is represented with a dot. Genes with an adjusted p value below a certain threshold are shown in cyan (True).
#' This plot demonstrates that only genes with a large average normalized count contain sufficient information to yield a significant call.
# deseq approach
# plotMA(deResults, main="DESeq2", ylim=c(-2,2))
# Calculation of the base means per condition
#https://stat.ethz.ch/R-manual/R-devel/library/base/html/attr.html
exDesign$replicate
#conSamplesOrdered = levels(dds$sample)
conSamplesOrdered = as.df(exDesign)[, contrastAttribute] %>% unique
baseMeanPerLvl <- sapply(conSamplesOrdered, function(lvl) rowMeans( counts(dds,normalized=TRUE)[,conSamplesOrdered == lvl] ) ) %>%
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
rownames2column("ensembl_gene_id")
## add base means to diffßex summary
#numResultsBeforeMerge <- nrow(deResults)
deResults <- baseMeanPerLvl %>%
gather(sample, norm_count, -ensembl_gene_id) %>%
merge(.,., by="ensembl_gene_id", suffixes=c("_1", "_2")) %>%
# filter(ac(sample_1)<ac(sample_2)) %>%
# add diffex status
merge(deResults)
#+ fig.width=16, fig.height=14
deResults %>% ggplot(aes(0.5*log2(norm_count_1*norm_count_2), log2(norm_count_2/norm_count_1), color=pvalue<0.05)) +
geom_point(alpha=0.3) +
geom_hline(yintercept=0, color="red") +
facet_grid(sample_1 ~ sample_2)
#deResults %$% pvalue %>% log10() %>% quantile(0.05, na.rm=T)
#' A volcano plot displays unstandardized signal (e.g. log-fold-change) against noise-adjusted/standardized signal (e.g. t-statistic or -log(10)(p-value) from the t-test).
#' This scatter plot is used to quickly identify changes in large datasets composed of replicate data.
#' Here we have log2-fold-change on the X-axis and -log10 of the p-value on the y-axis.
#' This results in datapoints with low p-values (highly significant) appearing toward the top of the plot. For the x-axis the log of the fold-change is used so that changes in both directions (left and right) appear equidistant from the center.
#' Therefore interesting points are those, which are found toward the top of the plot that are far to either the left- or the right-hand side. These represent values that display large magnitude fold changes (hence being left- or right- of center) as well as high statistical significance (hence being toward the top).
#' see: https://en.wikipedia.org/wiki/Volcano_plot_%28statistics%29)
##Volcano plots
hitCounts <- filter(deResults, is_hit) %>%
count(sample_1, sample_2, s1_overex) %>%
rename(hits=n) %>%
merge(data.frame(s1_overex=c(T,F), x_pos=c(-2.5,2.5)))
#+ fig.width=16, fig.height=14
deResults %>% ggplot(aes(s1_over_s2_logfc, -log10(pvalue), color=is_hit)) +
geom_jitter(alpha=0.3, position = position_jitter(height = 0.2)) +
# theme_bw() +
xlim(-3,3) +
scale_color_manual(values = c("TRUE"="red", "FALSE"="black")) +
# ggtitle("Insm1/Ctrl") +
## http://stackoverflow.com/questions/19764968/remove-point-transparency-in-ggplot2-legend
guides(colour = guide_legend(override.aes = list(alpha=1))) +
## tweak axis labels
xlab(expression(log[2]("x/y"))) +
ylab(expression(-log[10]("p value"))) +
# ylim(0,50) +
## add hit couts
geom_text(aes(label=hits, x=x_pos), y=2, color="red", size=10, data=hitCounts) +
facet_grid(sample_1 ~ sample_2)
# Define absolute binding categories
#rawCounts <- counts(dds,normalized=F) %>%
# set_names(exDesign(dds)$condition) %>% rownames2column("ensembl_gene_id")
#ggplot(rawCounts, aes(H3HA_Sphere)) + geom_histogram() + scale_x_log10() + ggtitle("raw H3HA_Sphere counts distribution")
#require_auto(Hmisc)
#
#bindCats <- rawCounts %>% transmute(ensembl_gene_id, bind_category=cut2(H3HA_Sphere, cuts=c(10, 100)))
#with(bindCats, as.data.frame(table(bind_category))) %>% kable()
## optionally install bioconducor package
if(!any(.packages(all.available=TRUE)=="biomaRt")){
source("http://bioconductor.org/biocLite.R")
biocLite("biomaRt", ask=FALSE)
}
# Load transcriptome annotations needed for results annotation
geneInfo <- quote({
## mart <- biomaRt::useDataset("drerio_gene_ensembl", mart = biomaRt::useMart("ensembl"))§
# mart <- biomaRt::useDataset(guess_mart(countData$ensembl_gene_id), mart = biomaRt::useMart("ensembl"))
## todo fix this https://support.bioconductor.org/p/74322/
mart <- biomaRt::useDataset(guess_mart(countData$ensembl_gene_id), mart = biomaRt::useMart("ENSEMBL_MART_ENSEMBL", host="www.ensembl.org"))
c("ensembl_gene_id", "external_gene_name", "description", "chromosome_name", "start_position", "end_position") %>%
biomaRt::getBM(mart=mart)
}) %>% cache_it("geneInfo")
## save as reference for downstream analysis
write_tsv(geneInfo, path=paste0(resultsBase, "geneInfo.txt"))
#' Export counts per replicate as double-normalized FPKMs=(read_count * 10^9)/(gene_length * library_size)
fpkms <- countMatrix %>%
rownames2column("ensembl_gene_id") %>%
gather(replicate, num_tags, -ensembl_gene_id) %>% tbl_df %>%
left_join(transmute(geneInfo, ensembl_gene_id, gene_length=end_position-start_position), by="ensembl_gene_id") %>%
# left_join(mutate(geneInfo, ensembl_gene_id, gene_length=end_position-start_position), by="ensembl_gene_id") %>%
## calculate normalized counts
group_by(replicate) %>%
mutate(lib_size=sum(num_tags), fpkm=((1E9/lib_size)*(num_tags/gene_length)) %>% round(digits=3)) %>%
ungroup()
#fpkms %>% ggplot(aes(fpkm)) + geom_histogram() + scale_x_log10(labels=comma)
write_tsv(fpkms, paste0(resultsBase, "fpkms_by_replicate.long.txt"))
fpkms %>%
transmute(ensembl_gene_id, replicate, fpkm) %>%
spread(replicate, fpkm) %>%
write_tsv(paste0(resultsBase, "fpkms_by_replicate.wide.txt"))
#' Also export averaged counts per sample as double-normalized FPKMs=(read_count * 10^9)/(gene_length * library_size)
sampleFpkms <- countMatrix %>%
rownames2column("ensembl_gene_id") %>%
gather(replicate, num_tags, -ensembl_gene_id) %>% tbl_df %>%
## grouping if sample column is known to be called sample
# mutate(sample=get_sample_from_replicate(replicate)) %>%
# group_by(sample, ensembl_gene_id) %>%
## grouping with nse
inner_join(exDesign, by="replicate") %>%
group_by_(.dots=c(contrastAttribute, "ensembl_gene_id")) %>%
summarize(median_tags=median(num_tags)) %>%
left_join(transmute(geneInfo, ensembl_gene_id, gene_length=end_position-start_position), by="ensembl_gene_id") %>%
group_by(sample) %>%
mutate(lib_size=sum(median_tags), fpkm=((1E9/lib_size)*(median_tags/gene_length)) %>% round(digits=3)) %>%
ungroup()
#fpkms %>% ggplot(aes(fpkm)) + geom_histogram() + scale_x_log10(labels=comma)
write_tsv(sampleFpkms, paste0(resultsBase, "fpkms_by_sample.long.txt"))
sampleFpkms %>%
transmute(ensembl_gene_id, sample, fpkm) %>%
spread(sample, fpkm) %>% write_tsv(paste0(resultsBase, "fpkms_by_sample.wide.txt"))
#' [FPKMs as table](fpkms.long.txt) [FPKMs as Matrix](fpkms.wide.txt)
# Export the complete dataset for later analysis
deAnnot <- deResults %>%
#inner_join(normCounts) %>%
#merge(., contrasts) %>%
#merge(.,., suffixes=c("_1", "_2"), by=NULL) %>%
#inner_join(baseMeanPerLvl, id='ensembl_gene_id') %>%
left_join(geneInfo)
write_tsv(deAnnot, path=paste0(resultsBase, "de_results.txt"))
#' [deAnnot](`r paste0(resultsBase, "dba_results.txt")`)
## also export results which we'll always need for pathways overlays
deAnnot %>% transmute(
ensembl_gene_id,
contrast=paste(sample_1, "vs", sample_2),
plot_score=-log10(pvalue)*ifelse(s1_overex, 1, -1)
) %>% spread(contrast, plot_score) %>%
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
#deAnnot %>% filter(ensembl_gene_id=="FBgn0000015")
## todo understand purpose and effeect of indpendentFiltering (see https://support.bioconductor.org/p/57128/)
#deResults %>% count(is_hit)
#deAnnot %>% count(is_hit)
#deAnnot %>% count(is.na(padj))
########################################################################################################################
#' ## Hits Summary
## Extract hits from deseq results
degs <- deAnnot %>% filter(is_hit)
# ggplot(degs, aes(paste(sample_1, "vs", sample_2))) + geom_bar() +xlab(NULL) + ylab("# DBGs") + ggtitle("DBG count summary") + coord_flip()
# degs %>%
# ggplot(aes(paste(sample_1, "vs", sample_2), fill=s1_overex)) +
# geom_bar(position="dodge") +
# xlab(NULL) + ylab("# DBGs") +
# ggtitle("DBG count summary by direction of expression") +
# coord_flip()
# Export DBA genes
## disabled because we just subset the annotated data now to define degs
#degsAnnot <- degs %>%
# inner_join(normCounts) %>%
# left_join(geneInfo) %>%
# left_join(bindCats)
degs %>% write_tsv(paste0(resultsBase, "diffex_genes.txt"))
#' [Differentially Expressed Genes](`r paste0(resultsBase, "diffbind_genes.txt")`)
#unloadNamespace("dplyr"); require(dplyr)
## export slim hit list for downstream enrichment analysis
degs %>% transmute(ensembl_gene_id, contrast=paste(sample_1, "vs", sample_2)) %>% write_tsv(paste0(resultsBase, "degs_by_contrast.txt"))
## also export gsea inputs
deResults %>% mutate(contrast=paste(sample_1, "vs", sample_2)) %>%
arrange(contrast, -padj) %>%
select(contrast, ensembl_gene_id) %>%
write_tsv(paste0(resultsBase, "gsea__genes_by_contrast__undirected.txt"))
deResults %>% mutate(contrast=paste(sample_1, "vs", sample_2)) %>%
arrange(contrast, padj*ifelse(s1_overex, 1, -1)) %>%
select(contrast, ensembl_gene_id) %>%
write_tsv(paste0(resultsBase, "gsea__genes_by_contrast__directed.txt"))
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
# ## render interactive hit browser
# #+results='asis'
# degs %>%
# left_join(geneInfo) %>%
# mutate(
# igv=paste0("<a href='http://localhost:60151/goto?locus=", chromosome_name,":", start_position, "-", end_position, "'>IGV</a>")
# ) %>%
# select(s1_over_s2_logfc, pvalue, ensembl_gene_id, sample_1, sample_2, external_gene_name, description, igv) %>%
# # kable("html", table.attr = "class='dtable'", escape=F)
# datatable(escape=F)
#' ### DEG Counts
#+ fig.height=nrow(contrasts)/2+2
#ggplot(degs, aes(paste(sample_1, "vs", sample_2), fill=status)) + geom_bar() +xlab(NULL) + ylab("# DGEs") +ggtitle("DEGs by contrast") + coord_flip()
ggplot(degs, aes(paste(sample_1, "vs", sample_2), fill=(s1_overex))) + geom_bar() + xlab(NULL) + ylab("# DGEs") + ggtitle("DEGs by contrast") + coord_flip()
#with(degs, as.data.frame(table(sample_1, sample_2, s1_overex))) %>% filter(Freq >0) %>% kable()
degs %>% count( sample_1, sample_2, s1_overex) %>% kable()
#' DEGs (differentially expressed genes) are contained in the following table
#' DEGs can be browsed in Excel using the exported table or via the embedded table browser. To enable the IGV links, you need to set the port in the IGV preferences to 3334.
kableDegs <- degs
if(nrow(degs>2000)){
kableDegs <- degs %>% arrange(padj) %>% head(1000)
# Error: object 'q_value' not found. So: padj used instead of '-q_value'
print("just showing first 1000 most significant hits (highest p-value) in interactive hit table!!!! Use Excel to browser to browse the full set")
}
#+ results='asis'
kableDegs %>%
#inner_join(geneLoci) %>%
mutate(
ensembl=paste0("<a href='http://www.ensembl.org/Multi/Search/Results?y=0;site=ensembl_all;x=0;page=1;facet_feature_type=Gene;q=",ensembl_gene_id, "'>",ensembl_gene_id, "</a>"),
igv=paste0("<a href='http://localhost:60151/goto?locus=", chromosome_name,":", start_position, "-", end_position, "'>IGV</a>")
) %>%
select(sample_1, sample_2, s1_over_s2_logfc, pvalue, padj, ensembl_gene_id, external_gene_name, description) %>%
# kable("html", table.attr = "class='dtable'", escape=F)
datatable(escape=F)
## just needed to restore environment easily
#save(degs, file=".degs.RData")
# degs <- local(get(load(".degs.RData")))
########################################################################################################################
#' ## MA Plots
#' Redo MA plots but now including hit overlays
maPlots <- deResults %>% group_by(sample_1, sample_2) %>% do(gg={ maData <-.
## todo why not s2_over_s1_log2fc
maData %>% ggplot(aes(0.5*log2(norm_count_1*norm_count_2), log2(norm_count_1/norm_count_2), color=is_hit)) +
geom_point(alpha=0.3) +
geom_hline(yintercept=0, color="red") +
ggtitle(with(maData[1,], paste(sample_1, "vs", sample_2)))
}) %$% gg
#+ fig.height=6*ceiling(length(maPlots)/2)
multiplot(plotlist=maPlots, cols=min(2, length(maPlots)))
#'
##--------------------------------------------------------
## Useful link: https://gist.github.com/stephenturner/f60c1934405c127f09a6
## TODO later, reenable child-inclusion of enrichment analysis
########################################################################################################################
## ## Term enrichment Data Preparation
##+ echo=FALSE
#
## This analysis was performed using [David](http://david.abcc.ncifcrf.gov/). The following ontologies were tested: `r paste(ontologies, collapse=', ')`
#
#geneLists <- degs %>%
# transmute(ensembl_gene_id, list_id=paste(sample_1, "vs", sample_2))
#
#if(nrow(contrasts)<4){
# geneLists <- rbind_list(
# geneLists,
# degs %>% filter(s1_overex) %>% transmute(ensembl_gene_id, list_id=paste(sample_1, ">", sample_2)),
# degs %>% filter(!s1_overex) %>% transmute(ensembl_gene_id, list_id=paste(sample_1, "<", sample_2))
# )
#}
#
### additional overlaps before doing the intersection analysis
#geneLists %>% count(list_id) %>% kable()
#
#intersectLists <- function(geneLists, listIdA, listIdB, intersectListId) {
# commonGenes <- setdiff(filter(geneLists, list_id==listIdA)$ensembl_gene_id, filter(geneLists, list_id==listIdB)$ensembl_gene_id)
# data.frame(list_id=intersectListId, ensembl_gene_id=commonGenes)
#}
#
#geneLists %<>% group_by(list_id)
#save(geneLists, file=".enrGeneLists.RData")
## geneLists <- local(get(load("enrGeneLists.RData")))
#
#
### redefine opts arguments and tweak knitr options
#opts_knit$set(root.dir = getwd())
#commandArgs <- function(x) c(paste("--overlay_expr_data ", count_matrix_file, " enrGeneLists.RData"))
##source("/home/brandl/mnt/mack/bioinfo/scripts/ngs_tools/dev/common/david_enrichment.R")
## child='/home/brandl/mnt/mack/bioinfo/scripts/ngs_tools/dev/common/david_enrichment.Rmd'