Skip to content
Snippets Groups Projects
Commit 2bc38c7e authored by Holger Brandl's avatar Holger Brandl
Browse files

cont better dge report

parent f9f0b9cd
No related branches found
No related tags found
No related merge requests found
......@@ -5,22 +5,22 @@ suppressMessages(require(docopt))
doc <- '
Convert a cuffdiff into a dge-report
Usage: dge_analysis.R [options] <cuffdb_directory>
Usage: cuffdiff_report.R [options] <cuffdb_directory>
Options:
--constrasts=<tab_delim_table> Table with sample pairs for which dge analysis should be performed
--directed Perform directed dge-steps in a directed manner
--qcutoff <qcutoff> Use a q-value cutoff of 0.01 instead of a q-value cutoff [default: 0.01]
--pcutoff <pcutoff> Override q-value filter and filter by p-value instead
--minfpkm <minfpkm> Minimal expression in any of the sample to be included in the final result list [default: 1]
'
#--directed Perform directed dge-steps in a directed manner
#-S Dont do general statistics and qc analysis
#print("args ars")
#print(commandArgs(TRUE))
opts <- docopt(doc, commandArgs(TRUE)) ## does not work when spining
# florio_11b: setwd("/lustre/projects/bioinfo/holger/projects/florio_11b/cuffdiff/dge_report_new"); opts <- docopt(doc, "--constrasts contrasts.txt --pcutoff 0.01 /projects/bioinfo/holger/projects/florio_11b/cuffdiff")
## florio_11b: setwd("/lustre/projects/bioinfo/holger/projects/florio_11b/cuffdiff/dge_report_new"); opts <- docopt(doc, "--constrasts contrasts.txt --pcutoff 0.01 /projects/bioinfo/holger/projects/florio_11b/cuffdiff")
# opts <- docopt(doc, "--undirected ..")
# opts <- docopt(doc, paste(Sys.getenv("DGE_PARAMS"), ".."))
## todo use commandArgs here!!
......@@ -28,10 +28,12 @@ opts <- docopt(doc, commandArgs(TRUE)) ## does not work when spining
#print(opts)
skipQC <<- opts$S
split_hit_list <- !opts$undirected
#split_hit_list <- opts$directed
constrasts_file <- opts$constrasts
pcutoff <- if(is.null(opts$pcutoff)) NULL else as.numeric(opts$pcutoff)
qcutoff <- if(is.numeric(pcutoff)) NULL else as.numeric(opts$qcutoff)
if(is.numeric(pcutoff)) opts$qcutoff <- NULL
minFPKM <- as.numeric(opts$minfpkm)
......@@ -45,9 +47,9 @@ if(is.na(file.info(cuffdb_directory)$isdir)){
}
devtools::source_url("https://raw.githubusercontent.com/holgerbrandl/datautils/v1.5/R/core_commons.R")
devtools::source_url("https://raw.githubusercontent.com/holgerbrandl/datautils/v1.5/R/ggplot_commons.R")
devtools::source_url("https://raw.githubusercontent.com/holgerbrandl/datautils/v1.5/R/bio/diffex_commons.R")
devtools::source_url("https://raw.githubusercontent.com/holgerbrandl/datautils/v1.7/R/core_commons.R")
devtools::source_url("https://raw.githubusercontent.com/holgerbrandl/datautils/v1.7/R/ggplot_commons.R")
devtools::source_url("https://raw.githubusercontent.com/holgerbrandl/datautils/v1.7/R/bio/diffex_commons.R")
#+ results='asis', echo=FALSE
cat('
......@@ -73,22 +75,24 @@ geneInfo <- quote({
biomaRt::getBM(c('ensembl_gene_id', 'external_gene_name', 'description', 'gene_biotype'), mart=mart)
}) %>% cache_it("gene_info")
geneLoci <- quote({
martName <- guess_mart(fpkm(genes(cuff))$gene_id)
mart <- biomaRt::useDataset(martName, mart = biomaRt::useMart("ensembl"))
biomaRt::getBM(c("ensembl_gene_id", "chromosome_name", "start_position", "end_position"), mart=mart)
}) %>% cache_it("gene_loci")
#knitr::opts_knit$set(root.dir = getwd())
#' Used DGE Analysis Paramters
vec2df(unlist(opts)) %>% filter(!str_detect(name, "^[<-]")) %>% kable()
########################################################################################################################
#' # Differential Gene Expression Analysis
print("Configuration")
vec2df(unlist(opts)) %>% filter(!str_detect(name, "^[<-]")) %>% kable()
#' [CuffDiff](http://cufflinks.cbcb.umd.edu/manual.html) and [cummeRbund](http://compbio.mit.edu/cummeRbund/) were used to perform this analysis. For details about how the test for differntial expression works, refer to [Differential analysis of gene regulation at transcript resolution with RNA-seq](http://www.nature.com/nbt/journal/v31/n1/full/nbt.2450.html).
#print(paste("db dir is", cuffdb_directory))
#' Used cuffdiff database in `r cuffdb_directory`
#+ load_db, cache=FALSE, warning=FALSE
isCluster=Sys.getenv("LSF_SERVERDIR")!=""
dbTempCopyRequired=isCluster | str_detect(normalizePath(cuffdb_directory), fixed("/Volumes/projects"))
......@@ -101,37 +105,42 @@ if(dbTempCopyRequired){
cuff <- readCufflinks(dir="..")
}
#' The expression tracking reference contained:
cuff
#' ### Samples
runInfo(cuff) %>% filter(param!="cmd_line") %>% kable()
#' cuffdiff cmd was
#' `r runInfo(cuff)$cmd_line`
#+ results='asis'
cat((runInfo(cuff) %>% filter(param=="cmd_line"))$value %>% gsub('(.{1,80})(,|$|\\s)', '\\1\n', .))
#+
replicateInfo <- replicates(cuff) %>%
mutate(file=basename(file)) %>%
select(-c(total_mass, norm_mass, internal_scale, external_scale))
replicateInfo %>% kable("html", table.attr = "class='dtable'", escape=F)
replicateInfo %>% kable() # kable("html", table.attr = "class='dtable'", escape=F)
write.delim(replicateInfo, file="replicate_info.txt")
# replicateInfo <- read.delim("replicate_info.txt")
#' [replicateInfo](replicate_info.txt)
#+ results='asis'
## define or load the contrasts
dbContrasts <- diffData(genes(cuff)) %>% select(sample_1, sample_2) %>% distinct()
if(!is.null(constrasts_file)){
echo("using contrasts matrix from: ", basename(constrasts_file))
echo("Using contrasts matrix from: ", basename(constrasts_file))
contrasts <- read.csv(constrasts_file, header=F) %>% set_names(c("sample_1", "sample_2"))
## filter for the direction present in cuffdb
contrasts <- rbind_list(contrasts, contrasts %>% select(sample_1=sample_2, sample_2=sample_1)) %>% inner_join(dbContrasts)
}else{
echo("comparing all samples against each other")
echo("Comparing all samples against each other")
contrasts <- diffData(genes(cuff)) %>% select(sample_1, sample_2) %>% distinct()
}
#' The contrasts are
#' The used contrasts model is
contrasts %>% kable()
......@@ -140,20 +149,22 @@ contrasts %>% kable()
#' All data has been annotated with ensembl gene information and was exprorted into . Both normalized (FPKM) and raw counts were exported.
#+ results='asis'
fpkmMatrix(genes(cuff)) %>%
rownames2column("ensembl_gene_id") %>%
merge(geneInfo, by="ensembl_gene_id", all.x=T) %>%
print_head() %>%
merge(geneInfo, by="ensembl_gene_id", all.x=T) %T>%
head_html() %>%
write.delim(file="gene_fpkms.txt")
# gene_fpkms <- read.delim("gene_fpkms.txt")
#' [Normalized Counts](gene_fpkms.txt)
## export the same but now including replicate information
#+ results='asis'
repFpkmMatrix(genes(cuff)) %>%
rownames2column("ensembl_gene_id") %>%
merge(geneInfo, by="ensembl_gene_id", all.x=T) %>%
print_head() %>%
head_html() %>%
write.delim(file="gene_fpkms_by_replicate.txt")
# gene_fpkms_by_replicate <- read.delim("gene_fpkms_by_replicate.txt")
#' [Normalized Counts by Replicate](gene_fpkms_by_replicate.txt)
......@@ -196,38 +207,37 @@ csBoxplot(genes(cuff),replicates=T)
csScatterMatrix(genes(cuff))
#csVolcano(genes(cuff),"a", "b")
#' For the volcano plot matrix we calculate fold-changes as x/y (aka column/row). For details see [here](http://rpackages.ianhowson.com/bioc/cummeRbund/man/csVolcano.html)
csVolcanoMatrix(genes(cuff))
#' We also plot more detailed scatter plots for just the contrasts for interest. Signifcance coloring is based on cuffdiff's internal score an not our hit criterion.
#csVolcano(genes(cuff),"hESC","Fibroblasts")
csVolcPlots <- alply(contrasts, 1, splat(function(sample_1, sample_2) {
csVolcano(genes(cuff),sample_1,sample_2)
}))
#' fig.width=16
#+ fig.width=16
multiplot(plotlist=csVolcPlots, cols=min(3, length(csVolcPlots)))
#' We also plot more detailed scatter plots for contrasts for interest
#csScatter(genes(cuff),"hESC","Fibroblasts",smooth=T)
#csScatter(genes(cuff), "a", "d", smooth=F) + aes(color=abs(log2(a/b))>2) + scale_colour_manual(values=c("darkgreen", "red"))
#MAplot(genes(cuff), "a", "d")+ aes(color=abs(log2(M))>2)+ ggtitle("MA-plot: a/d")
#MAplot(genes(cuff), "a", "b", useCount=T) #+ aes(color=abs(log2(M))>2)+ ggtitle("MA-plot: Luc/KD")
#csVolcano(genes(cuff),"aRGm", "N") + ggtitle("odd line-shaped fan-out in volcano plot")
## todo consider to add more pretty volcano plots (see stefania's extra plots
#getGenes(cuff,c("ENSMUSG00000097853")) %>% expressionBarplot()
#csVolcano(genes(cuff),"a", "b")
## todo replace or complement with marta plot
csVolcanoMatrix(genes(cuff))
#csVolcano(genes(cuff),"a", "b")
#######################################################################################################################
#' ## Replicate Correlation and Clustering
#' For most type of experiments we expect replicates to cluster together. Second similar biological conditions should cluster together. As clustering results depend on the method being used, we try a few here to check if we get consistent results.
## Map replicates into 2-dimensional space to highlight replicate clusters (or the lack of them)
#' Map replicates into 2-dimensional space to highlight replicate clusters (or the lack of them)
require.auto(edgeR)
plotMDS(repFpkmMatrix(genes(cuff)), top=500, main="top500 MDS")
......@@ -274,14 +284,16 @@ plot(res, main="Replicate Clustering of Expressed Genes")
#######################################################################################################################
#' ## Differentially expressed genes
#' Here we are using [cuffdiff](http://cole-trapnell-lab.github.io/cufflinks/cuffdiff) to test for differential expression. Estimate variance-mean dependence in count data from high-throughput sequencing assays and test for differential expression based on a model using the negative binomial distribution.
allDiff <- diffData(genes(cuff)) %>%
## just keep pairs of interest
merge(contrasts) %>%
## discard all genes that are not expressed in any of the cell types (>1)
filter(gene_id %in% getExpressedGenes(cuff, minFPKM=minFPKM)) %>%
dplyr::rename(ensembl_gene_id=gene_id) %>%
mutate(sample_1_overex=log2_fold_change<0)
dplyr::rename(ensembl_gene_id=gene_id, s2_over_s1_log2fc=log2_fold_change) %>%
mutate(sample_1_overex=s2_over_s1_log2fc<0)
#+ results='asis'
if(!is.null(qcutoff)){
......@@ -295,9 +307,9 @@ if(!is.null(qcutoff)){
merge(allDiff, geneInfo, by="ensembl_gene_id", all.x=T) %>% write.delim(file="diff_data.txt")
# diff_data <- read.delim("diff_data.txt")
#' [Unfiltered CuffDiff Results](diff_data.txt)
#
#' Filter for expressed genes
# Filter for DEGs (differentially expressed genes)
degs <- allDiff %>%
## Filter for expressed genes
filter(is_hit) %>%
......@@ -306,28 +318,82 @@ degs <- allDiff %>%
left_join(geneInfo, by="ensembl_gene_id")
#' DEGs (differentially expressed genes) are contained in the following table
write.delim(degs, file="degs.txt")
# degs <- read.delim("degs.txt")
#' [Differentially Expressed Genes](degs.txt)
#' ### DEG Counts
#with(degs, as.data.frame(table(sample_1, sample_2, sample_1_overex))) %>% filter(Freq >0) %>% kable()
degs %>% dplyr::count( sample_1, sample_2, sample_1_overex) %>% kable()
#+ fig.height=nrow(contrasts)+2
ggplot(degs, aes(paste(sample_1, "vs", sample_2), fill=status)) + geom_bar() +xlab(NULL) + ylab("# DGEs") +ggtitle("DEGs by contrast") + coord_flip()
ggplot(degs, aes(paste(sample_1, "vs", sample_2), fill=sample_1_overex)) + geom_bar(position="dodge") + xlab(NULL) + ylab("# DGEs") + ggtitle("DEGs by direction") + coord_flip()
#' Hit Browser:
#+ results='as.is'
#' DEGs can be browsed in Excel using the exported table or via the embedded table browser. To enable the IGV links, you need to set the port in the IGV preferences to 3334.
#+ results='asis'
degs %>%
mutate(ensembl=paste0("<a href='http://www.ensembl.org/Search/Results?facet_species=Elephant;page=1;facet_feature_type=Gene;q=",ensembl_gene_id, "'>",ensembl_gene_id, "</a>")) %>%
transmute(sample_1, sample_2, value_1, value_2, s2_over_s1_logfc=log2_fold_change, p_value, q_value, ensembl, external_gene_name, description, gene_biotype) %>%
inner_join(geneLoci) %>%
mutate(
ensembl=paste0("<a href='http://www.ensembl.org/Multi/Results?page=1;facet_feature_type=Gene;q=",ensembl_gene_id, "'>",ensembl_gene_id, "</a>"),
igv=paste0("<a href='http://localhost:3334/goto?locus=", chromosome_name,":", start_position, "-", end_position, "'>IGV</a>")
) %>%
select(sample_1, sample_2, value_1, value_2, s2_over_s1_log2fc, p_value, q_value, ensembl, external_gene_name, description, gene_biotype, igv) %>%
kable("html", table.attr = "class='dtable'", escape=F)
with(degs, as.data.frame(table(sample_1, sample_2, sample_1_overex))) %>% filter(Freq >0) %>% kable()
## just needed to restore environment easily
save(degs, file=".degs.RData")
# degs <- local(get(load(".degs.RData")))
#' Also redo volcano plots with out hit overlays
#+ fig.width=16, fig.height=14
csVolcPlots <- alply(contrasts, 1, splat(function(sample_1, sample_2) {
csVolcano(genes(cuff),sample_1,sample_2)
}))
maPlots <- allDiff %>% group_by(sample_1, sample_2) %>% do(gg={ maData <-.
## todo why not s2_over_s1_log2fc
maData %>% ggplot(aes(0.5*log2(value_1*value_2), log2(value_1/value_2), color=is_hit)) +
geom_point(alpha=0.3) +
geom_hline(yintercept=0, color="red") +
ggtitle(with(maData[1,], paste(sample_1, "vs", sample_2)))
}) %$% gg
#+ fig.width=16
multiplot(plotlist=maPlots, cols=min(2, length(maPlots)))
##Volcano plots
hitCounts <- degs %>%
dplyr::count(sample_1, sample_2, sample_1_overex) %>%
dplyr::rename(hits=n) %>%
merge(data.frame(sample_1_overex=c(T,F), x_pos=c(-3,3)))
#' For the volcano plot matrix we calculate fold-changes as x/y (aka column/row). For details see [here](http://rpackages.ianhowson.com/bioc/cummeRbund/man/csVolcano.html)
#+ fig.width=16, fig.height=14
allDiff %>% ggplot(aes(s2_over_s1_log2fc, -log10(p_value), color=is_hit)) +
# geom_jitter(alpha=0.3, position = position_jitter(height = 0.2)) +
geom_point(alpha=0.3) +
# theme_bw() +
xlim(-3.5,3.5) +
scale_color_manual(values = c("TRUE"="red", "FALSE"="black")) +
# ggtitle("Insm1/Ctrl") +
## http://stackoverflow.com/questions/19764968/remove-point-transparency-in-ggplot2-legend
guides(colour = guide_legend(override.aes = list(alpha=1))) +
## tweak axis labels
xlab(expression(log[2]("x/y"))) +
ylab(expression(-log[10]("p value"))) +
## add hit couts
geom_text(aes(label=hits, x=x_pos), y=2, color="red", size=10, data=hitCounts) +
facet_grid(sample_1 ~ sample_2)
########################################################################################################################
#' ## Term enrichment
......@@ -346,15 +412,17 @@ if(nrow(contrasts)<3){
)
}
#+ eval=F
enrResults <- geneLists %>% group_by(list_id) %>% do(davidAnnotationChart(.$ensembl_gene_id))
#+ eval=T
enrResults <- quote(geneLists %>% group_by(list_id) %>% do(davidAnnotationChart(.$ensembl_gene_id))) %>%
cache_it(paste0("enrdata_", digest(geneLists)))
write.delim(enrResults, file="enrResults.txt")
# enrResults <- read.delim("enrResults.txt")
#' [Enrichment Results](enrResults.txt)
#' Because David is not too strigent by default we extract just those terms for which Bonferroni<0.01
sigEnrResults <- subset(enrResults, Bonferroni <0.01)
#' results='as.is'
#' results='asis'
sigEnrResults %>% kable("html", table.attr = "class='dtable'", escape=F)
write.delim(sigEnrResults, file="sigEnrResults.txt")
......@@ -372,33 +440,49 @@ write.delim(sigEnrResults, file="sigEnrResults.txt")
# print(logPlot)
#})
sigEnrResults %>% do({
#+ include=FALSE
term_category_colors <- create_palette(unique(ac(sigEnrResults$Category)))
term_barplot_files <- sigEnrResults %>% do({
enrResultsGrp <- .
## DEBUG enrResultsGrp <- sigEnrResults
label <- enrResultsGrp$list_id[1]
# dplyr::select(enrResultsGrp, matches(ifelse(split_hit_list, "sample_1|sample_2|sample_1_overex", "sample_1|sample_2"))) %>%
# head(1) %>% apply(1, paste, collapse="_vs_")
echo("processing", label)
# echo("processing", label)
logPlot <- enrResultsGrp %>%
## fix factor order
mutate(Term=reorder(Term, -PValue) %>% reorder(as.integer(Category))) %>%
mutate(Term=reorder(Term, -PValue) %>% reorder(as.integer(as.factor(Category)))) %>%
ggplot(aes(Term, PValue, fill=Category)) +
geom_bar(stat="identity")+
coord_flip() +
xlab("Enriched Terms") +
ggtitle(label) +
scale_y_log10()
ggsave(paste0(label, " enrichmed terms.pdf"))
print(logPlot)
geom_bar(stat="identity")+
scale_fill_manual(values = term_category_colors, drop=F, name="Ontology") +
coord_flip() +
xlab("Enriched Terms") +
ggtitle(label) +
scale_y_log10()
fileNameLabel <- label %>%
str_replace_all("!=", "ne") %>%
str_replace_all(">", "gt") %>%
str_replace_all("<", "lt") %>%
str_replace_all(" ", "_")
# ggsave(paste0("enrichmed_terms__", fileNameLabel, ".pdf"))
# print(logPlot)
tmpPng <- paste0("enrterms__", fileNameLabel, ".png")
ggsave(tmpPng, logPlot, width=10, height = round(2+nrow(enrResultsGrp)/4), limitsize=FALSE)
data.frame(file=tmpPng)
})
#+ results="asis"
l_ply(term_barplot_files$file, function(pngFile){ cat(paste0("<img src='", pngFile, "'><br>"))})
########################################################################################################################
#' ## Enriched KEGG Pathways
#' To understand spatio-temporal changes in gene expression better we now overlay
keggPathways <- sigEnrResults %>%
filter(Category=="KEGG_PATHWAY") %>%
separate(Term, c('kegg_pathway_id', 'pathway_description'), sep="\\:", remove=F) %>%
......@@ -420,7 +504,7 @@ sliceData <- allDiff %>%
## define slice labels (needed for casting)
comparison=paste0(sample_1, "__vs__", sample_2),
## create normalized linear score for slice plotting
sample_1_overex=log2_fold_change<0,
sample_1_overex=s2_over_s1_log2fc<0,
plot_score=-log10(q_value)*ifelse(sample_1_overex, 1, -1)
) %>%
dcast(ensembl_gene_id ~ comparison, value.var="plot_score") %>%
......@@ -434,7 +518,7 @@ data.frame(set=names(sliceData)) %>% mutate(slice_index=row_number()) %>% kable(
plot_pathway <- function(pathwayID, overlayData){
# pathwayID="mmu04015"
# browser()
echo("processing pathway", pathwayID)
# echo("processing pathway", pathwayID)
pv.out <- pathview(
gene.data = overlayData,
pathway.id = pathwayID,
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment