Something went wrong on our end
-
Holger Brandl authoredHolger Brandl authored
cp_enrichment.R 13.78 KiB
#!/usr/bin/env Rscript
#+ echo=FALSE
suppressMessages(require(docopt))
doc <- '
Perform an enrichment analysis for a set of genes
Usage: cp_enrichment.R [options] <grouped_gene_lists_rdata>
Options:
--overlay_expr_data Tsv with overlay data for the kegg pathways
'
opts <- docopt(doc, commandArgs(TRUE)) ## does not work when spining
# opts <- docopt(doc, "--overlay_expr_data ctrl_fc_expr_filtered.txt geneClusters.RData" )
devtools::source_url("https://raw.githubusercontent.com/holgerbrandl/datautils/v1.13/R/core_commons.R")
devtools::source_url("https://raw.githubusercontent.com/holgerbrandl/datautils/v1.13/R/ggplot_commons.R")
devtools::source_url("https://raw.githubusercontent.com/holgerbrandl/datautils/v1.13/R/bio/diffex_commons.R")
require.auto(knitr)
require.auto(clusterProfiler)
require.auto(ReactomePA)
## to fix child support issue with knitr, see also
## http://stackoverflow.com/questions/20030523/knitr-nested-child-documents
## https://github.com/yihui/knitr/issues/38
# todo disabled because root.dir in parent document seems the only working solution
#if(exists('project_dir')) setwd(project_dir)
#print(getwd())
## load the data
geneLists <- local(get(load(opts$grouped_gene_lists_rdata)))
#geneLists %<>% filter(cluster %in% c("cluster_1", "cluster_2"))
if(!is.null(opts$overlay_expr_data)){
overlayData <- read.delim(opts$overlay_expr_data)
}
## TODO expose options to run gesa instead (by assuming that gene lists are sorted)
## see http://www.bioconductor.org/packages/release/bioc/vignettes/clusterProfiler/inst/doc/clusterProfiler.pdf for details
resultsBaseName=basename(opts$grouped_gene_lists_rdata) %>% trim_ext(".RData") %>% paste0(".")
########################################################################################################################
#' ## Enrichment Analysis
#' This analysis was performed using [David](http://david.abcc.ncifcrf.gov/). The following ontologies were tested: `r paste(DEF_DAVID_ONTOLOGIES, collapse=', ')`
listLabels <- geneLists %>% select(-ensembl_gene_id) %>% distinct
listLabels %<>% transform(list_label=do.call(paste, c(listLabels, sep="__")))
geneLists %>% inner_join(listLabels) %>%
ggplot(aes(list_label)) +
geom_bar() +
coord_flip() +
ggtitle("gene list sizes to be tested for term enrichment") +
ylab("")
guess_cp_species <- function(ensIds){
an_id <-ensIds[1]
if(str_detect(an_id, "ENSG")){
return("human")
}else if(str_detect(an_id, "ENSMUSG")){
return("mouse")
}else if(str_detect(an_id, "ENSDARG")){
return("zebrafish")
}else if(str_detect(an_id, "FBgn")){
return("fly")
}else{
stop(paste("could not clusterProfiler species name from ", an_id))
}
}
guess_anno_db <- function(ensIds){
an_id <-ensIds[1]
if(str_detect(an_id, "ENSG")){
return("org.Hs.eg.db")
}else if(str_detect(an_id, "ENSMUSG")){
return("org.Mm.eg.db")
}else if(str_detect(an_id, "ENSDARG")){
return("org.Dr.eg.db")
}else if(str_detect(an_id, "FBgn")){
return("org.Dm.eg.db")
}else{
stop(paste("could not anno db mart from ", an_id))
}
}
#source("https://bioconductor.org/biocLite.R")
#biocLite("org.Mm.eg.db")
#biocLite("org.Hm.eg.db")
#biocLite("org.Dr.eg.db")
#biocLite("org.Dm.eg.db")
cpSpecies <- guess_cp_species(geneLists$ensembl_gene_id)
annoDb <- guess_anno_db(geneLists$ensembl_gene_id) # e.g. "org.Hs.eg.db"
## supported ids
#idType("org.Hs.eg.db")
## convert to entrez gene ids
geneLists %<>% mutate(entrez_gene_id=bitr(x, fromType="ENSEMBL", toType="ENTREZID", annoDb=annoDb))
## TODO expose as argument
pCutoff <- 0.05
geneIds <- geneLists %>% filter(cluster %in% c("cluster_2")) %$% geneLists
enrichKEGG(gene = geneIds, organism = cpSpecies, pvalueCutoff = pCutoff, readable = TRUE)
enrichPathway(gene = geneIds, organism = cpSpecies, pvalueCutoff = pCutoff, readable = TRUE)
enrichGO(gene = gene, universe = names(geneList), organism = "human", ont = "CC", pAdjustMethod = "BH", pvalueCutoff = 0.01, qvalueCutoff = 0.05, readable = TRUE)
enrResults <- quote(geneLists %>% do(davidAnnotationChart(.$ensembl_gene_id))) %>%
cache_it(paste0("enrdata_", digest(geneLists)))
write.delim(enrResults, file=paste0(resultsBaseName, "enrResults.txt"))
# enrResults <- read.delim(paste0(resultsBaseName, "enrResults.txt"))
#' [Enrichment Results](`r paste0(resultsBaseName, "enrResults.txt")`)
sigEnrResults <- subset(enrResults, Bonferroni <0.01)
nrow(enrResults)
nrow(sigEnrResults)
write.delim(sigEnrResults, file=paste0(resultsBaseName, "sigEnrResults.txt"))
# sigEnrResults <- read.delim(paste0(resultsBaseName, "sigEnrResults.txt"))
#' [Very Significant Terms](`r paste0(resultsBaseName, "sigEnrResults.txt")`)
#+ include=FALSE, eval=FALSE
## plot the enrichment results
#sigEnrResults %>% group_by(Category, add=T) %>% do({
# logPlot <- . %>% ggplot(aes(Term, PValue)) +
# geom_bar(stat="identity")+coord_flip() +
# xlab("Enriched Terms") +
# ggtitle(.$Category[1]) +
# scale_y_log10()
# print(logPlot)
#})
#sigEnrResults %>%
## select(-Genes) %>%
# do({
# enrResultsGrp <- .
# ## DEBUG enrResultsGrp <- sigEnrResults
## geneLists %>% first(1) %>% select(-ensembl_gene_id) %>% paste0(., collapse="_")
##browser()
# label <- geneLists %>% semi_join(enrResultsGrp) %>% first(1) %>% dplyr::select(-ensembl_gene_id) %>% paste0(., collapse="_")
#
# echo("processing", label)
#
# logPlot <- enrResultsGrp %>%
# ## fix factor order
# mutate(Term=reorder(Term, -PValue) %>% reorder(as.integer(Category))) %>%
# ggplot(aes(Term, PValue, fill=Category)) +
# geom_bar(stat="identity")+
# coord_flip() +
# xlab("Enriched Terms") +
# ggtitle(label) +
# scale_y_log10()
#
# ggsave(paste0(resultsBaseName, label, ".enrichmed_terms.pdf"))
# print(logPlot)
#})
##ggsave2()
## include=FALSE, error=TRUE
#+ error=TRUE, echo=FALSE
warning("dropping levels")
sigEnrResults %<>% mutate(Category=ac(Category)) ## drop unsused level to get consistent color palette
term_category_colors <- create_palette(unique(ac(sigEnrResults$Category)))
dir.create("figures")
term_barplot_files <- sigEnrResults %>%
## chop and pad category names
mutate(Term=str_sub(Term, 1, 70) %>% str_pad(70)) %>%
do({
enrResultsGrp <- .
## DEBUG enrResultsGrp <- sigEnrResults
label <- geneLists %>% semi_join(enrResultsGrp) %>% first(1) %>% dplyr::select(-ensembl_gene_id) %>% paste0(., collapse="_")
# warning(paste0("processing terms", paste(ac(unique(enrResultsGrp$Category)), collapse=",")))
logPlot <- enrResultsGrp %>%
## fix factor order
mutate(Term=reorder(Term, -PValue) %>% reorder(as.integer(as.factor(Category)))) %>%
ggplot(aes(Term, PValue, fill=Category)) +
geom_bar(stat="identity")+
scale_fill_manual(values = term_category_colors, drop=F, name="Ontology") +
coord_flip() +
xlab("Enriched Terms") +
ggtitle(label) +
scale_y_log10()
# print(logPlot)
## todo use builtin method to create filesystem-compatible name
fileNameLabel <- label %>%
str_replace_all("!=", "ne") %>%
str_replace_all(">", "gt") %>%
str_replace_all("<", "lt") %>%
str_replace_all(fixed("&"), "AND") %>%
str_replace_all(fixed("|"), "OR") %>%
str_replace_all(" ", "_")
# ggsave(paste0("enrichmed_terms__", fileNameLabel, ".pdf"))
# print(logPlot)
tmpPng <- paste0("figures/enrterms__", fileNameLabel, ".png")
ggsave(tmpPng, logPlot, width=10, height = 2+round(nrow(enrResultsGrp)/5), limitsize=FALSE)
data.frame(file=tmpPng)
})
#+ results="asis"
l_ply(term_barplot_files$file, function(pngFile){ cat(paste0("<img src='", pngFile, "'><br>"))})
########################################################################################################################
# ' ## Enriched KEGG Pathways
#+ eval=nrow(sigEnrResults %>% filter(Category=="KEGG_PATHWAY")) >0
#' To understand spatio-temporal changes in gene expression better we now overlay enriched kegg pathways with the -log10(q_value) of each contrast. The direction of the expression changes is encoded as color, whereby red indicates that sample_1 is overexpressed. Because we have multiple contrasts of interest, this defines a slice-color barcode for each gene. To relate the barcode to contrasts we define the following slice order:
## todo why is tidyr not processing an empty dataframe
keggPathways <- sigEnrResults %>%
filter(Category=="KEGG_PATHWAY") %>%
separate(Term, c('kegg_pathway_id', 'pathway_description'), sep="\\:", remove=F) %>%
with(kegg_pathway_id) %>% ac() %>% unique()
##+ results='asis'
#if(!exists("keggPathways") | nrow(keggPathways)==0){
# cat("No enriched pathways found")
#}
#+ echo=FALSE
require(pathview)
require(png)
# keggPathways <- keggPathways[1]
#if(nrow(keggPathways)==0){
# echo("no enriched kegg pathways were found in the dataset")
#}else{
#keggOrCode <- "mmu"
keggOrCode <- guess_pathview_species(geneLists$ensembl_gene_id)
## prepare p-value data
sliceData <- overlayData %>%
# dcast(ensembl_gene_id ~ comparison, value.var="plot_score") %>%
column2rownames("ensembl_gene_id")
#sliceData %>% head %>% kable()
data.frame(set=names(sliceData)) %>% mutate(slice_index=row_number()) %>% kable()
plot_pathway <- function(pathwayID, overlayData){
# pathwayID="mmu04015"
# browser()
# echo("processing pathway", pathwayID)
pv.out <- pathview(
gene.data = overlayData,
pathway.id = pathwayID,
species = keggOrCode,
# out.suffix = pathwayID$kegg.description,
# out.suffix = pathwayID,
multi.state = ncol(overlayData)>1,
# kegg.native=F,
# node.sum = "mean", # the method name to calculate node summary given that multiple genes or compounds are mapped to it
limit = list(gene = c(-4,4)),
gene.idtype="ensembl"
)
outfile <- paste0(pathwayID, ".pathview", ifelse(ncol(overlayData)>1, ".multi", ""), ".png")
## move pathway plots into figures sub-directory
figuresPlotFile <- file.path("figures", outfile)
system(paste("mv", outfile, figuresPlotFile))
system(paste("rm", paste0(pathwayID, ".xml"), paste0(pathwayID, ".png")))
## interactive plotting
# ima <- readPNG(outfile)
# plot.new()
# lim <- par()
# rasterImage(ima, lim$usr[1], lim$usr[3], lim$usr[2], lim$usr[4])
pv.out$plotfile=figuresPlotFile
pv.out$pathway_id=pathwayID
return(pv.out)
}
#keggPathways <- c("mmu04976", "mmu04972", "mmu04810", "mmu04520", "mmu04530", "mmu04270", "mmu04015")
#keggPathways <- c("mmu04015")
pathwayPlots <- keggPathways %>% llply(function(pathwayID){
plot_pathway(pathwayID, sliceData)
})
save(pathwayPlots, file=".pathwayPlots.RData")
# pathwayPlots <- local(get(load("pathwayPlots.RData")))
## prepare tooltips with expression scores
ens2entrez <- quote({
mart <- biomaRt::useDataset(guess_mart(geneLists$ensembl_gene_id), mart = biomaRt::useMart("ensembl"))
biomaRt::getBM(attributes=c('ensembl_gene_id', 'entrezgene', 'external_gene_name'), mart=mart) %>% filter(!is.na(entrezgene))
}) %>% cache_it("ens2entrez") %>% distinct(ensembl_gene_id)
#unlen(ens2entrez$ensembl_gene_id)
toolTipData <- overlayData %>% left_join(ens2entrez)
makeTooltip <- function(entrez_id){
toolTipData %>% filter(entrezgene==entrez_id) %>% dplyr::select(-entrezgene) %>% gather() %$% paste(key, value, sep=": ") %>% paste(collapse="\n") #%>% cat
}
#entrez_id=14679
#entrez_id=34234234234234
#makeTooltip("14679")
#+ results="asis", echo=FALSE
## simple non-clickable plots
## http://stackoverflow.com/questions/12588323/r-how-to-extract-values-for-the-same-named-elements-across-multiple-objects-of
#unlist(lapply(pathwayPlots, "[[", "plotfile"))
# unlist(lapply(pathwayPlots, "[[", "plotfile")) %>% l_ply(function(pngFile){ cat(paste0("<img src='", pngFile, "'><br>"))})
#cat("
#<style type='text/css'>
# img {
# max-width: 100%;
# }
#</style>
#")
## todo add tooltips with additinal info
## http://stackoverflow.com/questions/5478940/how-to-create-a-html-tooltip-on-top-of-image-map
## extended version with clickable links
pathwayPlots %>% l_ply(function(plotData){
#pngFile="mmu04015.pathview.png"
#plotData <- pathwayPlots[[1]]
#plotData <- unlist(pathwayPlots)
# pathway_id=(basename(pngFile) %>% str_split_fixed("[.]", 2))[,1]
pngFile <- plotData$plotfile
pathway_id=plotData$pathway_id
## create link for image map
keggNodes <- plotData$plot.data.gene %>%
## remove box offset
mutate(x=x-22, y=y-8) %>% ## todo does not work for non-gene elements
mutate(link=paste0("http://www.kegg.jp/dbget-bin/www_bget?", keggOrCode, ":", kegg.names )) %>%
rowwise() %>% do({ curNode=.; mutate(as.df(curNode), tooltip=makeTooltip(as.integer(curNode$kegg.names)))})
## create tooltip by using mapping to
# "http://www.kegg.jp/dbget-bin/www_bget?mmu:16412
## first the image itself
# cat(paste0("<img usemap='#", pathway_id,"' src='", pngFile, "'><br>"))
cat(paste0("<p><div style='width: 2000px'><img style='float: left' usemap='#", pathway_id,"' src='", pngFile, "'></div><br>"))
cat(paste0("<map name='", pathway_id,"'>"))
#keggNodes %>% rowwise() %>% {curNode=.; cat(curNode$name)}
#see http://www.html-world.de/180/image-map/
keggNodes %>% a_ply(1, function(curNode){
rectDef=with(curNode, paste(x, y, x+width, y+height, sep=","))
# paste0("<area href='", curNode$link, "' alt='Text' coords='", rectDef , "' shape='rect'>") %>% cat()
paste0("<area href='", curNode$link, "' title='", curNode$tooltip, "' alt='Text' coords='",rectDef , "' shape='rect'>") %>% cat()
})
cat("</map></p><br>")
})
## respin it for cild inclusion
# require(knitr); setwd("/Volumes/projects/bioinfo/scripts/ngs_tools/dev/common"); spin("david_enrichment.R", knit=F)