Newer
Older
suppressMessages(require(docopt))
doc <- '
Perform an enrichment analysis for a set of genes
Usage: cp_enrichment.R [options] <gene_lists_tsv> <group_col>
Options:
--overlay_expr_data Tsv with overlay data for the kegg pathways
--list_id_col Column containing the grouping variable to speparate different lists [default: ]
--project <project_prefix> Name to prefix all generated result files [default: ]
'
opts <- docopt(doc, commandArgs(TRUE)) ## does not work when spining
# opts <- docopt(doc, "--overlay_expr_data ../plot_score_matrix.txt ../degs_by_contrast.txt contrast" )
devtools::source_url("https://raw.githubusercontent.com/holgerbrandl/datautils/v1.20/R/core_commons.R")
devtools::source_url("https://raw.githubusercontent.com/holgerbrandl/datautils/v1.20/R/ggplot_commons.R")
devtools::source_url("https://raw.githubusercontent.com/holgerbrandl/datautils/v1.20/R/bio/diffex_commons.R")
## to fix child support issue with knitr, see also
## http://stackoverflow.com/questions/20030523/knitr-nested-child-documents
## https://github.com/yihui/knitr/issues/38
# todo disabled because root.dir in parent document seems the only working solution
#if(exists('project_dir')) setwd(project_dir)
#print(getwd())
## load the data
group_col=opts$group_col
geneLists %<>% group_by_(.dots=group_col)
#geneLists %<>% filter(cluster %in% c("cluster_1", "cluster_2"))
if(!is.null(opts$overlay_expr_data)){
overlayData <- read_tsv(opts$overlay_expr_data)
## TODO expose options to run gesa instead (by assuming that gene lists are sorted)
## see http://www.bioconductor.org/packages/release/bioc/vignettes/clusterProfiler/inst/doc/clusterProfiler.pdf for details
resultsBaseName <- if(str_length(opts$project)>0) paste0(opts$project, ".") else basename(opts$gene_lists_tsv) %>% trim_ext("txt") #%>% paste0(".")
#resultsBaseName=basename(opts$gene_lists_tsv) %>% trim_ext("txt") #%>% paste0(".")
########################################################################################################################
#' ## Enrichment Analysis
#' This analysis was performed using [clusterProfiler](http://bioconductor.org/packages/release/bioc/html/clusterProfiler.html). The following ontologies were tested: Kegg, Go, Reactome, Dose,
listLabels <- geneLists %>% select(-ensembl_gene_id) %>% distinct
listLabels %<>% transform(list_label=do.call(paste, c(listLabels, sep="__")))
geneLists %>% inner_join(listLabels) %>%
ggplot(aes(list_label)) +
geom_bar() +
coord_flip() +
ggtitle("gene list sizes to be tested for term enrichment") +
ylab("")
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
guess_cp_species <- function(ensIds){
an_id <-ensIds[1]
if(str_detect(an_id, "ENSG")){
return("human")
}else if(str_detect(an_id, "ENSMUSG")){
return("mouse")
}else if(str_detect(an_id, "ENSDARG")){
return("zebrafish")
}else if(str_detect(an_id, "FBgn")){
return("fly")
}else{
stop(paste("could not clusterProfiler species name from ", an_id))
}
}
guess_anno_db <- function(ensIds){
an_id <-ensIds[1]
if(str_detect(an_id, "ENSG")){
return("org.Hs.eg.db")
}else if(str_detect(an_id, "ENSMUSG")){
return("org.Mm.eg.db")
}else if(str_detect(an_id, "ENSDARG")){
return("org.Dr.eg.db")
}else if(str_detect(an_id, "FBgn")){
return("org.Dm.eg.db")
}else{
stop(paste("could not anno db mart from ", an_id))
}
}
#biocLite("org.Mm.eg.db")
#biocLite("org.Hm.eg.db")
#biocLite("org.Dr.eg.db")
#biocLite("org.Dm.eg.db")
#data(gcSample)
#yy = enrichKEGG(gcSample[[5]], pvalueCutoff=0.01)
#head(summary(yy))
#plot(yy)
## seems broken
cpSpecies <- guess_cp_species(geneLists$ensembl_gene_id)
annoDb <- guess_anno_db(geneLists$ensembl_gene_id) # e.g. "org.Hs.eg.db"
## supported ids
#idType("org.Hs.eg.db")
## convert to entrez gene ids
glMappedRaw <- clusterProfiler::bitr(geneLists$ensembl_gene_id, fromType="ENSEMBL", toType="ENTREZID", annoDb=annoDb) %>%
set_names("ensembl_gene_id", "entrez_gene_id") %>% right_join(geneLists)
#' Check how many failed to map
count(glMappedRaw, is.na(entrez_gene_id))
glMapped <- glMappedRaw %>% filter(!is.na(entrez_gene_id)) %>% select(-ensembl_gene_id) %>%
## regroup the data
# group_by_(cluster)
## retain just 1500 genes at max per group
#glMappedSub <- glMapped %>% do({shuffle(.) %>% head(1500)})
#count(glMappedSub, cluster)
## sync reactome pacakge to node
if(Sys.getenv("LSF_SERVERDIR")!="" && dir.exists("/tmp/local_r_packages")){
system("if [ ! -d '/tmp/local_r_packages/reactome.db/' ]; then scp -r falcon:/tmp/local_r_packages /tmp ; fi")
require_auto(clusterProfiler)
require_auto(ReactomePA)
# DEBUG geneIds <- glMapped %>% filter(cluster %in% c("cluster_3")) %$% entrez_gene_id %>% as.integer
# DEBUG geneIds <- head(glMapped,30)$entrez_gene_id
if(length(geneIds)>1500){
geneIds <- sample(geneIds) %>% head(1500)
}
echo("testing", length(geneIds), " genes for enrichment")
# PANTHER10_ontology <- read.delim("http://data.pantherdb.org/PANTHER10.0/ontology/Protein_Class_7.0")
# pantherResults <- enricher(gene = geneIds, organism = cpSpecies, qvalueCutoff = qCutoff, readable = TRUE, TERM2GENE = PANTHER10_ontology) %>% summary()
keggResults <- enrichKEGG(gene = geneIds, organism = cpSpecies, qvalueCutoff = qCutoff, readable = TRUE, use_internal_data=T) %>% summary()
reactomeResults <- enrichPathway(gene = geneIds, organism = cpSpecies, qvalueCutoff = qCutoff, readable = TRUE) %>% summary()
goResultsCC <- enrichGO(gene = geneIds, organism = cpSpecies, qvalueCutoff = qCutoff, readable = TRUE, ont = "CC") %>% summary()
goResultsMF <- enrichGO(gene = geneIds, organism = cpSpecies, qvalueCutoff = qCutoff, readable = TRUE, ont = "MF") %>% summary()
goResultsBP <- enrichGO(gene = geneIds, organism = cpSpecies, qvalueCutoff = qCutoff, readable = TRUE, ont = "BP") %>% summary()
enrResults <- rbind_list(
mutate(keggResults, ontology="kegg"),
mutate(reactomeResults, ontology="reactome"),
mutate(goResultsBP, ontology="go_bp"),
mutate(goResultsMF, ontology="go_mf"),
mutate(goResultsCC, ontology="go_cc")
)
}
## prety slow
enrResults <- quote(glMapped %>% do(cp_test(.$entrez_gene_id))) %>% cache_it(paste0("enrdata_", digest(glMapped)))
## run the actual enrichment test for all clusters and all ontologies
#library(foreach); library(doMC); registerDoMC(cores=20)
## https://support.bioconductor.org/p/38541/
## library(GOstats) inside your %dopar% loop. And then start with a fresh session, so GOstats is not loaded outside the loop. It worked for me.
#enrResults <- ddply(glMapped, groups(glMapped) %>% ac, function(curGroup){
# cp_test(curGroup$entrez_gene_id)
#}, .progress="text", .parallel=F) %>% cache_it(paste0("enrdata_", digest(glMapped)))
unloadNamespace('dplyr'); require(dplyr) ## tbd seriously???
#enrResults %<>% rename(Category=Description)
#enrResults %<>% rename(ontology=Category)
## remove to clumsy gene_id columns
enrResults %<>% select(-geneID)
write.delim(enrResults, file=paste0(resultsBaseName, "enrResults.txt"))
# enrResults <- read.delim(paste0(resultsBaseName, "enrResults.txt"))
#' [Enrichment Results](`r paste0(resultsBaseName, "enrResults.txt")`)
require_auto(DT)
datatable(enrResults)
#enrResults %>% ggplot(aes(pvalue)) + geom_histogram() + scale_x_log10()
#enrResults %>% ggplot(aes(ontology)) + facet_wrap(~cluster) + geom_bar() + rot_x_lab()
#facetSpecs <- paste("~", groups(geneLists) %>% ac %>% paste(collapse=" + "))
facetSpecs <- paste("~", group_col %>% ac %>% paste(collapse=" + "))
#http://stackoverflow.com/questions/11028353/passing-string-variable-facet-wrap-in-ggplot-using-r
enrResults %>% ggplot(aes(ontology)) + facet_wrap(as.formula(facetSpecs), ncol=3) + geom_bar() + rot_x_lab() + ggtitle("enriched term counts by cluster")
enrResults %<>% mutate(num_term_genes=str_split_fixed(BgRatio, fixed("/"),2)[,1] %>% as.numeric)
#enrResults %<>% group_by_(.dots=groups(geneLists) %>% ac)
#enrResults %<>% group_by_(.dots=group_col)
#enrResults %<>% sample_frac(1) %>% filter((row_number()<100)) %>% group_by_(.dots=group_col)
#tt <- enrResults %>% group_by_(.dots=c(group_col,"ontology")) %>% arrange(-qvalue) %>% dplyr::slice(1:10)
#enrResults %>% filter(cluster=="cluster_1")
#enrResults %>% group_by_(.dots=c(group_col)) %>% ggplot(aes(pvalue, qvalue, color=ontology))+ geom_point()
#enrResults %>% group_by_(.dots=c(group_col)) %>% ggplot(aes(pvalue, p.adjust, color=cluster))+ geom_point()
erPlotData <- enrResults %>% group_by_(.dots=c(group_col)) %>% arrange(qvalue) %>% slice(1:10) %>%
## regroup because otherwise dplyr complains about corrupt df (which looks like a bug)
group_by_(.dots=c(group_col))
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
#+ include=FALSE, eval=FALSE
## plot the enrichment results
#sigEnrResults %>% group_by(Category, add=T) %>% do({
# logPlot <- . %>% ggplot(aes(Term, PValue)) +
# geom_bar(stat="identity")+coord_flip() +
# xlab("Enriched Terms") +
# ggtitle(.$Category[1]) +
# scale_y_log10()
# print(logPlot)
#})
#sigEnrResults %>%
## select(-Genes) %>%
# do({
# enrResultsGrp <- .
# ## DEBUG enrResultsGrp <- sigEnrResults
## geneLists %>% first(1) %>% select(-ensembl_gene_id) %>% paste0(., collapse="_")
##browser()
# label <- geneLists %>% semi_join(enrResultsGrp) %>% first(1) %>% dplyr::select(-ensembl_gene_id) %>% paste0(., collapse="_")
#
# echo("processing", label)
#
# logPlot <- enrResultsGrp %>%
# ## fix factor order
# mutate(Term=reorder(Term, -PValue) %>% reorder(as.integer(Category))) %>%
# ggplot(aes(Term, PValue, fill=Category)) +
# geom_bar(stat="identity")+
# coord_flip() +
# xlab("Enriched Terms") +
# ggtitle(label) +
# scale_y_log10()
#
# ggsave(paste0(resultsBaseName, label, ".enrichmed_terms.pdf"))
# print(logPlot)
#})
##ggsave2()
#+ error=TRUE, echo=FALSE
warning("dropping levels")
erPlotData %<>% mutate(ontology=ac(ontology)) ## drop unsused level to get consistent color palette
erPlotData %<>% rename(Term=Description)
term_category_colors <- create_palette(unique(ac(erPlotData$ontology)))
mutate(Term=str_sub(Term, 1, 70) %>% str_pad(70)) %>%
## DEBUG enrResultsGrp <- sigEnrResults
# label <- geneLists %>% semi_join(enrResultsGrp) %>% first(1) %>% dplyr::select(-ensembl_gene_id) %>% paste0(., collapse="_")
label <- subset(enrResultsGrp, select=group_col)[1,1] %>% as.matrix %>% ac
#browser()
warning(paste0("processing terms", paste(ac(unique(enrResultsGrp$Category)), collapse=",")))
logPlot <- enrResultsGrp %>%
## fix factor order
mutate(Term=reorder(Term, -qvalue) %>% reorder(as.integer(as.factor(ontology)))) %>%
ggplot(aes(Term, num_term_genes, fill=qvalue, color=ontology)) +
scale_color_manual(values = term_category_colors, drop=F, name="Ontology") +
coord_flip() +
xlab("Enriched Terms") +
ggtitle(label) +
scale_y_log10()
# print(logPlot)
## todo use builtin method to create filesystem-compatible name
fileNameLabel <- label %>%
str_replace_all("!=", "ne") %>%
str_replace_all(">", "gt") %>%
str_replace_all("<", "lt") %>%
str_replace_all(fixed("&"), "AND") %>%
str_replace_all(fixed("|"), "OR") %>%
str_replace_all(" ", "_")
# ggsave(paste0("enrichmed_terms__", fileNameLabel, ".pdf"))
# print(logPlot)
tmpPng <- paste0(figDir, "/enrterms__", fileNameLabel, ".png")
ggsave(tmpPng, logPlot, width=10, height = 2+round(nrow(enrResultsGrp)/5), limitsize=FALSE)
data.frame(file=tmpPng)
})
#+ results="asis"
l_ply(term_barplot_files$file, function(pngFile){ cat(paste0("<img src='", pngFile, "'><br>"))})
########################################################################################################################
#+ eval=(nrow(enrResults %>% filter(ontology=="kegg")) >0)
#' To understand spatio-temporal changes in gene expression better we now overlay enriched kegg pathways with the -log10(q_value) of each contrast. The direction of the expression changes is encoded as color, whereby red indicates that sample_1 is overexpressed. Because we have multiple contrasts of interest, this defines a slice-color barcode for each gene. To relate the barcode to contrasts we define the following slice order:
## todo why is tidyr not processing an empty dataframe
keggPathways <- enrResults %>% filter(ontology=="kegg") %$% ac(ID) %>% unique() # %>% head(3) ## todo remove debug head
##+ results='asis'
#if(!exists("keggPathways") | nrow(keggPathways)==0){
# cat("No enriched pathways found")
#}
#+ echo=FALSE
# keggPathways <- keggPathways[1]
#if(nrow(keggPathways)==0){
# echo("no enriched kegg pathways were found in the dataset")
#}else{
#keggOrCode <- "mmu"
keggOrCode <- guess_pathview_species(geneLists$ensembl_gene_id)
## prepare p-value data
sliceData <- overlayData %>%
# dcast(ensembl_gene_id ~ comparison, value.var="plot_score") %>%
column2rownames("ensembl_gene_id")
#sliceData %>% head %>% kable()
data.frame(set=names(sliceData)) %>% mutate(slice_index=row_number()) %>% kable()
plot_pathway <- function(pathwayID, sliceData){
# pathwayID="mmu04015"; sliceData <- sliceData
# browser()
# echo("processing pathway", pathwayID)
pv.out <- pathview(
pathway.id = pathwayID,
species = keggOrCode,
# out.suffix = pathwayID$kegg.description,
# out.suffix = pathwayID,
# kegg.native=F,
# node.sum = "mean", # the method name to calculate node summary given that multiple genes or compounds are mapped to it
limit = list(gene = c(-4,4)),
gene.idtype="ensembl"
)
if(is.numeric(pv.out) && pv.out==0) return(pathwayID) ## happens if pathview fails to parse xml e.g. for mmu01100
outfile <- paste0(pathwayID, ".pathview", ifelse(ncol(sliceData)>1, ".multi", ""), ".png")
## move pathway plots into figures sub-directory
system(paste("mv", outfile, figuresPlotFile))
system(paste("rm", paste0(pathwayID, ".xml"), paste0(pathwayID, ".png")))
## interactive plotting
# ima <- readPNG(outfile)
# plot.new()
# lim <- par()
# rasterImage(ima, lim$usr[1], lim$usr[3], lim$usr[2], lim$usr[4])
pv.out$plotfile=figuresPlotFile
pv.out$pathway_id=pathwayID
return(pv.out)
}
#keggPathways <- c("mmu04976", "mmu04972", "mmu04810", "mmu04520", "mmu04530", "mmu04270", "mmu04015")
pathwayPlots <- keggPathways %>% llply(function(pathwayID){
plot_pathway(pathwayID, sliceData)
})
save(pathwayPlots, file=".pathwayPlots.RData")
# pathwayPlots <- local(get(load(".pathwayPlots.RData")))
## remove and report failed ones
failedPPs <- pathwayPlots[lapply(pathwayPlots, is.character) > 0]
echo("the following pathways failed to render:", paste(failedPPs, collapse=", "))
pathwayPlots <- pathwayPlots[lapply(pathwayPlots, is.character) == 0]
## make sure that all files exist
stopifnot(llply(pathwayPlots, function(x) file.exists(x$plotfile)) %>% unlist %>% all)
## prepare tooltips with expression scores
ens2entrez <- quote({
# mart <- biomaRt::useDataset(guess_mart(geneLists$ensembl_gene_id), mart = biomaRt::useMart("ensembl"))
## todo fix this https://support.bioconductor.org/p/74322/
mart <- biomaRt::useDataset(guess_mart(geneLists$ensembl_gene_id), mart = biomaRt::useMart("ENSEMBL_MART_ENSEMBL", host="www.ensembl.org"))
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
biomaRt::getBM(attributes=c('ensembl_gene_id', 'entrezgene', 'external_gene_name'), mart=mart) %>% filter(!is.na(entrezgene))
}) %>% cache_it("ens2entrez") %>% distinct(ensembl_gene_id)
#unlen(ens2entrez$ensembl_gene_id)
toolTipData <- overlayData %>% left_join(ens2entrez)
makeTooltip <- function(entrez_id){
toolTipData %>% filter(entrezgene==entrez_id) %>% dplyr::select(-entrezgene) %>% gather() %$% paste(key, value, sep=": ") %>% paste(collapse="\n") #%>% cat
}
#entrez_id=14679
#entrez_id=34234234234234
#makeTooltip("14679")
#+ results="asis", echo=FALSE
## simple non-clickable plots
## http://stackoverflow.com/questions/12588323/r-how-to-extract-values-for-the-same-named-elements-across-multiple-objects-of
#unlist(lapply(pathwayPlots, "[[", "plotfile"))
# unlist(lapply(pathwayPlots, "[[", "plotfile")) %>% l_ply(function(pngFile){ cat(paste0("<img src='", pngFile, "'><br>"))})
#cat("
#<style type='text/css'>
# img {
# max-width: 100%;
# }
#</style>
#")
## todo add tooltips with additinal info
## http://stackoverflow.com/questions/5478940/how-to-create-a-html-tooltip-on-top-of-image-map
## extended version with clickable links
createImgMap <- function(plotData){
#browser()
warning("compiling overlay toolips...")
#pngFile="mmu04015.pathview.png"
#plotData <- pathwayPlots[[1]]
#plotData <- unlist(pathwayPlots)
# pathway_id=(basename(pngFile) %>% str_split_fixed("[.]", 2))[,1]
pngFile <- plotData$plotfile
pathway_id=plotData$pathway_id
## create link for image map
keggNodes <- plotData$plot.data.gene %>%
## remove box offset
mutate(x=x-22, y=y-8) %>% ## todo does not work for non-gene elements
mutate(link=paste0("http://www.kegg.jp/dbget-bin/www_bget?", keggOrCode, ":", kegg.names )) %>%
rowwise() %>% do({ curNode=.; mutate(as.df(curNode), tooltip=makeTooltip(as.integer(curNode$kegg.names)))})
## create tooltip by using mapping to
# "http://www.kegg.jp/dbget-bin/www_bget?mmu:16412
## first the image itself
# cat(paste0("<img usemap='#", pathway_id,"' src='", pngFile, "'><br>"))
# cat(paste0("<p><div style='width: 2000px'><img style='float: left' usemap='#", pathway_id,"' src='", pngFile, "'></div><br>"))
## it seems that formatting works better without the left alignment
cat(paste0("<p><div style='width: 2000px'><img usemap='#", pathway_id,"' src='", pngFile, "'></div><br>"))
cat(paste0("<map name='", pathway_id,"'>"))
#keggNodes %>% rowwise() %>% {curNode=.; cat(curNode$name)}
#see http://www.html-world.de/180/image-map/
keggNodes %>% a_ply(1, function(curNode){
rectDef=with(curNode, paste(x, y, x+width, y+height, sep=","))
# paste0("<area href='", curNode$link, "' alt='Text' coords='", rectDef , "' shape='rect'>") %>% cat()
paste0("<area href='", curNode$link, "' title='", curNode$tooltip, "' alt='Text' coords='",rectDef , "' shape='rect'>") %>% cat()
})
cat("</map></p><br>")
## respin it for cild inclusion
# require(knitr); setwd("/Volumes/projects/bioinfo/scripts/ngs_tools/dev/common"); spin("cp_enrichment.R", knit=F)
#' ## Notes
#' Other interesting tools to perform enrichment testing
#' * [piano](http://www.bioconductor.org/packages/release/bioc/html/piano.html)