#!/usr/bin/env Rscript #+ echo=FALSE, error=F suppressMessages(require(docopt)) ## todo use textual input here for ease of use doc <- ' Perform an enrichment analysis for a set of genes Usage: cp_enrichment.R [options] <gene_lists_tsv> <group_col> Options: --overlay_expr_data Tsv with overlay data for the kegg pathways --list_id_col Column containing the grouping variable to speparate different lists [default: ] --project <project_prefix> Name to prefix all generated result files [default: ] ' opts <- docopt(doc, commandArgs(TRUE)) ## does not work when spining # opts <- docopt(doc, "--overlay_expr_data ../plot_score_matrix.txt ../degs_by_contrast.txt contrast" ) devtools::source_url("https://raw.githubusercontent.com/holgerbrandl/datautils/v1.20/R/core_commons.R") devtools::source_url("https://raw.githubusercontent.com/holgerbrandl/datautils/v1.20/R/ggplot_commons.R") devtools::source_url("https://raw.githubusercontent.com/holgerbrandl/datautils/v1.20/R/bio/diffex_commons.R") require_auto(knitr) require_auto(DT) require_auto(readr) ## to fix child support issue with knitr, see also ## http://stackoverflow.com/questions/20030523/knitr-nested-child-documents ## https://github.com/yihui/knitr/issues/38 # todo disabled because root.dir in parent document seems the only working solution #if(exists('project_dir')) setwd(project_dir) #print(getwd()) ## load the data geneLists <- read_tsv(opts$gene_lists_tsv) group_col=opts$group_col geneLists %<>% group_by_(.dots=group_col) #geneLists %<>% filter(cluster %in% c("cluster_1", "cluster_2")) if(!is.null(opts$overlay_expr_data)){ overlayData <- read_tsv(opts$overlay_expr_data) } ## TODO expose options to run gesa instead (by assuming that gene lists are sorted) ## see http://www.bioconductor.org/packages/release/bioc/vignettes/clusterProfiler/inst/doc/clusterProfiler.pdf for details resultsBaseName <- if(str_length(opts$project)>0) paste0(opts$project, ".") else basename(opts$gene_lists_tsv) %>% trim_ext("txt") #%>% paste0(".") #resultsBaseName=basename(opts$gene_lists_tsv) %>% trim_ext("txt") #%>% paste0(".") unloadNamespace('dplyr'); require(dplyr) ######################################################################################################################## #' ## Enrichment Analysis #' This analysis was performed using [clusterProfiler](http://bioconductor.org/packages/release/bioc/html/clusterProfiler.html). The following ontologies were tested: Kegg, Go, Reactome, Dose, listLabels <- geneLists %>% select(-ensembl_gene_id) %>% distinct listLabels %<>% transform(list_label=do.call(paste, c(listLabels, sep="__"))) geneLists %>% inner_join(listLabels) %>% ggplot(aes(list_label)) + geom_bar() + coord_flip() + ggtitle("gene list sizes to be tested for term enrichment") + ylab("") ## todo move to diffex commons guess_cp_species <- function(ensIds){ an_id <-ensIds[1] if(str_detect(an_id, "ENSG")){ return("human") }else if(str_detect(an_id, "ENSMUSG")){ return("mouse") }else if(str_detect(an_id, "ENSDARG")){ return("zebrafish") }else if(str_detect(an_id, "FBgn")){ return("fly") }else{ stop(paste("could not clusterProfiler species name from ", an_id)) } } guess_anno_db <- function(ensIds){ an_id <-ensIds[1] if(str_detect(an_id, "ENSG")){ return("org.Hs.eg.db") }else if(str_detect(an_id, "ENSMUSG")){ return("org.Mm.eg.db") }else if(str_detect(an_id, "ENSDARG")){ return("org.Dr.eg.db") }else if(str_detect(an_id, "FBgn")){ return("org.Dm.eg.db") }else{ stop(paste("could not anno db mart from ", an_id)) } } #source("http://bioconductor.org/biocLite.R") #biocLite("org.Mm.eg.db") #biocLite("org.Hm.eg.db") #biocLite("org.Dr.eg.db") #biocLite("org.Dm.eg.db") #biocLite("KEGG.db") #data(gcSample) #yy = enrichKEGG(gcSample[[5]], pvalueCutoff=0.01) #head(summary(yy)) #plot(yy) ## seems broken cpSpecies <- guess_cp_species(geneLists$ensembl_gene_id) annoDb <- guess_anno_db(geneLists$ensembl_gene_id) # e.g. "org.Hs.eg.db" ## supported ids #idType("org.Hs.eg.db") ## convert to entrez gene ids glMappedRaw <- clusterProfiler::bitr(geneLists$ensembl_gene_id, fromType="ENSEMBL", toType="ENTREZID", annoDb=annoDb) %>% set_names("ensembl_gene_id", "entrez_gene_id") %>% right_join(geneLists) #' Check how many failed to map count(glMappedRaw, is.na(entrez_gene_id)) glMapped <- glMappedRaw %>% filter(!is.na(entrez_gene_id)) %>% select(-ensembl_gene_id) %>% ## regroup the data # group_by_(cluster) group_by_(.dots=group_col %>% ac) ## TODO expose as argument #pCutoff <- 0.05 qCutoff <- 0.01 ## retain just 1500 genes at max per group #glMappedSub <- glMapped %>% do({shuffle(.) %>% head(1500)}) #count(glMappedSub, cluster) ## sync reactome pacakge to node if(Sys.getenv("LSF_SERVERDIR")!="" && dir.exists("/tmp/local_r_packages")){ system("if [ ! -d '/tmp/local_r_packages/reactome.db/' ]; then scp -r falcon:/tmp/local_r_packages /tmp ; fi") } require_auto(clusterProfiler) require_auto(ReactomePA) cp_test <- function(geneIds){ # DEBUG geneIds <- glMapped %>% filter(cluster %in% c("cluster_3")) %$% entrez_gene_id %>% as.integer # DEBUG geneIds <- head(glMapped,30)$entrez_gene_id # geneIds=.$entrez_gene_id if(length(geneIds)>1500){ geneIds <- sample(geneIds) %>% head(1500) } echo("testing", length(geneIds), " genes for enrichment") # PANTHER10_ontology <- read.delim("http://data.pantherdb.org/PANTHER10.0/ontology/Protein_Class_7.0") # pantherResults <- enricher(gene = geneIds, organism = cpSpecies, qvalueCutoff = qCutoff, readable = TRUE, TERM2GENE = PANTHER10_ontology) %>% summary() keggResults <- enrichKEGG(gene = geneIds, organism = cpSpecies, qvalueCutoff = qCutoff, readable = TRUE, use_internal_data=T) %>% summary() reactomeResults <- enrichPathway(gene = geneIds, organism = cpSpecies, qvalueCutoff = qCutoff, readable = TRUE) %>% summary() goResultsCC <- enrichGO(gene = geneIds, organism = cpSpecies, qvalueCutoff = qCutoff, readable = TRUE, ont = "CC") %>% summary() goResultsMF <- enrichGO(gene = geneIds, organism = cpSpecies, qvalueCutoff = qCutoff, readable = TRUE, ont = "MF") %>% summary() goResultsBP <- enrichGO(gene = geneIds, organism = cpSpecies, qvalueCutoff = qCutoff, readable = TRUE, ont = "BP") %>% summary() enrResults <- rbind_list( mutate(keggResults, ontology="kegg"), mutate(reactomeResults, ontology="reactome"), mutate(goResultsBP, ontology="go_bp"), mutate(goResultsMF, ontology="go_mf"), mutate(goResultsCC, ontology="go_cc") ) } ## prety slow enrResults <- quote(glMapped %>% do(cp_test(.$entrez_gene_id))) %>% cache_it(paste0("enrdata_", digest(glMapped))) ## run the actual enrichment test for all clusters and all ontologies #library(foreach); library(doMC); registerDoMC(cores=20) ## https://support.bioconductor.org/p/38541/ ## library(GOstats) inside your %dopar% loop. And then start with a fresh session, so GOstats is not loaded outside the loop. It worked for me. #enrResults <- ddply(glMapped, groups(glMapped) %>% ac, function(curGroup){ # cp_test(curGroup$entrez_gene_id) #}, .progress="text", .parallel=F) %>% cache_it(paste0("enrdata_", digest(glMapped))) unloadNamespace('dplyr'); require(dplyr) ## tbd seriously??? #enrResults %<>% rename(Category=Description) #enrResults %<>% rename(ontology=Category) ## remove to clumsy gene_id columns enrResults %<>% select(-geneID) write.delim(enrResults, file=paste0(resultsBaseName, "enrResults.txt")) # enrResults <- read.delim(paste0(resultsBaseName, "enrResults.txt")) #' [Enrichment Results](`r paste0(resultsBaseName, "enrResults.txt")`) require_auto(DT) datatable(enrResults) #enrResults %>% ggplot(aes(pvalue)) + geom_histogram() + scale_x_log10() #enrResults %>% ggplot(aes(ontology)) + facet_wrap(~cluster) + geom_bar() + rot_x_lab() #facetSpecs <- paste("~", groups(geneLists) %>% ac %>% paste(collapse=" + ")) facetSpecs <- paste("~", group_col %>% ac %>% paste(collapse=" + ")) #' Visualize term-pvalues per list #http://stackoverflow.com/questions/11028353/passing-string-variable-facet-wrap-in-ggplot-using-r enrResults %>% ggplot(aes(ontology)) + facet_wrap(as.formula(facetSpecs), ncol=3) + geom_bar() + rot_x_lab() + ggtitle("enriched term counts by cluster") enrResults %<>% mutate(num_term_genes=str_split_fixed(BgRatio, fixed("/"),2)[,1] %>% as.numeric) #' Keep at max 100 terms for visualzation per group #if(table(enrResults$cluster)) ## restablish the grouping and limit results per group to 100 #enrResults %<>% group_by_(.dots=groups(geneLists) %>% ac) #enrResults %<>% group_by_(.dots=group_col) #enrResults %<>% sample_frac(1) %>% filter((row_number()<100)) %>% group_by_(.dots=group_col) #tt <- enrResults %>% group_by_(.dots=c(group_col,"ontology")) %>% arrange(-qvalue) %>% dplyr::slice(1:10) #enrResults %>% filter(cluster=="cluster_1") #enrResults %>% group_by_(.dots=c(group_col)) %>% ggplot(aes(pvalue, qvalue, color=ontology))+ geom_point() #enrResults %>% group_by_(.dots=c(group_col)) %>% ggplot(aes(pvalue, p.adjust, color=cluster))+ geom_point() erPlotData <- enrResults %>% group_by_(.dots=c(group_col)) %>% arrange(qvalue) %>% slice(1:10) %>% ## regroup because otherwise dplyr complains about corrupt df (which looks like a bug) group_by_(.dots=c(group_col)) #count(erPlotData, cluster) #+ include=FALSE, eval=FALSE ## plot the enrichment results #sigEnrResults %>% group_by(Category, add=T) %>% do({ # logPlot <- . %>% ggplot(aes(Term, PValue)) + # geom_bar(stat="identity")+coord_flip() + # xlab("Enriched Terms") + # ggtitle(.$Category[1]) + # scale_y_log10() # print(logPlot) #}) #sigEnrResults %>% ## select(-Genes) %>% # do({ # enrResultsGrp <- . # ## DEBUG enrResultsGrp <- sigEnrResults ## geneLists %>% first(1) %>% select(-ensembl_gene_id) %>% paste0(., collapse="_") ##browser() # label <- geneLists %>% semi_join(enrResultsGrp) %>% first(1) %>% dplyr::select(-ensembl_gene_id) %>% paste0(., collapse="_") # # echo("processing", label) # # logPlot <- enrResultsGrp %>% # ## fix factor order # mutate(Term=reorder(Term, -PValue) %>% reorder(as.integer(Category))) %>% # ggplot(aes(Term, PValue, fill=Category)) + # geom_bar(stat="identity")+ # coord_flip() + # xlab("Enriched Terms") + # ggtitle(label) + # scale_y_log10() # # ggsave(paste0(resultsBaseName, label, ".enrichmed_terms.pdf")) # print(logPlot) #}) ##ggsave2() #+ error=TRUE, echo=FALSE warning("dropping levels") erPlotData %<>% mutate(ontology=ac(ontology)) ## drop unsused level to get consistent color palette erPlotData %<>% rename(Term=Description) term_category_colors <- create_palette(unique(ac(erPlotData$ontology))) figDir <- "enr_charts" dir.create(figDir) term_barplot_files <- erPlotData %>% ## chop and pad category names mutate(Term=str_sub(Term, 1, 70) %>% str_pad(70)) %>% do({ enrResultsGrp <- . # browser() ## DEBUG enrResultsGrp <- sigEnrResults # label <- geneLists %>% semi_join(enrResultsGrp) %>% first(1) %>% dplyr::select(-ensembl_gene_id) %>% paste0(., collapse="_") label <- subset(enrResultsGrp, select=group_col)[1,1] %>% as.matrix %>% ac #browser() warning(paste0("processing terms", paste(ac(unique(enrResultsGrp$Category)), collapse=","))) logPlot <- enrResultsGrp %>% ## fix factor order mutate(Term=reorder(Term, -qvalue) %>% reorder(as.integer(as.factor(ontology)))) %>% ggplot(aes(Term, num_term_genes, fill=qvalue, color=ontology)) + geom_bar(stat="identity")+ scale_color_manual(values = term_category_colors, drop=F, name="Ontology") + coord_flip() + xlab("Enriched Terms") + ggtitle(label) + scale_y_log10() # print(logPlot) ## todo use builtin method to create filesystem-compatible name fileNameLabel <- label %>% str_replace_all("!=", "ne") %>% str_replace_all(">", "gt") %>% str_replace_all("<", "lt") %>% str_replace_all(fixed("&"), "AND") %>% str_replace_all(fixed("|"), "OR") %>% str_replace_all(" ", "_") # ggsave(paste0("enrichmed_terms__", fileNameLabel, ".pdf")) # print(logPlot) stopifnot(str_length(fileNameLabel)>0) tmpPng <- paste0(figDir, "/enrterms__", fileNameLabel, ".png") ggsave(tmpPng, logPlot, width=10, height = 2+round(nrow(enrResultsGrp)/5), limitsize=FALSE) data.frame(file=tmpPng) }) #+ results="asis" l_ply(term_barplot_files$file, function(pngFile){ cat(paste0("<img src='", pngFile, "'><br>"))}) ######################################################################################################################## #' ## Enriched KEGG Pathways #+ eval=(nrow(enrResults %>% filter(ontology=="kegg")) >0) #' To understand spatio-temporal changes in gene expression better we now overlay enriched kegg pathways with the -log10(q_value) of each contrast. The direction of the expression changes is encoded as color, whereby red indicates that sample_1 is overexpressed. Because we have multiple contrasts of interest, this defines a slice-color barcode for each gene. To relate the barcode to contrasts we define the following slice order: ## todo why is tidyr not processing an empty dataframe keggPathways <- enrResults %>% filter(ontology=="kegg") %$% ac(ID) %>% unique() # %>% head(3) ## todo remove debug head ##+ results='asis' #if(!exists("keggPathways") | nrow(keggPathways)==0){ # cat("No enriched pathways found") #} #+ echo=FALSE require_auto(pathview) require_auto(png) # keggPathways <- keggPathways[1] #if(nrow(keggPathways)==0){ # echo("no enriched kegg pathways were found in the dataset") #}else{ #keggOrCode <- "mmu" keggOrCode <- guess_pathview_species(geneLists$ensembl_gene_id) ## prepare p-value data sliceData <- overlayData %>% # dcast(ensembl_gene_id ~ comparison, value.var="plot_score") %>% column2rownames("ensembl_gene_id") #sliceData <- sliceData[,1:5] #sliceData %>% head %>% kable() data.frame(set=names(sliceData)) %>% mutate(slice_index=row_number()) %>% kable() pwPlotDir <-".pathways" dir.create(pwPlotDir) plot_pathway <- function(pathwayID, sliceData){ # pathwayID="mmu04015"; sliceData <- sliceData # browser() # echo("processing pathway", pathwayID) pv.out <- pathview( gene.data = sliceData, pathway.id = pathwayID, species = keggOrCode, # out.suffix = pathwayID$kegg.description, # out.suffix = pathwayID, multi.state = ncol(sliceData)>1, # kegg.native=F, # node.sum = "mean", # the method name to calculate node summary given that multiple genes or compounds are mapped to it limit = list(gene = c(-4,4)), gene.idtype="ensembl" ) if(is.numeric(pv.out) && pv.out==0) return(pathwayID) ## happens if pathview fails to parse xml e.g. for mmu01100 outfile <- paste0(pathwayID, ".pathview", ifelse(ncol(sliceData)>1, ".multi", ""), ".png") ## move pathway plots into figures sub-directory figuresPlotFile <- file.path(pwPlotDir, outfile) system(paste("mv", outfile, figuresPlotFile)) system(paste("rm", paste0(pathwayID, ".xml"), paste0(pathwayID, ".png"))) ## interactive plotting # ima <- readPNG(outfile) # plot.new() # lim <- par() # rasterImage(ima, lim$usr[1], lim$usr[3], lim$usr[2], lim$usr[4]) pv.out$plotfile=figuresPlotFile pv.out$pathway_id=pathwayID return(pv.out) } #keggPathways <- c("mmu04976", "mmu04972", "mmu04810", "mmu04520", "mmu04530", "mmu04270", "mmu04015") #keggPathways <- c("mmu00830") pathwayPlots <- keggPathways %>% llply(function(pathwayID){ plot_pathway(pathwayID, sliceData) }) save(pathwayPlots, file=".pathwayPlots.RData") # pathwayPlots <- local(get(load(".pathwayPlots.RData"))) ## remove and report failed ones failedPPs <- pathwayPlots[lapply(pathwayPlots, is.character) > 0] echo("the following pathways failed to render:", paste(failedPPs, collapse=", ")) pathwayPlots <- pathwayPlots[lapply(pathwayPlots, is.character) == 0] ## make sure that all files exist stopifnot(llply(pathwayPlots, function(x) file.exists(x$plotfile)) %>% unlist %>% all) ## prepare tooltips with expression scores ens2entrez <- quote({ # mart <- biomaRt::useDataset(guess_mart(geneLists$ensembl_gene_id), mart = biomaRt::useMart("ensembl")) ## todo fix this https://support.bioconductor.org/p/74322/ mart <- biomaRt::useDataset(guess_mart(geneLists$ensembl_gene_id), mart = biomaRt::useMart("ENSEMBL_MART_ENSEMBL", host="www.ensembl.org")) biomaRt::getBM(attributes=c('ensembl_gene_id', 'entrezgene', 'external_gene_name'), mart=mart) %>% filter(!is.na(entrezgene)) }) %>% cache_it("ens2entrez") %>% distinct(ensembl_gene_id) #unlen(ens2entrez$ensembl_gene_id) toolTipData <- overlayData %>% left_join(ens2entrez) makeTooltip <- function(entrez_id){ toolTipData %>% filter(entrezgene==entrez_id) %>% dplyr::select(-entrezgene) %>% gather() %$% paste(key, value, sep=": ") %>% paste(collapse="\n") #%>% cat } #entrez_id=14679 #entrez_id=34234234234234 #makeTooltip("14679") #+ results="asis", echo=FALSE ## simple non-clickable plots ## http://stackoverflow.com/questions/12588323/r-how-to-extract-values-for-the-same-named-elements-across-multiple-objects-of #unlist(lapply(pathwayPlots, "[[", "plotfile")) # unlist(lapply(pathwayPlots, "[[", "plotfile")) %>% l_ply(function(pngFile){ cat(paste0("<img src='", pngFile, "'><br>"))}) #cat(" #<style type='text/css'> # img { # max-width: 100%; # } #</style> #") ## todo add tooltips with additinal info ## http://stackoverflow.com/questions/5478940/how-to-create-a-html-tooltip-on-top-of-image-map ## extended version with clickable links createImgMap <- function(plotData){ #browser() warning("compiling overlay toolips...") #pngFile="mmu04015.pathview.png" #plotData <- pathwayPlots[[1]] #plotData <- unlist(pathwayPlots) # pathway_id=(basename(pngFile) %>% str_split_fixed("[.]", 2))[,1] pngFile <- plotData$plotfile stopifnot(file.exists(pngFile)) pathway_id=plotData$pathway_id ## create link for image map keggNodes <- plotData$plot.data.gene %>% ## remove box offset mutate(x=x-22, y=y-8) %>% ## todo does not work for non-gene elements mutate(link=paste0("http://www.kegg.jp/dbget-bin/www_bget?", keggOrCode, ":", kegg.names )) %>% rowwise() %>% do({ curNode=.; mutate(as.df(curNode), tooltip=makeTooltip(as.integer(curNode$kegg.names)))}) ## create tooltip by using mapping to # "http://www.kegg.jp/dbget-bin/www_bget?mmu:16412 ## first the image itself # cat(paste0("<img usemap='#", pathway_id,"' src='", pngFile, "'><br>")) # cat(paste0("<p><div style='width: 2000px'><img style='float: left' usemap='#", pathway_id,"' src='", pngFile, "'></div><br>")) ## it seems that formatting works better without the left alignment cat(paste0("<p><div style='width: 2000px'><img usemap='#", pathway_id,"' src='", pngFile, "'></div><br>")) cat(paste0("<map name='", pathway_id,"'>")) #keggNodes %>% rowwise() %>% {curNode=.; cat(curNode$name)} #see http://www.html-world.de/180/image-map/ keggNodes %>% a_ply(1, function(curNode){ rectDef=with(curNode, paste(x, y, x+width, y+height, sep=",")) # paste0("<area href='", curNode$link, "' alt='Text' coords='", rectDef , "' shape='rect'>") %>% cat() paste0("<area href='", curNode$link, "' title='", curNode$tooltip, "' alt='Text' coords='",rectDef , "' shape='rect'>") %>% cat() }) cat("</map></p><br>") } pathwayPlots %>% l_ply(createImgMap) ## respin it for cild inclusion # require(knitr); setwd("/Volumes/projects/bioinfo/scripts/ngs_tools/dev/common"); spin("cp_enrichment.R", knit=F) #' ## Notes #' Other interesting tools to perform enrichment testing #' * [piano](http://www.bioconductor.org/packages/release/bioc/html/piano.html)