#!/usr/bin/env Rscript #+ echo=FALSE, error=F suppressMessages(require(docopt)) ## todo use textual input here for ease of use doc = ' Perform a gene feature enrichment analysis for a set of genes Usage: cp_enrichment.R [options] Options: --overlay_expr_data Tsv with overlay data for the kegg pathways --project Name to prefix all generated result files [default: ] --qcutoff Use a q-value cutoff of 0.01 instead of a q-value cutoff [default: 0.01] ' opts = docopt(doc, commandArgs(TRUE)) ## does not work when spining # opts = docopt(doc, "--overlay_expr_data ../plot_score_matrix.txt ../degs_by_contrast.txt contrast" ) devtools::source_url("https://git.mpi-cbg.de/bioinfo/datautils/raw/v1.40/R/core_commons.R") devtools::source_url("https://git.mpi-cbg.de/bioinfo/datautils/raw/v1.40/R/ggplot_commons.R") devtools::source_url("https://git.mpi-cbg.de/bioinfo/datautils/raw/v1.40/R/bio/diffex_commons.R") # devtools::source_url("https://git.mpi-cbg.de/bioinfo/datautils/raw/v1.40/R/bio/cp_utils.R") # devtools::source_url("https://www.dropbox.com/s/p2b8luxf7jteb63/cp_utils.R?dl=1") load_pack(knitr) load_pack(DT) resultsBaseName = if (str_length(opts$project) > 0) paste0(opts$project, ".feat_enr") else "feat_enr." q_cutoff = as.numeric(opts$qcutoff) ## load the data geneLists = read_tsv(opts$gene_lists_tsv) group_col = opts$group_col geneLists %<>% group_by_(.dots = group_col) ######################################################################################################################## #' ## Enrichment Analysis #' Run configuration was vec_as_df(unlist(opts)) %>% filter(! str_detect(name, "^[=]")) %>% kable() #' This analysis was performed using [clusterProfiler](http://bioconductor.org/packages/release/bioc/html/clusterProfiler.html). The following ontologies were tested: Kegg, Go, Reactome, Dose, listLabels = geneLists %>% select(- ensembl_gene_id) %>% distinct listLabels %<>% transform(list_label = do.call(paste, c(listLabels, sep = "__"))) geneLists %>% inner_join(listLabels) %>% ggplot(aes(list_label)) + geom_bar() + coord_flip() + ggtitle("gene list sizes to be tested for term enrichment") + ylab("") ######################################################################################################################## #' ## Prepare gene feature data ## todo guess from data ens_build = "mar2017" # geneInfo = quote({ # mart = biomaRt::useMart("ENSEMBL_MART_ENSEMBL", dataset = guess_mart(counts$ensembl_gene_id), host = paste0(ens_build, ".archive.ensembl.org"), path = "/biomart/martservice", archive = FALSE) # c("ensembl_gene_id", "external_gene_name", "start_position", "end_position") %>% # biomaRt::getBM(mart = mart) %>% # mutate(gene_length=start_position - end_position) %>% # tbl_df # }) %>% cache_it(paste0("geneInfo_", ens_build)) exSummary = quote({ mart <- biomaRt::useMart("ENSEMBL_MART_ENSEMBL", dataset = guess_mart(geneLists$ensembl_gene_id), host = paste0(ens_build, ".archive.ensembl.org"), path = "/biomart/martservice") # library(biomaRt) exons = biomaRt::getBM(attributes = c('ensembl_gene_id', 'ensembl_transcript_id', 'ensembl_exon_id', 'rank'), mart = mart) txInfo = biomaRt::getBM(attributes = c('ensembl_gene_id', 'ensembl_transcript_id', 'transcript_start', 'transcript_end'), mart = mart) longestTx = txInfo %>% group_by(ensembl_gene_id) %>% mutate(tx_length = transcript_end - transcript_start) %>% filter(max(tx_length) == tx_length) %>% slice(1) exons %>% group_by(ensembl_gene_id, ensembl_transcript_id) %>% summarize(num_exons = max(rank)) %>% ungroup %>% ## filter for longest transcript semi_join(longestTx) %>% dplyr::select(- ensembl_transcript_id) }) %>% cache_it(paste0("exInfo", ens_build)) # exSummary %>% count(num_exons) %>% n_as("genes_count") exSummary %>% filter(num_exons < 30 ) %>% ggplot(aes(num_exons)) + geom_bar() numExTERM2GENE = exSummary %>% filter(num_exons< 15) %>% transmute(term = paste(num_exons, "exons"), gene = ensembl_gene_id) numExTERM2NAME = numExTERM2GENE %>% distinct(term) %>% mutate(name = term) install_package("clusterProfiler") # require(clusterProfiler) ## todo part of core_commons v1.41 first_group = function(groupedDF){ # groupedDF = geneLists # groupedDF = iris %>% group_by(Species) # https://stackoverflow.com/questions/33775239/emulate-split-with-dplyr-group-by-return-a-list-of-data-frames (groupedDF %>% do(data = (.)) %$% map(data, identity))[[1]] } feat_enr_test = function(geneIds){ # groupDf = first_group(geneLists); geneIds = groupDf$ensembl_gene_id exonCountResults = clusterProfiler::enricher(gene = geneIds, qvalueCutoff = q_cutoff, TERM2GENE = numExTERM2GENE, TERM2NAME = numExTERM2NAME) %>% as.data.frame() # reactomeResults = ReactomePA::gsePathway(gene = geneIds, organism = cpSpecies) %>% summary() # goResultsBP = tryCatch({ clusterProfiler::gseGO(geneList = geneIds, OrgDb = annoDb, ont = "BP") %>% summary()}, error = function(err)noResults) enrResults = bind_rows( mutate(exonCountResults, ontology = "exon_counts") # mutate(goResultsCC, ontology = "go_cc") ) enrResults } enrResults = geneLists %>% do(feat_enr_test(.$ensembl_gene_id)) ## purrr but less concise # feat_enr_test(first_group(geneLists)$ensembl_gene_id) # enrResults = geneLists %>% # nest %>% # mutate(enr_results = map(data, ~ feat_enr_test(.x$ensembl_gene_id))) %>% # unnest(enr_results) ## remove to clumsy gene_id columns enrResults %<>% select(- geneID) write_tsv(enrResults, path = paste0(resultsBaseName, "enrResults.txt")) # enrResults = read.delim(paste0(resultsBaseName, "enrResults.txt")) #' [Enrichment Results](`r paste0(resultsBaseName, "enrResults.txt")`) load_pack(DT) datatable(enrResults) #enrResults %>% ggplot(aes(pvalue)) + geom_histogram() + scale_x_log10() #enrResults %>% ggplot(aes(ontology)) + facet_wrap(~cluster) + geom_bar() + rot_x_lab() #facetSpecs = paste("~", groups(geneLists) %>% ac %>% paste(collapse=" + ")) facetSpecs = paste("~", group_col %>% ac %>% paste(collapse = " + ")) #' Visualize term-pvalues per list #http://stackoverflow.com/questions/11028353/passing-string-variable-facet-wrap-in-ggplot-using-r enrResults %>% ggplot(aes(ontology)) + facet_wrap(as.formula(facetSpecs), ncol = 3) + geom_bar() + rot_x_lab() + ggtitle("enriched term counts by cluster") enrResults %<>% mutate(num_term_genes = str_split_fixed(BgRatio, fixed("/"), 2)[, 1] %>% as.numeric) #' Keep at max 20 terms for visualzation per group erPlotData = enrResults %>% group_by_(.dots = c(group_col)) %>% arrange(qvalue) %>% slice(1 : 20) %>% ## regroup because otherwise dplyr complains about corrupt df (which looks like a bug) group_by_(.dots = c(group_col)) #+ error=TRUE, echo=FALSE warning("dropping levels") erPlotData %<>% mutate(ontology = ac(ontology)) ## drop unsused level to get consistent color palette erPlotData %<>% rename(Term = Description) term_category_colors = create_palette(unique(ac(erPlotData$ontology))) figDir = "enr_charts" dir.create(figDir) ## chop and pad category names erPlotData %<>% mutate(fixed_width_term = str_sub(Term, 1, 70) %>% str_pad(70)) ## evaluate gene ratios strings into actual proportions erPlotData %<>% mutate(gene_ratio = map_dbl(GeneRatio, ~ eval(parse(text = .x)))) term_barplot_files = erPlotData %>% do({ # DEBUG enrResultsGrp = erPlotData %>% head(30) enrResultsGrp = . label = subset(enrResultsGrp, select = group_col)[1, 1] %>% as.matrix %>% ac ## new version using dotplot https://bioconductor.org/packages/devel/bioc/vignettes/clusterProfiler/inst/doc/clusterProfiler.html#dotplot enrPlot = enrResultsGrp %>% ungroup %>% ## fix factor order mutate(fixed_width_term = reorder(fixed_width_term, gene_ratio)) %>% ggplot(aes(fixed_width_term, gene_ratio, size = Count, fill = p.adjust, color = ontology)) + # ggplot(aes(fixed_width_term, Count)) + geom_point(pch = 21) + scale_color_manual(values = term_category_colors, drop = F, name = "Ontology") + scale_fill_gradient(low = "red", high = "white", name = "q-value") + coord_flip() + # xlab("Enriched Terms") + ggtitle(label) # scale_y_log10() ## todo use builtin method to create filesystem-compatible name fileNameLabel = label %>% str_replace_all("!=", "ne") %>% str_replace_all(">", "gt") %>% str_replace_all("<", "lt") %>% str_replace_all(fixed("&"), "AND") %>% str_replace_all(fixed("|"), "OR") %>% str_replace_all(" ", "_") # ggsave(paste0("enrichmed_terms__", fileNameLabel, ".pdf")) # print(enrPlot) stopifnot(str_length(fileNameLabel) > 0) tmpPng = paste0(figDir, "/enrterms__", fileNameLabel, ".png") ggsave(tmpPng, enrPlot, width = 10, height = 2 + round(nrow(enrResultsGrp) / 5), limitsize = FALSE) data.frame(file = tmpPng) }) #+ results="asis" walk(term_barplot_files$file, function(pngFile){ cat(paste0("
"))}) # install.packages("session") session::save.session(".cp_enrichment.dat"); # session::restore.session(".cp_enrichment.dat"); #' System info: #+ devtools::session_info()