david_enrichment.R 12.2 KB
Newer Older
Holger Brandl's avatar
Holger Brandl committed
1
#!/usr/bin/env Rscript
2
#+ echo=FALSE
Holger Brandl's avatar
Holger Brandl committed
3 4 5 6


suppressMessages(require(docopt))

7 8

# todo refac to use tsv input along with user provided grouping attribute
Holger Brandl's avatar
Holger Brandl committed
9 10 11 12 13 14 15 16 17
doc <- '
Perform an enrichment analysis for a set of genes
Usage: david_enrichment.R [options] <grouped_gene_lists_rdata>

Options:
--overlay_expr_data Tsv with overlay data for the kegg pathways
'

opts <- docopt(doc, commandArgs(TRUE)) ## does not work when spining
18
#opts <- docopt(doc, "--overlay_expr_data ../ctrl_fc_expr_filtered.txt geneClusters.RData" )
19
#opts <- docopt(doc, "--overlay_expr_data ../ctrl_fc_expr_filtered.txt grpdGoiClusters.RData" )
Holger Brandl's avatar
Holger Brandl committed
20

21 22 23
devtools::source_url("https://git.mpi-cbg.de/bioinfo/datautils/raw/v1.12/R/core_commons.R")
devtools::source_url("https://git.mpi-cbg.de/bioinfo/datautils/raw/v1.12/R/ggplot_commons.R")
devtools::source_url("https://git.mpi-cbg.de/bioinfo/datautils/raw/v1.12/R/bio/diffex_commons.R")
24

25 26 27 28
source(file.path(Sys.getenv("NGS_TOOLS"), "common/david_enrichment_util.R"))
## todo used tagged version instead
# devtools::source_url("https://git.mpi-cbg.de/bioinfo/ngs_tools/raw/v12/common/david_enrichment_util.RR")

29 30 31 32 33
require.auto(knitr)

## to fix child support issue with knitr, see also
## http://stackoverflow.com/questions/20030523/knitr-nested-child-documents
## https://github.com/yihui/knitr/issues/38
34 35
# todo disabled because root.dir in parent document seems the only working solution
#if(exists('project_dir')) setwd(project_dir)
36 37 38 39
#print(getwd())

## load the data
geneLists <- local(get(load(opts$grouped_gene_lists_rdata)))
Holger Brandl's avatar
Holger Brandl committed
40

41
#geneLists %<>% filter(cluster %in% c("cluster_1", "cluster_2"))
Holger Brandl's avatar
Holger Brandl committed
42

43
if(!is.null(opts$overlay_expr_data)){
Holger Brandl's avatar
Holger Brandl committed
44 45 46 47 48 49
    overlayData <- read.delim(opts$overlay_expr_data)
}


resultsBaseName=basename(opts$grouped_gene_lists_rdata) %>% trim_ext(".RData") %>% paste0(".")

50 51
########################################################################################################################
#' ## Enrichment Analysis
Holger Brandl's avatar
Holger Brandl committed
52

53 54 55 56 57 58 59 60 61 62
#' This analysis was performed using [David](http://david.abcc.ncifcrf.gov/). The following ontologies were tested: `r paste(DEF_DAVID_ONTOLOGIES, collapse=', ')`

listLabels <- geneLists %>% select(-ensembl_gene_id) %>% distinct
listLabels %<>% transform(list_label=do.call(paste, c(listLabels, sep="__")))

geneLists %>% inner_join(listLabels) %>%
    ggplot(aes(list_label)) +
    geom_bar() +
    coord_flip() +
    ggtitle("gene list sizes to be tested for term enrichment") +
63
    ylab("")
Holger Brandl's avatar
Holger Brandl committed
64 65


66 67 68
#enrResults <- geneLists %>% do(davidAnnotationChart(.$ensembl_gene_id))
enrResults <- quote(geneLists %>% do(davidAnnotationChart(.$ensembl_gene_id))) %>%
    cache_it(paste0("enrdata_", digest(geneLists)))
Holger Brandl's avatar
Holger Brandl committed
69 70 71


write.delim(enrResults, file=paste0(resultsBaseName, "enrResults.txt"))
72 73
# enrResults <- read.delim(paste0(resultsBaseName, "enrResults.txt"))
#'  [Enrichment Results](`r paste0(resultsBaseName, "enrResults.txt")`)
Holger Brandl's avatar
Holger Brandl committed
74 75 76

sigEnrResults <- subset(enrResults, Bonferroni <0.01)

77 78 79
nrow(enrResults)
nrow(sigEnrResults)

Holger Brandl's avatar
Holger Brandl committed
80
write.delim(sigEnrResults, file=paste0(resultsBaseName, "sigEnrResults.txt"))
81
# sigEnrResults <- read.delim(paste0(resultsBaseName, "sigEnrResults.txt"))
82
#' [Very Significant Terms](`r paste0(resultsBaseName, "sigEnrResults.txt")`)
Holger Brandl's avatar
Holger Brandl committed
83

84
#+ include=FALSE, eval=FALSE
Holger Brandl's avatar
Holger Brandl committed
85 86 87 88 89 90 91 92 93 94
## plot the enrichment results
#sigEnrResults %>% group_by(Category, add=T) %>% do({
#    logPlot <- . %>% ggplot(aes(Term, PValue)) +
#	    geom_bar(stat="identity")+coord_flip() +
#	    xlab("Enriched Terms") +
#	    ggtitle(.$Category[1]) +
#	    scale_y_log10()
#	    print(logPlot)
#})

95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120
#sigEnrResults %>%
##    select(-Genes) %>%
#    do({
#    enrResultsGrp <- .
#    ## DEBUG enrResultsGrp <- sigEnrResults
##    geneLists %>% first(1) %>% select(-ensembl_gene_id) %>% paste0(., collapse="_")
##browser()
#    label <- geneLists %>% semi_join(enrResultsGrp) %>% first(1) %>% dplyr::select(-ensembl_gene_id) %>% paste0(., collapse="_")
#
#    echo("processing", label)
#
#    logPlot <- enrResultsGrp %>%
#        ## fix factor order
#        mutate(Term=reorder(Term, -PValue) %>% reorder(as.integer(Category))) %>%
#        ggplot(aes(Term, PValue, fill=Category)) +
#	    geom_bar(stat="identity")+
#	    coord_flip() +
#	    xlab("Enriched Terms") +
#	    ggtitle(label) +
#	    scale_y_log10()
#
#	    ggsave(paste0(resultsBaseName, label, ".enrichmed_terms.pdf"))
#	    print(logPlot)
#})
##ggsave2()

121 122 123
## include=FALSE, error=TRUE

#+ error=TRUE, echo=FALSE
124 125 126 127 128
warning("dropping levels")
sigEnrResults %<>% mutate(Category=ac(Category)) ## drop unsused level to get consistent color palette

term_category_colors <- create_palette(unique(ac(sigEnrResults$Category)))

129 130
dir.create("figures")

131 132 133 134
term_barplot_files <- sigEnrResults %>%
    ## chop and pad category names
    mutate(Term=str_sub(Term, 1, 70) %>% str_pad(70)) %>%
do({
Holger Brandl's avatar
Holger Brandl committed
135 136
    enrResultsGrp <- .
    ## DEBUG enrResultsGrp <- sigEnrResults
137
    label <- geneLists %>% semi_join(enrResultsGrp) %>% first(1) %>% dplyr::select(-ensembl_gene_id) %>% paste0(., collapse="_")
Holger Brandl's avatar
Holger Brandl committed
138

139
#    warning(paste0("processing terms", paste(ac(unique(enrResultsGrp$Category)), collapse=",")))
Holger Brandl's avatar
Holger Brandl committed
140 141
    logPlot <- enrResultsGrp %>%
        ## fix factor order
142
        mutate(Term=reorder(Term, -PValue) %>% reorder(as.integer(as.factor(Category)))) %>%
Holger Brandl's avatar
Holger Brandl committed
143
        ggplot(aes(Term, PValue, fill=Category)) +
144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164
        geom_bar(stat="identity")+
        scale_fill_manual(values = term_category_colors, drop=F, name="Ontology") +
        coord_flip() +
        xlab("Enriched Terms") +
        ggtitle(label) +
        scale_y_log10()

        # print(logPlot)

    ## todo  use builtin method to create filesystem-compatible name
    fileNameLabel <- label %>%
        str_replace_all("!=", "ne") %>%
        str_replace_all(">", "gt") %>%
        str_replace_all("<", "lt") %>%
        str_replace_all(fixed("&"), "AND") %>%
        str_replace_all(fixed("|"), "OR") %>%
        str_replace_all(" ", "_")

#        ggsave(paste0("enrichmed_terms__", fileNameLabel, ".pdf"))
#        print(logPlot)

165
     tmpPng <- paste0("figures/enrterms__", fileNameLabel, ".png")
166 167
     ggsave(tmpPng, logPlot, width=10, height = 2+round(nrow(enrResultsGrp)/5), limitsize=FALSE)
     data.frame(file=tmpPng)
Holger Brandl's avatar
Holger Brandl committed
168
})
169 170 171 172 173

#+ results="asis"
l_ply(term_barplot_files$file, function(pngFile){ cat(paste0("<img src='", pngFile, "'><br>"))})

########################################################################################################################
174 175
# ' ## Enriched KEGG Pathways
#+ eval=nrow(sigEnrResults %>% filter(Category=="KEGG_PATHWAY")) >0
176 177 178

#' To understand spatio-temporal changes in gene expression better we now overlay enriched kegg pathways with the -log10(q_value) of each contrast. The direction of the expression changes is encoded as color, whereby red indicates that sample_1 is overexpressed. Because we have multiple contrasts of interest, this defines a slice-color barcode for each gene. To relate the barcode to contrasts we define the following slice order:

179
## todo why is tidyr not processing an empty dataframe
Holger Brandl's avatar
Holger Brandl committed
180 181 182
keggPathways <- sigEnrResults %>%
    filter(Category=="KEGG_PATHWAY") %>%
    separate(Term, c('kegg_pathway_id', 'pathway_description'), sep="\\:", remove=F) %>%
183
    with(kegg_pathway_id) %>% ac() %>% unique()
Holger Brandl's avatar
Holger Brandl committed
184

185 186 187 188 189 190
##+ results='asis'
#if(!exists("keggPathways") | nrow(keggPathways)==0){
#    cat("No enriched pathways found")
#}

#+ echo=FALSE
Holger Brandl's avatar
Holger Brandl committed
191 192 193
require(pathview)
require(png)

194 195
# keggPathways <- keggPathways[1]

Holger Brandl's avatar
Holger Brandl committed
196 197 198 199
#if(nrow(keggPathways)==0){
#    echo("no enriched kegg pathways were found in the dataset")
#}else{

200 201
#keggOrCode <- "mmu"
keggOrCode <- guess_pathview_species(geneLists$ensembl_gene_id)
Holger Brandl's avatar
Holger Brandl committed
202 203 204 205 206 207 208 209 210 211 212 213 214 215

## prepare p-value data
sliceData <- overlayData %>%
#    dcast(ensembl_gene_id ~ comparison, value.var="plot_score") %>%
    column2rownames("ensembl_gene_id")

#sliceData %>% head %>% kable()

data.frame(set=names(sliceData)) %>% mutate(slice_index=row_number()) %>% kable()


plot_pathway <- function(pathwayID, overlayData){
#    pathwayID="mmu04015"
#    browser()
216
#    echo("processing pathway", pathwayID)
Holger Brandl's avatar
Holger Brandl committed
217 218 219 220 221 222 223 224 225 226 227 228 229 230 231
    pv.out <- pathview(
            gene.data = overlayData,
            pathway.id = pathwayID,
            species = keggOrCode,
#            out.suffix = pathwayID$kegg.description,
#            out.suffix = pathwayID,
            multi.state = ncol(overlayData)>1,
#            kegg.native=F,
#            node.sum = "mean", # the method name to calculate node summary given that multiple genes or compounds are mapped to it
            limit = list(gene = c(-4,4)),
            gene.idtype="ensembl"
    )

    outfile <- paste0(pathwayID, ".pathview", ifelse(ncol(overlayData)>1, ".multi", ""), ".png")

232 233 234 235
    ## move pathway plots into figures sub-directory
    figuresPlotFile <- file.path("figures", outfile)
    system(paste("mv", outfile, figuresPlotFile))

Holger Brandl's avatar
Holger Brandl committed
236
    system(paste("rm", paste0(pathwayID, ".xml"), paste0(pathwayID, ".png")))
237

Holger Brandl's avatar
Holger Brandl committed
238 239 240 241 242 243
    ## interactive plotting
#    ima <- readPNG(outfile)
#    plot.new()
#    lim <- par()
#    rasterImage(ima, lim$usr[1], lim$usr[3], lim$usr[2], lim$usr[4])

244
    pv.out$plotfile=figuresPlotFile
Holger Brandl's avatar
Holger Brandl committed
245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263
    pv.out$pathway_id=pathwayID

    return(pv.out)
}

#keggPathways <- c("mmu04976", "mmu04972", "mmu04810", "mmu04520", "mmu04530", "mmu04270", "mmu04015")
#keggPathways <- c("mmu04015")



pathwayPlots <- keggPathways %>% llply(function(pathwayID){
    plot_pathway(pathwayID, sliceData)
})


save(pathwayPlots, file=".pathwayPlots.RData")
# pathwayPlots <- local(get(load("pathwayPlots.RData")))


264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281
## prepare tooltips with expression scores
ens2entrez <- quote({
    mart <- biomaRt::useDataset(guess_mart(geneLists$ensembl_gene_id), mart = biomaRt::useMart("ensembl"))
    biomaRt::getBM(attributes=c('ensembl_gene_id', 'entrezgene', 'external_gene_name'), mart=mart) %>% filter(!is.na(entrezgene))
}) %>% cache_it("ens2entrez") %>% distinct(ensembl_gene_id)
#unlen(ens2entrez$ensembl_gene_id)

toolTipData <- overlayData %>% left_join(ens2entrez)

makeTooltip <- function(entrez_id){
    toolTipData %>% filter(entrezgene==entrez_id) %>% dplyr::select(-entrezgene) %>% gather() %$% paste(key, value, sep=": ") %>% paste(collapse="\n") #%>% cat
}
#entrez_id=14679
#entrez_id=34234234234234
#makeTooltip("14679")



282
#+ results="asis", echo=FALSE
Holger Brandl's avatar
Holger Brandl committed
283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302

## simple non-clickable plots
## http://stackoverflow.com/questions/12588323/r-how-to-extract-values-for-the-same-named-elements-across-multiple-objects-of
#unlist(lapply(pathwayPlots, "[[", "plotfile"))
# unlist(lapply(pathwayPlots, "[[", "plotfile")) %>% l_ply(function(pngFile){ cat(paste0("<img src='", pngFile, "'><br>"))})

#cat("
#<style type='text/css'>
# img {
#   max-width: 100%;
# }
#</style>
#")

## todo add tooltips with additinal info
## http://stackoverflow.com/questions/5478940/how-to-create-a-html-tooltip-on-top-of-image-map

## extended version with clickable links
pathwayPlots %>% l_ply(function(plotData){
    #pngFile="mmu04015.pathview.png"
303 304
    #plotData <- pathwayPlots[[1]]
    #plotData <- unlist(pathwayPlots)
Holger Brandl's avatar
Holger Brandl committed
305 306 307 308
#    pathway_id=(basename(pngFile) %>% str_split_fixed("[.]", 2))[,1]
    pngFile <- plotData$plotfile
    pathway_id=plotData$pathway_id

309
    ## create link for image map
Holger Brandl's avatar
Holger Brandl committed
310 311 312
    keggNodes <- plotData$plot.data.gene %>%
        ## remove box offset
        mutate(x=x-22, y=y-8) %>% ## todo does not work for non-gene elements
313 314 315 316 317
        mutate(link=paste0("http://www.kegg.jp/dbget-bin/www_bget?", keggOrCode, ":", kegg.names )) %>%
        rowwise() %>% do({ curNode=.; mutate(as.df(curNode), tooltip=makeTooltip(as.integer(curNode$kegg.names)))})

    ## create tooltip by using mapping to

Holger Brandl's avatar
Holger Brandl committed
318 319 320 321 322

#    "http://www.kegg.jp/dbget-bin/www_bget?mmu:16412

    ## first the image itself
#    cat(paste0("<img usemap='#", pathway_id,"' src='", pngFile, "'><br>"))
323
    cat(paste0("<p><div style='width: 2000px'><img style='float: left' usemap='#", pathway_id,"' src='", pngFile, "'></div><br>"))
Holger Brandl's avatar
Holger Brandl committed
324 325 326 327 328 329
    cat(paste0("<map name='", pathway_id,"'>"))
    #keggNodes %>% rowwise() %>% {curNode=.; cat(curNode$name)}

    #see http://www.html-world.de/180/image-map/
    keggNodes %>% a_ply(1, function(curNode){
        rectDef=with(curNode, paste(x, y, x+width, y+height, sep=","))
330 331 332
#        paste0("<area href='", curNode$link, "' alt='Text' coords='", rectDef , "' shape='rect'>") %>% cat()
        paste0("<area href='", curNode$link, "' title='", curNode$tooltip, "' alt='Text' coords='",rectDef , "' shape='rect'>") %>% cat()

Holger Brandl's avatar
Holger Brandl committed
333
    })
334
    cat("</map></p><br>")
Holger Brandl's avatar
Holger Brandl committed
335 336
})

337 338
## respin it for cild inclusion
# require(knitr); setwd("/Volumes/projects/bioinfo/scripts/ngs_tools/dev/common"); spin("david_enrichment.R", knit=F)