started improved dge report

bfbb3ba6 · Holger Brandl · 79277551 · bfbb3ba6
Commit bfbb3ba6 authored 10 years ago by Holger Brandl
--- a/dge_workflow/dge_analysis_new.R
+++ b/dge_workflow/dge_analysis_new.R
+#!/usr/bin/env Rscript
+#+ setup, cache=FALSE
+
+suppressMessages(require(docopt))
+
+doc <- '
+Convert a cuffdiff into a dge-report
+Usage: dge_analysis.R [options] <cuffdb_directory>
+
+Options:
+--constrasts=<tab_delim_table> Table with sample pairs for which dge analysis should be performed
+--undirected           Perform directed dge-steps in a directed manner
+--qcutoff <qcutoff>    Use a q-value cutoff of 0.01 instead of a q-value cutoff [default: 0.01]
+--pcutoff <pcutoff>    Override q-value filter and filter by p-value instead
+--minfpkm <minfpkm>   Minimal expression in any of the sample to be included in the final result list [default: 1]
+-S                     Dont do general statistics and qc analysis
+'
+
+opts <- docopt(doc, commandArgs(TRUE)) ## does not work when spining
+# opts <- docopt(doc, "--undirected  ..")
+# opts <- docopt(doc, paste(Sys.getenv("DGE_PARAMS"), ".."))
+## todo use commandArgs here!!
+#opts <- docopt(doc, paste(Sys.getenv("DGE_PARAMS"), paste(commandArgs(TRUE), collapse=" ")))
+#opts
+
+skipQC <<- opts$S
+split_hit_list <- !opts$undirected
+constrasts_file <- opts$constrasts
+pcutoff <- if(is.null(opts$pcutoff)) NULL else as.numeric(opts$pcutoff)
+qcutoff <- if(is.numeric(pcutoff)) NULL else as.numeric(opts$qcutoff)
+minFPKM <- as.numeric(opts$minfpkm)
+
+stopifnot(is.numeric(qcutoff) | is.numeric(pcutoff))
+
+cuffdb_directory <- normalizePath(opts$cuffdb_directory)
+
+if(is.na(file.info(cuffdb_directory)$isdir)){
+    stop(paste("db directory does not exist", doc))
+}
+
+
+devtools::source_url("https://raw.githubusercontent.com/holgerbrandl/datautils/v1.5/R/core_commons.R")
+devtools::source_url("https://raw.githubusercontent.com/holgerbrandl/datautils/v1.5/R/ggplot_commons.R")
+devtools::source_url("https://raw.githubusercontent.com/holgerbrandl/datautils/v1.5/R/bio/diffex_commons.R")
+
+require(cummeRbund)
+
+require(knitr)
+
+
+#knitr::opts_knit$set(root.dir = getwd())
+
+########################################################################################################################
+#' # Differential Gene Expression Analysis
+
+print("Configuration")
+vec2df(unlist(opts)) %>% filter(!str_detect(name, "^[<-]")) %>% kable()
+
+#' [CuffDiff](http://cufflinks.cbcb.umd.edu/manual.html) and [cummeRbund](http://compbio.mit.edu/cummeRbund/) were used to perform this analysis. For details about how the test for differntial expression works, refer to [Differential analysis of gene regulation at transcript resolution with RNA-seq](http://www.nature.com/nbt/journal/v31/n1/full/nbt.2450.html).
+
+#print(paste("db dir is", cuffdb_directory))
+
+#' Used cuffdiff database in `r cuffdb_directory`
+
+#+ load_db, cache=FALSE
+
+isCluster=Sys.getenv("LSF_SERVERDIR")!=""
+dbTempCopyRequired=isCluster | str_detect(normalizePath(cuffdb_directory), fixed("/Volumes/projects"))
+
+if(dbTempCopyRequired){
+    ## workaround until sqlite  is fixed on lustre
+    tmpdir <- tempfile("cuffdb_dir")
+    system(paste("cp -r", cuffdb_directory, tmpdir))
+    cuff <- readCufflinks(dir=tmpdir)
+}else{
+    cuff <- readCufflinks(dir="..")
+}
+
+
+cuff
+runInfo(cuff)
+replicateInfo <- replicates(cuff) %>% mutate(file=basename(file)) %>% select(-c(total_mass, norm_mass, internal_scale, external_scale))
+replicateInfo
+
+write.delim(replicateInfo, file="replicate_info.txt")
+# replicateInfo <- read.delim("replicate_info.txt")
+#'  [replicateInfo](replicate_info.txt)
+
+## add gene description
+martName <- guess_mart(fpkm(genes(cuff))$gene_id)
+cacheFile <- paste0("geneInfo.",martName, ".RData")
+if(!file.exists(cacheFile)){
+    require(biomaRt)
+
+    mousemart = useDataset(martName, mart=useMart("ensembl"))
+    geneInfo <- getBM(attributes=c('ensembl_gene_id', 'external_gene_name', 'description', 'gene_biotype'), mart=mousemart);
+    save(geneInfo, file=cacheFile)
+    unloadNamespace('biomaRt')
+}else{
+    geneInfo <- local(get(load(cacheFile)))
+}
+
+#+ eval=skipQC
+#hello hidden field
+
+#######################################################################################################################
+#' ## Count and Expression Score Tables
+
+#colnames(gFpkmMatrix) <- ifelse(is.na(sampleDic[colnames(gFpkmMatrix)]), colnames(gFpkmMatrix), sampleDic[colnames(gFpkmMatrix)])
+fpkmMatrix(genes(cuff)) %>%
+    rownames2column("ensembl_gene_id") %>%
+    merge(geneInfo, by="ensembl_gene_id", all.x=T) %>%
+    print_head() %>%
+    write.delim(file="gene_fpkms.txt")
+# gene_fpkms <- read.delim("gene_fpkms.txt")
+#' [Annotated Expresssion Table](gene_fpkms.txt)
+
+## export the same but now including replicate information
+
+repFpkmMatrix(genes(cuff)) %>%
+    rownames2column("ensembl_gene_id") %>%
+    merge(geneInfo, by="ensembl_gene_id", all.x=T) %>%
+    print_head() %>%
+    write.delim(file="gene_fpkms_by_replicate.txt")
+# gene_fpkms_by_replicate <- read.delim("gene_fpkms_by_replicate.txt")
+#' [Annotated Expresssion Table](gene_fpkms_by_replicate.txt)
+
+## also export count tables
+countMatrix(genes(cuff)) %>%
+    rownames2column("ensembl_gene_id") %>%
+    merge(geneInfo, by="ensembl_gene_id", all.x=T) %>%
+    write.delim(file="gene_counts.txt")
+
+#'  [Gene Counts](gene_counts.txt)
+
+repCountMatrix(genes(cuff)) %>%
+    rownames2column("ensembl_gene_id") %>%
+    merge(geneInfo, by="ensembl_gene_id", all.x=T) %>%
+    write.delim(file="gene_counts_by_replicate.txt")
+# repCounts <- read.delim("replicateCounts.txt")
+#'  [Gene Counts by Replicate](replicateCounts.txt)
+
+##ggplot(geneFpkm, aes(fpkm)) + geom_histogram() + scale_x_log10()
+
+
+#######################################################################################################################
+#' ## Score Distributions
+
+
+#if(!as.logical(opts$S)){
+
+# create some global summary statistic plots
+dispersionPlot(genes(cuff)) #+ aes(alpha=0.01)
+
+csDensity(genes(cuff))
+#ggsave2(csBoxplot(genes(cuff))+ ggtitle("fpkm boxplots"))
+
+csDensity(genes(cuff),features=T)
+csBoxplot(genes(cuff),replicates=T)
+
+fpkmSCVPlot(genes(cuff))
+
+
+#csScatter(genes(cuff), "a", "d", smooth=F) + aes(color=abs(log2(a/b))>2) + scale_colour_manual(values=c("darkgreen", "red"))
+
+#MAplot(genes(cuff), "a", "d")+ aes(color=abs(log2(M))>2)+ ggtitle("MA-plot: a/d")
+#MAplot(genes(cuff), "a", "b", useCount=T) #+ aes(color=abs(log2(M))>2)+ ggtitle("MA-plot: Luc/KD")
+
+
+#csVolcano(genes(cuff),"aRGm", "N") + ggtitle("odd line-shaped fan-out in volcano plot")
+
+## todo consider to add more pretty volcano plots (see stefania's extra plots
+
+
+#csVolcano(genes(cuff),"a", "b")
+## todo replace or complement with marta plot
+csVolcanoMatrix(genes(cuff))
+#csVolcano(genes(cuff),"a", "b")
+
+#######################################################################################################################
+#' ## Replicate Correlation and Clustering
+
+require.auto(edgeR)
+plotMDS(repFpkmMatrix(genes(cuff)), top=500, main="top500 MDS")
+
+
+if(unlen(replicates(cuff)$sample)>2){  ## >2 only, because MDS will fail with just 2 samples
+    MDSplot(genes(cuff)) + ggtitle("mds plot")
+}
+
+MDSplot(genes(cuff), replicates=T) + ggtitle("mds plot")
+
+## todo replace with marta plot
+#PCAplot(genes(cuff),"PC1","PC2",replicates=T)
+
+noTextLayer <- function(gg){ gg$layers[[2]] <- NULL;; gg}
+
+
+## todo use fix scaled here
+noTextLayer(csDistHeat(genes(cuff)) +  ggtitle("Sample correlation"))
+
+noTextLayer(csDistHeat(genes(cuff),replicates=T) +  ggtitle("Replicate Correlation")) #+ scale_fill_gradientn(colours=c(low='lightyellow',mid='orange',high='darkred'), limits=c(0,0.15))
+
+csDendro(genes(cuff),replicates=T)
+
+#pdf(paste0(basename(getwd()),"_rep_clustering.pdf"))
+#csDendro(genes(cuff),replicates=T)
+#dev.off()
+
+
+#' Also cluster expression filtered data
+exprGenes <- repFpkmMatrix(genes(cuff)) %>%
+    rownames2column("ensembl_gene_id") %>%
+    melt(value.name="fpkm") %>%
+    group_by(ensembl_gene_id) %>%
+    ## apply log trafo
+    mutate(log_fpkm=log10(fpkm+1)) %>%
+    ## apply expression filter
+    filter(max(log_fpkm)>1) %>%
+    dcast( ensembl_gene_id ~ variable, value.var="log_fpkm") %>%
+    column2rownames("ensembl_gene_id")
+
+res<-JSdist(makeprobs(exprGenes)) %>% hclust() %>% as.dendrogram()
+
+par(mar=c(10,4,4,4))
+plot(res, main="Replicate Clustering of Expressed Genes")
+
+
+#######################################################################################################################
+#' ## Extract lists of differentially expressed genes
+
+if(!is.null(constrasts_file)){
+    echo("using contrasts matrix from: ", basename(constrasts_file))
+
+    compPairsUniDir <- read.csv(constrasts_file, header=F) %>% set_names(c("sample_1", "sample_2"))
+
+    ## add other direction for completeness
+    compPairs <- rbind_list(compPairsUniDir, compPairsUniDir %>% select(sample_1=sample_2, sample_2=sample_1))
+}else{
+    echo("comparing all samples against each other")
+    compPairs <- diffData(genes(cuff)) %>% select(sample_1, sample_2) %>% distinct()
+}
+
+
+allDiff <- diffData(genes(cuff)) %>%
+    ## just keep pairs of interest
+    merge(compPairs) %>%
+
+    ##  discard all genes that are not expressed in any of the cell types (>1)
+    filter(gene_id %in% getExpressedGenes(cuff, minFPKM=minFPKM)) %>%
+    dplyr::rename(ensembl_gene_id=gene_id) %>%
+    mutate(sample_1_overex=log2_fold_change<0)
+
+#getExpressedGenes(cuff, minFPKM=minFPKM) %>% writeLines(paste0("genes_expressed_gt_", minFPKM[1], "_in_at_least_one_sample.txt"))
+
+# diffData <- read.delim("diffData.txt")
+#'  [diffData](diffData.txt)
+
+## ' Used cutoff criterion was `r if(use_pcutoff) echo("p_value<0.01") else echo("q_value<=0.01")`
+
+#+ results='asis'
+if(!is.null(qcutoff)){
+    echo("Using q-value cutoff of", qcutoff)
+    allDiff <- transform(allDiff, is_hit=q_value<=qcutoff)
+}else{
+    echo("Using p-value cutoff of", pcutoff)
+    allDiff <- transform(allDiff, is_hit=p_value<=pcutoff)
+}
+
+## save as reference
+merge(allDiff, geneInfo, by="ensembl_gene_id", all.x=T) %>% write.delim(file="diffData.txt")
+
+
+#+
+# #' Used cutoff criterion was: q_value<0.01
+# allDiff <- transform(allDiff, is_hit=q_value<0.01)
+
+
+degs <- subset(allDiff, is_hit)
+#degs <- subset(allDiff, significant=="yes")
+
+
+## todo embedd as paginated DataTable
+
+with(degs, as.data.frame(table(sample_1, sample_2, sample_1_overex))) %>% filter(Freq >0)
+
+## add gene info
+degs <- merge(degs, geneInfo, by="ensembl_gene_id", all.x=T)
+
+
+write.delim(degs, file="degs.txt")
+# degs <- read.delim("degs.txt")
+#' [dge table](degs.txt)
+
+ggplot(degs, aes(paste(sample_1, "vs",  sample_2), fill=status)) + geom_bar() +xlab(NULL) + ylab("# DGEs") +ggtitle("DGE count summary") + coord_flip()
+
+ggplot(degs, aes(paste(sample_1, "vs",  sample_2), fill=sample_1_overex)) + geom_bar(position="dodge") + xlab(NULL) + ylab("# DGEs") + ggtitle("DGE counts by direction") + coord_flip()
+
+## just needed to restore environment easily
+save(degs, file=".degs.RData")
+# degs <- local(get(load(".degs.RData")))
+
+
+
+########################################################################################################################
+#' ## Term enrichment
+#+ echo=FALSE
+
+#' This analysis was performed using [David](http://david.abcc.ncifcrf.gov/). The following ontologies were tested: `r paste(ontologies, collapse=', ')`
+
+geneLists <- degs %>%
+#    transmute(ensembl_gene_id, list_id=paste(sample_1, "vs", sample_2, "ovex", sample_1_overex, sep="_"))
+    transmute(ensembl_gene_id, list_id=paste(sample_1, "vs", sample_2))
+
+grpdDegs <- if(split_hit_list){
+    degs %>% group_by(sample_1, sample_2, sample_1_overex)
+}else{
+    degs %>% group_by(sample_1, sample_2)
+}
+
+enrResults <- grpdDegs %>% do(davidAnnotationChart(.$ensembl_gene_id))
+
+
+write.delim(enrResults, file="enrResults.txt")
+# enrResults <- read.delim("enrResults.txt")
+#'  [Enrichment Results](enrResults.txt)
+
+sigEnrResults <- subset(enrResults, Bonferroni <0.01)
+
+write.delim(sigEnrResults, file="sigEnrResults.txt")
+# sigEnrResults <- read.delim("sigEnrResults.txt")
+#'  [Very Significant Terms](sigEnrResults.txt)
+
+
+## plot the enrichment results
+#sigEnrResults %>% group_by(Category, add=T) %>% do({
+#    logPlot <- . %>% ggplot(aes(Term, PValue)) +
+#	    geom_bar(stat="identity")+coord_flip() +
+#	    xlab("Enriched Terms") +
+#	    ggtitle(.$Category[1]) +
+#	    scale_y_log10()
+#	    print(logPlot)
+#})
+
+sigEnrResults %>% do({
+    enrResultsGrp <- .
+    ## DEBUG enrResultsGrp <- sigEnrResults
+    label <-
+    dplyr::select(enrResultsGrp, matches(ifelse(split_hit_list, "sample_1|sample_2|sample_1_overex", "sample_1|sample_2"))) %>%
+      head(1) %>% apply(1, paste, collapse="_vs_")
+
+    echo("processing", label)
+
+    logPlot <- enrResultsGrp %>%
+        ## fix factor order
+        mutate(Term=reorder(Term, -PValue) %>% reorder(as.integer(Category))) %>%
+        ggplot(aes(Term, PValue, fill=Category)) +
+	    geom_bar(stat="identity")+
+	    coord_flip() +
+	    xlab("Enriched Terms") +
+	    ggtitle(label) +
+	    scale_y_log10()
+
+	    ggsave(paste0(label, " enrichmed terms.pdf"))
+	    print(logPlot)
+})
+#ggsave2()
+
+#ggsave2(ggplot(sigEnrResults, aes(set)) + geom_bar() + ggtitle("enriched term frequencies")) # + facet_wrap(~site_type))
+
+#sigEnrResults <- arrange(sigEnrResults, site_type, set, -Bonferroni)
+#sigEnrResults$Term <-lockFactorOrder(sigEnrResults$Term)
+
+#ggplot(sigEnrResults, aes(Term, -log10(Bonferroni))) +coord_flip() + geom_bar() + facet_wrap(~site_type + set)
+#ggplot(sigEnrResults, aes(reorder(set, -Bonferroni), -log10(Bonferroni), label=Term)) + geom_text(alpha=0.6, size=3) + ggtitle("enriched terms by set") + scale_y_log10()
+#ggsave2(width=14, height=12)
+
+#write.table(sigEnrResults, file=paste0("sigEnrTerms.txt"), row.names=FALSE,  sep="\t")
+
+#with(sigEnrResults, as.data.frame(table(Category)))
+keggPathways <- sigEnrResults %>%
+    filter(Category=="KEGG_PATHWAY") %>%
+    separate(Term, c('kegg_pathway_id', 'pathway_description'), sep="\\:", remove=F) %>%
+    with(kegg_pathway_id) %>% ac()
+
+########################################################################################################################
+#' ## Enriched KEGG Pathways
+
+require(pathview)
+require(png)
+
+#if(nrow(keggPathways)==0){
+#    echo("no enriched kegg pathways were found in the dataset")
+#}else{
+
+keggOrCode <- "mmu"
+
+## prepare p-value data
+sliceData <- allDiff %>%
+    mutate(
+        ## define slice labels (needed for casting)
+        comparison=paste0(sample_1, "__vs__", sample_2),
+        ## create normalized linear score for slice plotting
+        sample_1_overex=log2_fold_change<0,
+        plot_score=-log10(q_value)*ifelse(sample_1_overex, 1, -1)
+    ) %>%
+    dcast(ensembl_gene_id ~ comparison, value.var="plot_score") %>%
+    column2rownames("ensembl_gene_id")
+
+#sliceData %>% head %>% kable()
+
+data.frame(set=names(sliceData)) %>% mutate(slice_index=row_number()) %>% kable()
+
+
+plot_pathway <- function(pathwayID, overlayData){
+#    pathwayID="mmu04015"
+#    browser()
+    echo("processing pathway", pathwayID)
+    pv.out <- pathview(
+            gene.data = overlayData,
+            pathway.id = pathwayID,
+            species = keggOrCode,
+#            out.suffix = pathwayID$kegg.description,
+#            out.suffix = pathwayID,
+            multi.state = ncol(overlayData)>1,
+#            kegg.native=F,
+#            node.sum = "mean", # the method name to calculate node summary given that multiple genes or compounds are mapped to it
+            limit = list(gene = c(-4,4)),
+            gene.idtype="ensembl"
+    )
+
+    outfile <- paste0(pathwayID, ".pathview", ifelse(ncol(overlayData)>1, ".multi", ""), ".png")
+
+    ## interactive plotting
+#    ima <- readPNG(outfile)
+#    plot.new()
+#    lim <- par()
+#    rasterImage(ima, lim$usr[1], lim$usr[3], lim$usr[2], lim$usr[4])
+
+    pv.out$plotfile=outfile
+    pv.out$pathway_id=pathwayID
+
+    return(pv.out)
+}
+
+#keggPathways <- c("mmu04976", "mmu04972", "mmu04810", "mmu04520", "mmu04530", "mmu04270", "mmu04015")
+#keggPathways <- c("mmu04015")
+
+
+
+pathwayPlots <- keggPathways %>% llply(function(pathwayID){
+    plot_pathway(pathwayID, sliceData)
+})
+
+
+save(pathwayPlots, file=".pathwayPlots.RData")
+# pathwayPlots <- local(get(load("pathwayPlots.RData")))
+
+
+#+ results="asis"
+
+## simple non-clickable plots
+## http://stackoverflow.com/questions/12588323/r-how-to-extract-values-for-the-same-named-elements-across-multiple-objects-of
+#unlist(lapply(pathwayPlots, "[[", "plotfile"))
+# unlist(lapply(pathwayPlots, "[[", "plotfile")) %>% l_ply(function(pngFile){ cat(paste0("<img src='", pngFile, "'><br>"))})
+
+#cat("
+#<style type='text/css'>
+# img {
+#   max-width: 100%;
+# }
+#</style>
+#")
+
+## todo add tooltips with additinal info
+## http://stackoverflow.com/questions/5478940/how-to-create-a-html-tooltip-on-top-of-image-map
+
+## extended version with clickable links
+pathwayPlots %>% l_ply(function(plotData){
+    #pngFile="mmu04015.pathview.png"
+#    pathway_id=(basename(pngFile) %>% str_split_fixed("[.]", 2))[,1]
+    pngFile <- plotData$plotfile
+    pathway_id=plotData$pathway_id
+
+    keggNodes <- plotData$plot.data.gene %>%
+        ## remove box offset
+        mutate(x=x-22, y=y-8) %>% ## todo does not work for non-gene elements
+        mutate(link=paste0("http://www.kegg.jp/dbget-bin/www_bget?", keggOrCode, ":", kegg.names ))
+
+#    "http://www.kegg.jp/dbget-bin/www_bget?mmu:16412
+
+    ## first the image itself
+#    cat(paste0("<img usemap='#", pathway_id,"' src='", pngFile, "'><br>"))
+    cat(paste0("<div style='width: 2000px'><img style='float: left' usemap='#", pathway_id,"' src='", pngFile, "'></div><br>"))
+    cat(paste0("<map name='", pathway_id,"'>"))
+    #keggNodes %>% rowwise() %>% {curNode=.; cat(curNode$name)}
+
+    #see http://www.html-world.de/180/image-map/
+    keggNodes %>% a_ply(1, function(curNode){
+        rectDef=with(curNode, paste(x, y, x+width, y+height, sep=","))
+        paste0("<area href='", curNode$link,"' alt='Text' coords='",rectDef , "' shape='rect'>") %>% cat()
+    })
+    cat("</map>")
+})
+
+#}
+
+
+
+#+ include=FALSE
+# ########################################################################################################################
+# #' ## Protein-Protein Interaction Clusters using String
+#
+# ## see http://www.bioconductor.org/packages/release/bioc/vignettes/STRINGdb/inst/doc/STRINGdb.pdf
+#
+# #' We can use the string-db web service to look for protein-protein interactions.
+# #' This normally results in a large hairball and many unconnected nodes and so we can look for enriched clusters.
+# #' Here we plot using (high confidence interactions) enriched clusters with a node size greater than 5 proteins for our list of candidate genes showing up in at least two replicates
+#
+# #+ PPI plots , fig.width=30, fig.height=30, fig.retina=2, warning=FALSE, tidy=TRUE
+#
+# #taxTable <- list(ENSG=9606,  ENSMUSG=10090)
+# #taxId <- taxTable[[degs$gene_id[1] %>% ac() %>% str_extract("^([A-z]+)")]]
+# #
+# #require.auto(STRINGdb)
+# #
+# #string_db <- STRINGdb$new(score_threshold=400, species=taxId, input_directory="/local2/user_data/brandl/data/stringr") ## score threshold default is 400
+# #example1_mapped <- string_db$map( summary_allgids_df_sighit_gt1, "supplier_gene_symbol", species= removeUnmappedRows = TRUE )
+# #clustersList <- string_db$get_clusters(example1_mapped$STRING_id)
+# #for(network in clustersList)
+# #{
+# #	if(length(network)>5)
+# #	{
+# #		string_db$plot_network(network, add_link=FALSE)
+# #	}
+# #}
+#
+
+#' Created `r format(Sys.Date(), format="%B-%d-%Y")` by `r readLines(pipe('whoami'))`
\ No newline at end of file