cont better dge report

2bc38c7e · Holger Brandl · f9f0b9cd · 2bc38c7e
Commit 2bc38c7e authored 10 years ago by Holger Brandl
--- a/dge_workflow/dge_analysis_new.R
+++ b/dge_workflow/dge_analysis_new.R
@@ -5,22 +5,22 @@ suppressMessages(require(docopt))

 doc <- '
 Convert a cuffdiff into a dge-report
-Usage: dge_analysis.R [options] <cuffdb_directory>
+Usage: cuffdiff_report.R [options] <cuffdb_directory>

 Options:
 --constrasts=<tab_delim_table> Table with sample pairs for which dge analysis should be performed
--directed           Perform directed dge-steps in a directed manner
 --qcutoff <qcutoff>    Use a q-value cutoff of 0.01 instead of a q-value cutoff [default: 0.01]
 --pcutoff <pcutoff>    Override q-value filter and filter by p-value instead
 --minfpkm <minfpkm>   Minimal expression in any of the sample to be included in the final result list [default: 1]
 '
+#--directed           Perform directed dge-steps in a directed manner
 #-S                     Dont do general statistics and qc analysis

 #print("args ars")
 #print(commandArgs(TRUE))

 opts <- docopt(doc, commandArgs(TRUE)) ## does not work when spining
-# florio_11b: setwd("/lustre/projects/bioinfo/holger/projects/florio_11b/cuffdiff/dge_report_new"); opts <- docopt(doc, "--constrasts contrasts.txt --pcutoff 0.01 /projects/bioinfo/holger/projects/florio_11b/cuffdiff")
+## florio_11b: setwd("/lustre/projects/bioinfo/holger/projects/florio_11b/cuffdiff/dge_report_new"); opts <- docopt(doc, "--constrasts contrasts.txt --pcutoff 0.01 /projects/bioinfo/holger/projects/florio_11b/cuffdiff")
 # opts <- docopt(doc, "--undirected  ..")
 # opts <- docopt(doc, paste(Sys.getenv("DGE_PARAMS"), ".."))
 ## todo use commandArgs here!!
@@ -28,10 +28,12 @@ opts <- docopt(doc, commandArgs(TRUE)) ## does not work when spining
 #print(opts)

 skipQC <<- opts$S
-split_hit_list <- !opts$undirected
+#split_hit_list <- opts$directed
 constrasts_file <- opts$constrasts
 pcutoff <- if(is.null(opts$pcutoff)) NULL else as.numeric(opts$pcutoff)
 qcutoff <- if(is.numeric(pcutoff)) NULL else as.numeric(opts$qcutoff)
+if(is.numeric(pcutoff)) opts$qcutoff <- NULL
+
 minFPKM <- as.numeric(opts$minfpkm)


@@ -45,9 +47,9 @@ if(is.na(file.info(cuffdb_directory)$isdir)){
 }


-devtools::source_url("https://raw.githubusercontent.com/holgerbrandl/datautils/v1.5/R/core_commons.R")
-devtools::source_url("https://raw.githubusercontent.com/holgerbrandl/datautils/v1.5/R/ggplot_commons.R")
-devtools::source_url("https://raw.githubusercontent.com/holgerbrandl/datautils/v1.5/R/bio/diffex_commons.R")
+devtools::source_url("https://raw.githubusercontent.com/holgerbrandl/datautils/v1.7/R/core_commons.R")
+devtools::source_url("https://raw.githubusercontent.com/holgerbrandl/datautils/v1.7/R/ggplot_commons.R")
+devtools::source_url("https://raw.githubusercontent.com/holgerbrandl/datautils/v1.7/R/bio/diffex_commons.R")

 #+ results='asis', echo=FALSE
 cat('
@@ -73,22 +75,24 @@ geneInfo <- quote({
    biomaRt::getBM(c('ensembl_gene_id', 'external_gene_name', 'description', 'gene_biotype'), mart=mart)
 }) %>% cache_it("gene_info")

+geneLoci <- quote({
+    martName <- guess_mart(fpkm(genes(cuff))$gene_id)
+    mart <- biomaRt::useDataset(martName, mart = biomaRt::useMart("ensembl"))
+    biomaRt::getBM(c("ensembl_gene_id", "chromosome_name", "start_position", "end_position"), mart=mart)
+}) %>% cache_it("gene_loci")
 #knitr::opts_knit$set(root.dir = getwd())

+#' Used DGE Analysis Paramters
+vec2df(unlist(opts)) %>% filter(!str_detect(name, "^[<-]")) %>% kable()
+
+
 ########################################################################################################################
 #' # Differential Gene Expression Analysis

-print("Configuration")
-vec2df(unlist(opts)) %>% filter(!str_detect(name, "^[<-]")) %>% kable()
-
 #' [CuffDiff](http://cufflinks.cbcb.umd.edu/manual.html) and [cummeRbund](http://compbio.mit.edu/cummeRbund/) were used to perform this analysis. For details about how the test for differntial expression works, refer to [Differential analysis of gene regulation at transcript resolution with RNA-seq](http://www.nature.com/nbt/journal/v31/n1/full/nbt.2450.html).

-#print(paste("db dir is", cuffdb_directory))
-
-#' Used cuffdiff database in `r cuffdb_directory`

 #+ load_db, cache=FALSE, warning=FALSE
-
 isCluster=Sys.getenv("LSF_SERVERDIR")!=""
 dbTempCopyRequired=isCluster | str_detect(normalizePath(cuffdb_directory), fixed("/Volumes/projects"))

@@ -101,37 +105,42 @@ if(dbTempCopyRequired){
    cuff <- readCufflinks(dir="..")
 }

-
+#' The expression tracking reference contained:
 cuff
+
+#' ### Samples
 runInfo(cuff) %>% filter(param!="cmd_line") %>% kable()
 #' cuffdiff cmd was
-#' `r runInfo(cuff)$cmd_line`
+#+ results='asis'
+cat((runInfo(cuff) %>% filter(param=="cmd_line"))$value %>% gsub('(.{1,80})(,|$|\\s)', '\\1\n', .))

+#+
 replicateInfo <- replicates(cuff) %>%
    mutate(file=basename(file)) %>%
    select(-c(total_mass, norm_mass, internal_scale, external_scale))

-replicateInfo %>% kable("html", table.attr = "class='dtable'", escape=F)
+replicateInfo %>% kable() # kable("html", table.attr = "class='dtable'", escape=F)

 write.delim(replicateInfo, file="replicate_info.txt")
 # replicateInfo <- read.delim("replicate_info.txt")
 #'  [replicateInfo](replicate_info.txt)

+#+ results='asis'
 ## define or load the contrasts
 dbContrasts <- diffData(genes(cuff)) %>% select(sample_1, sample_2) %>% distinct()
 if(!is.null(constrasts_file)){
-    echo("using contrasts matrix from: ", basename(constrasts_file))
+    echo("Using contrasts matrix from: ", basename(constrasts_file))

    contrasts <- read.csv(constrasts_file, header=F) %>% set_names(c("sample_1", "sample_2"))

    ## filter for the direction present in cuffdb
    contrasts <- rbind_list(contrasts, contrasts %>% select(sample_1=sample_2, sample_2=sample_1)) %>%  inner_join(dbContrasts)
 }else{
-    echo("comparing all samples against each other")
+    echo("Comparing all samples against each other")
    contrasts <- diffData(genes(cuff)) %>% select(sample_1, sample_2) %>% distinct()
 }

-#' The contrasts are
+#' The used contrasts model is
 contrasts %>% kable()


@@ -140,20 +149,22 @@ contrasts %>% kable()

 #' All data has been annotated with ensembl gene information and was exprorted into . Both normalized (FPKM) and raw counts were exported.

+#+ results='asis'
 fpkmMatrix(genes(cuff)) %>%
    rownames2column("ensembl_gene_id") %>%
-    merge(geneInfo, by="ensembl_gene_id", all.x=T) %>%
-    print_head() %>%
+    merge(geneInfo, by="ensembl_gene_id", all.x=T) %T>%
+    head_html() %>%
    write.delim(file="gene_fpkms.txt")
 # gene_fpkms <- read.delim("gene_fpkms.txt")
 #' [Normalized Counts](gene_fpkms.txt)

 ## export the same but now including replicate information

+#+ results='asis'
 repFpkmMatrix(genes(cuff)) %>%
    rownames2column("ensembl_gene_id") %>%
    merge(geneInfo, by="ensembl_gene_id", all.x=T) %>%
-    print_head() %>%
+    head_html() %>%
    write.delim(file="gene_fpkms_by_replicate.txt")
 # gene_fpkms_by_replicate <- read.delim("gene_fpkms_by_replicate.txt")
 #' [Normalized Counts by Replicate](gene_fpkms_by_replicate.txt)
@@ -196,38 +207,37 @@ csBoxplot(genes(cuff),replicates=T)

 csScatterMatrix(genes(cuff))

+#csVolcano(genes(cuff),"a", "b")
+#' For the volcano plot matrix we calculate fold-changes as x/y (aka column/row). For details see [here](http://rpackages.ianhowson.com/bioc/cummeRbund/man/csVolcano.html)
+csVolcanoMatrix(genes(cuff))

+#' We also plot more detailed scatter plots for just the contrasts for interest. Signifcance coloring is based on cuffdiff's internal score an not our hit criterion.
 #csVolcano(genes(cuff),"hESC","Fibroblasts")
 csVolcPlots <- alply(contrasts, 1,  splat(function(sample_1, sample_2) {
    csVolcano(genes(cuff),sample_1,sample_2)
 }))
-#' fig.width=16
+
+#+ fig.width=16
 multiplot(plotlist=csVolcPlots, cols=min(3, length(csVolcPlots)))

-#' We also plot more detailed scatter plots for contrasts for interest
 #csScatter(genes(cuff),"hESC","Fibroblasts",smooth=T)
-
 #csScatter(genes(cuff), "a", "d", smooth=F) + aes(color=abs(log2(a/b))>2) + scale_colour_manual(values=c("darkgreen", "red"))

 #MAplot(genes(cuff), "a", "d")+ aes(color=abs(log2(M))>2)+ ggtitle("MA-plot: a/d")
 #MAplot(genes(cuff), "a", "b", useCount=T) #+ aes(color=abs(log2(M))>2)+ ggtitle("MA-plot: Luc/KD")

-
 #csVolcano(genes(cuff),"aRGm", "N") + ggtitle("odd line-shaped fan-out in volcano plot")

 ## todo consider to add more pretty volcano plots (see stefania's extra plots
 #getGenes(cuff,c("ENSMUSG00000097853")) %>%  expressionBarplot()

-#csVolcano(genes(cuff),"a", "b")
-## todo replace or complement with marta plot
-csVolcanoMatrix(genes(cuff))
-#csVolcano(genes(cuff),"a", "b")

 #######################################################################################################################
 #' ## Replicate Correlation and Clustering

+#' For most type of experiments we expect replicates to cluster together. Second similar biological conditions should cluster together. As clustering results depend on the method being used, we try a few here to check if we get consistent results.

-## Map replicates into 2-dimensional space to highlight replicate clusters (or the lack of them)
+#' Map replicates into 2-dimensional space to highlight replicate clusters (or the lack of them)
 require.auto(edgeR)
 plotMDS(repFpkmMatrix(genes(cuff)), top=500, main="top500 MDS")

@@ -274,14 +284,16 @@ plot(res, main="Replicate Clustering of Expressed Genes")
 #######################################################################################################################
 #' ## Differentially expressed genes

+#' Here we are using [cuffdiff](http://cole-trapnell-lab.github.io/cufflinks/cuffdiff) to test for differential expression. Estimate variance-mean dependence in count data from high-throughput sequencing assays and test for differential expression based on a model using the negative binomial distribution.
+
 allDiff <- diffData(genes(cuff)) %>%
    ## just keep pairs of interest
    merge(contrasts) %>%

    ##  discard all genes that are not expressed in any of the cell types (>1)
    filter(gene_id %in% getExpressedGenes(cuff, minFPKM=minFPKM)) %>%
-    dplyr::rename(ensembl_gene_id=gene_id) %>%
-    mutate(sample_1_overex=log2_fold_change<0)
+    dplyr::rename(ensembl_gene_id=gene_id, s2_over_s1_log2fc=log2_fold_change) %>%
+    mutate(sample_1_overex=s2_over_s1_log2fc<0)

 #+ results='asis'
 if(!is.null(qcutoff)){
@@ -295,9 +307,9 @@ if(!is.null(qcutoff)){
 merge(allDiff, geneInfo, by="ensembl_gene_id", all.x=T) %>% write.delim(file="diff_data.txt")
 # diff_data <- read.delim("diff_data.txt")
 #'  [Unfiltered CuffDiff Results](diff_data.txt)
+#

-
-#' Filter for expressed genes
+# Filter for DEGs (differentially expressed genes)
 degs <- allDiff %>%
    ## Filter for expressed genes
    filter(is_hit) %>%
@@ -306,28 +318,82 @@ degs <- allDiff %>%
    left_join(geneInfo, by="ensembl_gene_id")


-
+#'  DEGs (differentially expressed genes) are contained in the following table
 write.delim(degs, file="degs.txt")
 # degs <- read.delim("degs.txt")
 #' [Differentially Expressed Genes](degs.txt)

+#' ### DEG Counts
+#with(degs, as.data.frame(table(sample_1, sample_2, sample_1_overex))) %>% filter(Freq >0) %>% kable()
+degs %>% dplyr::count( sample_1, sample_2, sample_1_overex) %>% kable()
+
+
+#+ fig.height=nrow(contrasts)+2
 ggplot(degs, aes(paste(sample_1, "vs",  sample_2), fill=status)) + geom_bar() +xlab(NULL) + ylab("# DGEs") +ggtitle("DEGs by contrast") + coord_flip()
 ggplot(degs, aes(paste(sample_1, "vs",  sample_2), fill=sample_1_overex)) + geom_bar(position="dodge") + xlab(NULL) + ylab("# DGEs") + ggtitle("DEGs by direction") + coord_flip()


-#' Hit Browser:
-#+ results='as.is'
+#' DEGs can be browsed in Excel using the exported table or via the embedded table browser. To enable the IGV links, you need to set the port in the IGV preferences to 3334.
+#+ results='asis'
 degs %>%
-    mutate(ensembl=paste0("<a href='http://www.ensembl.org/Search/Results?facet_species=Elephant;page=1;facet_feature_type=Gene;q=",ensembl_gene_id, "'>",ensembl_gene_id, "</a>")) %>%
-    transmute(sample_1, sample_2, value_1, value_2, s2_over_s1_logfc=log2_fold_change, p_value, q_value, ensembl, external_gene_name, description, gene_biotype) %>%
+    inner_join(geneLoci) %>%
+    mutate(
+        ensembl=paste0("<a href='http://www.ensembl.org/Multi/Results?page=1;facet_feature_type=Gene;q=",ensembl_gene_id, "'>",ensembl_gene_id, "</a>"),
+        igv=paste0("<a href='http://localhost:3334/goto?locus=", chromosome_name,":", start_position, "-", end_position, "'>IGV</a>")
+    ) %>%
+    select(sample_1, sample_2, value_1, value_2, s2_over_s1_log2fc, p_value, q_value, ensembl, external_gene_name, description, gene_biotype, igv) %>%
    kable("html", table.attr = "class='dtable'", escape=F)

-with(degs, as.data.frame(table(sample_1, sample_2, sample_1_overex))) %>% filter(Freq >0) %>% kable()
-
 ## just needed to restore environment easily
 save(degs, file=".degs.RData")
 # degs <- local(get(load(".degs.RData")))

+#' Also redo volcano plots with out hit overlays
+
+#+ fig.width=16, fig.height=14
+csVolcPlots <- alply(contrasts, 1,  splat(function(sample_1, sample_2) {
+    csVolcano(genes(cuff),sample_1,sample_2)
+ }))
+
+maPlots <- allDiff %>% group_by(sample_1, sample_2) %>% do(gg={ maData <-.
+    ## todo why not s2_over_s1_log2fc
+    maData %>% ggplot(aes(0.5*log2(value_1*value_2), log2(value_1/value_2), color=is_hit)) +
+        geom_point(alpha=0.3) +
+        geom_hline(yintercept=0, color="red") +
+        ggtitle(with(maData[1,], paste(sample_1, "vs", sample_2)))
+}) %$% gg
+
+
+#+ fig.width=16
+multiplot(plotlist=maPlots, cols=min(2, length(maPlots)))
+
+
+##Volcano plots
+hitCounts <- degs %>%
+        dplyr::count(sample_1, sample_2, sample_1_overex) %>%
+        dplyr::rename(hits=n) %>%
+        merge(data.frame(sample_1_overex=c(T,F), x_pos=c(-3,3)))
+
+#' For the volcano plot matrix we calculate fold-changes as x/y (aka column/row). For details see [here](http://rpackages.ianhowson.com/bioc/cummeRbund/man/csVolcano.html)
+#+ fig.width=16, fig.height=14
+allDiff %>% ggplot(aes(s2_over_s1_log2fc, -log10(p_value), color=is_hit)) +
+#    geom_jitter(alpha=0.3, position = position_jitter(height = 0.2)) +
+    geom_point(alpha=0.3) +
+#    theme_bw() +
+    xlim(-3.5,3.5) +
+    scale_color_manual(values = c("TRUE"="red", "FALSE"="black")) +
+#    ggtitle("Insm1/Ctrl") +
+    ## http://stackoverflow.com/questions/19764968/remove-point-transparency-in-ggplot2-legend
+    guides(colour = guide_legend(override.aes = list(alpha=1))) +
+
+    ## tweak axis labels
+    xlab(expression(log[2]("x/y"))) +
+    ylab(expression(-log[10]("p value"))) +
+
+    ## add hit couts
+    geom_text(aes(label=hits, x=x_pos), y=2, color="red", size=10, data=hitCounts) +
+    facet_grid(sample_1 ~ sample_2)
+

 ########################################################################################################################
 #' ## Term enrichment
@@ -346,15 +412,17 @@ if(nrow(contrasts)<3){
    )
 }

-#+ eval=F
-enrResults <- geneLists %>% group_by(list_id) %>% do(davidAnnotationChart(.$ensembl_gene_id))
+#+ eval=T
+enrResults <- quote(geneLists %>% group_by(list_id) %>% do(davidAnnotationChart(.$ensembl_gene_id))) %>%
+    cache_it(paste0("enrdata_", digest(geneLists)))

 write.delim(enrResults, file="enrResults.txt")
 # enrResults <- read.delim("enrResults.txt")
 #'  [Enrichment Results](enrResults.txt)

+#' Because David is not too strigent by default we extract just those terms for which Bonferroni<0.01
 sigEnrResults <- subset(enrResults, Bonferroni <0.01)
-#' results='as.is'
+#' results='asis'
 sigEnrResults %>% kable("html", table.attr = "class='dtable'", escape=F)

 write.delim(sigEnrResults, file="sigEnrResults.txt")
@@ -372,33 +440,49 @@ write.delim(sigEnrResults, file="sigEnrResults.txt")
 #	    print(logPlot)
 #})

-sigEnrResults %>% do({
+#+ include=FALSE
+term_category_colors <- create_palette(unique(ac(sigEnrResults$Category)))
+
+term_barplot_files <- sigEnrResults %>% do({
    enrResultsGrp <- .
    ## DEBUG enrResultsGrp <- sigEnrResults
    label <- enrResultsGrp$list_id[1]
-#    dplyr::select(enrResultsGrp, matches(ifelse(split_hit_list, "sample_1|sample_2|sample_1_overex", "sample_1|sample_2"))) %>%
-#      head(1) %>% apply(1, paste, collapse="_vs_")
-
-    echo("processing", label)

+#    echo("processing", label)
    logPlot <- enrResultsGrp %>%
        ## fix factor order
-        mutate(Term=reorder(Term, -PValue) %>% reorder(as.integer(Category))) %>%
+        mutate(Term=reorder(Term, -PValue) %>% reorder(as.integer(as.factor(Category)))) %>%
        ggplot(aes(Term, PValue, fill=Category)) +
-	    geom_bar(stat="identity")+
-	    coord_flip() +
-	    xlab("Enriched Terms") +
-	    ggtitle(label) +
-	    scale_y_log10()
-
-	    ggsave(paste0(label, " enrichmed terms.pdf"))
-	    print(logPlot)
+        geom_bar(stat="identity")+
+        scale_fill_manual(values = term_category_colors, drop=F, name="Ontology") +
+        coord_flip() +
+        xlab("Enriched Terms") +
+        ggtitle(label) +
+        scale_y_log10()
+
+    fileNameLabel <- label %>%
+        str_replace_all("!=", "ne") %>%
+        str_replace_all(">", "gt") %>%
+        str_replace_all("<", "lt") %>%
+        str_replace_all(" ", "_")
+
+#        ggsave(paste0("enrichmed_terms__", fileNameLabel, ".pdf"))
+#        print(logPlot)
+
+     tmpPng <- paste0("enrterms__", fileNameLabel, ".png")
+     ggsave(tmpPng, logPlot, width=10, height = round(2+nrow(enrResultsGrp)/4), limitsize=FALSE)
+     data.frame(file=tmpPng)
 })

+#+ results="asis"
+l_ply(term_barplot_files$file, function(pngFile){ cat(paste0("<img src='", pngFile, "'><br>"))})
+

 ########################################################################################################################
 #' ## Enriched KEGG Pathways

+#' To understand spatio-temporal changes in gene expression better we now overlay
+
 keggPathways <- sigEnrResults %>%
    filter(Category=="KEGG_PATHWAY") %>%
    separate(Term, c('kegg_pathway_id', 'pathway_description'), sep="\\:", remove=F) %>%
@@ -420,7 +504,7 @@ sliceData <- allDiff %>%
        ## define slice labels (needed for casting)
        comparison=paste0(sample_1, "__vs__", sample_2),
        ## create normalized linear score for slice plotting
-        sample_1_overex=log2_fold_change<0,
+        sample_1_overex=s2_over_s1_log2fc<0,
        plot_score=-log10(q_value)*ifelse(sample_1_overex, 1, -1)
    ) %>%
    dcast(ensembl_gene_id ~ comparison, value.var="plot_score") %>%
@@ -434,7 +518,7 @@ data.frame(set=names(sliceData)) %>% mutate(slice_index=row_number()) %>% kable(
 plot_pathway <- function(pathwayID, overlayData){
 #    pathwayID="mmu04015"
 #    browser()
-    echo("processing pathway", pathwayID)
+#    echo("processing pathway", pathwayID)
    pv.out <- pathview(
            gene.data = overlayData,
            pathway.id = pathwayID,