From e4d97c24043ab20fc205a55c6819da7c80384fbe Mon Sep 17 00:00:00 2001
From: Holger Brandl <brandl@mpi-cbg.de>
Date: Mon, 8 Dec 2014 18:05:06 +0100
Subject: [PATCH] cont. dge analysis

---
 dge_workflow/dge_analysis.R | 46 +++++++++++++++++++++++++++++--------
 dge_workflow/dge_utils.sh   |  1 +
 2 files changed, 37 insertions(+), 10 deletions(-)

diff --git a/dge_workflow/dge_analysis.R b/dge_workflow/dge_analysis.R
index c042602..3740226 100755
--- a/dge_workflow/dge_analysis.R
+++ b/dge_workflow/dge_analysis.R
@@ -10,19 +10,26 @@ Usage: dge_analysis.R [options] <cuffdb_directory>
 Options:
 --constrasts=<tab_delim_table> Table with sample pairs for which dge analysis should be performed
 --undirected           Perform directed dge-steps in a directed manner
+--qcutoff <qcutoff>    Use a q-value cutoff of 0.01 instead of a q-value cutoff [default: 0.01]
+--pcutoff <pcutoff>    Override q-value filter and filter by p-value instead
 -S                     Dont do general statistics and qc analysis
 '
 
 #opts <- docopt(doc, commandArgs(TRUE)) ## does not work when spining
-#opts <- docopt(doc, "--undirected ..")
+# Stefania: opts <- docopt(doc, "--undirected --pcutoff 0.05  ..")
+# opts <- docopt(doc, "--undirected  ..")
 # opts <- docopt(doc, paste(Sys.getenv("DGE_PARAMS"), ".."))
 
 opts <- docopt(doc, paste(Sys.getenv("DGE_PARAMS"), paste(commandArgs(TRUE), collapse=" ")))
-# opts
+#opts
 
 skipQC <<- opts$S
 split_hit_list <- !opts$undirected
 constrasts_file <- opts$constrasts
+pcutoff <- if(is.null(opts$pcutoff)) NULL else as.numeric(opts$pcutoff)
+qcutoff <- if(is.numeric(pcutoff)) NULL else as.numeric(opts$qcutoff)
+
+stopifnot(is.numeric(qcutoff) | is.numeric(pcutoff))
 
 cuffdb_directory <- normalizePath(opts$cuffdb_directory)
 
@@ -61,6 +68,8 @@ cuff <- readCufflinks(dir=tmpdir)
 cuff
 runInfo(cuff)
 replicateInfo <- replicates(cuff) %>% mutate(file=basename(file)) %>% select(-c(total_mass, norm_mass, internal_scale, external_scale))
+replicateInfo
+
 write.delim(replicateInfo, file="replicateInfo.txt")
 # replicateInfo <- read.delim("replicateInfo.txt")
 #'  [replicateInfo](replicateInfo.txt)
@@ -82,6 +91,7 @@ if(!file.exists(cacheFile)){
     mousemart = useDataset(martName, mart=useMart("ensembl"))
     geneInfo <- getBM(attributes=c('ensembl_gene_id', 'external_gene_name', 'description', 'gene_biotype'), mart=mousemart);
     save(geneInfo, file=cacheFile)
+    unloadNameSpace(biomaRt)
 }else{
     geneInfo <- local(get(load(cacheFile)))
 }
@@ -184,9 +194,9 @@ noTextLayer <- function(gg){ gg$layers[[2]] <- NULL;; gg}
 
 
 ## todo use fix scaled here
-noTextLayer(csDistHeat(genes(cuff)) +  ggtitle("mouse zone correlation"))
+noTextLayer(csDistHeat(genes(cuff)) +  ggtitle("Sample correlation"))
 
-noTextLayer(csDistHeat(genes(cuff),replicates=T) +  ggtitle("mouse replicate correlation")) #+ scale_fill_gradientn(colours=c(low='lightyellow',mid='orange',high='darkred'), limits=c(0,0.15))
+noTextLayer(csDistHeat(genes(cuff),replicates=T) +  ggtitle("Replicate Correlation")) #+ scale_fill_gradientn(colours=c(low='lightyellow',mid='orange',high='darkred'), limits=c(0,0.15))
 
 csDendro(genes(cuff),replicates=T)
 
@@ -226,18 +236,27 @@ allDiff <- diffData(genes(cuff)) %>%
     mutate(sample_1_overex=log2_fold_change<0)
 
 
+## ' Used cutoff criterion was `r if(use_pcutoff) echo("p_value<0.01") else echo("q_value<=0.01")`
 
+#+ results='asis'
+if(!is.null(qcutoff)){
+    echo("Using q-value cutoff of ", qcutoff)
+    allDiff <- transform(allDiff, isHit=q_value<=qcutoff)
+}else{
+    echo("Using p-value cutoff of ", pcutoff)
+    allDiff <- transform(allDiff, isHit=p_value<=pcutoff)
+}
 
-##' Used cutoff criterion was: p_value<0.01
-#allDiff <- transform(allDiff, isHit=p_value<=0.01)
 
-#' Used cutoff criterion was: q_value<0.01
-allDiff <- transform(allDiff, isHit=q_value<0.01)
+#+
+# #' Used cutoff criterion was: q_value<0.01
+# allDiff <- transform(allDiff, isHit=q_value<0.01)
 
 
 degs <- subset(allDiff, isHit)
 #degs <- subset(allDiff, significant=="yes")
 
+
 with(degs, as.data.frame(table(sample_1, sample_2, sample_1_overex))) %>% filter(Freq >0)
 
 ## add gene info
@@ -278,7 +297,7 @@ ontologies <- c(
     "GOTERM_BP_FAT"
 )
 
-#' This analysis was performed using [David](http://david.abcc.ncifcrf.gov/). The following ontologies were tested: `r paste(ontologies, collapse=',')`
+#' This analysis was performed using [David](http://david.abcc.ncifcrf.gov/). The following ontologies were tested: `r paste(ontologies, collapse=', ')`
 
 require(RDAVIDWebService) ## just works if installed on non-network-drive (e.g. /tmp/)
 
@@ -317,7 +336,14 @@ davidAnnotationChart <- function(someGenes){ ## expexted to have a column with g
 }
 
 
-enrResults <- degs %>% group_by(sample_1, sample_2, sample_1_overex) %>% do(davidAnnotationChart(.$ensembl_gene_id))
+
+grpdDegs <- if(split_hit_list){
+    degs %>% group_by(sample_1, sample_2, sample_1_overex)
+}else{
+    degs %>% group_by(sample_1, sample_2)
+}
+
+enrResults <- grpdDegs %>% do(davidAnnotationChart(.$ensembl_gene_id))
 
 write.delim(enrResults, file="enrResults.txt")
 # enrResults <- read.delim("enrResults.txt")
diff --git a/dge_workflow/dge_utils.sh b/dge_workflow/dge_utils.sh
index 646ca00..09789b0 100755
--- a/dge_workflow/dge_utils.sh
+++ b/dge_workflow/dge_utils.sh
@@ -209,6 +209,7 @@ fi
 allBams=$(find $bamDir | grep ".bam$" | grep -v "unmapped" | sort)
 ## todo use optional argument here --> default "unmapped" --> allow user to add others
 #allBams=$(find $bamDir | grep ".bam$" | grep -v "unmapped" | grep -v "1029" | sort)
+#allBams=$(find $bamDir | grep ".bam$" | grep -v "unmapped" | grep -v "Ctrl_1029" | sort)
 
 bamsSplit=""
 for label in $(echo $labels | tr ", " " "); do
-- 
GitLab