cont. rna-seq workflow

4407c214 · Holger Brandl · 9f749a60 · 4407c214 · 4407c214 · 4407c214
Commit 4407c214 authored 10 years ago by Holger Brandl
--- a/dge_workflow/bam_qc.R
+++ b/dge_workflow/bam_qc.R
+#!/usr/bin/env Rscript
+#+ echo=FALSE, message=FALSE
+
+suppressMessages(library(docopt))
+
+# retrieve and parse the command-line arguments
+doc <- '
+Create a small summary for bam-files in a directory
+Usage: bam_qc.R <base_directory>
+
+Options:
+-c        Peform correlation analysis
+'
+
+#print(paste("args are:", commandArgs(TRUE)))
+opts <- docopt(doc, commandArgs(TRUE))
+#opts <- docopt(doc, ". .")
+
+if(!exists("opts")){ stop(doc); return }
+#print("doc opts are")
+#print(opts)
+
+baseDir <- normalizePath(opts$base_directory)
+
+if(is.na(file.info(baseDir)$isdir)){
+    stop(paste("base directory", baseDir, "does not exist"))
+}
+
+devtools::source_url("https://dl.dropboxusercontent.com/u/113630701/datautils/R/core_commons.R")
+devtools::source_url("https://dl.dropboxusercontent.com/u/113630701/datautils/R/ggplot_commons.R")
+
+
+########################################################################################################################
+#' # Mapping Summary for: `r baseDir`
+
+parseAlgnSummary_T2_0_11 <- function(alignSummary){
+    #alignSummary="/projects/bioinfo/holger/projects/marta_rnaseq/human_leipzig/mapping/S5382_aRG_1b_rep1/align_summary.txt"
+    algnData <- readLines(alignSummary)
+
+    data.frame(
+        condition=basename(dirname(alignSummary)),
+        num_reads=as.numeric(str_match(algnData[2], " ([0-9]*$)")[,2]),
+        mapped_reads=as.numeric(str_match(algnData[3], ":[ ]*([0-9]*) ")[,2][1])
+    ) %>% transform(mapping_efficiency=100*mapped_reads/num_reads)
+}
+
+
+algnSummary <- ldply(list.files(".", "align_summary.txt", full.names=TRUE, recursive=T), parseAlgnSummary_T2_0_11)
+write.delim(algnSummary, file="tophat_mapping_stats.txt")
+# algnSummary <- read.delim("algnSummary.txt")
+#'  [Tophat Mapping Statistics](tophat_mapping_stats.txt)
+
+scale_fill_discrete <- function (...){ scale_color_brewer(..., type = "seq", palette="Set1", "fill", na.value = "grey50") }
+
+
+ggplot(algnSummary, aes(condition, mapping_efficiency)) +
+    geom_bar(stat="identity") +
+    coord_flip() +
+    ylim(0,100) +
+    ggtitle("mapping efficiency")
+
+ggplot(algnSummary, aes(condition, num_reads)) +
+    geom_bar(stat="identity") +
+    coord_flip() +
+    ggtitle("read counts") +scale_y_continuous(labels=comma)
+
+ggplot(algnSummary, aes(condition, mapped_reads)) +
+    geom_bar(stat="identity") + coord_flip() +
+    ggtitle("alignments counts") +
+    scale_y_continuous(labels=comma)
+
+
+#ggplot(melt(algnSummary), aes(condition, value)) + geom_bar(stat="identity") +facet_wrap(~variable, scales="free") + ggtitle("mapping summary") + scale_y_continuous(labels=comma)  + theme(axis.text.x=element_text(angle=90, hjust=0))
+#ggsave2(w=10, h=10, p="mapstats")
+
+
+########################################################################################################################
+#' ## Alignment Correlation
+
+#' Without using any transcriptome as reference, the genome can be binned and alignment counts per bin can be used to perform a correlation analysis.
+#' Used tool (deepTools)[https://github.com/fidelram/deepTools]
+
+### todo integrate bam correlation analysis
\ No newline at end of file
--- a/dge_workflow/dge_analysis.R
+++ b/dge_workflow/dge_analysis.R
@@ -38,17 +38,20 @@ Usage: dge_analysis.R <report_name>

 Options:
 --cuffdir <DIR>        Cache results [default: .]
+--dge-pairs <dgepairs> Txt files with sample pairs for which dge analysis should be performed
+-S                     Dont do general statistics and qc analysis
 '

 #opts <- docopt(doc)
 opts <- docopt(doc, "test_report")

-cuffdb_directory=normalizePath(opts$cuffdir)

-mcdir(opts$report_name)
+print("options were:")
+print(opts)

-# print(opts)
+cuffdb_directory <- normalizePath(opts$cuffdir)

+mcdir(opts$report_name)

 if(is.na(file.info(cuffdb_directory)$isdir)){
    stop(paste("db directory does not exist", doc))
@@ -70,8 +73,7 @@ cuff <- readCufflinks(dir=tmpdir)

 cuff
 runInfo(cuff)
-replicates(cuff)
-
+replicates(cuff) %>% mutate(file=basename(file)) %>% select(-c(total_mass, norm_mass, internal_scale, external_scale))


 #######################################################################################################################
@@ -108,7 +110,10 @@ csVolcanoMatrix(genes(cuff))
 require.auto(edgeR)
 plotMDS(repFpkmMatrix(genes(cuff)), top=500, main="top500 MDS")

-MDSplot(genes(cuff)) + ggtitle("mds plot")
+
+if(unlen(replicates(cuff)$sample)>2){  ## >2 only, because MDS will fail with just 2 samples
+    MDSplot(genes(cuff)) + ggtitle("mds plot")
+}

 MDSplot(genes(cuff), replicates=T) + ggtitle("mds plot")

@@ -128,7 +133,9 @@ csDendro(genes(cuff),replicates=T)


 #######################################################################################################################
-#' ## Extract and save tables
+#' ## Raw data tables
+
+## merge in basic gene info

 #xls(runInfo(cuff))
 #setwd("/Volumes/project-zeigerer/Rab5_NGS/results/")
@@ -136,17 +143,25 @@ csDendro(genes(cuff),replicates=T)
 geneFpkm<-fpkm(genes(cuff))
 write.delim(geneFpkm, file="geneFpkm.txt")
 # gene.fpkm <- read.delim("gene.fpkm.txt")
+#'  [FPKMs](geneFpkm.txt)

-geneCounts<-count(genes(cuff))
-write.delim(geneCounts, file="geneCounts.txt")
-geneCgene.matrix<-fpkmMatrix(genes(cuff))

 repGeneFPKMs <- repFpkmMatrix(genes(cuff))
-save(repGeneFPKMs, file="repGeneFPKMs.RData")
 write.delim(repGeneFPKMs, file="repGeneFPKMs.txt")
 # repGeneFPKMs <- read.delim("repGeneFPKMs.txt")
 # repGeneFPKMs <- local(get(load("repGeneFPKMs.RData")))
+#'  [FPKMs by Replicate](repGeneFPKMs.txt)
+
+geneCounts<-count(genes(cuff))
+max(geneCounts$count)
+write.delim(geneCounts, file="geneCounts.txt")
+geneCgene.matrix<-fpkmMatrix(genes(cuff))
+#'  [Raw Counts](geneCounts.txt)

+repCounts <- countMatrix(genes(cuff))
+write.delim(repCounts, file="repCounts.txt")
+# repCounts <- read.delim("repCounts.txt")
+#'  [Raw Counts by Replicate](repCounts.txt)


 #######################################################################################################################

--- a/dge_workflow/dge_master.sh
+++ b/dge_workflow/dge_master.sh
@@ -26,7 +26,7 @@ export PATH=/home/brandl/bin/cufflinks-2.2.1.Linux_x86_64:$PATH


 #fastqFiles=$(ls /projects/bioinfo/holger/projects/helin/all_trep_pooled/dog_*)
-fastqFiles=<<TBD>>
+fastqFiles=<<<<TBD>>>>
 #ll $fastqFiles

 ## do some qc for the reads
@@ -39,18 +39,22 @@ mailme "$project: fastqc done"


 ########################################################################################################################
-### Map the data and do cuffdiff
+### Align the reads

 mcdir $baseDir/mapped

 #igenome=/projects/bioinfo/igenomes/Canis_familiaris/Ensembl/CanFam3.1
-igenome=<<TBD>>
+igenome=<<<<TBD>>>>


 dge_tophat_se -i $igenome $fastqFiles 2>&1 | tee mapped.log

 mailme "$project: mapping done"

+########################################################################################################################
+#### Basic Alginment QC and technical replicate grouping
+
+##TBD

 ########################################################################################################################
 ### Do the differential expression analysis
@@ -61,7 +65,7 @@ gtfFile=$igenome/Annotation/Genes/genes.gtf

 ## define labels to split bam files into replicate groups
 #labels="big_cyst,small_cyst"
-labels=<<TBD>>
+labels=<<<<TBD>>>>

 dge_cuffdiff $gtfFile $baseDir/mapped $labels
 MakeCuffDB $gtfFile "NAN"

--- a/misc/spin.R
+++ b/misc/spin.R
 #!/usr/bin/env Rscript

+# similar http://stackoverflow.com/questions/10943695/what-is-the-knitr-equivalent-of-r-cmd-sweave-myfile-rnw
+
 #http://stackoverflow.com/questions/3433603/parsing-command-line-arguments-in-r-scripts
 #https://github.com/edwindj/docopt.R
 #http://www.slideshare.net/EdwindeJonge1/docopt-user2014
@@ -23,10 +25,10 @@ Options:
 '
 #docopt(doc, "-w test  a b c ")$"-w"

-opts <- docopt(doc)
-#print(opts)
+spin_opts <- docopt(doc)
+#print(spin_opts)

-r_script <- opts$r_script
+r_script <- spin_opts$r_script

 if(!file.exists(r_script)){
     stop(paste("file does not exist\n", doc))
@@ -42,9 +44,11 @@ options(width=150)
 #https://groups.google.com/forum/#!topic/knitr/ojcnq5Nm298

 ## better table css: http://www.stat.ubc.ca/~jenny/STAT545A/topic10_tablesCSS.html
-commandArgs <- function(trailingOnly = FALSE){ return(opts$script_args) }
-print("trimmed args are")
-print(commandArgs())
+commandArgs <- function(trailingOnly = FALSE){ return(as.character(spin_opts$script_args)) }
+#print("trimmed args are")
+#print(commandArgs())
+#print("end args")
+
 spin(r_script, knit=F)

 mdScript <- str_replace(r_script, "[.]R$", ".Rmd")
@@ -63,10 +67,10 @@ cssHeader='

 ## custom title http://stackoverflow.com/questions/14124022/setting-html-meta-elements-with-knitr
 opts_chunk$set(
-    cache = opts$"-c",
-    message= opts$"-m",
-    warning= opts$"-w",
-    echo= opts$"-e",
+    cache = spin_opts$"-c",
+    message= spin_opts$"-m",
+    warning= spin_opts$"-w",
+    echo= spin_opts$"-e",
    fig.width=15,
    width=200
 )