initial commit of dge workflow

614e8317 · Holger Brandl · 614e8317 · 614e8317 · 614e8317
Commit 614e8317 authored 10 years ago by Holger Brandl
--- a/dge_workflow/README.md
+++ b/dge_workflow/README.md
+# RNA-Seq DGE Workflow
+
+http://stackoverflow.com/questions/600079/is-there-any-way-to-clone-a-git-repositorys-sub-directory-only
+http://stackoverflow.com/questions/160608/do-a-git-export-like-svn-export
+git checkout-index -a -f --prefix=/destination/path/
+
+
+
+## How to use it?
+
+git checkout-index -a -f --prefix=/destination/path/
+
+1) Optionally rename dge_master.sh to something like dge_mouse_helin.sh
+2) Adjust paths in master script
+3) Run on madmax
+4) Adjust reports if necessary
+
+
+## How to backport changes into it?
+
+Clone it, change it, push and commit it.
\ No newline at end of file
--- a/dge_workflow/fastqc_summary.R
+++ b/dge_workflow/fastqc_summary.R
+#' # Read Summary Report
+#+ echo=FALSE, message=FALSE
+
+## Note This script is supposed to be knitr::spin'ed
+
+devtools::source_url("https://dl.dropboxusercontent.com/u/113630701/datautils/R/core_commons.R")
+devtools::source_url("https://dl.dropboxusercontent.com/u/113630701/datautils/R/ggplot_commons.R")
+
+argv = commandArgs(TRUE)
+if(length(argv) != 1){
+    stop("Usage: fastqc_summmary.R <directory with fastqc results>")
+}
+
+#baseDir="fastqc"
+#baseDir="/Volumes/projects/bioinfo/holger/projects/helin/mouse/fastqc"
+baseDir=argv[1]
+
+fastqDataFiles <- list.files(path=baseDir, pattern="^fastqc_data.txt", full.names=TRUE, recursive=T)
+
+
+#' ## Number of reads per run
+
+readCount <- function(statsFile){
+    data.frame(
+        run=trim_ext(basename(dirname(statsFile)), c("_fastqc")),
+        num_reads=as.numeric(readLines(pipe( paste("grep -F 'Total Sequences	' ", statsFile, " | cut -f2 -d'\t'") )))
+    )
+}
+
+readCounts <- fastqDataFiles %>% ldply(readCount)
+
+require.auto(scales)
+ggplot(readCounts, aes(run, num_reads)) + geom_bar(stat="identity") + coord_flip() + scale_y_continuous(labels=comma)
+
+
+### Create a faill/pass matrix
+
+readSummary <- function(statsFile){
+    # echo("reading", statsFile)
+    # statsFile="./fastqc/mouse_big_cyst_rep1_fastqc/summary.txt"
+
+    fileMapStats <- read.delim(statsFile, header=F) %>%
+        set_names(c("flag", "score", "run")) %>%
+        mutate(run=trim_ext(run, c(".fastq.gz", "fastq")))
+
+    fileMapStats
+}
+
+qcSummary <- list.files(path=baseDir, pattern="^summary.txt", full.names=TRUE, recursive=T) %>%  ldply(readSummary)
+
+
+qcSummary %>% ggplot(aes(score, run, fill=tolower(flag))) +
+    geom_tile() +
+    rotXlab() +
+    scale_fill_manual(values = c(fail="darkred", pass="darkgreen", warn="orange"))
+
+
+
+readBaseQualDist <- function(statsFile){
+    # statsFile="./fastqc/mouse_big_cyst_rep2_fastqc/fastqc_data.txt"
+    # statsFile="./fastqc/mouse_liver_polar_stage3_rep2_fastqc/fastqc_data.txt"
+    # grep -A30 -F '>>Per base sequence quality' /Volumes/projects/bioinfo/holger/projects/helin/mouse/fastqc/mouse_big_cyst_rep1_fastqc/fastqc_data.txt | grep -B100 -F '>>END_M' | head -n-1 | tail -n+2 | tr '#' ' '
+    echo("reading", statsFile)
+
+    baseStats <- read.delim(pipe(
+            paste("grep -A30 -F '>>Per base sequence quality' ", statsFile, "| grep -B100 -F '>>END_M' | head -n-1 | tail -n+2 | tr '#' ' '")
+        )) %>% mutate(
+            run=trim_ext(basename(dirname(statsFile)), c("_fastqc"))
+        )
+
+    baseStats %>% mutate(base_order=1:n())
+}
+
+baseQualities <- fastqDataFiles %>%  ldply(readBaseQualDist)
+
+#with(baseQualities, as.data.frame(table(run)))
+
+
+baseQualities %>% ggplot(aes(reorder(Base, base_order), Mean, group=run, color=run)) + geom_line() + scale_y_continuous(limits=c(2, 40))
+
+
+#' # Base Quality Distribution Summary
+
+runs <- with(baseQualities, as.data.frame(table(run)))
+
+## http://stackoverflow.com/questions/12518387/can-i-create-an-empty-ggplot2-plot-in-r
+baseQualities %>%
+    head(30) %>%
+    ggplot() +
+    geom_blank(mapping=aes(reorder(Base, base_order))) +
+    geom_rect(aes(xmin=-Inf, xmax=Inf, ymin=-Inf, ymax=20), data=runs, alpha=0.05, fill="red") +
+    geom_rect(aes(xmin=-Inf, xmax=Inf, ymin=20, ymax=28), data=runs, alpha=0.05, fill=colors()[654]) +
+    geom_rect(aes(xmin=-Inf, xmax=Inf, ymin=28, ymax=Inf), data=runs, alpha=0.05, fill="green") +
+    geom_boxplot(
+            mapping=aes(x=reorder(Base, base_order), ymin = X10th.Percentile, lower = Lower.Quartile , middle = Median, upper = Upper.Quartile , ymax =  X90th.Percentile),
+             stat = "identity"
+     ) +
+    facet_wrap(~run) +
+    theme(axis.text.x=element_blank()) +
+    scale_y_continuous(limits=c(2, 40)) +
+    xlab("")
+
+
--- a/dge_workflow/lsf_rna_seq.sh
+++ b/dge_workflow/lsf_rna_seq.sh
+## docs
+## http://blog.joncairns.com/2013/08/what-you-need-to-know-about-bash-functions/
+
+
+## create fastq report for all fastq and fastq.gz files in the current directory
+dge_fastqc(){
+
+while getopts "o:" curopt; do
+    case $curopt in
+    o) outputDir=$OPTARG;
+    esac
+done
+shift $(($OPTIND - 1))
+
+local fastqFiles=$*
+
+#if [ -z "$fastqFiles" ]; then
+if [ $# -lt 1 ]; then
+     echo "Usage: dge_fastqc [-o <output_directory>] [<fastq.gz file>]+" >&2 ; return;
+fi
+
+## use default directory if not specified
+if [ -z "$outputDir" ]; then
+     outputDir="fastqc_reports"
+fi
+
+if [ ! -d "$outputDir" ]; then
+    echo "creating output directory '$outputDir'"
+    mkdir $outputDir
+fi
+
+
+for fastqFile in $fastqFiles ; do
+    echo "fastqcing $fastqFile"
+
+    if [ ! -f $fastqFile ]; then
+        continue;
+    fi
+    
+
+    mysub "fastqc__$(basename $fastqFile)" "fastqc -j /sw/bin/java -o $outputDir -f fastq $fastqFile" -q medium  | joblist .fastqc_jobs
+done
+
+
+wait4jobs .fastqc_jobs
+
+mailme "fastqc done for $outputDir"
+
+ziprm fastqc_logs fastqc__*
+
+## create a small report
+source <(curl https://dl.dropboxusercontent.com/u/113630701/datautils/R/utils/spinr.sh 2>&1 2>/dev/null)
+spinr $(which fastqc_summary.R) $outputDir
+
+# todo create some summary report here
+}
+export -f dge_fastqc
+
+
+#http://wiki.bash-hackers.org/howto/getopts_tutorial
+dge_tophat_se(){
+
+# http://stackoverflow.com/questions/18414054/bash-getopts-reading-optarg-for-optional-flags
+echo $*
+
+while getopts "i:" curopt; do
+    case $curopt in
+    i) IGENOME="$OPTARG";
+    esac
+done
+shift $(($OPTIND - 1))
+
+
+local fastqFiles=$*
+
+echo fastq $fastqFiles
+echo igenomes "$IGENOME"
+
+if [ -z "$IGENOME" ] || [ -z "$fastqFiles" ];
+     then echo "Usage: dge_tophat_se -i <path to igenome> [<fastq.gz file>]+" >&2 ; return;
+fi
+
+
+
+export bowtie_gindex="$IGENOME/Sequence/Bowtie2Index/genome"
+export gtfFile="$IGENOME/Annotation/Genes/genes.gtf"
+#head $gtfFile
+
+if [ ! -f $gtfFile ]; then
+    >&2 echo "gtf '$gtfFile' does not exis"; return;
+fi
+
+if [ -z "$(which tophat)" ]; then
+    >&2 echo "no tomcat binary in PATH"; return;
+fi
+
+
+echo "running tophat using igenome '$IGENOME' for the following files"
+ll $fastqFiles
+
+#fastqFiles=$(ls $baseDir/treps_pooled/*fastq.gz)
+
+for fastqFile in $fastqFiles ; do
+    echo "submitting tophat job for $fastqFile"
+
+    # DEBUG fastqFile=/projects/bioinfo/holger/projects/eric/trimmed/a1_ca.fastq.gz
+    fastqBaseName=$(basename ${fastqFile%%.fastq.gz})
+    outputdir=$fastqBaseName
+
+    ## uniquely mapping reads only:   http:/seqanswers.com/forums/showthread.php?s=93da11109ae12a773a128a637d69defe&t=3464
+    mysub "${project}__tophat__${fastqBaseName}" "
+    tophat -p6  -G $gtfFile -g1 -o $outputdir $bowtie_gindex $fastqFile
+
+    mv $outputdir/accepted_hits.bam $outputdir/$(basename $outputdir).bam
+    samtools index $outputdir/$(basename $outputdir).bam
+    " -n 5 -R span[hosts=1] -q long | joblist .tophatjobs
+done
+
+wait4jobs .tophatjobs
+
+## create tophat mapping report
+source <(curl https://dl.dropboxusercontent.com/u/113630701/datautils/bash/bioinfo_utils.sh 2>&1 2>/dev/null)
+TophatMappingReport
+}
+export -f dge_tophat_se
+
+# dge_tophat_se
+# dge_tophat_se -i
+
+
+dge_cuffdiff(){
+
+local gtfFile=$1
+local bamDir=$2
+local labels=$3
+
+if [ $# -ne 3 ]; then
+ echo "Usage: dge_fastqc <gtf_file>  <bam directory> <labels>" >&2 ; return;
+fi
+
+if [ -z "$(which cuffdiff)" ]; then
+    >&2 echo "no cuffdiff binary in PATH"; return;
+fi
+
+if [ ! -f $gtfFile ]; then
+    >&2 echo "gtf '$gtfFile' does not exis"; return;
+fi
+
+## collect all bam-files and group them
+allBams=$(find $bamDir | grep ".bam$" | grep -v "unmapped" | sort)
+
+bamsSplit=""
+for label in $(echo $labels | tr ", " " "); do
+    echo $label
+    labelBams=$(echo  "$allBams" | grep $label | xargs echo -n | tr ' ' ',')
+    bamsSplit+=$labelBams" "
+done
+#echo $bamsSplit
+
+## make sure (again that all files are there
+echo $gtfFile $bamsSplit | tr "," " "  | xargs ls -la
+
+
+#mysub ${project}_cuffdiff "cuffdiff -L aRGm,aRGp,aRG,bRG,IPC,N -o . -p 8 mm10_ensGene_ccds.gtf $amBams $apBams $aBams $bBams $cBams $dBams  2>&1 | tee cuffdiff.log" -q long -n 8 -R span[hosts=1]   | blockScript
+
+cdCmd="cuffdiff -L $labels -o . -p 10 $gtfFile $bamsSplit"
+#echo "cmd is: $cdCmd"
+mysub ${project}__cuffdiff "$cdCmd"  -q long -n 8 -R span[hosts=1] | blockScript
+
+
+MakeCuffDB $gtfFile "NAN"
+
+}
\ No newline at end of file