From b2966a0b2e5f574bba0dcd3bb4b358221cdcbba8 Mon Sep 17 00:00:00 2001
From: Holger Brandl <brandl@mpi-cbg.de>
Date: Wed, 26 Nov 2014 09:23:32 +0100
Subject: [PATCH] cont. rna-seq workflow

---
 dge_workflow/bam_qc.R       |  1 +
 dge_workflow/lsf_rna_seq.sh | 72 +++++++++++++++++++++++++++++--------
 dge_workflow/todo.txt       |  1 +
 3 files changed, 59 insertions(+), 15 deletions(-)
 create mode 100755 dge_workflow/todo.txt

diff --git a/dge_workflow/bam_qc.R b/dge_workflow/bam_qc.R
index 222bb7b..9f790ac 100755
--- a/dge_workflow/bam_qc.R
+++ b/dge_workflow/bam_qc.R
@@ -73,6 +73,7 @@ ggplot(algnSummary, aes(condition, mapped_reads)) +
 #ggplot(melt(algnSummary), aes(condition, value)) + geom_bar(stat="identity") +facet_wrap(~variable, scales="free") + ggtitle("mapping summary") + scale_y_continuous(labels=comma)  + theme(axis.text.x=element_text(angle=90, hjust=0))
 #ggsave2(w=10, h=10, p="mapstats")
 
+##todo multimapper counting
 
 ########################################################################################################################
 #' ## Alignment Correlation
diff --git a/dge_workflow/lsf_rna_seq.sh b/dge_workflow/lsf_rna_seq.sh
index af02ade..8addcae 100755
--- a/dge_workflow/lsf_rna_seq.sh
+++ b/dge_workflow/lsf_rna_seq.sh
@@ -19,9 +19,9 @@ if [ $# -lt 1 ]; then
      echo "Usage: dge_fastqc [-o <output_directory>] [<fastq.gz file>]+" >&2 ; return;
 fi
 
-## use default directory if not specified
+## use current directory if not specified
 if [ -z "$outputDir" ]; then
-     outputDir="fastqc_reports"
+     outputDir="fastqc"
 fi
 
 if [ ! -d "$outputDir" ]; then
@@ -44,24 +44,41 @@ done
 
 wait4jobs .fastqc_jobs
 
-mailme "fastqc done for $outputDir"
+mailme "fastqc done for $outputDir"dge_tophat_se
 
 ziprm fastqc_logs fastqc__*
 
-## create a small report
-source <(curl https://dl.dropboxusercontent.com/u/113630701/datautils/R/utils/spinr.sh 2>&1 2>/dev/null)
-spinr $(which fastqc_summary.R) $outputDir
-
 # todo create some summary report here
+spin.R $DGE_HOME/fastqc_summary.R $outputDir
+
 }
 export -f dge_fastqc
 
 
+dge_cutadapt(){
+
+export Ill_ADAPTERS=/projects/bioinfo/common/illumina_universal_index.fasta
+
+for fastqFile in $* ; do
+    # DEBUG fastqFile=/projects/bioinfo/holger/projects/helin/mouse/treps_pooled/mouse_liver_polar_stage1_rep4.fastq.gz
+    caFastq=$(basename $fastqFile .fastq.gz)_ca.fastq.gz
+    echo "cutadapting $caFastq into $caFastq"
+
+    #todo use a more specific trimming model (trim just correct part on each side without using reverse complements
+    mysub "$project__ca__$caFastq" "cutadapt -b file:$Ill_ADAPTERS -m 20 -q 25 -o $caFastq $fastqFile > $caFastq.ca.log"  -q long  | joblist .cajobs
+done
+
+wait4jobs .cajobs
+
+}
+export -f dge_cutadapt
+
+
 #http://wiki.bash-hackers.org/howto/getopts_tutorial
 dge_tophat_se(){
 
 # http://stackoverflow.com/questions/18414054/bash-getopts-reading-optarg-for-optional-flags
-echo $*
+#echo $*
 
 while getopts "i:" curopt; do
     case $curopt in
@@ -73,6 +90,7 @@ shift $(($OPTIND - 1))
 
 local fastqFiles=$*
 
+# IGENOME=/projects/bioinfo/igenomes/Mus_musculus/Ensembl/GRCm38
 echo fastq $fastqFiles
 echo igenomes "$IGENOME"
 
@@ -103,11 +121,12 @@ ll $fastqFiles
 for fastqFile in $fastqFiles ; do
     echo "submitting tophat job for $fastqFile"
 
-    # DEBUG fastqFile=/projects/bioinfo/holger/projects/eric/trimmed/a1_ca.fastq.gz
+    # DEBUG fastqFile=/projects/bioinfo/holger/projects/helin/mouse/trimmed/mouse_big_cyst_rep4_ca.fastq.gz
     fastqBaseName=$(basename ${fastqFile%%.fastq.gz})
     outputdir=$fastqBaseName
 
     ## uniquely mapping reads only:   http:/seqanswers.com/forums/showthread.php?s=93da11109ae12a773a128a637d69defe&t=3464
+    ###     tophat -p6  -G $gtfFile -g1 -o test $bowtie_gindex $fastqFile
     mysub "${project}__tophat__${fastqBaseName}" "
     tophat -p6  -G $gtfFile -g1 -o $outputdir $bowtie_gindex $fastqFile
 
@@ -119,8 +138,8 @@ done
 wait4jobs .tophatjobs
 
 ## create tophat mapping report
-source <(curl https://dl.dropboxusercontent.com/u/113630701/datautils/bash/bioinfo_utils.sh 2>&1 2>/dev/null)
-TophatMappingReport
+spin.R $DGE_HOME/bam_qc.R .
+
 }
 export -f dge_tophat_se
 
@@ -165,9 +184,32 @@ echo $gtfFile $bamsSplit | tr "," " "  | xargs ls -la
 
 cdCmd="cuffdiff -L $labels -o . -p 10 $gtfFile $bamsSplit"
 #echo "cmd is: $cdCmd"
-mysub ${project}__cuffdiff "$cdCmd"  -q long -n 8 -R span[hosts=1] | blockScript
-
+# eval $cdCmd
+mysub "${project}__cuffdiff" "$cdCmd"  -q long -n 4 -R span[hosts=1] | blockScript
+
+
+## create a cuffdiff db using cummerbund in a temporary directory to avoid locking problems
+tmpDbDir=$(mktemp -d)
+cp -r . $tmpDbDir
+
+## todo remove this hack
+genome=$(echo $gtfFile | cut -f7 -d'/'); echo "genome is $genome"
+R_LIBS=/tmp/r_index
+echo '
+require(cummeRbund)
+dbDir=commandArgs(T)[1]
+gtfFile=commandArgs(T)[2]
+genome=commandArgs(T)[3]
+## note without providing the gtf the db is much smaller
+readCufflinks(dir=dbDir, rebuild=T, gtf=gtfFile, genome=genome)
+' | R -q --no-save --no-restore --args $tmpDbDir $gtfFile $genome
+
+if [ ! -f $tmpDbDir/cuffData.db ]; then
+    >&2 echo "cummerbund failed to create cuffidff sqlite db"; return;
+fi
 
-MakeCuffDB $gtfFile "NAN"
+cp $tmpDbDir/cuffData.db .
+rm -rf $tmpDbDir ## because it's no longer needed
 
-}
\ No newline at end of file
+}
+export -f dge_cuffdiff
diff --git a/dge_workflow/todo.txt b/dge_workflow/todo.txt
new file mode 100755
index 0000000..66f0ff3
--- /dev/null
+++ b/dge_workflow/todo.txt
@@ -0,0 +1 @@
+- cuffdbs change dramatically in size if gtf is provided when building them, but what impact does it have on the results
-- 
GitLab