cont. rna-seq workflow

b2966a0b · Holger Brandl · 625bd201 · b2966a0b · b2966a0b · b2966a0b
Commit b2966a0b authored 10 years ago by Holger Brandl
--- a/dge_workflow/bam_qc.R
+++ b/dge_workflow/bam_qc.R
@@ -73,6 +73,7 @@ ggplot(algnSummary, aes(condition, mapped_reads)) +
 #ggplot(melt(algnSummary), aes(condition, value)) + geom_bar(stat="identity") +facet_wrap(~variable, scales="free") + ggtitle("mapping summary") + scale_y_continuous(labels=comma)  + theme(axis.text.x=element_text(angle=90, hjust=0))
 #ggsave2(w=10, h=10, p="mapstats")
+##todo multimapper counting
 ########################################################################################################################
 #' ## Alignment Correlation

--- a/dge_workflow/lsf_rna_seq.sh
+++ b/dge_workflow/lsf_rna_seq.sh
@@ -19,9 +19,9 @@ if [ $# -lt 1 ]; then
     echo "Usage: dge_fastqc [-o <output_directory>] [<fastq.gz file>]+" >&2 ; return;
 fi
-## use default directory if not specified
+## use current directory if not specified
 if [ -z "$outputDir" ]; then
-     outputDir="fastqc_reports"
+     outputDir="fastqc"
 fi
 if [ ! -d "$outputDir" ]; then
@@ -44,24 +44,41 @@ done
 wait4jobs .fastqc_jobs
-mailme "fastqc done for $outputDir"
+mailme "fastqc done for $outputDir"dge_tophat_se
 ziprm fastqc_logs fastqc__*
-## create a small report
-source <(curl https://dl.dropboxusercontent.com/u/113630701/datautils/R/utils/spinr.sh 2>&1 2>/dev/null)
-spinr $(which fastqc_summary.R) $outputDir
 # todo create some summary report here
+spin.R $DGE_HOME/fastqc_summary.R $outputDir
 }
 export -f dge_fastqc
+dge_cutadapt(){
+export Ill_ADAPTERS=/projects/bioinfo/common/illumina_universal_index.fasta
+for fastqFile in $* ; do
+    # DEBUG fastqFile=/projects/bioinfo/holger/projects/helin/mouse/treps_pooled/mouse_liver_polar_stage1_rep4.fastq.gz
+    caFastq=$(basename $fastqFile .fastq.gz)_ca.fastq.gz
+    echo "cutadapting $caFastq into $caFastq"
+    #todo use a more specific trimming model (trim just correct part on each side without using reverse complements
+    mysub "$project__ca__$caFastq" "cutadapt -b file:$Ill_ADAPTERS -m 20 -q 25 -o $caFastq $fastqFile > $caFastq.ca.log"  -q long  | joblist .cajobs
+done
+wait4jobs .cajobs
+}
+export -f dge_cutadapt
 #http://wiki.bash-hackers.org/howto/getopts_tutorial
 dge_tophat_se(){
 # http://stackoverflow.com/questions/18414054/bash-getopts-reading-optarg-for-optional-flags
-echo $*
+#echo $*
 while getopts "i:" curopt; do
    case $curopt in
@@ -73,6 +90,7 @@ shift $(($OPTIND - 1))
 local fastqFiles=$*
+# IGENOME=/projects/bioinfo/igenomes/Mus_musculus/Ensembl/GRCm38
 echo fastq $fastqFiles
 echo igenomes "$IGENOME"
@@ -103,11 +121,12 @@ ll $fastqFiles
 for fastqFile in $fastqFiles ; do
    echo "submitting tophat job for $fastqFile"
-    # DEBUG fastqFile=/projects/bioinfo/holger/projects/eric/trimmed/a1_ca.fastq.gz
+    # DEBUG fastqFile=/projects/bioinfo/holger/projects/helin/mouse/trimmed/mouse_big_cyst_rep4_ca.fastq.gz
    fastqBaseName=$(basename ${fastqFile%%.fastq.gz})
    outputdir=$fastqBaseName
    ## uniquely mapping reads only:   http:/seqanswers.com/forums/showthread.php?s=93da11109ae12a773a128a637d69defe&t=3464
+    ###     tophat -p6  -G $gtfFile -g1 -o test $bowtie_gindex $fastqFile
    mysub "${project}__tophat__${fastqBaseName}" "
    tophat -p6  -G $gtfFile -g1 -o $outputdir $bowtie_gindex $fastqFile
@@ -119,8 +138,8 @@ done
 wait4jobs .tophatjobs
 ## create tophat mapping report
-source <(curl https://dl.dropboxusercontent.com/u/113630701/datautils/bash/bioinfo_utils.sh 2>&1 2>/dev/null)
+spin.R $DGE_HOME/bam_qc.R .
-TophatMappingReport
 }
 export -f dge_tophat_se
@@ -165,9 +184,32 @@ echo $gtfFile $bamsSplit | tr "," " "  | xargs ls -la
 cdCmd="cuffdiff -L $labels -o . -p 10 $gtfFile $bamsSplit"
 #echo "cmd is: $cdCmd"
-mysub ${project}__cuffdiff "$cdCmd"  -q long -n 8 -R span[hosts=1] | blockScript
+# eval $cdCmd
+mysub "${project}__cuffdiff" "$cdCmd"  -q long -n 4 -R span[hosts=1] | blockScript
+## create a cuffdiff db using cummerbund in a temporary directory to avoid locking problems
+tmpDbDir=$(mktemp -d)
+cp -r . $tmpDbDir
+## todo remove this hack
+genome=$(echo $gtfFile | cut -f7 -d'/'); echo "genome is $genome"
+R_LIBS=/tmp/r_index
+echo '
+require(cummeRbund)
+dbDir=commandArgs(T)[1]
+gtfFile=commandArgs(T)[2]
+genome=commandArgs(T)[3]
+## note without providing the gtf the db is much smaller
+readCufflinks(dir=dbDir, rebuild=T, gtf=gtfFile, genome=genome)
+' | R -q --no-save --no-restore --args $tmpDbDir $gtfFile $genome
+if [ ! -f $tmpDbDir/cuffData.db ]; then
+    >&2 echo "cummerbund failed to create cuffidff sqlite db"; return;
+fi
-MakeCuffDB $gtfFile "NAN"
+cp $tmpDbDir/cuffData.db .
+rm -rf $tmpDbDir ## because it's no longer needed
 }
\ No newline at end of file
+export -f dge_cuffdiff
--- a/dge_workflow/todo.txt
+++ b/dge_workflow/todo.txt
+- cuffdbs change dramatically in size if gtf is provided when building them, but what impact does it have on the results