From b2966a0b2e5f574bba0dcd3bb4b358221cdcbba8 Mon Sep 17 00:00:00 2001 From: Holger Brandl <brandl@mpi-cbg.de> Date: Wed, 26 Nov 2014 09:23:32 +0100 Subject: [PATCH] cont. rna-seq workflow --- dge_workflow/bam_qc.R | 1 + dge_workflow/lsf_rna_seq.sh | 72 +++++++++++++++++++++++++++++-------- dge_workflow/todo.txt | 1 + 3 files changed, 59 insertions(+), 15 deletions(-) create mode 100755 dge_workflow/todo.txt diff --git a/dge_workflow/bam_qc.R b/dge_workflow/bam_qc.R index 222bb7b..9f790ac 100755 --- a/dge_workflow/bam_qc.R +++ b/dge_workflow/bam_qc.R @@ -73,6 +73,7 @@ ggplot(algnSummary, aes(condition, mapped_reads)) + #ggplot(melt(algnSummary), aes(condition, value)) + geom_bar(stat="identity") +facet_wrap(~variable, scales="free") + ggtitle("mapping summary") + scale_y_continuous(labels=comma) + theme(axis.text.x=element_text(angle=90, hjust=0)) #ggsave2(w=10, h=10, p="mapstats") +##todo multimapper counting ######################################################################################################################## #' ## Alignment Correlation diff --git a/dge_workflow/lsf_rna_seq.sh b/dge_workflow/lsf_rna_seq.sh index af02ade..8addcae 100755 --- a/dge_workflow/lsf_rna_seq.sh +++ b/dge_workflow/lsf_rna_seq.sh @@ -19,9 +19,9 @@ if [ $# -lt 1 ]; then echo "Usage: dge_fastqc [-o <output_directory>] [<fastq.gz file>]+" >&2 ; return; fi -## use default directory if not specified +## use current directory if not specified if [ -z "$outputDir" ]; then - outputDir="fastqc_reports" + outputDir="fastqc" fi if [ ! -d "$outputDir" ]; then @@ -44,24 +44,41 @@ done wait4jobs .fastqc_jobs -mailme "fastqc done for $outputDir" +mailme "fastqc done for $outputDir"dge_tophat_se ziprm fastqc_logs fastqc__* -## create a small report -source <(curl https://dl.dropboxusercontent.com/u/113630701/datautils/R/utils/spinr.sh 2>&1 2>/dev/null) -spinr $(which fastqc_summary.R) $outputDir - # todo create some summary report here +spin.R $DGE_HOME/fastqc_summary.R $outputDir + } export -f dge_fastqc +dge_cutadapt(){ + +export Ill_ADAPTERS=/projects/bioinfo/common/illumina_universal_index.fasta + +for fastqFile in $* ; do + # DEBUG fastqFile=/projects/bioinfo/holger/projects/helin/mouse/treps_pooled/mouse_liver_polar_stage1_rep4.fastq.gz + caFastq=$(basename $fastqFile .fastq.gz)_ca.fastq.gz + echo "cutadapting $caFastq into $caFastq" + + #todo use a more specific trimming model (trim just correct part on each side without using reverse complements + mysub "$project__ca__$caFastq" "cutadapt -b file:$Ill_ADAPTERS -m 20 -q 25 -o $caFastq $fastqFile > $caFastq.ca.log" -q long | joblist .cajobs +done + +wait4jobs .cajobs + +} +export -f dge_cutadapt + + #http://wiki.bash-hackers.org/howto/getopts_tutorial dge_tophat_se(){ # http://stackoverflow.com/questions/18414054/bash-getopts-reading-optarg-for-optional-flags -echo $* +#echo $* while getopts "i:" curopt; do case $curopt in @@ -73,6 +90,7 @@ shift $(($OPTIND - 1)) local fastqFiles=$* +# IGENOME=/projects/bioinfo/igenomes/Mus_musculus/Ensembl/GRCm38 echo fastq $fastqFiles echo igenomes "$IGENOME" @@ -103,11 +121,12 @@ ll $fastqFiles for fastqFile in $fastqFiles ; do echo "submitting tophat job for $fastqFile" - # DEBUG fastqFile=/projects/bioinfo/holger/projects/eric/trimmed/a1_ca.fastq.gz + # DEBUG fastqFile=/projects/bioinfo/holger/projects/helin/mouse/trimmed/mouse_big_cyst_rep4_ca.fastq.gz fastqBaseName=$(basename ${fastqFile%%.fastq.gz}) outputdir=$fastqBaseName ## uniquely mapping reads only: http:/seqanswers.com/forums/showthread.php?s=93da11109ae12a773a128a637d69defe&t=3464 + ### tophat -p6 -G $gtfFile -g1 -o test $bowtie_gindex $fastqFile mysub "${project}__tophat__${fastqBaseName}" " tophat -p6 -G $gtfFile -g1 -o $outputdir $bowtie_gindex $fastqFile @@ -119,8 +138,8 @@ done wait4jobs .tophatjobs ## create tophat mapping report -source <(curl https://dl.dropboxusercontent.com/u/113630701/datautils/bash/bioinfo_utils.sh 2>&1 2>/dev/null) -TophatMappingReport +spin.R $DGE_HOME/bam_qc.R . + } export -f dge_tophat_se @@ -165,9 +184,32 @@ echo $gtfFile $bamsSplit | tr "," " " | xargs ls -la cdCmd="cuffdiff -L $labels -o . -p 10 $gtfFile $bamsSplit" #echo "cmd is: $cdCmd" -mysub ${project}__cuffdiff "$cdCmd" -q long -n 8 -R span[hosts=1] | blockScript - +# eval $cdCmd +mysub "${project}__cuffdiff" "$cdCmd" -q long -n 4 -R span[hosts=1] | blockScript + + +## create a cuffdiff db using cummerbund in a temporary directory to avoid locking problems +tmpDbDir=$(mktemp -d) +cp -r . $tmpDbDir + +## todo remove this hack +genome=$(echo $gtfFile | cut -f7 -d'/'); echo "genome is $genome" +R_LIBS=/tmp/r_index +echo ' +require(cummeRbund) +dbDir=commandArgs(T)[1] +gtfFile=commandArgs(T)[2] +genome=commandArgs(T)[3] +## note without providing the gtf the db is much smaller +readCufflinks(dir=dbDir, rebuild=T, gtf=gtfFile, genome=genome) +' | R -q --no-save --no-restore --args $tmpDbDir $gtfFile $genome + +if [ ! -f $tmpDbDir/cuffData.db ]; then + >&2 echo "cummerbund failed to create cuffidff sqlite db"; return; +fi -MakeCuffDB $gtfFile "NAN" +cp $tmpDbDir/cuffData.db . +rm -rf $tmpDbDir ## because it's no longer needed -} \ No newline at end of file +} +export -f dge_cuffdiff diff --git a/dge_workflow/todo.txt b/dge_workflow/todo.txt new file mode 100755 index 0000000..66f0ff3 --- /dev/null +++ b/dge_workflow/todo.txt @@ -0,0 +1 @@ +- cuffdbs change dramatically in size if gtf is provided when building them, but what impact does it have on the results -- GitLab