continued star wrapper

80340742 · Holger Brandl · 3c89c2c6 · 80340742 · 80340742 · 80340742
Commit 80340742 authored 9 years ago by Holger Brandl
--- a/dge_workflow/dge_master_template.sh
+++ b/dge_workflow/dge_master_template.sh
@@ -124,7 +124,7 @@ mailme "$project: mapping done"


 #########################################################################################################################
-#### Basic Alginment QC and technical replicate grouping
+#### Technical replicate grouping
 #
 #mcdir $baseDir/trep_pooled
 #

--- a/dge_workflow/dge_utils.sh
+++ b/dge_workflow/dge_utils.sh
@@ -12,6 +12,7 @@ source <(curl https://raw.githubusercontent.com/holgerbrandl/datautils/v1.10/R/s
 ## todo tweak this to work on bioinfo as well
 export PATH=/projects/bioinfo/holger/bin/bowtie2-2.2.2:$PATH
 export PATH=/projects/bioinfo/holger/bin/tophat-2.0.13.Linux_x86_64:$PATH
+export PATH=/home/brandl/bin/STAR/STAR-STAR_2.4.1d/source:$PATH
 export PATH=/home/brandl/bin/cufflinks-2.2.1.Linux_x86_64:$PATH
 export PATH=/sw/apps/python/current/bin:$PATH
 export PATH=/home/brandl/bin/deepTools/bin:$PATH
@@ -157,7 +158,7 @@ for fastqFile in $fastqFiles ; do

    mv $outputdir/accepted_hits.bam $outputdir/$(basename $outputdir).bam
    samtools index $outputdir/$(basename $outputdir).bam
-    " -n 5 -R span[hosts=1] -q long | joblist .tophatjobs
+    " -n 5 -R span[hosts=1] -q medium | joblist .tophatjobs
 done

 wait4jobs .tophatjobs

--- a/dge_workflow/star_align.sh
+++ b/dge_workflow/star_align.sh
@@ -5,60 +5,64 @@

 usage='
 Use star to align fastq files against a genome
-Usage: spin.R <igenome> <fastq_files>...
+Usage: star_align.sh <igenome> <fastq_files>...

 Options:
 '
-c        Cache results
+#-c        Cache results

-#eval $(echo "$usage" | ~/bin/docopts/docopts -h - -A dopts : asdf asdf.fastq)
-eval $(echo "$usage" | ~/bin/docopts/docopts -h -  : asdf asdf.fastq another.fastq)
-for fastqFile in ; do
-    echo processing $fastqFile
-done
+eval $(echo "$usage" | ~/bin/docopts/docopts -h - -A dopts : "$@")
+#eval $(echo "$usage" | ~/bin/docopts/docopts -h -  : /projects/bioinfo/igenomes/Mus_musculus/Ensembl/GRCm38 /projects/bioinfo/holger/projects/florio_11b_2nd_batch/lanereps_pooled/arhgap11b_1.fastq.gz)

-echo $igenome
+#for fastqFile in ${fastq_files[@]} ; do
+#    echo processing $fastqFile
+#done
+#
+#echo $igenome


-gtfFile=$1
-bamDir=$2
-labels=$3
+## build index if not present

-echo "${!dopts[@]}"
-echo "${dopts[-c]}"
-echo "${dopts[igenome]}"


-#http://www.artificialworlds.net/blog/2012/10/17/bash-associative-array-examples/
-#args=$(echo "$usage" | ~/bin/docopts/docopts -h -  : "test.R")
+export star_index="$igenome/Sequence/StarIndex/genome"
+export gtfFile="$igenome/Annotation/Genes/genes.gtf"
+head $gtfFile

-## this will define one environment variable per parameter (c,e,m,v)
-eval $(echo "$usage" | ~/bin/docopts/docopts -h -  : -e test.R)
+if [ ! -f $gtfFile ]; then
+    >&2 echo "gtf '$gtfFile' does not exis"; exit 1;
+fi

-fastqFiles=${fastq_files[@]}
+## basic 
+#http://www.homolog.us/blogs/blog/2012/11/02/star-really-kick-ass-rna-seq-aligner/

-# IGENOME=/projects/bioinfo/igenomes/Mus_musculus/Ensembl/GRCm38
+if ! -d "${igenome}/Sequence/StarIndex" ]; then

-if [ -z "$IGENOME" ] || [ -z "$fastqFiles" ];
-     then echo "Usage: dge_tophat_se -i <path to igenome> [<fastq.gz file>]+" >&2 ; return;
-fi
+mailme "${project}: creating STAR index for $igenome"
+mkdir ${igenome}/Sequence/StarIndex

+cmd="STAR --runMode genomeGenerate --genomeDir ${igenome}/Sequence/StarIndex --genomeFastaFiles ${igenome}/Sequence/WholeGenomeFasta/genome.fa --runThreadN 10"
+#eval $cmd
+#STAR --runMode genomeGenerate --genomeDir ${igenome}/Sequence/StarIndex --genomeFastaFiles ${igenome}/Sequence/Chromosomes/*.fa --runThreadN 10
+mysub "${project}_star_index" "$cmd" -n 5 -R span[hosts=1] -q medium  | blockScript
+
+## prevent modification
+chmod -R -w ${igenome}/Sequence/StarIndex
+
+fi


-export bowtie_gindex="$IGENOME/Sequence/Bowtie2Index/genome"
-export gtfFile="$IGENOME/Annotation/Genes/genes.gtf"
-#head $gtfFile

 if [ ! -f $gtfFile ]; then
-    >&2 echo "gtf '$gtfFile' does not exis"; return;
+    >&2 echo "gtf '$gtfFile' does not exis"; exit 1;
 fi

-if [ -z "$(which tophat)" ]; then
-    >&2 echo "no tomcat binary in PATH"; return;
+if [ -z "$(which STAR)" ]; then
+    >&2 echo "no tomcat binary in PATH"; exit 1;
 fi


-echo "running tophat using igenome '$IGENOME' for the following files"
+echo "running STAR using igenome '${igenome}' for the following files"
 ll $fastqFiles

 #fastqFiles=$(ls $baseDir/treps_pooled/*fastq.gz)
@@ -72,11 +76,11 @@ for fastqFile in $fastqFiles ; do

    ## uniquely mapping reads only:   http:/seqanswers.com/forums/showthread.php?s=93da11109ae12a773a128a637d69defe&t=3464
    ###     tophat -p6  -G $gtfFile -g1 -o test $bowtie_gindex $fastqFile
-    mysub "${project}__tophat__${fastqBaseName}" "
-    tophat -p6  -G $gtfFile -g1 -o $outputdir $bowtie_gindex $fastqFile

-    mv $outputdir/accepted_hits.bam $outputdir/$(basename $outputdir).bam
-    samtools index $outputdir/$(basename $outputdir).bam
+    ## note --outSAMstrandField intronMotif is required for cuffdiff compatiblity (xs flag)
+    mysub "${project}__star__${fastqBaseName}" "
+#    tophat -p6  -G $gtfFile -g1 -o $outputdir $bowtie_gindex $fastqFile
+    STAR --genomeDir $star_index --readFilesIn /path/to/read1 $fastqFile --runThreadN 6 --outFileNamePrefix $fastqBaseName --outSAMtype BAM SortedByCoordinate --outSAMstrandField intronMotif --sjdbGTFfile
    " -n 5 -R span[hosts=1] -q long | joblist .tophatjobs
 done

@@ -87,6 +91,7 @@ wait4jobs .tophatjobs
 dge_bam_correlate .

 ## create tophat mapping report
-spin.R ${NGS_TOOLS}/dge_workflow/tophat_qc.R .
+## todo adjust report to STAR
+#spin.R ${NGS_TOOLS}/dge_workflow/tophat_qc.R .

-mailme "$project: tophat done in $(pwd)"
\ No newline at end of file
+mailme "$project: STAR done in $(pwd)"
\ No newline at end of file