star_align.sh

#!/usr/bin/env bash


#https://www.biostars.org/p/91020/

usage='
Use star to align fastq files against a genome
Usage: star_align.sh [options] <igenome> <fastq_files>...

Options:
--gtf <gtfFile>     Custom gtf file instead of igenome bundled copy
--pc-only           Use protein coding genes only for mapping and quantification
'


#eval $(echo "$usage" | ~/bin/docopts/docopts -h - -A dopts : "$@")
#echo "$usage" | ~/bin/docopts/docopts -h -  : "$@"
#echo "$usage" | ~/bin/docopts/docopts -h - : "hallo dfds"
#eval $(echo  $usage | ~/bin/docopts/docopts -h - : "$@")
#eval(exit 64)
eval "$(echo  "$usage" | ~/bin/docopts/docopts -h - : "$@")"

# v0.7 style
#eval $(echo  "$usage" | /home/brandl/bin/docopts_v0.7/docopts "hallo")

#eval "$(echo "$usage" | ~/bin/docopts/docopts -h -  : /projects/bioinfo/igenomes/Mus_musculus/Ensembl/GRCm38 /projects/bioinfo/holger/projects/florio_11b_2nd_batch/lanereps_pooled/arhgap11b_1.fastq.gz fastq2)"
#eval "$(echo "$usage" | ~/bin/docopts/docopts -h -  : --pc-only /projects/bioinfo/igenomes/Mus_musculus/Ensembl/GRCm38 /projects/bioinfo/holger/projects/florio_11b_2nd_batch/lanereps_pooled/arhgap11b_1.fastq.gz fastq2)"
#eval "$(echo "$usage" | ~/bin/docopts/docopts -h -  : /projects/bioinfo/igenomes/Mus_musculus/Ensembl/GRCm38 /projects/bioinfo/holger/projects/florio_11b_2nd_batch/lanereps_pooled/arhgap11b_1.fastq.gz fastq2)"


## extract all configuration parameters
fastqFiles=${fastq_files[@]}
export star_index="${igenome}/Sequence/StarIndex"
export gtf="${gtfFile}"


if [ -z "${gtfFile}" ]; then
    export gtfFile="$igenome/Annotation/Genes/genes.gtf"
fi


## check if gtf file exists
if [ ! -f $gtfFile ]; then
    >&2 echo "gtf '$gtfFile' does not exis"; exit 1;
fi


## make sure that STAR is in the PATH
if [ -z "$(which STAR)" ]; then
    >&2 echo "no STAR binary in PATH"; exit 1;
fi

## basic usage tutorial
#http://www.homolog.us/blogs/blog/2012/11/02/star-really-kick-ass-rna-seq-aligner/

## Check if STAR index is not present
if [ ! -f "${star_index}/SA" ]; then
    echo "Missing STAR for ${star_index}/SA; use 'dge_create_star_index ${igenome}' to create it"; exit 1;
#    dge_create_star_index ${igenome}'
fi


if [ "${pc_only}" -eq "true" ]; then
    echo "pc mode is not implemented yet. " >&2; exit 1
## use pc only gtf
### create a filtered gtf containing only protein coding ccds transcripts for quanitification
#gtfFilePC=mm10_igenomes_pc.gtf
#echo '
#require(biomaRt)
#require(dplyr)
#require(ggplot2)
#
##mart <- useDataset("hsapiens_gene_ensembl", mart = useMart("ensembl"))
#mart <- useDataset("mmusculus_gene_ensembl", mart = useMart("ENSEMBL_MART_ENSEMBL", host="www.ensembl.org"))
##ccdsTx <- getBM(attributes=c("ensembl_gene_id", "ensembl_transcript_id", "gene_biotype", "transcript_biotype"), filters=c("with_ccds"), values=TRUE, mart=mart)
#pcTx <- getBM(attributes=c("ensembl_gene_id", "ensembl_transcript_id", "gene_biotype", "transcript_biotype"),  mart=mart) %>% filter(transcript_biotype=="protein_coding")
#
##ggplot(pcTx, aes(gene_biotype)) + geom_bar() + coord_flip()
##ggplot(pcTx, aes(transcript_biotype)) + geom_bar() + coord_flip()
#
#write.table(with(pcTx, data.frame(ensembl_transcript_id)), col.names=F, file="mm10_pc_tx.txt",quote=F,row.names=F)
#' | R --no-save --no-restore
#
#grep -Ff mm10_pc_tx.txt $gtfFile > $gtfFilePC
#wc -l $gtfFile $gtfFilePC
fi


echo "running STAR using igenome '${igenome}' for the following files"


rm .starjobs

#fastqFiles=$(ls $baseDir/treps_pooled/*fastq.gz)

for fastqFile in $fastqFiles ; do
    echo "submitting STAR job for $fastqFile"

    # DEBUG fastqFile=/projects/bioinfo/holger/projects/florio_11b_2nd_batch/lanereps_pooled/arhgap11b_1.fastq.gz
    fastqBaseName=$(basename ${fastqFile%%.fastq.gz})
    outputdir=$fastqBaseName

    ## uniquely mapping reads only:   http:/seqanswers.com/forums/showthread.php?s=93da11109ae12a773a128a637d69defe&t=3464
    ###     tophat -p6  -G $gtfFile -g1 -o test $bowtie_gindex $fastqFile

    ## params:
    # --outSAMstrandField intronMotif is required for cuffdiff compatiblity (xs flag)
    # --outFilterIntronMotifs RemoveNoncanonicalUnannotated --outFilterType BySJout to get rid of artifical junctions
    # --quantMode GeneCounts see https://groups.google.com/forum/#!searchin/rna-star/GeneCounts/rna-star/gZRJx3ElRNo/p5FjBYKuY00J
    # --outSJfilterCountUniqueMin see https://groups.google.com/forum/#!topic/rna-star/_1BeAlGUmpA

    jl submit -j .starjobs -q medium -t 5 -n "${project}__star__${fastqBaseName}" "
    STAR --genomeDir $star_index --readFilesIn $fastqFile --runThreadN 6 --readFilesCommand zcat --outFileNamePrefix ${fastqBaseName}. --outSAMtype BAM SortedByCoordinate --outSAMstrandField intronMotif --sjdbGTFfile $gtfFile --outFilterIntronMotifs   RemoveNoncanonicalUnannotated --outFilterType BySJout --quantMode GeneCounts --outFilterMultimapNmax 1 --outSJfilterCountUniqueMin 8 3 3 3
    mv ${fastqBaseName}.Aligned.sortedByCoord.out.bam ${fastqBaseName}.bam
    samtools index ${fastqBaseName}.bam
    "
done


jl wait .starjobs

if [ -n "$(jl failed .starjobs)" ]; then
    echo "some jobs failed"; exit 1
fi

jl status --report .starjobs


rm -rf *STARgenome *.Log.progress.out _STARtmp *.SJ.out.tab *Log.out
#ziprm star_outlogs *Log.out


## Test for bam correlation (but don't wait for the results)
dge_bam_correlate . &


## estimate expresssion with http://bioinf.wehi.edu.au/featureCounts/
##Summarize multiple datasets at the same time:
#featureCounts -t exon -g gene_id -a annotation.gtf -o counts.txt library1.bam library2.bam library3.bam

## FeatureCounts --> Commented out because now part of STAR
#bamFile=control_3.bam
#featureCounts -t exon -g gene_id -a ${gtfFile} -o feature_counts.txt ${bamFile} -T 5
#mysub "${project}__feature_counts" "featureCounts -t exon -g gene_id -a ${gtfFile} -o gene_counts.txt $(ls *bam) -T 5" -q medium | blockScript

## create tophat mapping report
spin.R ${NGS_TOOLS}/dge_workflow/star_qc.R .


## Condense counts into matrix
# cd /projects/bioinfo/holger/projects/dye_rnaseq/star/mapped
dge_star_counts2matrix

mailme "${project}: STAR done in $(pwd)"