dge_utils.sh

## docs
## http://blog.joncairns.com/2013/08/what-you-need-to-know-about-bash-functions/

source <(curl https://dl.dropboxusercontent.com/u/113630701/datautils/bash/lsf_utils.sh 2>&1 2>/dev/null)
source <(curl https://dl.dropboxusercontent.com/u/113630701/datautils/R/utils/spinr.sh 2>&1 2>/dev/null)


export PATH=/projects/bioinfo/holger/bin/bowtie2-2.2.2:$PATH
export PATH=/projects/bioinfo/holger/bin/tophat-2.0.13.Linux_x86_64:$PATH
export PATH=/home/brandl/bin/cufflinks-2.2.1.Linux_x86_64:$PATH
export PATH=/sw/apps/python/current/bin:$PATH
export PATH=/home/brandl/bin/deepTools/bin:$PATH


export R_LIBS=/tmp/r_index ## export to make sure that packages are load from local repository, otherwise sqlite won't work


## create fastq report for all fastq and fastq.gz files in the current directory
dge_fastqc(){

while getopts "o:" curopt; do
    case $curopt in
    o) outputDir=$OPTARG;
    esac
done
shift $(($OPTIND - 1))

local fastqFiles=$*

#if [ -z "$fastqFiles" ]; then
if [ $# -lt 1 ]; then
     echo "Usage: dge_fastqc [-o <output_directory>] [<fastq.gz file>]+" >&2 ; return;
fi

## use current directory if not specified
if [ -z "$outputDir" ]; then
     outputDir="fastqc"
fi

if [ ! -d "$outputDir" ]; then
    echo "creating output directory '$outputDir'"
    mkdir $outputDir
fi


for fastqFile in $fastqFiles ; do
    echo "fastqcing $fastqFile"

    if [ ! -f $fastqFile ]; then
        continue;
    fi
    

    mysub "fastqc__$(basename $fastqFile)" "fastqc -j /sw/bin/java -o $outputDir -f fastq $fastqFile" -q medium  | joblist .fastqc_jobs
done


wait4jobs .fastqc_jobs

ziprm fastqc_logs fastqc__*.log

spin.R $DGE_HOME/fastqc_summary.R $outputDir

mailme "$project: fastqc done in $(pwd)"

}
export -f dge_fastqc


dge_cutadapt(){

export Ill_ADAPTERS=/projects/bioinfo/common/illumina_universal_index.fasta

for fastqFile in $* ; do
    # DEBUG fastqFile=/projects/bioinfo/holger/projects/helin/mouse/treps_pooled/mouse_liver_polar_stage1_rep4.fastq.gz
    caFastq=$(basename $fastqFile .fastq.gz)_ca.fastq.gz
    echo "cutadapting $caFastq into $caFastq"

    #todo use a more specific trimming model (trim just correct part on each side without using reverse complements
    mysub "$project__ca__$(basename $fastqFile .fastq.gz)" "cutadapt -b file:$Ill_ADAPTERS -m 20 -q 25 -o $caFastq $fastqFile"  -q long  | joblist .cajobs
done

wait4jobs .cajobs
ziprm cutadapt_logs ${project}__ca__*.log

## todo do a small report here about what has been trimmed away and why

}
export -f dge_cutadapt


#http://wiki.bash-hackers.org/howto/getopts_tutorial
dge_tophat_se(){

# http://stackoverflow.com/questions/18414054/bash-getopts-reading-optarg-for-optional-flags
#echo $*

while getopts "i:" curopt; do
    case $curopt in
    i) IGENOME="$OPTARG";
    esac
done
shift $(($OPTIND - 1))


local fastqFiles=$*

# IGENOME=/projects/bioinfo/igenomes/Mus_musculus/Ensembl/GRCm38

if [ -z "$IGENOME" ] || [ -z "$fastqFiles" ];
     then echo "Usage: dge_tophat_se -i <path to igenome> [<fastq.gz file>]+" >&2 ; return;
fi


export bowtie_gindex="$IGENOME/Sequence/Bowtie2Index/genome"
export gtfFile="$IGENOME/Annotation/Genes/genes.gtf"
#head $gtfFile

if [ ! -f $gtfFile ]; then
    >&2 echo "gtf '$gtfFile' does not exis"; return;
fi

if [ -z "$(which tophat)" ]; then
    >&2 echo "no tomcat binary in PATH"; return;
fi


echo "running tophat using igenome '$IGENOME' for the following files"
ll $fastqFiles

#fastqFiles=$(ls $baseDir/treps_pooled/*fastq.gz)

for fastqFile in $fastqFiles ; do
    echo "submitting tophat job for $fastqFile"

    # DEBUG fastqFile=/projects/bioinfo/holger/projects/helin/mouse/trimmed/mouse_big_cyst_rep4_ca.fastq.gz
    fastqBaseName=$(basename ${fastqFile%%.fastq.gz})
    outputdir=$fastqBaseName

    ## uniquely mapping reads only:   http:/seqanswers.com/forums/showthread.php?s=93da11109ae12a773a128a637d69defe&t=3464
    ###     tophat -p6  -G $gtfFile -g1 -o test $bowtie_gindex $fastqFile
    mysub "${project}__tophat__${fastqBaseName}" "
    tophat -p6  -G $gtfFile -g1 -o $outputdir $bowtie_gindex $fastqFile

    mv $outputdir/accepted_hits.bam $outputdir/$(basename $outputdir).bam
    samtools index $outputdir/$(basename $outputdir).bam
    " -n 5 -R span[hosts=1] -q long | joblist .tophatjobs
done

wait4jobs .tophatjobs

ziprm tophat_logs ${project}__tophat__*.log


dge_bam_correlate .

## create tophat mapping report
spin.R $DGE_HOME/bam_qc.R .

mailme "$project: tophat done in $(pwd)"

}
export -f dge_tophat_se


dge_bam_correlate(){

if [ $# -ne 1 ]; then
    echo "Usage: dge_bam_correlate <bam_directory>" >&2 ; return;
fi

local bamDir=$1

bamFiles=$(find $bamDir | grep ".bam$" | grep -v "unmapped" | sort)
bamLabels=$(echo "$bamFiles" |sed 's!.*/!!' | sed 's/_ca.bam//g' | sed 's/.bam//g' | xargs echo); echo $bamLabels

## see how well bam files correlate using untrimmed data
bcCmd="bamCorrelate bins --bamfiles $(echo $bamFiles | xargs echo) --labels $bamLabels --plotFile='bc.pdf' --outFileCorMatrix='bc.txt' --numberOfProcessors=6 --corMethod spearman --zMin 0.5 --zMax 1"
mysub "${project}__bamcorrelate" "$bcCmd"  -q long -n 4 -R span[hosts=1] | blockScript

mailme "$project: bamcorrelate done in $(pwd)"

}
export -f dge_bam_correlate


dge_cuffdiff(){

local gtfFile=$1
local bamDir=$2
local labels=$3

if [ $# -ne 3 ]; then
    echo "Usage: dge_fastqc <gtf_file>  <bam_directory> <labels>" >&2 ; return;
fi

if [ -z "$(which cuffdiff)" ]; then
    >&2 echo "no cuffdiff binary in PATH"; return;
fi

if [ ! -f $gtfFile ]; then
    >&2 echo "gtf '$gtfFile' does not exis"; return;
fi

## collect all bam-files and group them
allBams=$(find $bamDir | grep ".bam$" | grep -v "unmapped" | sort)

bamsSplit=""
for label in $(echo $labels | tr ", " " "); do
    echo $label
    labelBams=$(echo  "$allBams" | grep $label | xargs echo -n | tr ' ' ',')
    bamsSplit+=$labelBams" "
done
#echo $bamsSplit

## make sure (again that all files are there
echo $gtfFile $bamsSplit | tr "," " "  | xargs ls -la


#mysub ${project}_cuffdiff "cuffdiff -L aRGm,aRGp,aRG,bRG,IPC,N -o . -p 8 mm10_ensGene_ccds.gtf $amBams $apBams $aBams $bBams $cBams $dBams  2>&1 | tee cuffdiff.log" -q long -n 8 -R span[hosts=1]   | blockScript

cdCmd="cuffdiff -L $labels -o . -p 10 $gtfFile $bamsSplit"
#echo "cmd is: $cdCmd"
# eval $cdCmd
mysub "${project}__cuffdiff" "$cdCmd"  -q long -n 4 -R span[hosts=1] | blockScript


## create a cuffdiff db using cummerbund in a temporary directory to avoid locking problems
tmpDbDir=$(mktemp -d)
cp -r . $tmpDbDir

## todo remove this hack
genome=$(echo $gtfFile | cut -f7 -d'/'); echo "genome is $genome"

## make sure to use temp-r to avoid file locking problems
R_LIBS=/tmp/r_index

echo '
require(cummeRbund)
dbDir=commandArgs(T)[1]
gtfFile=commandArgs(T)[2]
genome=commandArgs(T)[3]
## note without providing the gtf the db is much smaller
readCufflinks(dir=dbDir, rebuild=T, gtf=gtfFile, genome=genome)
' | R -q --no-save --no-restore --args $tmpDbDir $gtfFile $genome

if [ ! -f $tmpDbDir/cuffData.db ]; then
    >&2 echo "cummerbund failed to create cuffidff sqlite db"; return;
fi

cp $tmpDbDir/cuffData.db .
rm -rf $tmpDbDir ## because it's no longer needed

mailme "$project: cuffdiff done in $(pwd)"

}
export -f dge_cuffdiff