Newer
Older
## docs
## http://blog.joncairns.com/2013/08/what-you-need-to-know-about-bash-functions/
source <(curl https://dl.dropboxusercontent.com/u/113630701/datautils/bash/lsf_utils.sh 2>&1 2>/dev/null)
source <(curl https://dl.dropboxusercontent.com/u/113630701/datautils/R/utils/spinr.sh 2>&1 2>/dev/null)
export PATH=/projects/bioinfo/holger/bin/bowtie2-2.2.2:$PATH
export PATH=/projects/bioinfo/holger/bin/tophat-2.0.13.Linux_x86_64:$PATH
export PATH=/home/brandl/bin/cufflinks-2.2.1.Linux_x86_64:$PATH
export PATH=/sw/apps/python/current/bin:$PATH
export PATH=/home/brandl/bin/deepTools/bin:$PATH
export R_LIBS=/tmp/r_index ## export to make sure that packages are load from local repository, otherwise sqlite won't work
## create fastq report for all fastq and fastq.gz files in the current directory
dge_fastqc(){
while getopts "o:" curopt; do
case $curopt in
o) outputDir=$OPTARG;
esac
done
shift $(($OPTIND - 1))
local fastqFiles=$*
#if [ -z "$fastqFiles" ]; then
if [ $# -lt 1 ]; then
echo "Usage: dge_fastqc [-o <output_directory>] [<fastq.gz file>]+" >&2 ; return;
fi
fi
if [ ! -d "$outputDir" ]; then
echo "creating output directory '$outputDir'"
mkdir $outputDir
fi
for fastqFile in $fastqFiles ; do
echo "fastqcing $fastqFile"
if [ ! -f $fastqFile ]; then
continue;
fi
mysub "fastqc__$(basename $fastqFile)" "fastqc -j /sw/bin/java -o $outputDir -f fastq $fastqFile" -q medium | joblist .fastqc_jobs
done
wait4jobs .fastqc_jobs
dge_cutadapt(){
export Ill_ADAPTERS=/projects/bioinfo/common/illumina_universal_index.fasta
for fastqFile in $* ; do
# DEBUG fastqFile=/projects/bioinfo/holger/projects/helin/mouse/treps_pooled/mouse_liver_polar_stage1_rep4.fastq.gz
caFastq=$(basename $fastqFile .fastq.gz)_ca.fastq.gz
echo "cutadapting $caFastq into $caFastq"
#todo use a more specific trimming model (trim just correct part on each side without using reverse complements
mysub "$project__ca__$(basename $fastqFile .fastq.gz)" "cutadapt -b file:$Ill_ADAPTERS -m 20 -q 25 -o $caFastq $fastqFile" -q long | joblist .cajobs
## todo do a small report here about what has been trimmed away and why
#http://wiki.bash-hackers.org/howto/getopts_tutorial
dge_tophat_se(){
# http://stackoverflow.com/questions/18414054/bash-getopts-reading-optarg-for-optional-flags
while getopts "i:" curopt; do
case $curopt in
i) IGENOME="$OPTARG";
esac
done
shift $(($OPTIND - 1))
local fastqFiles=$*
# IGENOME=/projects/bioinfo/igenomes/Mus_musculus/Ensembl/GRCm38
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
if [ -z "$IGENOME" ] || [ -z "$fastqFiles" ];
then echo "Usage: dge_tophat_se -i <path to igenome> [<fastq.gz file>]+" >&2 ; return;
fi
export bowtie_gindex="$IGENOME/Sequence/Bowtie2Index/genome"
export gtfFile="$IGENOME/Annotation/Genes/genes.gtf"
#head $gtfFile
if [ ! -f $gtfFile ]; then
>&2 echo "gtf '$gtfFile' does not exis"; return;
fi
if [ -z "$(which tophat)" ]; then
>&2 echo "no tomcat binary in PATH"; return;
fi
echo "running tophat using igenome '$IGENOME' for the following files"
ll $fastqFiles
#fastqFiles=$(ls $baseDir/treps_pooled/*fastq.gz)
for fastqFile in $fastqFiles ; do
echo "submitting tophat job for $fastqFile"
# DEBUG fastqFile=/projects/bioinfo/holger/projects/helin/mouse/trimmed/mouse_big_cyst_rep4_ca.fastq.gz
fastqBaseName=$(basename ${fastqFile%%.fastq.gz})
outputdir=$fastqBaseName
## uniquely mapping reads only: http:/seqanswers.com/forums/showthread.php?s=93da11109ae12a773a128a637d69defe&t=3464
### tophat -p6 -G $gtfFile -g1 -o test $bowtie_gindex $fastqFile
mysub "${project}__tophat__${fastqBaseName}" "
tophat -p6 -G $gtfFile -g1 -o $outputdir $bowtie_gindex $fastqFile
mv $outputdir/accepted_hits.bam $outputdir/$(basename $outputdir).bam
samtools index $outputdir/$(basename $outputdir).bam
" -n 5 -R span[hosts=1] -q long | joblist .tophatjobs
done
wait4jobs .tophatjobs
ziprm tophat_logs ${project}__tophat__*.log
dge_bam_correlate .
dge_bam_correlate(){
if [ $# -ne 1 ]; then
echo "Usage: dge_bam_correlate <bam_directory>" >&2 ; return;
fi
local bamDir=$1
bamFiles=$(find $bamDir | grep ".bam$" | grep -v "unmapped" | sort)
bamLabels=$(echo "$bamFiles" |sed 's!.*/!!' | sed 's/_ca.bam//g' | sed 's/.bam//g' | xargs echo); echo $bamLabels
## see how well bam files correlate using untrimmed data
bcCmd="bamCorrelate bins --bamfiles $(echo $bamFiles | xargs echo) --labels $bamLabels --plotFile='bc.pdf' --outFileCorMatrix='bc.txt' --numberOfProcessors=6 --corMethod spearman --zMin 0.5 --zMax 1"
mysub "${project}__bamcorrelate" "$bcCmd" -q long -n 4 -R span[hosts=1] | blockScript
mailme "$project: bamcorrelate done in $(pwd)"
}
export -f dge_bam_correlate
dge_cuffdiff(){
local gtfFile=$1
local bamDir=$2
local labels=$3
if [ $# -ne 3 ]; then
echo "Usage: dge_fastqc <gtf_file> <bam_directory> <labels>" >&2 ; return;
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
fi
if [ -z "$(which cuffdiff)" ]; then
>&2 echo "no cuffdiff binary in PATH"; return;
fi
if [ ! -f $gtfFile ]; then
>&2 echo "gtf '$gtfFile' does not exis"; return;
fi
## collect all bam-files and group them
allBams=$(find $bamDir | grep ".bam$" | grep -v "unmapped" | sort)
bamsSplit=""
for label in $(echo $labels | tr ", " " "); do
echo $label
labelBams=$(echo "$allBams" | grep $label | xargs echo -n | tr ' ' ',')
bamsSplit+=$labelBams" "
done
#echo $bamsSplit
## make sure (again that all files are there
echo $gtfFile $bamsSplit | tr "," " " | xargs ls -la
#mysub ${project}_cuffdiff "cuffdiff -L aRGm,aRGp,aRG,bRG,IPC,N -o . -p 8 mm10_ensGene_ccds.gtf $amBams $apBams $aBams $bBams $cBams $dBams 2>&1 | tee cuffdiff.log" -q long -n 8 -R span[hosts=1] | blockScript
cdCmd="cuffdiff -L $labels -o . -p 10 $gtfFile $bamsSplit"
#echo "cmd is: $cdCmd"
# eval $cdCmd
mysub "${project}__cuffdiff" "$cdCmd" -q long -n 4 -R span[hosts=1] | blockScript
## create a cuffdiff db using cummerbund in a temporary directory to avoid locking problems
tmpDbDir=$(mktemp -d)
cp -r . $tmpDbDir
## todo remove this hack
genome=$(echo $gtfFile | cut -f7 -d'/'); echo "genome is $genome"
## make sure to use temp-r to avoid file locking problems
echo '
require(cummeRbund)
dbDir=commandArgs(T)[1]
gtfFile=commandArgs(T)[2]
genome=commandArgs(T)[3]
## note without providing the gtf the db is much smaller
readCufflinks(dir=dbDir, rebuild=T, gtf=gtfFile, genome=genome)
' | R -q --no-save --no-restore --args $tmpDbDir $gtfFile $genome
if [ ! -f $tmpDbDir/cuffData.db ]; then
>&2 echo "cummerbund failed to create cuffidff sqlite db"; return;
fi
cp $tmpDbDir/cuffData.db .
rm -rf $tmpDbDir ## because it's no longer needed