Newer
Older
## docs
## http://blog.joncairns.com/2013/08/what-you-need-to-know-about-bash-functions/
source <(curl https://raw.githubusercontent.com/holgerbrandl/datautils/v1.12/bash/lsf_utils.sh 2>&1 2>/dev/null)
source <(curl https://raw.githubusercontent.com/holgerbrandl/datautils/v1.12/R/spinr/spin_utils.sh 2>&1 2>/dev/null)
export PATH=/projects/bioinfo/holger/bin/bowtie2-2.2.2:$PATH
export PATH=/projects/bioinfo/holger/bin/tophat-2.0.13.Linux_x86_64:$PATH
export PATH=/home/brandl/bin/cufflinks-2.2.1.Linux_x86_64:$PATH
export PATH=/sw/apps/python/current/bin:$PATH
export PATH=/home/brandl/bin/deepTools/bin:$PATH
export PATH=/projects/bioinfo/holger/bin/FastQC_0.11.2:$PATH
export PATH=/projects/bioinfo/holger/bin/bedtools-2.23.0/bin/:$PATH
export PATH=/home/brandl/bin/spinr:$PATH
export PATH=/home/brandl/bin/subread-1.4.6-p3-Linux-x86_64/bin:$PATH
#export PATH=/home/brandl/bin/STAR/STAR-STAR_2.4.1d/source:$PATH
export PATH=/home/brandl/bin/STAR/STAR_2.4.2a/source:$PATH
## no longer needed because packages are no kept in home
#export R_LIBS=/tmp/r_index ## export to make sure that packages are load from local repository, otherwise sqlite won't work
## create fastq report for all fastq and fastq.gz files in the current directory
dge_fastqc(){
while getopts "o:" curopt; do
case $curopt in
o) outputDir=$OPTARG;
esac
done
shift $(($OPTIND - 1))
local fastqFiles=$*
#if [ -z "$fastqFiles" ]; then
if [ $# -lt 1 ]; then
echo "Usage: dge_fastqc [-o <output_directory>] [<fastq.gz file>]+" >&2 ; return;
fi
fi
if [ ! -d "$outputDir" ]; then
echo "creating output directory '$outputDir'"
mkdir $outputDir
fi
for fastqFile in $fastqFiles ; do
echo "fastqcing $fastqFile"
if [ ! -f $fastqFile ]; then
continue;
fi
mysub "fastqc__$(basename $fastqFile)" "fastqc -j /sw/bin/java -o $outputDir -f fastq $fastqFile" -q medium | joblist .fastqc_jobs
done
wait4jobs .fastqc_jobs
spin.R ${NGS_TOOLS}/dge_workflow/fastqc_summary.R $outputDir
export Ill_ADAPTERS=/projects/bioinfo/common/cutadapt_patterns.fasta
for fastqFile in $* ; do
# DEBUG fastqFile=/projects/bioinfo/holger/projects/helin/mouse/treps_pooled/mouse_liver_polar_stage1_rep4.fastq.gz
caFastq=$(basename $fastqFile .fastq.gz)_ca.fastq.gz
echo "cutadapting $caFastq into $caFastq"
#todo use a more specific trimming model (trim just correct part on each side without using reverse complements
mysub "${project}__ca__$(basename $fastqFile .fastq.gz)" "cutadapt -b file:$Ill_ADAPTERS -m 20 -q 25 -o $caFastq $fastqFile" -q long | joblist .cajobs
spin.R ${NGS_TOOLS}/dge_workflow/cutadapt_summary.R .
## todo do a small report here about what has been trimmed away and why
#http://wiki.bash-hackers.org/howto/getopts_tutorial
dge_tophat_se(){
# http://stackoverflow.com/questions/18414054/bash-getopts-reading-optarg-for-optional-flags
while getopts "i:" curopt; do
case $curopt in
i) IGENOME="$OPTARG";
esac
done
shift $(($OPTIND - 1))
local fastqFiles=$*
# IGENOME=/projects/bioinfo/igenomes/Mus_musculus/Ensembl/GRCm38
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
if [ -z "$IGENOME" ] || [ -z "$fastqFiles" ];
then echo "Usage: dge_tophat_se -i <path to igenome> [<fastq.gz file>]+" >&2 ; return;
fi
export bowtie_gindex="$IGENOME/Sequence/Bowtie2Index/genome"
export gtfFile="$IGENOME/Annotation/Genes/genes.gtf"
#head $gtfFile
if [ ! -f $gtfFile ]; then
>&2 echo "gtf '$gtfFile' does not exis"; return;
fi
if [ -z "$(which tophat)" ]; then
>&2 echo "no tomcat binary in PATH"; return;
fi
echo "running tophat using igenome '$IGENOME' for the following files"
ll $fastqFiles
#fastqFiles=$(ls $baseDir/treps_pooled/*fastq.gz)
for fastqFile in $fastqFiles ; do
echo "submitting tophat job for $fastqFile"
# DEBUG fastqFile=/projects/bioinfo/holger/projects/helin/mouse/trimmed/mouse_big_cyst_rep4_ca.fastq.gz
fastqBaseName=$(basename ${fastqFile%%.fastq.gz})
outputdir=$fastqBaseName
## uniquely mapping reads only: http:/seqanswers.com/forums/showthread.php?s=93da11109ae12a773a128a637d69defe&t=3464
### tophat -p6 -G $gtfFile -g1 -o test $bowtie_gindex $fastqFile
mysub "${project}__tophat__${fastqBaseName}" "
tophat -p6 -G $gtfFile -g1 -o $outputdir $bowtie_gindex $fastqFile
mv $outputdir/accepted_hits.bam $outputdir/$(basename $outputdir).bam
samtools index $outputdir/$(basename $outputdir).bam
" -n 5 -R span[hosts=1] -q medium | joblist .tophatjobs
spin.R ${NGS_TOOLS}/dge_workflow/tophat_qc.R .
dge_bam_correlate(){
if [ $# -ne 1 ]; then
echo "Usage: dge_bam_correlate <bam_directory>" >&2 ; return;
fi
local bamDir=$1
bamFiles=$(find $bamDir | grep ".bam$" | grep -v "unmapped" | sort)
bamLabels=$(echo "$bamFiles" |sed 's!.*/!!' | sed 's/_ca.bam//g' | sed 's/.bam//g' | xargs echo); echo $bamLabels
## see how well bam files correlate using untrimmed data
bcCmd="bamCorrelate bins --bamfiles $(echo $bamFiles | xargs echo) --labels $bamLabels --plotFile='bc.pdf' --outFileCorMatrix='bc.txt' --numberOfProcessors=6 --corMethod spearman --zMin 0.5 --zMax 1"
mysub "${project}__bamcorrelate" "$bcCmd" -q long -n 4 -R span[hosts=1] | blockScript .bamcorrelate
mailme "$project: bamcorrelate done in $(pwd)"
}
export -f dge_bam_correlate
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
dge_bigwig(){
usage="Usage: dge_bigwig <genome_fai> [<bam_file>]+"
if [ $# -lt 2 ]; then
echo ${usage} >&2 ; return;
fi
#bamFiles=$(find . -name "*.bam" | grep -v unmapped | xargs echo)
genomeFai=$1
bamFiles="${@:2}"
if [ ! -f "${genomeFai}" ]; then
echo "could not find fai index $1! ${usage}" >&2 ; return;
fi
## add bigwig to PATH
export PATH=~/bin:${PATH}
if [ -z "$(which wigToBigWig 2>/dev/null)" ]; then
echo "could not find wigToBigWig in PATH! ${usage}" >&2 ; #return;
fi
## create big wig files
for bamFile in $bamFiles; do
sample=$(basename $bamFile .bam)
echo "converting $bamFile to bigwig format"
mysub "${project}__bw__${sample}" "genomeCoverageBed -bg -ibam $bamFile -g ${genomeFai} | wigToBigWig -clip stdin ${genomeFai} ${sample}.bw" -q short | joblist .bigwig
done
wait4jobs .bigwig
}
export -f dge_bigwig
## http://stackoverflow.com/questions/6916856/can-bash-show-a-functions-definition
#type dge_bigwig
## Merge technical replicatews
dge_merge_treps(){
usage="Usage: dge_merge_treps <bam_directory> <comma_sep_biosample_spec>"
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
if [ $# -ne 2 ]; then
echo $usage >&2 ; return;
fi
local bam_dir=$1; # bam_dir=$baseDir/mapped_trim/
local bio_reps=$2; # bio_samples="test,lala"
if [ ! -d "$bam_dir" ]; then
echo "bam file directory does not exist! $usage'" >&2 ; return;
fi
if [ $(echo "$bio_samples" | grep "," | wc -l ) -ne 1 ]; then
echo "Invalid biosample spec! $usage'" >&2 ; return;
fi
#for sample in aRG bRG N; do
for sample in $(echo $bio_samples | tr ",", " "); do
# DEBUG sample=Insm1_1103
## todo use bsub instead here
(
sampleBams=$(find $bam_dir -name '*bam' | grep -v unmapped | grep $sample)
echo "merging $sample with $sampleBams"
if [ 1 -eq $(echo "$sampleBams" | wc -l) ]; then
cp $sampleBams $sample.bam
else
samtools merge - $(echo $sampleBams | xargs echo) | samtools sort - $sample
fi
samtools index $sample.bam
)&
done
}
export -f dge_merge_treps
## tester
#dge_merge_treps $baseDir/mapped_trim/ "Ctrl_0703,Ctrl_1029,Ctrl_1103,Insm1_0703,Insm1_1029,Insm1_1103"
#dge_cd(){
#( ## required to work around docopt sys.exit
#usage='
#Use cuffdiff2 to performa a differntial expression analysis
#Usage: dge_cuffdiff [--exclude=<regex>] <gtfFile> <bamDir> <labels>
#
#Options:
#--exclude=<regex> Exclude patterns (grep regex matching bam file paths to be excluded)
#'
#echo "$usage" | ~/bin/docopts/docopts -h - : $*
#eval "$(echo "$usage" | ~/bin/docopts/docopts -h - : $*)"
#
#)
#}
#dge_cd $gtfFilePC $baseDir/mapped $labels
#dge_cd --help
dge_cuffdiff(){
local gtfFile=$1
local bamDir=$2
local labels=$3
if [ $# -ne 3 ]; then
echo "Usage: dge_cuffdiff <gtf_file> <bam_directory> <labels>" >&2 ; return;
fi
if [ -z "$(which cuffdiff)" ]; then
>&2 echo "no cuffdiff binary in PATH"; return;
fi
if [ ! -f $gtfFile ]; then
>&2 echo "gtf '$gtfFile' does not exis"; return;
fi
## collect all bam-files and group them
allBams=$(find $bamDir | grep ".bam$" | grep -v "unmapped" | sort)
if [ -n "$EXCLUDE_BAMS_PATTERN" ]; then
echo "excluding bams with $EXCLUDE_BAMS_PATTERN"
allBams=$(echo "$allBams" | grep -v "$EXCLUDE_BAMS_PATTERN")
fi
## todo use optional argument here --> default "unmapped" --> allow user to add others
#allBams=$(find $bamDir | grep ".bam$" | grep -v "unmapped" | grep -v "1029" | sort)
#allBams=$(find $bamDir | grep ".bam$" | grep -v "unmapped" | grep -v "Ctrl_1029" | sort)
bamsSplit=""
for label in $(echo $labels | tr ", " " "); do
# DEBUG label="liver"; label="cyst"
## grep the bam names excluding the path
echo "$allBams" | xargs -n1 basename | grep $label > ${label}.cuff_bamlist
labelBams=$(echo "$allBams" | grep -Ff ${label}.cuff_bamlist | xargs echo -n | tr ' ' ',')
# echo "$allBams" | grep -Ff ${label}.bamlist | xargs -n1 echo
bamsSplit+=$labelBams" "
done
#echo $bamsSplit
## make sure (again that all files are there
echo $gtfFile $bamsSplit | tr "," " " | xargs ls -la
#mysub ${project}_cuffdiff "cuffdiff -L aRGm,aRGp,aRG,bRG,IPC,N -o . -p 8 mm10_ensGene_ccds.gtf $amBams $apBams $aBams $bBams $cBams $dBams 2>&1 | tee cuffdiff.log" -q long -n 8 -R span[hosts=1] | blockScript
cdCmd="cuffdiff -L $labels -o . -p 10 $gtfFile $bamsSplit"
#eval $cdCmd
mysub "${project}__cuffdiff" "$cdCmd" -q long -n 4 -R span[hosts=1] | blockScript
## create a cuffdiff db using cummerbund in a temporary directory to avoid locking problems
tmpDbDir=$(mktemp -d)
cp -r . $tmpDbDir
## todo remove this hack
genome=$(echo $gtfFile | cut -f8 -d'/' | tr '[:upper:]' '[:lower:]'); echo "genome is $genome"
## make sure to use temp-r to avoid file locking problems
#export R_LIBS=/tmp/r_index
echo '
require(cummeRbund)
dbDir=commandArgs(T)[1]
gtfFile=commandArgs(T)[2]
genome=commandArgs(T)[3]
## note without providing the gtf the db is much smaller
readCufflinks(dir=dbDir, rebuild=T, gtf=gtfFile, genome=genome)
' | R -q --no-save --no-restore --args "$tmpDbDir" "$gtfFile" "$genome"
if [ ! -f $tmpDbDir/cuffData.db ]; then
>&2 echo "cummerbund failed to create cuffidff sqlite db"; return;
fi
cp $tmpDbDir/cuffData.db .
rm -rf $tmpDbDir ## because it's no longer needed