Newer
Older
## docs
## http://blog.joncairns.com/2013/08/what-you-need-to-know-about-bash-functions/
source <(curl https://raw.githubusercontent.com/holgerbrandl/datautils/v1.14/bash/lsf_utils.sh 2>&1 2>/dev/null)
source <(curl https://raw.githubusercontent.com/holgerbrandl/datautils/v1.14/R/spinr/spin_utils.sh 2>&1 2>/dev/null)
export PATH=/projects/bioinfo/holger/bin/bowtie2-2.2.5:$PATH
export PATH=/projects/bioinfo/holger/bin/tophat-2.0.13.Linux_x86_64:$PATH
export PATH=/home/brandl/bin/cufflinks-2.2.1.Linux_x86_64:$PATH
export PATH=/sw/apps/python/current/bin:$PATH
export PATH=/home/brandl/bin/deepTools/bin:$PATH
export PATH=/projects/bioinfo/holger/bin/FastQC_0.11.2:$PATH
export PATH=/projects/bioinfo/holger/bin/bedtools-2.23.0/bin/:$PATH
export PATH=/home/brandl/bin/spinr:$PATH
export PATH=/home/brandl/bin/subread-1.4.6-p3-Linux-x86_64/bin:$PATH
#export PATH=/home/brandl/bin/STAR/STAR-STAR_2.4.1d/source:$PATH
export PATH=/home/brandl/bin/STAR/STAR_2.4.2a/source:$PATH
## no longer needed because packages are no kept in home
#export R_LIBS=/tmp/r_index ## export to make sure that packages are load from local repository, otherwise sqlite won't work
## create fastq report for all fastq and fastq.gz files in the current directory
dge_fastqc(){
while getopts "o:" curopt; do
case $curopt in
o) outputDir=$OPTARG;
esac
done
shift $(($OPTIND - 1))
local fastqFiles=$*
#if [ -z "$fastqFiles" ]; then
if [ $# -lt 1 ]; then
echo "Usage: dge_fastqc [-o <output_directory>] [<fastq.gz file>]+" >&2 ; return;
fi
fi
if [ ! -d "$outputDir" ]; then
echo "creating output directory '$outputDir'"
mkdir $outputDir
fi
for fastqFile in $fastqFiles ; do
echo "fastqcing $fastqFile"
if [ ! -f $fastqFile ]; then
continue;
fi
mysub "fastqc__$(basename $fastqFile)" "fastqc -j /sw/bin/java -o $outputDir -f fastq $fastqFile" -q medium | joblist .fastqc_jobs
done
wait4jobs .fastqc_jobs
spin.R ${NGS_TOOLS}/dge_workflow/fastqc_summary.R $outputDir
export Ill_ADAPTERS=/projects/bioinfo/common/cutadapt_patterns.fasta
for fastqFile in $* ; do
# DEBUG fastqFile=/projects/bioinfo/holger/projects/helin/mouse/treps_pooled/mouse_liver_polar_stage1_rep4.fastq.gz
caFastq=$(basename $fastqFile .fastq.gz)_ca.fastq.gz
echo "cutadapting $caFastq into $caFastq"
#todo use a more specific trimming model (trim just correct part on each side without using reverse complements
mysub "${project}__ca__$(basename $fastqFile .fastq.gz)" "cutadapt -b file:$Ill_ADAPTERS -m 20 -q 25 -o $caFastq $fastqFile" -q long | joblist .cajobs
spin.R ${NGS_TOOLS}/dge_workflow/cutadapt_summary.R .
## todo do a small report here about what has been trimmed away and why
#http://wiki.bash-hackers.org/howto/getopts_tutorial
dge_tophat_se(){
# http://stackoverflow.com/questions/18414054/bash-getopts-reading-optarg-for-optional-flags
while getopts "i:" curopt; do
case $curopt in
i) IGENOME="$OPTARG";
esac
done
shift $(($OPTIND - 1))
local fastqFiles=$*
# IGENOME=/projects/bioinfo/igenomes/Mus_musculus/Ensembl/GRCm38
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
if [ -z "$IGENOME" ] || [ -z "$fastqFiles" ];
then echo "Usage: dge_tophat_se -i <path to igenome> [<fastq.gz file>]+" >&2 ; return;
fi
export bowtie_gindex="$IGENOME/Sequence/Bowtie2Index/genome"
export gtfFile="$IGENOME/Annotation/Genes/genes.gtf"
#head $gtfFile
if [ ! -f $gtfFile ]; then
>&2 echo "gtf '$gtfFile' does not exis"; return;
fi
if [ -z "$(which tophat)" ]; then
>&2 echo "no tomcat binary in PATH"; return;
fi
echo "running tophat using igenome '$IGENOME' for the following files"
ll $fastqFiles
#fastqFiles=$(ls $baseDir/treps_pooled/*fastq.gz)
for fastqFile in $fastqFiles ; do
echo "submitting tophat job for $fastqFile"
# DEBUG fastqFile=/projects/bioinfo/holger/projects/helin/mouse/trimmed/mouse_big_cyst_rep4_ca.fastq.gz
fastqBaseName=$(basename ${fastqFile%%.fastq.gz})
outputdir=$fastqBaseName
## uniquely mapping reads only: http:/seqanswers.com/forums/showthread.php?s=93da11109ae12a773a128a637d69defe&t=3464
### tophat -p6 -G $gtfFile -g1 -o test $bowtie_gindex $fastqFile
mysub "${project}__tophat__${fastqBaseName}" "
tophat -p6 -G $gtfFile -g1 -o $outputdir $bowtie_gindex $fastqFile
mv $outputdir/accepted_hits.bam $outputdir/$(basename $outputdir).bam
samtools index $outputdir/$(basename $outputdir).bam
" -n 5 -R span[hosts=1] -q medium | joblist .tophatjobs
## see https://github.com/fidelram/deepTools/wiki/QC#bamCorrelate
spin.R ${NGS_TOOLS}/dge_workflow/tophat_qc.R .
if [ $# -eq 0 ]; then
echo "Usage: dge_bam_correlate <bam_directory>" >&2 ;
echo "Usage: dge_bam_correlate <bam_files...>" >&2 ;
return;
if [ $# -e 1 ]; then
bamFiles=$(find $bamDir | grep ".bam$" | grep -v "unmapped" | sort)
else
bamFiles=$*
fi
bamLabels=$(echo "$bamFiles" | xargs -n1 basename | sed 's!.*/!!' | sed 's/_mmf.bam//g' | sed 's/_ca.bam//g' | sed 's/.bam//g' | xargs echo); echo $bamLabels
## see how well bam files correlate using untrimmed data
bcCmd="bamCorrelate bins --bamfiles $(echo $bamFiles | xargs echo) --labels $bamLabels --plotFile='bc.pdf' --outFileCorMatrix='bc.txt' --numberOfProcessors=6 --corMethod spearman --zMin 0.5 --zMax 1"
echo $bcCmd
#mysub "${project}__bamcorrelate" "$bcCmd" -q long -n 4 -R span[hosts=1] | blockScript .bamcorrelate
#mailme "$project: bamcorrelate done in $(pwd)"
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
dge_bigwig(){
usage="Usage: dge_bigwig <genome_fai> [<bam_file>]+"
if [ $# -lt 2 ]; then
echo ${usage} >&2 ; return;
fi
#bamFiles=$(find . -name "*.bam" | grep -v unmapped | xargs echo)
genomeFai=$1
bamFiles="${@:2}"
if [ ! -f "${genomeFai}" ]; then
echo "could not find fai index $1! ${usage}" >&2 ; return;
fi
## add bigwig to PATH
export PATH=~/bin:${PATH}
if [ -z "$(which wigToBigWig 2>/dev/null)" ]; then
echo "could not find wigToBigWig in PATH! ${usage}" >&2 ; #return;
fi
## create big wig files
for bamFile in $bamFiles; do
sample=$(basename $bamFile .bam)
echo "converting $bamFile to bigwig format"
mysub "${project}__bw__${sample}" "genomeCoverageBed -split -bg -ibam $bamFile -g ${genomeFai} | wigToBigWig -clip stdin ${genomeFai} ${sample}.bw" -q short | joblist .bigwig
done
wait4jobs .bigwig
}
export -f dge_bigwig
## http://stackoverflow.com/questions/6916856/can-bash-show-a-functions-definition
#type dge_bigwig
## Merge technical replicatews
dge_merge_treps(){
usage="Usage: dge_merge_treps <bam_directory> <comma_sep_biosample_spec>"
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
if [ $# -ne 2 ]; then
echo $usage >&2 ; return;
fi
local bam_dir=$1; # bam_dir=$baseDir/mapped_trim/
local bio_reps=$2; # bio_samples="test,lala"
if [ ! -d "$bam_dir" ]; then
echo "bam file directory does not exist! $usage'" >&2 ; return;
fi
if [ $(echo "$bio_samples" | grep "," | wc -l ) -ne 1 ]; then
echo "Invalid biosample spec! $usage'" >&2 ; return;
fi
#for sample in aRG bRG N; do
for sample in $(echo $bio_samples | tr ",", " "); do
# DEBUG sample=Insm1_1103
## todo use bsub instead here
(
sampleBams=$(find $bam_dir -name '*bam' | grep -v unmapped | grep $sample)
echo "merging $sample with $sampleBams"
if [ 1 -eq $(echo "$sampleBams" | wc -l) ]; then
cp $sampleBams $sample.bam
else
samtools merge - $(echo $sampleBams | xargs echo) | samtools sort - $sample
fi
samtools index $sample.bam
)&
done
}
export -f dge_merge_treps
## tester
#dge_merge_treps $baseDir/mapped_trim/ "Ctrl_0703,Ctrl_1029,Ctrl_1103,Insm1_0703,Insm1_1029,Insm1_1103"
#dge_cd(){
#( ## required to work around docopt sys.exit
#usage='
#Use cuffdiff2 to performa a differntial expression analysis
#Usage: dge_cuffdiff [--exclude=<regex>] <gtfFile> <bamDir> <labels>
#
#Options:
#--exclude=<regex> Exclude patterns (grep regex matching bam file paths to be excluded)
#'
#echo "$usage" | ~/bin/docopts/docopts -h - : $*
#eval "$(echo "$usage" | ~/bin/docopts/docopts -h - : $*)"
#
#)
#}
#dge_cd $gtfFilePC $baseDir/mapped $labels
#dge_cd --help
dge_cuffdiff(){
local gtfFile=$1
local bamDir=$2
local labels=$3
if [ $# -ne 3 ]; then
echo "Usage: dge_cuffdiff <gtf_file> <bam_directory> <labels>" >&2 ; return;
fi
if [ -z "$(which cuffdiff)" ]; then
>&2 echo "no cuffdiff binary in PATH"; return;
fi
if [ ! -f $gtfFile ]; then
>&2 echo "gtf '$gtfFile' does not exis"; return;
fi
## collect all bam-files and group them
allBams=$(find $bamDir | grep ".bam$" | grep -v "unmapped" | sort)
if [ -n "$EXCLUDE_BAMS_PATTERN" ]; then
echo "excluding bams with $EXCLUDE_BAMS_PATTERN"
allBams=$(echo "$allBams" | grep -v "$EXCLUDE_BAMS_PATTERN")
fi
## todo use optional argument here --> default "unmapped" --> allow user to add others
#allBams=$(find $bamDir | grep ".bam$" | grep -v "unmapped" | grep -v "1029" | sort)
#allBams=$(find $bamDir | grep ".bam$" | grep -v "unmapped" | grep -v "Ctrl_1029" | sort)
bamsSplit=""
for label in $(echo $labels | tr ", " " "); do
# DEBUG label="liver"; label="cyst"
## grep the bam names excluding the path
echo "$allBams" | xargs -n1 basename | grep $label > ${label}.cuff_bamlist
labelBams=$(echo "$allBams" | grep -Ff ${label}.cuff_bamlist | xargs echo -n | tr ' ' ',')
# echo "$allBams" | grep -Ff ${label}.bamlist | xargs -n1 echo
bamsSplit+=$labelBams" "
done
#echo $bamsSplit
## make sure (again that all files are there
echo $gtfFile $bamsSplit | tr "," " " | xargs ls -la
#mysub ${project}_cuffdiff "cuffdiff -L aRGm,aRGp,aRG,bRG,IPC,N -o . -p 8 mm10_ensGene_ccds.gtf $amBams $apBams $aBams $bBams $cBams $dBams 2>&1 | tee cuffdiff.log" -q long -n 8 -R span[hosts=1] | blockScript
cdCmd="cuffdiff -L $labels -o . -p 10 $gtfFile $bamsSplit"
#eval $cdCmd
mysub "${project}__cuffdiff" "$cdCmd" -q long -n 4 -R span[hosts=1] | blockScript
## create a cuffdiff db using cummerbund in a temporary directory to avoid locking problems
tmpDbDir=$(mktemp -d)
cp -r . $tmpDbDir
## todo remove this hack
genome=$(echo $gtfFile | cut -f8 -d'/' | tr '[:upper:]' '[:lower:]'); echo "genome is $genome"
## make sure to use temp-r to avoid file locking problems
#export R_LIBS=/tmp/r_index
echo '
require(cummeRbund)
dbDir=commandArgs(T)[1]
gtfFile=commandArgs(T)[2]
genome=commandArgs(T)[3]
## note without providing the gtf the db is much smaller
readCufflinks(dir=dbDir, rebuild=T, gtf=gtfFile, genome=genome)
' | R -q --no-save --no-restore --args "$tmpDbDir" "$gtfFile" "$genome"
if [ ! -f $tmpDbDir/cuffData.db ]; then
>&2 echo "cummerbund failed to create cuffidff sqlite db"; return;
fi
cp $tmpDbDir/cuffData.db .
rm -rf $tmpDbDir ## because it's no longer needed
# Create a star index for a given igenome
dge_create_star_index(){
if [ $# -ne 1 ]; then
echo "Usage: dge_create_star_index <igenome>" >&2 ; return;
fi
igenome=$1
if [ ! -d "$igenome" ] | [ ! -d "${igenome}/Sequence" ]; then
echo "igenome directory '$igenome' does not exist" >&2 ; return;
fi
export star_index="${igenome}/Sequence/StarIndex"
## stop if index exists already
if [ -d "$star_index" ]; then
echo "Error: Index directory ${star_index} already exists." >&2 ; return;
fi
chmod +w $(dirname ${star_index})
mailme "${project}: creating STAR index in ${star_index}"
mkdir ${star_index}
cmd="STAR --runMode genomeGenerate --genomeDir ${star_index} --genomeFastaFiles ${igenome}/Sequence/WholeGenomeFasta/genome.fa --runThreadN 10"
#eval $cmd
#STAR --runMode genomeGenerate --genomeDir ${star_index} --genomeFastaFiles ${igenome}/Sequence/Chromosomes/*.fa --runThreadN 10
mysub "${project}_star_index" "$cmd" -n 5 -R span[hosts=1] -q medium | blockScript
## prevent further modification
chmod -w $(dirname ${star_index})
mailme "created star index for $igenome"
}
export -f dge_create_star_index
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
dge_star_counts2matrix(){
echo '
devtools::source_url("https://raw.githubusercontent.com/holgerbrandl/datautils/v1.13/R/core_commons.R")
## STAR count file format is
#column 1: gene ID
#column 2: counts for unstranded RNA-seq
#column 3: counts for the 1st read strand aligned with RNA (htseq-count option -s yes)
exprCounts <- list.files(".", "ReadsPerGene.out.tab") %>% ldply(function(countFile){
read.delim(countFile, header=F) %>%
select(V1, V2) %>%
set_names("gene_id", "num_alignments") %>%
filter(!str_detect(gene_id, "^N_")) %>%
mutate(sample=trim_ext(countFile, ".ReadsPerGene.out.tab"))
}, .progress="text")
countMatrix <- spread(exprCounts, sample, num_alignments)
write.delim(countMatrix, "star_count_matrix.txt")
' | R --vanilla -q
}
export -f dge_star_counts2matrix