Newer
Older
## docs
## http://blog.joncairns.com/2013/08/what-you-need-to-know-about-bash-functions/
#source <(curl https://raw.githubusercontent.com/holgerbrandl/datautils/v1.20/bash/lsf_utils.sh 2>&1 2>/dev/null)
source <(curl https://raw.githubusercontent.com/holgerbrandl/datautils/v1.20/R/spinr/spin_utils.sh 2>&1 2>/dev/null)
source <(curl https://raw.githubusercontent.com/holgerbrandl/datautils/v1.20/R/rendr/rendr_utils.sh 2>&1 2>/dev/null)
export PATH=/projects/bioinfo/holger/bin/bowtie2-2.2.5:$PATH
export PATH=/projects/bioinfo/holger/bin/tophat-2.0.13.Linux_x86_64:$PATH
export PATH=/home/brandl/bin/cufflinks-2.2.1.Linux_x86_64:$PATH
export PATH=/sw/apps/python/current/bin:$PATH
export PATH=/home/brandl/bin/deepTools/bin:$PATH
export PATH=/projects/bioinfo/holger/bin/FastQC_0.11.2:$PATH
export PATH=/projects/bioinfo/holger/bin/bedtools2-2.25.0/bin/:$PATH
export PATH=/projects/bioinfo/holger/bin/samtools-1.3:$PATH
export PATH=/home/brandl/bin/subread-1.4.6-p3-Linux-x86_64/bin:$PATH
#export PATH=/home/brandl/bin/STAR/STAR-STAR_2.4.1d/source:$PATH
export PATH=/home/brandl/bin/STAR-2.5.0b/source:$PATH
## add cluster job manager
export PATH=/projects/bioinfo/scripts/joblist_v0.5:$PATH
## backward compatibility wrappers for old shell cluster utils
joblist(){
jl add $*
}
export -f joblist
wait4jobs(){
jl wait $*
}
export -f wait4jobs
## also add spinr
export PATH=$PATH:/projects/bioinfo/scripts/spinr
mcdir(){
if [ ! -d "$1" ]; then
mkdir "$1";
fi;
cd "$1";
}
export -f mcdir
## no longer needed because packages are no kept in home
#export R_LIBS=/tmp/r_index ## export to make sure that packages are load from local repository, otherwise sqlite won't work
## create fastq report for all fastq and fastq.gz files in the current directory
dge_fastqc(){
while getopts "o:" curopt; do
case $curopt in
o) outputDir=$OPTARG;
esac
done
shift $(($OPTIND - 1))
local fastqFiles=$*
#if [ -z "$fastqFiles" ]; then
if [ $# -lt 1 ]; then
echo "Usage: dge_fastqc [-o <output_directory>] [<fastq.gz file>]+" >&2 ; return;
fi
fi
if [ ! -d "$outputDir" ]; then
echo "creating output directory '$outputDir'"
mkdir $outputDir
fi
for fastqFile in $fastqFiles ; do
echo "fastqcing $fastqFile"
if [ ! -f $fastqFile ]; then
continue;
fi
mysub "fastqc__$(basename $fastqFile)" "fastqc -j /sw/bin/java -o $outputDir -f fastq $fastqFile" -q medium | joblist .fastqc_jobs
done
wait4jobs .fastqc_jobs
rend.R ${NGS_TOOLS}/dge_workflow/fastqc_summary.R $outputDir
export Ill_ADAPTERS=/projects/bioinfo/common/cutadapt_patterns.fasta
for fastqFile in $* ; do
# DEBUG fastqFile=/projects/bioinfo/holger/projects/helin/mouse/treps_pooled/mouse_liver_polar_stage1_rep4.fastq.gz
caFastq=$(basename $fastqFile .fastq.gz)_ca.fastq.gz
echo "cutadapting $caFastq into $caFastq"
#todo use a more specific trimming model (trim just correct part on each side without using reverse complements
mysub "${project}__ca__$(basename $fastqFile .fastq.gz)" "cutadapt -b file:$Ill_ADAPTERS -m 20 -q 25 -o $caFastq $fastqFile" -q long | joblist .cajobs
spin.R ${NGS_TOOLS}/dge_workflow/cutadapt_summary.R .
## todo do a small report here about what has been trimmed away and why
#http://wiki.bash-hackers.org/howto/getopts_tutorial
dge_tophat_se(){
# http://stackoverflow.com/questions/18414054/bash-getopts-reading-optarg-for-optional-flags
while getopts "i:" curopt; do
case $curopt in
i) IGENOME="$OPTARG";
esac
done
shift $(($OPTIND - 1))
local fastqFiles=$*
# IGENOME=/projects/bioinfo/igenomes/Mus_musculus/Ensembl/GRCm38
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
if [ -z "$IGENOME" ] || [ -z "$fastqFiles" ];
then echo "Usage: dge_tophat_se -i <path to igenome> [<fastq.gz file>]+" >&2 ; return;
fi
export bowtie_gindex="$IGENOME/Sequence/Bowtie2Index/genome"
export gtfFile="$IGENOME/Annotation/Genes/genes.gtf"
#head $gtfFile
if [ ! -f $gtfFile ]; then
>&2 echo "gtf '$gtfFile' does not exis"; return;
fi
if [ -z "$(which tophat)" ]; then
>&2 echo "no tomcat binary in PATH"; return;
fi
echo "running tophat using igenome '$IGENOME' for the following files"
ll $fastqFiles
#fastqFiles=$(ls $baseDir/treps_pooled/*fastq.gz)
for fastqFile in $fastqFiles ; do
echo "submitting tophat job for $fastqFile"
# DEBUG fastqFile=/projects/bioinfo/holger/projects/helin/mouse/trimmed/mouse_big_cyst_rep4_ca.fastq.gz
fastqBaseName=$(basename ${fastqFile%%.fastq.gz})
outputdir=$fastqBaseName
## uniquely mapping reads only: http:/seqanswers.com/forums/showthread.php?s=93da11109ae12a773a128a637d69defe&t=3464
### tophat -p6 -G $gtfFile -g1 -o test $bowtie_gindex $fastqFile
mysub "${project}__tophat__${fastqBaseName}" "
tophat -p6 -G $gtfFile -g1 -o $outputdir $bowtie_gindex $fastqFile
mv $outputdir/accepted_hits.bam $outputdir/$(basename $outputdir).bam
samtools index $outputdir/$(basename $outputdir).bam
" -n 5 -R span[hosts=1] -q medium | joblist .tophatjobs
## see https://github.com/fidelram/deepTools/wiki/QC#bamCorrelate
spin.R ${NGS_TOOLS}/dge_workflow/tophat_qc.R .
if [ $# -eq 0 ]; then
echo "Usage: dge_bam_correlate <bam_directory>" >&2 ;
echo "Usage: dge_bam_correlate <bam_files...>" >&2 ;
return;
if [ $# -eq 1 ]; then
bamFiles=$(find $1 | grep ".bam$" | grep -v "unmapped" | sort)
local bamLabels=$(echo "$bamFiles" | xargs -n1 basename | sed 's!.*/!!' | sed 's/_mmf.bam//g' | sed 's/_ca.bam//g' | sed 's/.bam//g' | xargs echo);
echo "Used bam labels are: $bamLabels"
## see how well bam files correlate using untrimmed data
bcCmd="bamCorrelate bins --bamfiles $(echo $bamFiles | xargs echo) --labels $bamLabels --plotFile=bc.pdf --outFileCorMatrix=bc.txt --numberOfProcessors=4 --corMethod spearman --zMin 0.5 --zMax 1"
#echo "cmd is $bcCmd"
#mysub "${project}__bamcorrelate" "$bcCmd" -q long -n 4 -R span[hosts=1] | blockScript .bamcorrelate
jl submit --wait --jl .bamcorrelate -q long -t 4 -n "${project}__bamcorrelate" "$bcCmd"
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
dge_bigwig(){
usage="Usage: dge_bigwig <genome_fai> [<bam_file>]+"
if [ $# -lt 2 ]; then
echo ${usage} >&2 ; return;
fi
#bamFiles=$(find . -name "*.bam" | grep -v unmapped | xargs echo)
genomeFai=$1
bamFiles="${@:2}"
if [ ! -f "${genomeFai}" ]; then
echo "could not find fai index $1! ${usage}" >&2 ; return;
fi
## add bigwig to PATH
export PATH=~/bin:${PATH}
if [ -z "$(which wigToBigWig 2>/dev/null)" ]; then
echo "could not find wigToBigWig in PATH! ${usage}" >&2 ; #return;
fi
## create big wig files
for bamFile in $bamFiles; do
sample=$(basename $bamFile .bam)
echo "converting $bamFile to bigwig format"
mysub "${project}__bw__${sample}" "genomeCoverageBed -split -bg -ibam $bamFile -g ${genomeFai} | wigToBigWig -clip stdin ${genomeFai} ${sample}.bw" -q short | joblist .bigwig
done
wait4jobs .bigwig
}
export -f dge_bigwig
## http://stackoverflow.com/questions/6916856/can-bash-show-a-functions-definition
#type dge_bigwig
## Merge technical replicatews
dge_merge_treps(){
usage="Usage: dge_merge_treps <bam_directory> <comma_sep_biosample_spec>"
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
if [ $# -ne 2 ]; then
echo $usage >&2 ; return;
fi
local bam_dir=$1; # bam_dir=$baseDir/mapped_trim/
local bio_reps=$2; # bio_samples="test,lala"
if [ ! -d "$bam_dir" ]; then
echo "bam file directory does not exist! $usage'" >&2 ; return;
fi
if [ $(echo "$bio_samples" | grep "," | wc -l ) -ne 1 ]; then
echo "Invalid biosample spec! $usage'" >&2 ; return;
fi
#for sample in aRG bRG N; do
for sample in $(echo $bio_samples | tr ",", " "); do
# DEBUG sample=Insm1_1103
## todo use bsub instead here
(
sampleBams=$(find $bam_dir -name '*bam' | grep -v unmapped | grep $sample)
echo "merging $sample with $sampleBams"
if [ 1 -eq $(echo "$sampleBams" | wc -l) ]; then
cp $sampleBams $sample.bam
else
samtools merge - $(echo $sampleBams | xargs echo) | samtools sort - $sample
fi
samtools index $sample.bam
)&
done
}
export -f dge_merge_treps
## tester
#dge_merge_treps $baseDir/mapped_trim/ "Ctrl_0703,Ctrl_1029,Ctrl_1103,Insm1_0703,Insm1_1029,Insm1_1103"
#dge_cd(){
#( ## required to work around docopt sys.exit
#usage='
#Use cuffdiff2 to performa a differntial expression analysis
#Usage: dge_cuffdiff [--exclude=<regex>] <gtfFile> <bamDir> <labels>
#
#Options:
#--exclude=<regex> Exclude patterns (grep regex matching bam file paths to be excluded)
#'
#echo "$usage" | ~/bin/docopts/docopts -h - : $*
#eval "$(echo "$usage" | ~/bin/docopts/docopts -h - : $*)"
#
#)
#}
#dge_cd $gtfFilePC $baseDir/mapped $labels
#dge_cd --help
dge_cuffdiff(){
local gtfFile=$1
local bamDir=$2
local labels=$3
if [ $# -ne 3 ]; then
echo "Usage: dge_cuffdiff <gtf_file> <bam_directory> <labels>" >&2 ; return;
fi
if [ -z "$(which cuffdiff)" ]; then
>&2 echo "no cuffdiff binary in PATH"; return;
fi
if [ ! -f $gtfFile ]; then
>&2 echo "gtf '$gtfFile' does not exis"; return;
fi
## collect all bam-files and group them
allBams=$(find $bamDir | grep ".bam$" | grep -v "unmapped" | sort)
if [ -n "$EXCLUDE_BAMS_PATTERN" ]; then
echo "excluding bams with $EXCLUDE_BAMS_PATTERN"
allBams=$(echo "$allBams" | grep -v "$EXCLUDE_BAMS_PATTERN")
fi
## todo use optional argument here --> default "unmapped" --> allow user to add others
#allBams=$(find $bamDir | grep ".bam$" | grep -v "unmapped" | grep -v "1029" | sort)
#allBams=$(find $bamDir | grep ".bam$" | grep -v "unmapped" | grep -v "Ctrl_1029" | sort)
bamsSplit=""
for label in $(echo $labels | tr ", " " "); do
# DEBUG label="liver"; label="cyst"
## grep the bam names excluding the path
echo "$allBams" | xargs -n1 basename | grep $label > ${label}.cuff_bamlist
labelBams=$(echo "$allBams" | grep -Ff ${label}.cuff_bamlist | xargs echo -n | tr ' ' ',')
# echo "$allBams" | grep -Ff ${label}.bamlist | xargs -n1 echo
bamsSplit+=$labelBams" "
done
#echo $bamsSplit
## make sure (again that all files are there
echo $gtfFile $bamsSplit | tr "," " " | xargs ls -la
#mysub ${project}_cuffdiff "cuffdiff -L aRGm,aRGp,aRG,bRG,IPC,N -o . -p 8 mm10_ensGene_ccds.gtf $amBams $apBams $aBams $bBams $cBams $dBams 2>&1 | tee cuffdiff.log" -q long -n 8 -R span[hosts=1] | blockScript
cdCmd="cuffdiff -L $labels -o . -p 10 $gtfFile $bamsSplit"
#eval $cdCmd
mysub "${project}__cuffdiff" "$cdCmd" -q long -n 4 -R span[hosts=1] | blockScript
## create a cuffdiff db using cummerbund in a temporary directory to avoid locking problems
tmpDbDir=$(mktemp -d)
cp -r . $tmpDbDir
#genome=$(echo $gtfFile | cut -f8 -d'/' | tr '[:upper:]' '[:lower:]'); echo "genome is $genome"
## make sure to use temp-r to avoid file locking problems
#export R_LIBS=/tmp/r_index
genome=$(scala -e '
val gtfFile = args(0); //val gtfFile="mm10_igenomes_pc.gtf"
val pattern = "mm10|mm9|hg19|zv9".r
println(pattern.findFirstIn(gtfFile).getOrElse(""))
' $(readlink -f $gtfFile)
)
echo $genome
echo '
require(cummeRbund)
dbDir=commandArgs(T)[1]
gtfFile=commandArgs(T)[2]
genome=commandArgs(T)[3]
## note without providing the gtf the db is much smaller
readCufflinks(dir=dbDir, rebuild=T, gtf=gtfFile, genome=genome)
' | R -q --no-save --no-restore --args "$tmpDbDir" "$gtfFile" "$genome"
if [ ! -f $tmpDbDir/cuffData.db ]; then
>&2 echo "cummerbund failed to create cuffidff sqlite db"; return;
fi
cp $tmpDbDir/cuffData.db .
rm -rf $tmpDbDir ## because it's no longer needed
# Create a star index for a given igenome
dge_create_star_index(){
if [ $# -ne 1 ]; then
echo "Usage: dge_create_star_index <igenome>" >&2 ; return;
fi
igenome=$1
if [ ! -d "$igenome" ] | [ ! -d "${igenome}/Sequence" ]; then
echo "igenome directory '$igenome' does not exist" >&2 ; return;
fi
export star_index="${igenome}/Sequence/StarIndex"
## stop if index exists already
if [ -d "$star_index" ]; then
echo "Error: Index directory ${star_index} already exists." >&2 ; return;
fi
chmod +w $(dirname ${star_index})
mailme "${project}: creating STAR index in ${star_index}"
mkdir ${star_index}
cmd="STAR --runMode genomeGenerate --genomeDir ${star_index} --genomeFastaFiles ${igenome}/Sequence/WholeGenomeFasta/genome.fa --runThreadN 10"
#eval $cmd
#STAR --runMode genomeGenerate --genomeDir ${star_index} --genomeFastaFiles ${igenome}/Sequence/Chromosomes/*.fa --runThreadN 10
jl submit --wait -t 5 -q medium -n "${project}_star_index" "$cmd"
## prevent further modification
chmod -w $(dirname ${star_index})
mailme "created star index for $igenome"
}
export -f dge_create_star_index
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
dge_get_pc_isoforms(){
# todo write more generic version that also filtered provided gtf and/or allow for ccds filtering as well
if [ $# -ne 1 ]; then
echo "Usage: dge_get_pc_isoforms <hsapiens/mmusculus/other_ensembl_species_identifier>" >&2 ; return;
fi
echo '
require(biomaRt)
require(dplyr)
require(ggplot2)
mart <- useDataset(paste0(commandArgs(T)[1], "_gene_ensembl"), mart = useMart("ensembl"))
#mart <- useDataset("hsapiens_gene_ensembl", mart = useMart("ensembl"))
#mart <- useDataset("mmusculus_gene_ensembl", mart = useMart("ENSEMBL_MART_ENSEMBL", host="www.ensembl.org"))
pcTx <- getBM(attributes=c("ensembl_gene_id", "ensembl_transcript_id", "gene_biotype", "transcript_biotype"), mart=mart) %>%
filter(transcript_biotype=="protein_coding")
#ggplot(pcTx, aes(gene_biotype)) + geom_bar() + coord_flip()
#ggplot(pcTx, aes(transcript_biotype)) + geom_bar() + coord_flip()cd
#write.table(with(pcTx, data.frame(ensembl_transcript_id)), col.names=F, file="mm10_pc_tx.txt",quote=F,row.names=F)
# just print results to stdout
write.table(with(pcTx, data.frame(ensembl_transcript_id)), col.names=F, file=stdout(),quote=F,row.names=F)
' | Rscript --vanilla - $1 2>/dev/null
}
export -f dge_get_pc_isoforms
dge_star_counts2matrix(){
echo '
devtools::source_url("https://raw.githubusercontent.com/holgerbrandl/datautils/v1.13/R/core_commons.R")
## STAR count file format is
#column 1: gene ID
#column 2: counts for unstranded RNA-seq
#column 3: counts for the 1st read strand aligned with RNA (htseq-count option -s yes)
exprCounts <- list.files(".", "ReadsPerGene.out.tab") %>% ldply(function(countFile){
read.delim(countFile, header=F) %>%
select(V1, V2) %>%
set_names("ensembl_gene_id", "num_alignments") %>%
filter(!str_detect(ensembl_gene_id, "^N_")) %>%
mutate(sample=trim_ext(countFile, ".ReadsPerGene.out.tab"))
}, .progress="text")
countMatrix <- spread(exprCounts, sample, num_alignments)
write.delim(countMatrix, "star_counts_matrix.txt")
' | R --vanilla -q
}
export -f dge_star_counts2matrix