## docs ## http://blog.joncairns.com/2013/08/what-you-need-to-know-about-bash-functions/ ## load lsf helpers #source <(curl https://raw.githubusercontent.com/holgerbrandl/datautils/v1.20/bash/lsf_utils.sh 2>&1 2>/dev/null) ## enable snippet spinning source <(curl https://raw.githubusercontent.com/holgerbrandl/datautils/v1.22/R/spinr/spin_utils.sh 2>&1 2>/dev/null) export PATH=/projects/bioinfo/holger/bin//spinr:$PATH source <(curl https://raw.githubusercontent.com/holgerbrandl/datautils/v1.22/R/rendr/rendr_utils.sh 2>&1 2>/dev/null) export PATH=/projects/bioinfo/holger/bin//rendr:$PATH ## define common binaries ## todo tweak this to work on bioinfo as well export PATH=/projects/bioinfo/holger/bin/bowtie2-2.2.5:$PATH export PATH=/projects/bioinfo/holger/bin/tophat-2.0.13.Linux_x86_64:$PATH export PATH=/home/brandl/bin/cufflinks-2.2.1.Linux_x86_64:$PATH export PATH=/sw/apps/python/current/bin:$PATH export PATH=/projects/bioinfo/holger/bin/deepTools/bin:$PATH export PATH=/projects/bioinfo/holger/bin/FastQC_0.11.2:$PATH export PATH=/projects/bioinfo/holger/bin/bedtools2-2.25.0/bin/:$PATH export PATH=/projects/bioinfo/holger/bin/samtools-1.3:$PATH export PATH=/home/brandl/bin/subread-1.4.6-p3-Linux-x86_64/bin:$PATH #export PATH=/home/brandl/bin/STAR/STAR-STAR_2.4.1d/source:$PATH export PATH=/home/brandl/bin/STAR-2.5.0b/source:$PATH ## add cluster job manager export PATH=/projects/bioinfo/scripts/joblist_v0.5:$PATH ## backward compatibility wrappers for old shell cluster utils joblist(){ jl add $* } export -f joblist wait4jobs(){ jl wait $* } export -f wait4jobs ## also add spinr export PATH=$PATH:/projects/bioinfo/scripts/spinr # which tophat; which bowtie2; which cuffdiff mcdir(){ if [ ! -d "$1" ]; then mkdir "$1"; fi; cd "$1"; } export -f mcdir mailme(){ echo "Subject:"$1 "$2" | sendmail -v $(whoami)@mpi-cbg.de > /dev/null ; } export -f mailme ziprm(){ if [ $# -lt 2 ]; then echo "Usage: ziprm <tarbasename> [<file>]+"; return; fi tarName=$(date +'%y%m%d')_"$1".tar.gz; shift tar czf $tarName $@; rm $@; } export -f ziprm ## no longer needed because packages are no kept in home #export R_LIBS=/tmp/r_index ## export to make sure that packages are load from local repository, otherwise sqlite won't work ## create fastq report for all fastq and fastq.gz files in the current directory dge_fastqc(){ while getopts "o:" curopt; do case $curopt in o) outputDir=$OPTARG; esac done shift $(($OPTIND - 1)) local fastqFiles=$* #if [ -z "$fastqFiles" ]; then if [ $# -lt 1 ]; then echo "Usage: dge_fastqc [-o <output_directory>] [<fastq.gz file>]+" >&2 ; return; fi ## use current directory if not specified if [ -z "$outputDir" ]; then outputDir="fastqc" fi if [ ! -d "$outputDir" ]; then echo "creating output directory '$outputDir'" mkdir $outputDir fi for fastqFile in $fastqFiles ; do echo "fastqcing $fastqFile" if [ ! -f $fastqFile ]; then continue; fi jl submit -j .fastqc_jobs -q medium -n "fastqc__$(basename $fastqFile)" "fastqc -j /sw/bin/java -o $outputDir -f fastq $fastqFile" done jl wait --report .fastqc_jobs rend.R ${NGS_TOOLS}/dge_workflow/fastqc_summary.R $outputDir mailme "$project: fastqc done in $(pwd)" } export -f dge_fastqc dge_cutadapt(){ export Ill_ADAPTERS=/projects/bioinfo/common/cutadapt_patterns.fasta for fastqFile in $* ; do # DEBUG fastqFile=/projects/bioinfo/holger/projects/helin/mouse/treps_pooled/mouse_liver_polar_stage1_rep4.fastq.gz caFastq=$(basename $fastqFile .fastq.gz)_ca.fastq.gz echo "cutadapting $caFastq into $caFastq" #todo use a more specific trimming model (trim just correct part on each side without using reverse complements jl submit -j .cajobs -q long -n "${project}__ca__$(basename $fastqFile .fastq.gz)" "cutadapt -b file:$Ill_ADAPTERS -m 20 -q 25 -o $caFastq $fastqFile" done jl wait --report --email .cajobs spin.R ${NGS_TOOLS}/dge_workflow/cutadapt_summary.R . ## todo do a small report here about what has been trimmed away and why } export -f dge_cutadapt #http://wiki.bash-hackers.org/howto/getopts_tutorial dge_tophat_se(){ # http://stackoverflow.com/questions/18414054/bash-getopts-reading-optarg-for-optional-flags #echo $* while getopts "i:" curopt; do case $curopt in i) IGENOME="$OPTARG"; esac done shift $(($OPTIND - 1)) local fastqFiles=$* # IGENOME=/projects/bioinfo/igenomes/Mus_musculus/Ensembl/GRCm38 if [ -z "$IGENOME" ] || [ -z "$fastqFiles" ]; then echo "Usage: dge_tophat_se -i <path to igenome> [<fastq.gz file>]+" >&2 ; return; fi export bowtie_gindex="$IGENOME/Sequence/Bowtie2Index/genome" export gtfFile="$IGENOME/Annotation/Genes/genes.gtf" #head $gtfFile if [ ! -f $gtfFile ]; then >&2 echo "gtf '$gtfFile' does not exis"; return; fi if [ -z "$(which tophat)" ]; then >&2 echo "no tomcat binary in PATH"; return; fi echo "running tophat using igenome '$IGENOME' for the following files" ll $fastqFiles #fastqFiles=$(ls $baseDir/treps_pooled/*fastq.gz) for fastqFile in $fastqFiles ; do echo "submitting tophat job for $fastqFile" # DEBUG fastqFile=/projects/bioinfo/holger/projects/helin/mouse/trimmed/mouse_big_cyst_rep4_ca.fastq.gz fastqBaseName=$(basename ${fastqFile%%.fastq.gz}) outputdir=$fastqBaseName ## uniquely mapping reads only: http:/seqanswers.com/forums/showthread.php?s=93da11109ae12a773a128a637d69defe&t=3464 ### tophat -p6 -G $gtfFile -g1 -o test $bowtie_gindex $fastqFile tophatCmd=" tophat -p6 -G $gtfFile -g1 -o $outputdir $bowtie_gindex $fastqFile mv $outputdir/accepted_hits.bam $outputdir/$(basename $outputdir).bam samtools index $outputdir/$(basename $outputdir).bam " jl submit -j .tophatjobs -t 5 -q medium -n "${project}__tophat__${fastqBaseName}" "${tophatCmd}" done jl wait --report .tophatjobs ## see https://github.com/fidelram/deepTools/wiki/QC#bamCorrelate dge_bam_correlate . & ## create tophat mapping report spin.R ${NGS_TOOLS}/dge_workflow/tophat_qc.R . mailme "$project: tophat done in $(pwd)" } export -f dge_tophat_se dge_bam_correlate(){ if [ $# -eq 0 ]; then echo "Usage: dge_bam_correlate <bam_directory>" >&2 ; echo "Usage: dge_bam_correlate <bam_files...>" >&2 ; return; fi if [ $# -eq 1 ]; then bamFiles=$(find $1 | grep ".bam$" | grep -v "unmapped" | sort) else bamFiles=$* fi local bamLabels=$(echo "$bamFiles" | xargs -n1 basename | sed 's!.*/!!' | sed 's/_mmf.bam//g' | sed 's/_ca.bam//g' | sed 's/.bam//g' | xargs echo); echo "Used bam labels are: $bamLabels" ## see how well bam files correlate using untrimmed data bcCmd="bamCorrelate bins --bamfiles $(echo $bamFiles | xargs echo) --labels $bamLabels --plotFile=bc.pdf --outFileCorMatrix=bc.txt --numberOfProcessors=4 --corMethod spearman --zMin 0.5 --zMax 1" #echo "cmd is $bcCmd" #mysub "${project}__bamcorrelate" "$bcCmd" -q long -n 4 -R span[hosts=1] | blockScript .bamcorrelate jl submit --wait --jl .bamcorrelate -q long -t 4 -n "${project}__bamcorrelate" "$bcCmd" } export -f dge_bam_correlate dge_bigwig(){ usage="Usage: dge_bigwig <genome_fai> [<bam_file>]+" if [ $# -lt 2 ]; then echo ${usage} >&2 ; return; fi #bamFiles=$(find . -name "*.bam" | grep -v unmapped | xargs echo) genomeFai=$1 bamFiles="${@:2}" if [ ! -f "${genomeFai}" ]; then echo "could not find fai index $1! ${usage}" >&2 ; return; fi ## add bigwig to PATH export PATH=~/bin:${PATH} if [ -z "$(which wigToBigWig 2>/dev/null)" ]; then echo "could not find wigToBigWig in PATH! ${usage}" >&2 ; #return; fi ## create big wig files for bamFile in $bamFiles; do sample=$(basename $bamFile .bam) echo "converting $bamFile to bigwig format" jl submit -j .bigwig -q short -n "${project}__bw__${sample}" "genomeCoverageBed -split -bg -ibam $bamFile -g ${genomeFai} | wigToBigWig -clip stdin ${genomeFai} ${sample}.bw" done jl wait --report .bigwig } export -f dge_bigwig ## http://stackoverflow.com/questions/6916856/can-bash-show-a-functions-definition #type dge_bigwig ## Merge technical replicatews dge_merge_treps(){ usage="Usage: dge_merge_treps <bam_directory> <comma_sep_biosample_spec>" if [ $# -ne 2 ]; then echo $usage >&2 ; return; fi local bam_dir=$1; # bam_dir=$baseDir/mapped_trim/ local bio_reps=$2; # bio_samples="test,lala" if [ ! -d "$bam_dir" ]; then echo "bam file directory does not exist! $usage'" >&2 ; return; fi if [ $(echo "$bio_samples" | grep "," | wc -l ) -ne 1 ]; then echo "Invalid biosample spec! $usage'" >&2 ; return; fi #for sample in aRG bRG N; do for sample in $(echo $bio_samples | tr ",", " "); do # DEBUG sample=Insm1_1103 ## todo use bsub instead here ( sampleBams=$(find $bam_dir -name '*bam' | grep -v unmapped | grep $sample) echo "merging $sample with $sampleBams" ## todo add read-groups to bam files if [ 1 -eq $(echo "$sampleBams" | wc -l) ]; then cp $sampleBams $sample.bam else samtools merge - $(echo $sampleBams | xargs echo) | samtools sort - $sample fi samtools index $sample.bam )& done } export -f dge_merge_treps ## tester #dge_merge_treps $baseDir/mapped_trim/ "Ctrl_0703,Ctrl_1029,Ctrl_1103,Insm1_0703,Insm1_1029,Insm1_1103" #dge_cd(){ #( ## required to work around docopt sys.exit #usage=' #Use cuffdiff2 to performa a differntial expression analysis #Usage: dge_cuffdiff [--exclude=<regex>] <gtfFile> <bamDir> <labels> # #Options: #--exclude=<regex> Exclude patterns (grep regex matching bam file paths to be excluded) #' #echo "$usage" | ~/bin/docopts/docopts -h - : $* #eval "$(echo "$usage" | ~/bin/docopts/docopts -h - : $*)" # #) #} #dge_cd $gtfFilePC $baseDir/mapped $labels #dge_cd --help dge_cuffdiff(){ local gtfFile=$1 local bamDir=$2 local labels=$3 if [ $# -ne 3 ]; then echo "Usage: dge_cuffdiff <gtf_file> <bam_directory> <labels>" >&2 ; return; fi if [ -z "$(which cuffdiff)" ]; then >&2 echo "no cuffdiff binary in PATH"; return; fi if [ ! -f $gtfFile ]; then >&2 echo "gtf '$gtfFile' does not exis"; return; fi ## collect all bam-files and group them allBams=$(find $bamDir | grep ".bam$" | grep -v "unmapped" | sort) if [ -n "$EXCLUDE_BAMS_PATTERN" ]; then echo "excluding bams with $EXCLUDE_BAMS_PATTERN" allBams=$(echo "$allBams" | grep -v "$EXCLUDE_BAMS_PATTERN") fi ## todo use optional argument here --> default "unmapped" --> allow user to add others #allBams=$(find $bamDir | grep ".bam$" | grep -v "unmapped" | grep -v "1029" | sort) #allBams=$(find $bamDir | grep ".bam$" | grep -v "unmapped" | grep -v "Ctrl_1029" | sort) bamsSplit="" for label in $(echo $labels | tr ", " " "); do # DEBUG label="liver"; label="cyst" echo $label ## grep the bam names excluding the path echo "$allBams" | xargs -n1 basename | grep $label > ${label}.cuff_bamlist labelBams=$(echo "$allBams" | grep -Ff ${label}.cuff_bamlist | xargs echo -n | tr ' ' ',') # echo "$allBams" | grep -Ff ${label}.bamlist | xargs -n1 echo bamsSplit+=$labelBams" " done #echo $bamsSplit ## make sure (again that all files are there echo $gtfFile $bamsSplit | tr "," " " | xargs ls -la #mysub ${project}_cuffdiff "cuffdiff -L aRGm,aRGp,aRG,bRG,IPC,N -o . -p 8 mm10_ensGene_ccds.gtf $amBams $apBams $aBams $bBams $cBams $dBams 2>&1 | tee cuffdiff.log" -q long -n 8 -R span[hosts=1] | blockScript cdCmd="cuffdiff -L $labels -o . -p 10 $gtfFile $bamsSplit" echo "cuffdiff cmd is: $cdCmd" #eval $cdCmd jl submit --wait -t 4 -q long -n "${project}__cuffdiff" "$cdCmd" ## create a cuffdiff db using cummerbund in a temporary directory to avoid locking problems tmpDbDir=$(mktemp -d) cp -r . $tmpDbDir #genome=$(echo $gtfFile | cut -f8 -d'/' | tr '[:upper:]' '[:lower:]'); echo "genome is $genome" ## make sure to use temp-r to avoid file locking problems #export R_LIBS=/tmp/r_index genome=$(scala -e ' val gtfFile = args(0); //val gtfFile="mm10_igenomes_pc.gtf" val pattern = "mm10|mm9|hg19|zv9".r println(pattern.findFirstIn(gtfFile).getOrElse("")) ' $(readlink -f $gtfFile) ) echo $genome echo ' require(cummeRbund) dbDir=commandArgs(T)[1] gtfFile=commandArgs(T)[2] genome=commandArgs(T)[3] ## note without providing the gtf the db is much smaller readCufflinks(dir=dbDir, rebuild=T, gtf=gtfFile, genome=genome) ' | R -q --no-save --no-restore --args "$tmpDbDir" "$gtfFile" "$genome" if [ ! -f $tmpDbDir/cuffData.db ]; then >&2 echo "cummerbund failed to create cuffidff sqlite db"; return; fi cp $tmpDbDir/cuffData.db . rm -rf $tmpDbDir ## because it's no longer needed mailme "$project: cuffdiff done in $(pwd)" } export -f dge_cuffdiff # Create a star index for a given igenome dge_create_star_index(){ if [ $# -ne 1 ]; then echo "Usage: dge_create_star_index <igenome>" >&2 ; return; fi igenome=$1 if [ ! -d "$igenome" ] | [ ! -d "${igenome}/Sequence" ]; then echo "igenome directory '$igenome' does not exist" >&2 ; return; fi export star_index="${igenome}/Sequence/StarIndex" ## stop if index exists already if [ -d "$star_index" ]; then echo "Error: Index directory ${star_index} already exists." >&2 ; return; fi # chmod +w $(dirname ${star_index}) mailme "${project}: creating STAR index in ${star_index}" mkdir ${star_index} cmd="STAR --runMode genomeGenerate --genomeDir ${star_index} --genomeFastaFiles ${igenome}/Sequence/WholeGenomeFasta/genome.fa --runThreadN 10" #eval $cmd #STAR --runMode genomeGenerate --genomeDir ${star_index} --genomeFastaFiles ${igenome}/Sequence/Chromosomes/*.fa --runThreadN 10 jl submit --wait -t 5 -q medium -n "${project}_star_index" "$cmd" ## prevent further modification # chmod -w $(dirname ${star_index}) mailme "created star index for $igenome" } export -f dge_create_star_index dge_get_pc_isoforms(){ # todo write more generic version that also filtered provided gtf and/or allow for ccds filtering as well if [ $# -ne 1 ]; then echo "Usage: dge_get_pc_isoforms <hsapiens/mmusculus/other_ensembl_species_identifier>" >&2 ; return; fi echo ' require(biomaRt) require(dplyr) require(ggplot2) mart <- useDataset(paste0(commandArgs(T)[1], "_gene_ensembl"), mart = useMart("ensembl")) #mart <- useDataset("hsapiens_gene_ensembl", mart = useMart("ensembl")) #mart <- useDataset("mmusculus_gene_ensembl", mart = useMart("ENSEMBL_MART_ENSEMBL", host="www.ensembl.org")) pcTx <- getBM(attributes=c("ensembl_gene_id", "ensembl_transcript_id", "gene_biotype", "transcript_biotype"), mart=mart) %>% filter(transcript_biotype=="protein_coding") #ggplot(pcTx, aes(gene_biotype)) + geom_bar() + coord_flip() #ggplot(pcTx, aes(transcript_biotype)) + geom_bar() + coord_flip()cd #write.table(with(pcTx, data.frame(ensembl_transcript_id)), col.names=F, file="mm10_pc_tx.txt",quote=F,row.names=F) # just print results to stdout write.table(with(pcTx, data.frame(ensembl_transcript_id)), col.names=F, file=stdout(),quote=F,row.names=F) ' | Rscript --vanilla - $1 2>/dev/null } export -f dge_get_pc_isoforms dge_star_counts2matrix(){ echo ' devtools::source_url("https://raw.githubusercontent.com/holgerbrandl/datautils/v1.13/R/core_commons.R") ## STAR count file format is #column 1: gene ID #column 2: counts for unstranded RNA-seq #column 3: counts for the 1st read strand aligned with RNA (htseq-count option -s yes) exprCounts <- list.files(".", "ReadsPerGene.out.tab") %>% ldply(function(countFile){ read.delim(countFile, header=F) %>% select(V1, V2) %>% set_names("ensembl_gene_id", "num_alignments") %>% filter(!str_detect(ensembl_gene_id, "^N_")) %>% mutate(sample=trim_ext(countFile, ".ReadsPerGene.out.tab")) }, .progress="text") countMatrix <- spread(exprCounts, sample, num_alignments) write.delim(countMatrix, "star_counts_matrix.txt") ' | R --vanilla -q } export -f dge_star_counts2matrix