Newer
Older
## docs
## http://blog.joncairns.com/2013/08/what-you-need-to-know-about-bash-functions/
if [ $(hostname) == "bioinformatics-srv1" ]; then
export BIO_BIN_BASE="/local/home/brandl/bin"
else
export BIO_BIN_BASE="/projects/bioinfo/holger/bin"
fi
export PATH=${BIO_BIN_BASE}/bowtie2-2.2.5:$PATH
export PATH=${BIO_BIN_BASE}/deepTools-2.2.2/bin:$PATH
export PATH=${BIO_BIN_BASE}/FastQC_0.11.2:$PATH
export PATH=${BIO_BIN_BASE}/bedtools2-2.25.0/bin/:$PATH
export PATH=${BIO_BIN_BASE}/samtools-1.3:$PATH
export PATH=${BIO_BIN_BASE}/STAR-2.5.1b/source:$PATH
export PATH=${BIO_BIN_BASE}/ucsc:$PATH # for wigToBigWig
#export PATH=/home/brandl/bin/subread-1.4.6-p3-Linux-x86_64/bin:$PATH
## add cluster job manager
export PATH=/projects/bioinfo/tools/joblist_v0.7:$PATH
## makd sure that rend.R is present
source <(curl https://raw.githubusercontent.com/holgerbrandl/datautils/v1.23/R/rendr/rendr_utils.sh 2>&1 2>/dev/null)
## backward compatibility wrappers for old shell cluster utils
#joblist(){
# jl add $*
#}
#export -f joblist
#
#
#wait4jobs(){
# jl wait $*
#}
#export -f wait4jobs
mcdir(){
if [ ! -d "$1" ]; then
mkdir "$1";
fi;
cd "$1";
}
export -f mcdir
mailme(){
echo "Subject:"$1 "$2" | sendmail -v $(whoami)@mpi-cbg.de > /dev/null ;
}
export -f mailme
ziprm(){
if [ $# -lt 2 ]; then echo "Usage: ziprm <tarbasename> [<file>]+"; return; fi
tarName=$(date +'%y%m%d')_"$1".tar.gz; shift
tar czf $tarName $@; rm $@;
}
export -f ziprm
## no longer needed because packages are no kept in home
#export R_LIBS=/tmp/r_index ## export to make sure that packages are load from local repository, otherwise sqlite won't work
## create fastq report for all fastq and fastq.gz files in the current directory
dge_fastqc(){
while getopts "o:" curopt; do
case $curopt in
o) outputDir=$OPTARG;
esac
done
shift $(($OPTIND - 1))
local fastqFiles=$*
#if [ -z "$fastqFiles" ]; then
if [ $# -lt 1 ]; then
echo "Usage: dge_fastqc [-o <output_directory>] [<fastq.gz file>]+" >&2 ; return;
fi
fi
if [ ! -d "$outputDir" ]; then
echo "creating output directory '$outputDir'"
mkdir $outputDir
fi
for fastqFile in $fastqFiles ; do
echo "fastqcing $fastqFile"
if [ ! -f $fastqFile ]; then
continue;
fi
jl submit -j .fastqc_jobs -q medium -n "fastqc__$(basename $fastqFile)" "fastqc -j ${JAVA_HOME}/bin/java -o $outputDir -f fastq $fastqFile"
rend.R ${NGS_TOOLS}/misc/fastqc_summary.R $outputDir
if [ $# -eq 0 ]; then
echo "Usage: dge_bam_correlate <bam_directory>" >&2 ;
echo "Usage: dge_bam_correlate <bam_files...>" >&2 ;
return;
if [ $# -eq 1 ]; then
bamFiles=$(find $1 | grep ".bam$" | grep -v "unmapped" | sort)
local bamLabels=$(echo "$bamFiles" | xargs -n1 basename | sed 's!.*/!!' | sed 's/_mmf.bam//g' | sed 's/_ca.bam//g' | sed 's/.bam//g' | xargs echo);
echo "Used bam labels are: $bamLabels"
## see how well bam files correlate using untrimmed data

Holger Brandl
committed
# http://deeptools.readthedocs.org/en/latest/content/tools/multiBamSummary.html
# http://deeptools.readthedocs.org/en/latest/content/tools/plotCorrelation.html?highlight=plotfile
bcCmd="
## todo add python2 ~/.local/bin/ or fix setup

Holger Brandl
committed
multiBamSummary bins --bamfiles $(echo $bamFiles | xargs echo) --labels $bamLabels -out bam_cor_matrix.txt --numberOfProcessors 4
plotCorrelation --corData bam_cor_matrix.txt --plotFile bc.pdf --corMethod spearman --zMin 0.5 --zMax 1 -p heatmap
"
#mysub "${project}__bamcorrelate" "$bcCmd" -q long -n 4 -R span[hosts=1] | blockScript .bamcorrelate
## old lsf version
#jl submit --wait --jl .bamcorrelate -q long -t 4 -n "${project}__bamcorrelate" "$bcCmd"
jl reset .bamcorrelate
jl submit --jl .bamcorrelate -w 10:00 -t 4 -n "${project}__bamcorrelate" "$bcCmd"
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
dge_bigwig(){
usage="Usage: dge_bigwig <genome_fai> [<bam_file>]+"
if [ $# -lt 2 ]; then
echo ${usage} >&2 ; return;
fi
#bamFiles=$(find . -name "*.bam" | grep -v unmapped | xargs echo)
genomeFai=$1
bamFiles="${@:2}"
if [ ! -f "${genomeFai}" ]; then
echo "could not find fai index $1! ${usage}" >&2 ; return;
fi
## add bigwig to PATH
export PATH=~/bin:${PATH}
if [ -z "$(which wigToBigWig 2>/dev/null)" ]; then
echo "could not find wigToBigWig in PATH! ${usage}" >&2 ; #return;
fi
## create big wig files
for bamFile in $bamFiles; do
sample=$(basename $bamFile .bam)
echo "converting $bamFile to bigwig format"
jl submit -j .bigwig -w 10:00 -n "${project}__bw__${sample}" "genomeCoverageBed -split -bg -ibam $bamFile -g ${genomeFai} | wigToBigWig -clip stdin ${genomeFai} ${sample}.bw"
}
export -f dge_bigwig
## http://stackoverflow.com/questions/6916856/can-bash-show-a-functions-definition
#type dge_bigwig
## Merge technical replicatews
dge_merge_treps(){
usage="Usage: dge_merge_treps <bam_directory> <comma_sep_biosample_spec>"
if [ $# -ne 2 ]; then
echo $usage >&2 ; return;
fi
local bam_dir=$1; # bam_dir=$baseDir/mapped_trim/
local bio_reps=$2; # bio_samples="test,lala"
if [ ! -d "$bam_dir" ]; then
echo "bam file directory does not exist! $usage'" >&2 ; return;
fi
if [ $(echo "$bio_samples" | grep "," | wc -l ) -ne 1 ]; then
echo "Invalid biosample spec! $usage'" >&2 ; return;
fi
#for sample in aRG bRG N; do
for sample in $(echo $bio_samples | tr ",", " "); do
# DEBUG sample=Insm1_1103
(
sampleBams=$(find $bam_dir -name '*bam' | grep -v unmapped | grep $sample)
echo "merging $sample with $sampleBams"
if [ 1 -eq $(echo "$sampleBams" | wc -l) ]; then
cp $sampleBams $sample.bam
else
samtools merge - $(echo $sampleBams | xargs echo) | samtools sort - $sample
fi
samtools index $sample.bam
)&
done
}
export -f dge_merge_treps
# Create a star index for a given igenome
dge_create_star_index(){
if [ $# -ne 1 ]; then
echo "Usage: dge_create_star_index <igenome>" >&2 ; return;
fi
igenome=$1
if [ ! -d "$igenome" ] | [ ! -d "${igenome}/Sequence" ]; then
echo "igenome directory '$igenome' does not exist" >&2 ; return;
fi
export star_index="${igenome}/Sequence/StarIndex"
## stop if index exists already
if [ -d "$star_index" ]; then
echo "Error: Index directory ${star_index} already exists." >&2 ; return;
fi
# chmod +w $(dirname ${star_index})
mailme "${project}: creating STAR index in ${star_index}"
mkdir ${star_index}
cmd="STAR --runMode genomeGenerate --genomeDir ${star_index} --genomeFastaFiles ${igenome}/Sequence/WholeGenomeFasta/genome.fa --runThreadN 10"
#eval $cmd
#STAR --runMode genomeGenerate --genomeDir ${star_index} --genomeFastaFiles ${igenome}/Sequence/Chromosomes/*.fa --runThreadN 10
jl submit --wait -t 5 -w 10:00 -m 30g -n "${project}_star_index" "$cmd"
# chmod -w $(dirname ${star_index})
mailme "created star index for $igenome"
}
export -f dge_create_star_index
# todo write more generic version that also filtered provided gtf and/or allow for ccds filtering as well
if [ $# -ne 1 ]; then
echo "Usage: dge_get_pc_isoforms <hsapiens/mmusculus/other_ensembl_species_identifier>" >&2 ; return;
fi
echo '
require(biomaRt)
require(dplyr)
require(ggplot2)
mart <- useDataset(paste0(commandArgs(T)[1], "_gene_ensembl"), mart = useMart("ensembl"))
#mart <- useDataset("hsapiens_gene_ensembl", mart = useMart("ensembl"))
#mart <- useDataset("mmusculus_gene_ensembl", mart = useMart("ENSEMBL_MART_ENSEMBL", host="www.ensembl.org"))
pcTx <- getBM(attributes=c("ensembl_gene_id", "ensembl_transcript_id", "gene_biotype", "transcript_biotype"), mart=mart) %>%
filter(transcript_biotype=="protein_coding")
#ggplot(pcTx, aes(gene_biotype)) + geom_bar() + coord_flip()
#ggplot(pcTx, aes(transcript_biotype)) + geom_bar() + coord_flip()cd
#write.table(with(pcTx, data.frame(ensembl_transcript_id)), col.names=F, file="mm10_pc_tx.txt",quote=F,row.names=F)
# just print results to stdout
write.table(with(pcTx, data.frame(ensembl_transcript_id)), col.names=F, file=stdout(),quote=F,row.names=F)
' | Rscript --vanilla - $1 2>/dev/null
}
export -f dge_get_pc_isoforms
dge_star_counts2matrix(){
echo '
devtools::source_url("https://raw.githubusercontent.com/holgerbrandl/datautils/v1.13/R/core_commons.R")
## STAR count file format is
#column 1: gene ID
#column 2: counts for unstranded RNA-seq
#column 3: counts for the 1st read strand aligned with RNA (htseq-count option -s yes)
exprCounts <- list.files(".", "ReadsPerGene.out.tab") %>% ldply(function(countFile){
read.delim(countFile, header=F) %>%
select(V1, V2) %>%
set_names("ensembl_gene_id", "num_alignments") %>%
filter(!str_detect(ensembl_gene_id, "^N_")) %>%
mutate(sample=trim_ext(countFile, ".ReadsPerGene.out.tab"))
}, .progress="text")
countMatrix <- spread(exprCounts, sample, num_alignments)
write.delim(countMatrix, "star_counts_matrix.txt")
' | R --vanilla -q
}
export -f dge_star_counts2matrix