dge_utils.sh 13.7 KB
Newer Older
1 2 3
## docs
## http://blog.joncairns.com/2013/08/what-you-need-to-know-about-bash-functions/

Holger Brandl's avatar
Holger Brandl committed
4
## define common binaries
5 6
if [ $(hostname) == "bioinformatics-srv1" ]; then
    export BIO_BIN_BASE="/local/home/brandl/bin"
Holger Brandl's avatar
Holger Brandl committed
7
elif [[ $(hostname) == *-mac* ]]; then
8
    export BIO_BIN_BASE=${HOME}/bin
9
elif [[ $(hostname) == "falcon1" ]]; then
Holger Brandl's avatar
Holger Brandl committed
10
    export BIO_BIN_BASE="/projects/bioinfo/brandl/bin"
11
else
Holger Brandl's avatar
Holger Brandl committed
12
    if [ -z ${BIO_BIN_BASE} ]; then echo "BIO_BIN_BASE is not set"; exit 1; fi
13 14
fi

Holger Brandl's avatar
Holger Brandl committed
15

16 17 18 19 20
append2path(){
    if [ ! -d ${1} ]; then
        echo "can not append non-existing path '$1' to PATH" >&2
        exit
    fi
21

22 23
    export PATH=$1:$PATH
}
24
export -f append2path
25

Holger Brandl's avatar
Holger Brandl committed
26 27
append2path ${BIO_BIN_BASE}/bowtie-1.2.1.1
append2path ${BIO_BIN_BASE}/bowtie2-2.3.3.1
28
append2path ${BIO_BIN_BASE}/FastQC_0.11.2
29
append2path ${BIO_BIN_BASE}/bedtools2-2.25.0/bin
Holger Brandl's avatar
Holger Brandl committed
30
append2path ${BIO_BIN_BASE}/samtools-1.5
31
append2path ${BIO_BIN_BASE}/STAR-2.5.2b/source
Holger Brandl's avatar
Holger Brandl committed
32
append2path ${BIO_BIN_BASE}/kallisto-v0.43.1
33 34
#append2path ${BIO_BIN_BASE}/appify
append2path ${BIO_BIN_BASE}/appify2
Lena Hersemann's avatar
Lena Hersemann committed
35

Holger Brandl's avatar
Holger Brandl committed
36 37

# todo this is not an actual tagged release and should be appended differently (if at all)
38 39 40
append2path ${BIO_BIN_BASE}/ucsc

export PATH=/home/$(whoami)/local_bin/R-3.4.0/bin:$PATH
41

42
## Fixme use BIO_BIN_BASE for deeptools
43
## pip3 install --user deeptools
44
#export PATH=${BIO_BIN_BASE}/deepTools-2.2.2/bin:$PATH
45
export PATH=/home/$(whoami)/.local/bin:$PATH
46
#export PATH=/home/brandl/bin/subread-1.4.6-p3-Linux-x86_64/bin:$PATH
Holger Brandl's avatar
Holger Brandl committed
47

48
## add cluster job manager
49
#export PATH=/projects/bioinfo/tools/joblist_v0.7:$PATH
Holger Brandl's avatar
Holger Brandl committed
50
append2path ${BIO_BIN_BASE}/joblist_v0.7.1
51

52
## make sure that rend.R is present
53
source <(curl https://git.mpi-cbg.de/bioinfo/datautils/raw/v1.50/tools/rendr/rendr_utils.sh 2>&1 2>/dev/null)
Holger Brandl's avatar
Holger Brandl committed
54
source <(curl https://git.mpi-cbg.de/bioinfo/ngs_tools/raw/v10/common/bash_utils.sh 2>&1 2>/dev/null)
55

56

57 58 59 60 61 62 63 64 65
mcdir(){
    if [ ! -d "$1" ]; then
        mkdir "$1";
    fi;

    cd "$1";
}
export -f mcdir

66

Holger Brandl's avatar
Holger Brandl committed
67 68 69 70 71 72
## remove empty lines in input
## see http://stackoverflow.com/questions/16414410/delete-empty-lines-using-sed
trim(){
    cat - | sed '/^\s*$/d'
}

Holger Brandl's avatar
Holger Brandl committed
73 74 75 76 77
mailme(){
    echo "Subject:"$1 "$2" | sendmail -v $(whoami)@mpi-cbg.de > /dev/null ;
}
export -f mailme

78

79 80 81 82 83 84 85 86 87
ziprm(){
    if [ $# -lt 2 ]; then echo "Usage: ziprm <tarbasename> [<file>]+"; return; fi

    tarName=$(date +'%y%m%d')_"$1".tar.gz; shift
    tar czf $tarName $@; rm $@;
}
export -f ziprm


Holger Brandl's avatar
Holger Brandl committed
88

Holger Brandl's avatar
Holger Brandl committed
89 90
## no longer needed because packages are no kept in home
#export R_LIBS=/tmp/r_index ## export to make sure that packages are load from local repository, otherwise sqlite won't work
Holger Brandl's avatar
Holger Brandl committed
91

92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109

## create fastq report for all fastq and fastq.gz files in the current directory
dge_fastqc(){

while getopts "o:" curopt; do
    case $curopt in
    o) outputDir=$OPTARG;
    esac
done
shift $(($OPTIND - 1))

local fastqFiles=$*

#if [ -z "$fastqFiles" ]; then
if [ $# -lt 1 ]; then
     echo "Usage: dge_fastqc [-o <output_directory>] [<fastq.gz file>]+" >&2 ; return;
fi

Holger Brandl's avatar
Holger Brandl committed
110
## use current directory if not specified
111
if [ -z "$outputDir" ]; then
Holger Brandl's avatar
Holger Brandl committed
112
     outputDir="fastqc"
113 114 115 116 117 118 119 120 121 122 123 124 125 126 127
fi

if [ ! -d "$outputDir" ]; then
    echo "creating output directory '$outputDir'"
    mkdir $outputDir
fi


for fastqFile in $fastqFiles ; do
    echo "fastqcing $fastqFile"

    if [ ! -f $fastqFile ]; then
        continue;
    fi
    
128 129
#    jl submit -j .fastqc_jobs -n "fastqc__$(basename $fastqFile)" "fastqc -j ${JAVA_HOME}/bin/java -o $outputDir -f fastq $fastqFile"
    jl submit -j .fastqc_jobs -n "fastqc__$(basename $fastqFile)" "fastqc -o $outputDir -f fastq $fastqFile"
130 131
done

132
jl wait --email --report .fastqc_jobs
133

134
rend.R -e ${NGS_TOOLS}/misc/fastqc_summary.R $outputDir
Holger Brandl's avatar
Holger Brandl committed
135

136
#mailme "$project: fastqc done in $(pwd)"
137 138 139 140
}
export -f dge_fastqc


Holger Brandl's avatar
Holger Brandl committed
141 142
dge_bam_correlate(){

143 144 145 146
if [ $# -eq 0 ]; then
    echo "Usage: dge_bam_correlate <bam_directory>" >&2 ;
    echo "Usage: dge_bam_correlate <bam_files...>" >&2 ;
    return;
Holger Brandl's avatar
Holger Brandl committed
147 148 149
fi


150 151
if [ $# -eq 1 ]; then
    bamFiles=$(find $1 | grep ".bam$" | grep -v "unmapped" | sort)
152 153 154 155
else
    bamFiles=$*
fi

156 157
local bamLabels=$(echo "$bamFiles" | xargs -n1 basename | sed 's!.*/!!' | sed 's/_mmf.bam//g' |  sed 's/_ca.bam//g' | sed 's/.bam//g' | xargs echo);
echo "Used bam labels are: $bamLabels"
158

Holger Brandl's avatar
Holger Brandl committed
159 160

## see how well bam files correlate using untrimmed data
161 162 163
# http://deeptools.readthedocs.org/en/latest/content/tools/multiBamSummary.html
# http://deeptools.readthedocs.org/en/latest/content/tools/plotCorrelation.html?highlight=plotfile
bcCmd="
164
## todo add python2 ~/.local/bin/ or fix setup
165 166
multiBamSummary bins --bamfiles $(echo $bamFiles | xargs echo) --labels $bamLabels -out bin_data.npz --numberOfProcessors 4
plotCorrelation --corData bin_data.npz --plotFile bc.pdf --corMethod spearman --zMin 0.5 --zMax 1 -p heatmap --outFileCorMatrix bc_cor_matrix.tsv
167
"
168

169 170
## old lsf version
jl reset .bamcorrelate
Lena Hersemann's avatar
Lena Hersemann committed
171
jl submit --jl .bamcorrelate --wait -w 10:00 -t 4 -n "${PRJ_NAME}__bamcorrelate" "$bcCmd"
Holger Brandl's avatar
Holger Brandl committed
172
}
173

Holger Brandl's avatar
Holger Brandl committed
174 175
export -f dge_bam_correlate

176 177 178 179
## see https://git.mpi-cbg.de/bioinfo/ngs_tools/issues/11
## Integrate tools to assess gc/3/5 bias, insert size, and capture efficiency (see "Multi-perspective quality control of Illumina RNA sequencing data analysis")
## e.g. using http://deeptools.readthedocs.io/en/latest/content/tools/computeGCBias.html?highlight=bias

Holger Brandl's avatar
Holger Brandl committed
180

Holger Brandl's avatar
Holger Brandl committed
181 182 183 184 185 186 187 188 189 190 191 192 193
dge_bigwig(){

usage="Usage: dge_bigwig <genome_fai> [<bam_file>]+"


if [ $# -lt 2 ]; then
    echo ${usage} >&2 ; return;
fi

#bamFiles=$(find . -name "*.bam" | grep -v unmapped |  xargs echo)
genomeFai=$1
bamFiles="${@:2}"

194 195 196
if [ ! -f "${genomeFai}" ] || [ ${genomeFai: -4} != ".fai" ]; then
    echo "Could not find fai index $1!
    ${usage}" >&2 ; return;
Holger Brandl's avatar
Holger Brandl committed
197 198 199 200
fi


if [ -z "$(which wigToBigWig 2>/dev/null)" ]; then
201 202
    echo "Could not find wigToBigWig in PATH!
    ${usage}" >&2 ; #return;
Holger Brandl's avatar
Holger Brandl committed
203 204
fi

205 206
jl reset .bigwig

Holger Brandl's avatar
Holger Brandl committed
207 208 209 210
## create big wig files
for bamFile in $bamFiles; do
    sample=$(basename $bamFile .bam)
    echo "converting $bamFile to bigwig format"
211

212
    jl submit -j .bigwig -w 10:00 -m 50g -n "${PRJ_NAME}__bw__${sample}" "genomeCoverageBed -split -bg -ibam $bamFile  -g ${genomeFai} |  wigToBigWig -clip stdin ${genomeFai} ${sample}.bw"
Holger Brandl's avatar
Holger Brandl committed
213 214
done

215
jl wait --report .bigwig
Holger Brandl's avatar
Holger Brandl committed
216 217 218 219

}
export -f dge_bigwig

220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241

dge_bigwigs(){

usage="Usage: dge_bigwigs [<bam_file>]+"


if [ $# -lt 1 ]; then
    echo ${usage} >&2 ; return;
fi

#bamFiles=$(find . -name "*.bam" | grep -v unmapped |  xargs echo)
bamFiles="${@}"

if [ -z "$(which bamCoverage 2>/dev/null)" ]; then
    echo "could not find bamCoverage in PATH! ${usage}" >&2 ; #return;
fi

## create big wig files
for bamFile in $bamFiles; do
    sample=$(basename $bamFile .bam)
    echo "converting $bamFile to bigwig format"

Lena Hersemann's avatar
Lena Hersemann committed
242
    jl submit -j .bigwig -w 10:00 -n "${PRJ_NAME}__bw__${sample}" "bamCoverage --bam ${bamFile} --binSize 10 -p 1 -o ${sample}.bw"
243 244 245 246 247 248 249
done

jl wait --report .bigwig

}
export -f dge_bigwigs

Holger Brandl's avatar
Holger Brandl committed
250 251 252 253
## http://stackoverflow.com/questions/6916856/can-bash-show-a-functions-definition
#type dge_bigwig


254 255 256
## Merge technical replicatews
dge_merge_treps(){

Holger Brandl's avatar
Holger Brandl committed
257
usage="Usage: dge_merge_treps <bam_directory> <comma_sep_biosample_spec>"
258 259 260 261 262 263 264

if [ $# -ne 2 ]; then
    echo $usage >&2 ; return;
fi


local bam_dir=$1; # bam_dir=$baseDir/mapped_trim/
265
local bio_samples=$2; # bio_samples="test,lala"
266 267 268 269 270

if [ ! -d "$bam_dir" ]; then
    echo "bam file directory does not exist! $usage'" >&2 ; return;
fi

271
if [ $(echo "${bio_samples}" | grep "," | wc -l )  -ne 1 ]; then
272 273 274
    echo "Invalid biosample spec! $usage'" >&2 ; return;
fi

275 276 277 278 279 280 281
for condition in $(echo ${bio_samples} | tr ",", " "); do
    echo merging ${condition}
    bamFiles=$(ls ${bam_dir}/${condition}*.bam | xargs echo)
    jl submit --jl .merge_reps "
    samtools merge --threads 10 - ${bamFiles} | samtools sort -T ${condition} -o ${condition}.bam -
    samtools index ${condition}.bam
    "
282 283
done

284
jl wait .merge_reps
285 286 287
}
export -f dge_merge_treps

288 289 290 291 292 293 294
# Create a star index for a given igenome
dge_create_star_index(){

    if [ $# -ne 1 ]; then
        echo "Usage: dge_create_star_index <igenome>" >&2 ; return;
    fi

295 296
    #igenome="Pan_troglodytes/Ensembl_81/CHIMP2.1.4"
    local igenome=$1
297 298 299 300 301 302

    if [ ! -d "$igenome" ] | [ ! -d "${igenome}/Sequence" ]; then
        echo "igenome directory '$igenome' does not exist" >&2 ; return;
    fi

    export star_index="${igenome}/Sequence/StarIndex"
303 304 305 306 307 308

    ## stop if index exists already
    if [ -d "$star_index" ]; then
        echo "Error: Index directory ${star_index} already exists." >&2 ; return;
    fi

309
#    chmod +w $(dirname ${star_index})
310

Lena Hersemann's avatar
Lena Hersemann committed
311
    mailme "${PRJ_NAME}: creating STAR index in ${star_index}"
312 313
    mkdir ${star_index}

314
    cmd="STAR --runMode genomeGenerate --genomeDir ${star_index} --genomeFastaFiles ${igenome}/Sequence/WholeGenomeFasta/genome.fa --runThreadN 5"
315
    #eval $cmd
316
    #echo $cmd
317
    #STAR --runMode genomeGenerate --genomeDir ${star_index} --genomeFastaFiles ${igenome}/Sequence/Chromosomes/*.fa --runThreadN 10
Lena Hersemann's avatar
Lena Hersemann committed
318
    jl submit --wait -t 5 -w 10:00 -m 50g -n "${PRJ_NAME}_star_index" "$cmd"
319 320

    ## prevent further modification
321
#    chmod -w $(dirname ${star_index})
322 323 324 325

    mailme "created star index for $igenome"
}
export -f dge_create_star_index
Holger Brandl's avatar
Holger Brandl committed
326 327


328
dge_get_pc_isoforms(){
329
# todo write more generic version that also filtered provided gtf and/or allow for ccds filtering as well
330

331 332 333
if [ $# -ne 1 ]; then
    echo "Usage: dge_get_pc_isoforms <hsapiens/mmusculus/other_ensembl_species_identifier>" >&2 ; return;
fi
334

335 336 337 338
echo '
require(biomaRt)
require(dplyr)
require(ggplot2)
339

340 341 342
mart <- useDataset(paste0(commandArgs(T)[1], "_gene_ensembl"), mart = useMart("ensembl"))
#mart <- useDataset("hsapiens_gene_ensembl", mart = useMart("ensembl"))
#mart <- useDataset("mmusculus_gene_ensembl", mart = useMart("ENSEMBL_MART_ENSEMBL", host="www.ensembl.org"))
343

344 345
pcTx <- getBM(attributes=c("ensembl_gene_id", "ensembl_transcript_id", "gene_biotype", "transcript_biotype"),  mart=mart) %>%
    filter(transcript_biotype=="protein_coding")
346

347 348
#ggplot(pcTx, aes(gene_biotype)) + geom_bar() + coord_flip()
#ggplot(pcTx, aes(transcript_biotype)) + geom_bar() + coord_flip()cd
349

350 351 352 353
#write.table(with(pcTx, data.frame(ensembl_transcript_id)), col.names=F, file="mm10_pc_tx.txt",quote=F,row.names=F)
# just print results to stdout
write.table(with(pcTx, data.frame(ensembl_transcript_id)), col.names=F, file=stdout(),quote=F,row.names=F)
' | Rscript --vanilla - $1 2>/dev/null
354 355 356 357 358
}
export -f dge_get_pc_isoforms



Holger Brandl's avatar
Holger Brandl committed
359 360
dge_star_counts2matrix(){
echo '
361
devtools::source_url("https://git.mpi-cbg.de/bioinfo/datautils/raw/v1.40/R/core_commons.R")
Holger Brandl's avatar
Holger Brandl committed
362 363 364 365 366

## STAR count file format is
#column 1: gene ID
#column 2: counts for unstranded RNA-seq
#column 3: counts for the 1st read strand aligned with RNA (htseq-count option -s yes)
367
#column 4: counts for the 2nd read strand aligned with RNA (htseq-count option -s reverse)
Holger Brandl's avatar
Holger Brandl committed
368

369
exprCounts <- list.files(".", "ReadsPerGene.out.tab") %>% map_df(function(countFile){
Holger Brandl's avatar
Holger Brandl committed
370 371
    read.delim(countFile, header=F) %>%
        select(V1, V2) %>%
372 373
        set_names("ensembl_gene_id", "num_alignments") %>%
        filter(!str_detect(ensembl_gene_id, "^N_")) %>%
Holger Brandl's avatar
Holger Brandl committed
374
        mutate(sample=trim_ext(countFile, ".ReadsPerGene.out.tab"))
375
})
Holger Brandl's avatar
Holger Brandl committed
376

377
countMatrix = spread(exprCounts, sample, num_alignments)
Holger Brandl's avatar
Holger Brandl committed
378

379
write_tsv(countMatrix, "star_counts_matrix.txt")
Holger Brandl's avatar
Holger Brandl committed
380 381 382 383
' | R --vanilla -q

}
export -f dge_star_counts2matrix
384 385


386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406
build_kallisto_index(){

usage="Usage: build_kallisto_index <fasta_file>"

if [ $# -ne 1 ]; then
    echo ${usage} >&2 ; return;
fi

if [ ! $1 != "*.fa" ]; then
    echo "Please provide fasta file"
    echo ${usage} >&2 ; return;
fi

INPUT=$1
OUTPUT=($1".kallisto.idx")

kallisto index -i ${OUTPUT} ${INPUT}

}
export -f build_kallisto_index

407 408 409

run_kallisto(){

410
usage="Usage: run_kallisto <ensembl cnda fasta> <fastq.gz files>"
411 412 413 414 415

if [ $# -lt 2 ]; then
    echo ${usage} >&2 ; return;
fi

416 417 418 419 420 421 422 423
local ENS_CDNA=$1
local KALLISTO_INDEX=${ENS_CDNA}".kallisto.idx"

if [[ ! -f ${KALLISTO_INDEX} ]]; then
    echo "Index file '$KALLISTO_INDEX' does not exist. Create it with: kallisto index -i ${KALLISTO_INDEX} ${ENS_CDNA}"; return
fi

local fastqFiles="${@:2}"
424 425 426


for fastqFile in ${fastqFiles}; do
427 428

    #create output directory
429 430
    fastqBase=$(basename ${fastqFile} .fastq.gz)
    mkdir -p ${fastqBase}
431 432 433 434

    #run kallisto
#    jl submit -t 5 "kallisto quant -i ${KALLI} -o ${OUTDIR} --pseudobam --single -l 200 -s 20 -b 50 ${fastqFile} > ${OUTDIR}/out.sam"
#    jl submit -t 5 "kallisto quant -t 5 -i ${KALLI}  -o $(basename ${fastqFile} .fastq.gz) --single -l 200 -s 20 -b 50 ${fastqFile}"
435
#    jl submit -t 5 "kallisto quant -i ${KALLI} -o ${OUTDIR} --pseudobam --single -l 200 -s 20 -b 50 ${fastqFile} > ${OUTDIR}/out.sam && samtools view -Sb  ${OUTDIR}/out.sam  >  ${OUTDIR}/out.bam"
436
    jl submit --jl .isoex -n kallisto_${fastqBase} -t 5 "kallisto quant -t 5 -i ${KALLISTO_INDEX} -o ${fastqBase} --single -l 200 -s 20 -b 25 ${fastqFile} &> ${fastqBase}/kallisto.log"
437 438
done

439 440
jl wait

441
#extract IDs
442 443 444
## todo create more clean mapping table here; should we trim versions here?
grep "^>" $ENS_CDNA | cut -d ' ' -f1,4 > ids.txt
#grep "^>" $ENS_CDNA | cut -d ' ' -f1,4 | tr -d ">" | sed 's/gene://g' | tr ' ' ',' > ids.csv
445

446
#analyse kallisto data:
447
rend.R ${NGS_TOOLS}/dge_workflow/collect_kallisto_data.R ids.txt $1
448 449 450

}
export -f run_kallisto
451 452 453 454 455 456 457



dge_create_explorer_app() {

# check if required files exist in the current working directory
files='tpms_by_replicate.txt fpkms_by_replicate.txt de_results.txt basic_design.txt'
458
ls $files 2>/dev/null || { echo "Can not create app, because not all required data files ($files) exist in the current directory" 1>&2; return; }
459

460 461 462 463 464
#appify ${NGS_TOOLS}/dge_workflow/expression_explorer/expression_explorer.R "expression_explorer"

## with icon using appify fork https://gist.github.com/oubiwann/453744744da1141ccc542ff75b47e0cf
appify2 -n "expression_explorer" -i ${NGS_TOOLS}/dge_workflow/expression_explorer/scf_logo_2018_clip_ee.png.icns -s ${NGS_TOOLS}/dge_workflow/expression_explorer/expression_explorer.R

465
}
466 467 468 469 470
export -f dge_create_explorer_app


## add support for igv session generator and simpify launcher
alias make_igv_session="kscript https://git.io/vyLlj"
471 472 473 474

if [[ "$OSTYPE" == "linux-gnu" ]]; then
    igv(){ ${BIO_BIN_BASE}/IGV_2.4.7/igv.sh "$@"; }
fi