dge_utils.sh 9.17 KB
Newer Older
Holger Brandl's avatar
Holger Brandl committed
1 2 3
## docs
## http://blog.joncairns.com/2013/08/what-you-need-to-know-about-bash-functions/

Holger Brandl's avatar
Holger Brandl committed
4
## define common binaries
Holger Brandl's avatar
Holger Brandl committed
5
## todo generify this to work on bioinfo and furiosa as well
6
export PATH=/projects/bioinfo/holger/bin/bowtie2-2.2.5:$PATH
7
export PATH=/projects/bioinfo/holger/bin/deepTools-2.2.2/bin:$PATH
8
export PATH=/projects/bioinfo/holger/bin/FastQC_0.11.2:$PATH
Holger Brandl's avatar
Holger Brandl committed
9
export PATH=/projects/bioinfo/holger/bin/bedtools2-2.25.0/bin/:$PATH
10
export PATH=/projects/bioinfo/holger/bin/samtools-1.3:$PATH
Holger Brandl's avatar
Holger Brandl committed
11
export PATH=/home/brandl/bin/STAR-2.5.1b/source:$PATH
Holger Brandl's avatar
Holger Brandl committed
12
#export PATH=/home/brandl/bin/subread-1.4.6-p3-Linux-x86_64/bin:$PATH
Holger Brandl's avatar
Holger Brandl committed
13

14 15

## add cluster job manager
Holger Brandl's avatar
Holger Brandl committed
16
export PATH=/projects/bioinfo/tools/joblist_v0.6:$PATH
17 18


Holger Brandl's avatar
Holger Brandl committed
19 20 21 22 23 24 25 26 27 28 29 30 31 32 33
## makd sure that rend.R is present
source <(curl https://raw.githubusercontent.com/holgerbrandl/datautils/v1.23/R/rendr/rendr_utils.sh 2>&1 2>/dev/null)


## backward compatibility wrappers for old shell cluster utils
#joblist(){
#  jl add $*
#}
#export -f joblist
#
#
#wait4jobs(){
#  jl wait $*
#}
#export -f wait4jobs
34

35

Holger Brandl's avatar
Holger Brandl committed
36
# which tophat;  which bowtie2; which cuffdiff
Holger Brandl's avatar
Holger Brandl committed
37

Holger Brandl's avatar
Holger Brandl committed
38 39 40 41 42 43 44 45 46
mcdir(){
    if [ ! -d "$1" ]; then
        mkdir "$1";
    fi;

    cd "$1";
}
export -f mcdir

Holger Brandl's avatar
Holger Brandl committed
47

Holger Brandl's avatar
Holger Brandl committed
48 49 50 51 52
mailme(){
    echo "Subject:"$1 "$2" | sendmail -v $(whoami)@mpi-cbg.de > /dev/null ;
}
export -f mailme

Holger Brandl's avatar
Holger Brandl committed
53

54 55 56 57 58 59 60 61 62
ziprm(){
    if [ $# -lt 2 ]; then echo "Usage: ziprm <tarbasename> [<file>]+"; return; fi

    tarName=$(date +'%y%m%d')_"$1".tar.gz; shift
    tar czf $tarName $@; rm $@;
}
export -f ziprm


Holger Brandl's avatar
Holger Brandl committed
63

Holger Brandl's avatar
Holger Brandl committed
64 65
## no longer needed because packages are no kept in home
#export R_LIBS=/tmp/r_index ## export to make sure that packages are load from local repository, otherwise sqlite won't work
Holger Brandl's avatar
Holger Brandl committed
66

Holger Brandl's avatar
Holger Brandl committed
67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84

## create fastq report for all fastq and fastq.gz files in the current directory
dge_fastqc(){

while getopts "o:" curopt; do
    case $curopt in
    o) outputDir=$OPTARG;
    esac
done
shift $(($OPTIND - 1))

local fastqFiles=$*

#if [ -z "$fastqFiles" ]; then
if [ $# -lt 1 ]; then
     echo "Usage: dge_fastqc [-o <output_directory>] [<fastq.gz file>]+" >&2 ; return;
fi

Holger Brandl's avatar
Holger Brandl committed
85
## use current directory if not specified
Holger Brandl's avatar
Holger Brandl committed
86
if [ -z "$outputDir" ]; then
Holger Brandl's avatar
Holger Brandl committed
87
     outputDir="fastqc"
Holger Brandl's avatar
Holger Brandl committed
88 89 90 91 92 93 94 95 96 97 98 99 100 101 102
fi

if [ ! -d "$outputDir" ]; then
    echo "creating output directory '$outputDir'"
    mkdir $outputDir
fi


for fastqFile in $fastqFiles ; do
    echo "fastqcing $fastqFile"

    if [ ! -f $fastqFile ]; then
        continue;
    fi
    
Holger Brandl's avatar
Holger Brandl committed
103
    jl submit -j .fastqc_jobs -q medium -n "fastqc__$(basename $fastqFile)" "fastqc -j ${JAVA_HOME}/bin/java -o $outputDir -f fastq $fastqFile"
Holger Brandl's avatar
Holger Brandl committed
104 105
done

Holger Brandl's avatar
Holger Brandl committed
106
jl wait --report .fastqc_jobs
Holger Brandl's avatar
Holger Brandl committed
107

Holger Brandl's avatar
Holger Brandl committed
108
rend.R ${NGS_TOOLS}/misc/fastqc_summary.R $outputDir
Holger Brandl's avatar
Holger Brandl committed
109

Holger Brandl's avatar
Holger Brandl committed
110 111
mailme "$project: fastqc done in $(pwd)"

Holger Brandl's avatar
Holger Brandl committed
112 113 114 115
}
export -f dge_fastqc


Holger Brandl's avatar
Holger Brandl committed
116 117
dge_bam_correlate(){

118 119 120 121
if [ $# -eq 0 ]; then
    echo "Usage: dge_bam_correlate <bam_directory>" >&2 ;
    echo "Usage: dge_bam_correlate <bam_files...>" >&2 ;
    return;
Holger Brandl's avatar
Holger Brandl committed
122 123 124
fi


Holger Brandl's avatar
Holger Brandl committed
125 126
if [ $# -eq 1 ]; then
    bamFiles=$(find $1 | grep ".bam$" | grep -v "unmapped" | sort)
127 128 129 130
else
    bamFiles=$*
fi

Holger Brandl's avatar
Holger Brandl committed
131 132
local bamLabels=$(echo "$bamFiles" | xargs -n1 basename | sed 's!.*/!!' | sed 's/_mmf.bam//g' |  sed 's/_ca.bam//g' | sed 's/.bam//g' | xargs echo);
echo "Used bam labels are: $bamLabels"
133

Holger Brandl's avatar
Holger Brandl committed
134 135

## see how well bam files correlate using untrimmed data
136 137 138
# http://deeptools.readthedocs.org/en/latest/content/tools/multiBamSummary.html
# http://deeptools.readthedocs.org/en/latest/content/tools/plotCorrelation.html?highlight=plotfile
bcCmd="
139
## todo add python2 ~/.local/bin/ or fix setup
140 141 142
multiBamSummary bins --bamfiles $(echo $bamFiles | xargs echo) --labels $bamLabels -out bam_cor_matrix.txt --numberOfProcessors 4
plotCorrelation --corData bam_cor_matrix.txt --plotFile bc.pdf --corMethod spearman --zMin 0.5 --zMax 1 -p heatmap
"
Holger Brandl's avatar
Holger Brandl committed
143
#echo "cmd is $bcCmd"
144
#mysub "${project}__bamcorrelate" "$bcCmd"  -q long -n 4 -R span[hosts=1] | blockScript .bamcorrelate
Holger Brandl's avatar
Holger Brandl committed
145

146 147 148 149
## old lsf version
#jl submit --wait --jl .bamcorrelate -q long -t 4 -n "${project}__bamcorrelate" "$bcCmd"
jl reset .bamcorrelate
jl submit --jl .bamcorrelate -w 10:00 -t 4 -n "${project}__bamcorrelate" "$bcCmd"
Holger Brandl's avatar
Holger Brandl committed
150
}
151

Holger Brandl's avatar
Holger Brandl committed
152 153 154
export -f dge_bam_correlate


Holger Brandl's avatar
Holger Brandl committed
155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182
dge_bigwig(){

usage="Usage: dge_bigwig <genome_fai> [<bam_file>]+"


if [ $# -lt 2 ]; then
    echo ${usage} >&2 ; return;
fi

#bamFiles=$(find . -name "*.bam" | grep -v unmapped |  xargs echo)
genomeFai=$1
bamFiles="${@:2}"

if [ ! -f "${genomeFai}" ]; then
    echo "could not find fai index $1! ${usage}" >&2 ; return;
fi

## add bigwig to PATH
export PATH=~/bin:${PATH}

if [ -z "$(which wigToBigWig 2>/dev/null)" ]; then
    echo "could not find wigToBigWig in PATH! ${usage}" >&2 ; #return;
fi

## create big wig files
for bamFile in $bamFiles; do
    sample=$(basename $bamFile .bam)
    echo "converting $bamFile to bigwig format"
Holger Brandl's avatar
Holger Brandl committed
183 184

    jl submit -j .bigwig -q short -n "${project}__bw__${sample}" "genomeCoverageBed -split -bg -ibam $bamFile  -g ${genomeFai} |  wigToBigWig -clip stdin ${genomeFai} ${sample}.bw"
Holger Brandl's avatar
Holger Brandl committed
185 186
done

Holger Brandl's avatar
Holger Brandl committed
187
jl wait --report .bigwig
Holger Brandl's avatar
Holger Brandl committed
188 189 190 191 192 193 194 195

}
export -f dge_bigwig

## http://stackoverflow.com/questions/6916856/can-bash-show-a-functions-definition
#type dge_bigwig


196 197 198
## Merge technical replicatews
dge_merge_treps(){

Holger Brandl's avatar
Holger Brandl committed
199
usage="Usage: dge_merge_treps <bam_directory> <comma_sep_biosample_spec>"
200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220

if [ $# -ne 2 ]; then
    echo $usage >&2 ; return;
fi


local bam_dir=$1; # bam_dir=$baseDir/mapped_trim/
local bio_reps=$2; # bio_samples="test,lala"

if [ ! -d "$bam_dir" ]; then
    echo "bam file directory does not exist! $usage'" >&2 ; return;
fi

if [ $(echo "$bio_samples" | grep "," | wc -l )  -ne 1 ]; then
    echo "Invalid biosample spec! $usage'" >&2 ; return;
fi

#for sample in aRG bRG N; do
for sample in $(echo $bio_samples | tr ",", " "); do
    # DEBUG sample=Insm1_1103

Holger Brandl's avatar
Holger Brandl committed
221
    ## todo use joblist instead here
222 223 224 225
    (
    sampleBams=$(find $bam_dir -name '*bam'  | grep -v unmapped | grep $sample)
    echo "merging $sample with $sampleBams"

Holger Brandl's avatar
Holger Brandl committed
226
    ## todo add read-groups to bam files
227 228 229 230 231 232 233 234 235 236 237 238 239
    if [ 1 -eq $(echo "$sampleBams" | wc -l) ]; then
        cp $sampleBams $sample.bam
    else
        samtools merge - $(echo $sampleBams | xargs echo) | samtools sort - $sample
    fi

    samtools index $sample.bam
    )&
done

}
export -f dge_merge_treps

Holger Brandl's avatar
Holger Brandl committed
240 241 242 243 244 245 246 247 248 249 250 251 252 253
# Create a star index for a given igenome
dge_create_star_index(){

    if [ $# -ne 1 ]; then
        echo "Usage: dge_create_star_index <igenome>" >&2 ; return;
    fi

    igenome=$1

    if [ ! -d "$igenome" ] | [ ! -d "${igenome}/Sequence" ]; then
        echo "igenome directory '$igenome' does not exist" >&2 ; return;
    fi

    export star_index="${igenome}/Sequence/StarIndex"
254 255 256 257 258 259

    ## stop if index exists already
    if [ -d "$star_index" ]; then
        echo "Error: Index directory ${star_index} already exists." >&2 ; return;
    fi

260
#    chmod +w $(dirname ${star_index})
Holger Brandl's avatar
Holger Brandl committed
261 262 263 264 265 266 267

    mailme "${project}: creating STAR index in ${star_index}"
    mkdir ${star_index}

    cmd="STAR --runMode genomeGenerate --genomeDir ${star_index} --genomeFastaFiles ${igenome}/Sequence/WholeGenomeFasta/genome.fa --runThreadN 10"
    #eval $cmd
    #STAR --runMode genomeGenerate --genomeDir ${star_index} --genomeFastaFiles ${igenome}/Sequence/Chromosomes/*.fa --runThreadN 10
268
    jl submit --wait -t 5 -q medium -n "${project}_star_index" "$cmd"
Holger Brandl's avatar
Holger Brandl committed
269 270

    ## prevent further modification
271
#    chmod -w $(dirname ${star_index})
Holger Brandl's avatar
Holger Brandl committed
272 273 274 275

    mailme "created star index for $igenome"
}
export -f dge_create_star_index
Holger Brandl's avatar
Holger Brandl committed
276 277


278
dge_get_pc_isoforms(){
Holger Brandl's avatar
Holger Brandl committed
279
# todo write more generic version that also filtered provided gtf and/or allow for ccds filtering as well
280

Holger Brandl's avatar
Holger Brandl committed
281 282 283
if [ $# -ne 1 ]; then
    echo "Usage: dge_get_pc_isoforms <hsapiens/mmusculus/other_ensembl_species_identifier>" >&2 ; return;
fi
284

Holger Brandl's avatar
Holger Brandl committed
285 286 287 288
echo '
require(biomaRt)
require(dplyr)
require(ggplot2)
289

Holger Brandl's avatar
Holger Brandl committed
290 291 292
mart <- useDataset(paste0(commandArgs(T)[1], "_gene_ensembl"), mart = useMart("ensembl"))
#mart <- useDataset("hsapiens_gene_ensembl", mart = useMart("ensembl"))
#mart <- useDataset("mmusculus_gene_ensembl", mart = useMart("ENSEMBL_MART_ENSEMBL", host="www.ensembl.org"))
293

Holger Brandl's avatar
Holger Brandl committed
294 295
pcTx <- getBM(attributes=c("ensembl_gene_id", "ensembl_transcript_id", "gene_biotype", "transcript_biotype"),  mart=mart) %>%
    filter(transcript_biotype=="protein_coding")
296

Holger Brandl's avatar
Holger Brandl committed
297 298
#ggplot(pcTx, aes(gene_biotype)) + geom_bar() + coord_flip()
#ggplot(pcTx, aes(transcript_biotype)) + geom_bar() + coord_flip()cd
299

Holger Brandl's avatar
Holger Brandl committed
300 301 302 303
#write.table(with(pcTx, data.frame(ensembl_transcript_id)), col.names=F, file="mm10_pc_tx.txt",quote=F,row.names=F)
# just print results to stdout
write.table(with(pcTx, data.frame(ensembl_transcript_id)), col.names=F, file=stdout(),quote=F,row.names=F)
' | Rscript --vanilla - $1 2>/dev/null
304 305 306 307 308
}
export -f dge_get_pc_isoforms



Holger Brandl's avatar
Holger Brandl committed
309 310 311 312 313 314 315 316 317 318 319 320
dge_star_counts2matrix(){
echo '
devtools::source_url("https://raw.githubusercontent.com/holgerbrandl/datautils/v1.13/R/core_commons.R")

## STAR count file format is
#column 1: gene ID
#column 2: counts for unstranded RNA-seq
#column 3: counts for the 1st read strand aligned with RNA (htseq-count option -s yes)

exprCounts <- list.files(".", "ReadsPerGene.out.tab") %>% ldply(function(countFile){
    read.delim(countFile, header=F) %>%
        select(V1, V2) %>%
321 322
        set_names("ensembl_gene_id", "num_alignments") %>%
        filter(!str_detect(ensembl_gene_id, "^N_")) %>%
Holger Brandl's avatar
Holger Brandl committed
323 324 325 326 327
        mutate(sample=trim_ext(countFile, ".ReadsPerGene.out.tab"))
}, .progress="text")

countMatrix <- spread(exprCounts, sample, num_alignments)

328
write.delim(countMatrix, "star_counts_matrix.txt")
Holger Brandl's avatar
Holger Brandl committed
329 330 331 332
' | R --vanilla -q

}
export -f dge_star_counts2matrix