chipseq_utils.sh 6.91 KB
Newer Older
1 2
#source <(curl https://git.mpi-cbg.de/bioinfo/datautils/raw/v1.50/tools/rendr/rendr_utils.sh 2>&1 2>/dev/null)
source <(curl https://git.mpi-cbg.de/bioinfo/datautils/raw/v1.50/tools/rendr/rendr_utils.sh 2>&1 2>/dev/null)
Holger Brandl's avatar
Holger Brandl committed
3
source <(curl https://git.mpi-cbg.de/bioinfo/ngs_tools/raw/v10/common/bash_utils.sh 2>&1 2>/dev/null)
Holger Brandl's avatar
Holger Brandl committed
4 5 6 7 8


## configure paths
#python setup_w_cython.py install --prefix /home/brandl/bin/macs2

9
## todo use append2path as for dge_utils
Holger Brandl's avatar
Holger Brandl committed
10 11
export PATH=/projects/bioinfo/holger/bin/bedtools2-2.25.0/bin:${PATH}
export PATH=/home/brandl/bin/macs2/bin:${PATH}
Holger Brandl's avatar
Holger Brandl committed
12 13 14 15 16 17 18 19
export PYTHONPATH=/home/brandl/bin/macs2/lib:$PYTHONPATH
export PYTHONPATH=/home/brandl/bin/macs2/lib/python2.7/site-packages:$PYTHONPATH

#/home/brandl/bin/macs2/bin/macs2 -h


cs_bowtie_qc(){

20
rendr_snippet  "bowtie2_qc" <<"EOF"
21 22
devtools::source_url("https://git.mpi-cbg.de/bioinfo/datautils/raw/v1.8/R/core_commons.R")
devtools::source_url("https://git.mpi-cbg.de/bioinfo/datautils/raw/v1.8/R/ggplot_commons.R")
Holger Brandl's avatar
Holger Brandl committed
23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51

########################################################################################################################
#> # Bowtie2 Mapping Summary

#> Directory: `r getwd()`

logSuffix=".bowtie.log"
parseAlgnSummary <- function(alignSummary){
    #alignSummary="/lustre/projects/bioinfo/holger/projects/khan_chipseq_h1/alignments/H1L_Shield.bowtie.log"
    algnData <- readLines(alignSummary)

    data.frame(
        condition=trimEnd(basename(alignSummary), logSuffix),
        num_reads=as.numeric(str_split_fixed(algnData[1], " ", 2)[1]),
        mapping_efficiency=as.numeric(str_replace(str_split_fixed(algnData[6], " ", 2)[1], "%", "")),

        unique_mappers=as.numeric(str_split_fixed(str_trim(algnData[4]), " ", 2)[1]),
        unique_mapper_prop=str_match(algnData[4], "[(]([0-9.]*)[%)]*")  %>% subset(select=2) %>% as.numeric(),

        multi_mappers=as.numeric(str_split_fixed(str_trim(algnData[5]), " ", 2)[1]),
        multi_mappers_prop=str_match(algnData[5], "[(]([0-9.]*)[%)]*")  %>% subset(select=2) %>% as.numeric()
    )
}

algnSummary <- list.files(".", logSuffix, full.names=TRUE, recursive=T) %>% ldply(parseAlgnSummary)
write.delim(algnSummary, file="bowtie2_qc.txt")

scale_fill_discrete <- function (...){ scale_color_brewer(..., type = "seq", palette="Set1", "fill", na.value = "grey50") }

52 53
## make sure that mapping efficency is a ratio between 0 and 1
algnSummary %<>% mutate(mapping_efficiency=mapping_efficiency/100.)
Holger Brandl's avatar
Holger Brandl committed
54 55 56 57

#+ fig.height=nrow(algnSummary)
ggplot(algnSummary, aes(condition, mapping_efficiency)) +
    geom_bar(stat="identity") +
Holger Brandl's avatar
Holger Brandl committed
58
    coord_flip() +
59
    scale_y_continuous(labels=percent, limits=c(0,1))
Holger Brandl's avatar
Holger Brandl committed
60 61
    ggtitle("mapping efficiency")

62
ggplot(algnSummary, aes(condition, num_reads*mapping_efficiency)) +
63 64 65 66 67 68
    geom_bar(stat="identity") +
    xlab("# alignments") +
    coord_flip() +
    scale_y_continuous(labels=comma)
    ggtitle("alignment counts")

Holger Brandl's avatar
Holger Brandl committed
69 70 71
ggplot(algnSummary, aes(condition, num_reads)) +
    geom_bar(stat="identity") +
    coord_flip() +
Holger Brandl's avatar
Holger Brandl committed
72 73
    ggtitle("read counts") +
    scale_y_continuous(labels=comma)
Holger Brandl's avatar
Holger Brandl committed
74 75

ggplot(algnSummary, aes(condition, unique_mapper_prop)) +
Holger Brandl's avatar
Holger Brandl committed
76 77
    geom_bar(stat="identity") +
    coord_flip() +
Holger Brandl's avatar
Holger Brandl committed
78
    ggtitle("unique-mapper proportions") +
Holger Brandl's avatar
Holger Brandl committed
79 80 81
    scale_y_continuous(labels=percent) +
    ylim(0,100)

Holger Brandl's avatar
Holger Brandl committed
82 83

ggplot(algnSummary, aes(condition, multi_mappers_prop)) +
Holger Brandl's avatar
Holger Brandl committed
84 85
    geom_bar(stat="identity") +
    coord_flip() +
Holger Brandl's avatar
Holger Brandl committed
86
    ggtitle("multi-mapper proportions") +
Holger Brandl's avatar
Holger Brandl committed
87 88 89
    scale_y_continuous(labels=percent) +
    ylim(0,100)

Holger Brandl's avatar
Holger Brandl committed
90 91 92 93 94 95 96 97 98

#> ## Bam Correlation

#> Using simple genome binning, we can performa a simple clustering/corrlation analysis

## http://stackoverflow.com/questions/291813/recommended-way-to-embed-pdf-in-html
#+ results="asis"
cat("<embed src=\"./bc.pdf\" width=\"900px\" height=\"700px\">")

Holger Brandl's avatar
Holger Brandl committed
99
EOF
Holger Brandl's avatar
Holger Brandl committed
100

Holger Brandl's avatar
Holger Brandl committed
101
}
102 103 104 105 106
export -f cs_bowtie_qc



cs_lib_size(){
107
rm -f algn_counts.txt
Holger Brandl's avatar
Holger Brandl committed
108
for bamFile in $(ls *.bam); do
109 110
(
    echo "processing $bamFile"
111
    samtools flagstat $bamFile | head -n1 | cut -f1 -d' ' | sed -e 's/$/\t'$(basename $bamFile .bam)'/' >> algn_counts.txt
112 113 114 115 116
) &
done
wait

echo '
117 118
devtools::source_url("https://git.mpi-cbg.de/bioinfo/datautils/raw/v1.26/R/core_commons.R")
devtools::source_url("https://git.mpi-cbg.de/bioinfo/datautils/raw/v1.26/R/ggplot_commons.R")
119

120
libSizes <- read_tsv("algn_counts.txt", col_names=F) %>% set_names(c("library_size", "sample"))
121 122
libSizes %>% ggplot(aes(sample, library_size)) +
    geom_bar(stat="identity") +
Holger Brandl's avatar
Holger Brandl committed
123 124
    scale_y_continuous(labels=comma) +
    ggtitle("# alignments") +
125 126 127 128 129 130 131
    coord_flip()

ggsave2()
' | R -q --vanilla
}
export -f cs_lib_size

132 133 134 135 136


## create sorted genome file for dba counting
cs_sync_order_chrominfo2bam(){

Holger Brandl's avatar
Holger Brandl committed
137
if [ $# -ne 2 ]; then
138
    echo "Usage: cs_sync_order_chrominfo2bam <bam_file>  <chrom_info>" >&2 ; return;
Holger Brandl's avatar
Holger Brandl committed
139 140
fi

141
echo '
142
devtools::source_url("https://git.mpi-cbg.de/bioinfo/datautils/raw/v1.38/R/core_commons.R")
143 144 145 146

bamFile=commandArgs(T)[1]
chromInfoFile=commandArgs(T)[2]

147 148
#bamFile="/projects/bioinfo/holger/projects/krause_chipseq/alignments_mmfilt/H3HA_Dome_mmf.bam";
#chromInfoFile="danRer7.chromInfo"
149

150 151 152
#bamFile="/projects/bioinfo/holger/projects/krause_chipseq/reanalysis/../alignments_mmfilt/H3HA_Sphere_mmf.bam";
#chromInfoFile="danRer7.chromInfo"

153
chrOrderBam = readLines(pipe(paste("samtools view -H ", bamFile, " | cut -f2 | grep -F SN"))) %>% str_replace(fixed("SN:"), "")
154

155
#chromInfo = read_tsv(chromInfoFile, col_names=F) %>% subset(select=1:2) %>% set_names("chromosome", "chr_length") #%>% mutate(chrom=str_replace(chromosome, "chr", ""))
156 157 158 159 160 161 162

#data.frame(chrom=chrOrderBam) %>%
#    left_join(chromInfo) %>%
#    ## make sure to always keep all chromsomes of the bam no matter if they are present in the chromInfo.
#    mutate(size=ifelse(is.na(size), -1, size)) %>%
#    write.delim(stdout(), header=F, quote=F)

163 164
read_tsv(chromInfoFile, col_names=F) %>%
    set_names("chromosome", "chr_length") %>%
165 166 167 168
    subset(select=1:2) %>%
#    mutate(chromosome=str_replace(chromosome, "M", "MT") %>% str_replace("chr", "")) %>%
#    mutate(chromosome=str_replace(chromosome, "chrM", "M") %>% str_replace("chr", "")) %>%
    filter(chromosome %in% chrOrderBam) %>%
169 170
    mutate(chromosome=factor(as.character(chromosome), levels=chrOrderBam)) %>% arrange(chromosome) %>%
    filter(!is.na(chromosome)) %>%
171
    write.delim(stdout(), header=F, quote=F)
172

173 174
' | Rscript --vanilla - $1 $2  2> /dev/null
}
175
export -f cs_sync_order_chrominfo2bam
176

177 178

cs_sync_chrominfo_order_to_bed(){
Holger Brandl's avatar
Holger Brandl committed
179 180

if [ $# -ne 2 ]; then
181
    echo "Usage: cs_sync_chrominfo_order_to_bed <sorted_chrom_info>  <bed_file>" >&2 ; return;
Holger Brandl's avatar
Holger Brandl committed
182 183
fi

184
echo '
185
devtools::source_url("https://git.mpi-cbg.de/bioinfo/datautils/raw/v1.38/R/core_commons.R")
186

187
chromInfo=commandArgs(T)[1]
188 189 190 191
bedFile=commandArgs(T)[2]

options(warn = -1)

192 193 194
chrOrderBam = read_tsv(chromInfo, col_names=F) %>% set_names("chromosome", "length") %$% chromosome

bedData = read.delim(bedFile, h=F)
195

196 197
bedData %<>% filter(V1 %in% chrOrderBam) %>% mutate(V1=factor(as.character(V1), levels=chrOrderBam)) %>% arrange(V1, V2)
bedData %>% write.delim(stdout(), header=F, quote=F)
198 199
' | Rscript --vanilla - $1 $2 2> /dev/null
}
200
export -f cs_sync_chrominfo_order_to_bed
201