cont. chipseq analysis

e827b0df · Holger Brandl · 820b69ac · e827b0df · e827b0df
Commit e827b0df authored 10 years ago by Holger Brandl
--- a/chipseq_workflow/cs_snippets.sh
+++ b/chipseq_workflow/cs_snippets.sh
+########################################################################################################################
+### Picard ResortSam
+
+sort -k1V $bowtieIndex.fa.dict_tmp > $bowtieIndex.dict
+
+samtools view -h $bamFile | removeMultiMappers > ${bamBaseName}_mmf.tmp.bam
+samtools index ${bamBaseName}_mmf.tmp.bam # because ReorderSam runs substantially faster if the input is an indexed BAM file.
+picard ReorderSam I=${bamBaseName}_mmf.tmp.bam O=${bamBaseName}_mmf.bam REFERENCE=$bowtieIndex.fa
+##    samtools view -h $bamFile | removeMultiMappers | picard ReorderSam I=/dev/stdin O=${bamBaseName}_mmf.bam REFERENCE=$bowtieIndex.fa
+samtools index ${bamBaseName}_mmf.bam
+
+
+

 ## extract chromosome from bam file
 samtools view -bo test.bam /home/brandl/mnt/chip-seq_study/ChIPSeq_February_2014/alignments_trimmed_nomulti_pooled/H2Az.bam 1

--- a/chipseq_workflow/peak_report.R
+++ b/chipseq_workflow/peak_report.R
+devtools::source_url("https://raw.githubusercontent.com/holgerbrandl/datautils/v1.4/R/core_commons.R")
+devtools::source_url("https://raw.githubusercontent.com/holgerbrandl/datautils/v1.4/R/ggplot_commons.R")
+
+
+require(knitr)
+require.auto(tidyr)
+require(tidyr)
+if (!require("DT")) devtools::install_github("rstudio/DT") ## see http://rstudio.github.io/DT/
+
+## setwd("/projects/bioinfo/holger/projects/krause_chipseq/macs2__OLD_SORTING")
+
+########################################################################################################################
+#' # Called Peaks Overview
+
+
+## https://github.com/taoliu/MACS/
+
+readMacsPeaks <- function(peakFile){
+    read.delim(peakFile, header=F) %>% mutate(file=basename(peakFile))
+}
+
+logSuffix="*.narrowPeak.*"
+
+peaks <- list.files(".", logSuffix, full.names=TRUE, recursive=T) %>% ldply(readMacsPeaks)
+
+#5th column: -10*log10pvalue of peak region" (see http://seqanswers.com/forums/showthread.php?t=18674)
+## for spec see https://github.com/taoliu/MACS/#output-files
+peaks %<>% set_names(c("chromosome_name", "start_position", "end_position", "peak_id", "score", "strand", "fold_change", "pvalue", "qvalue", "sum_rel_to_start", "file"))
+
+
+## todo remove this
+peaks %<>% mutate(file=str_replace(file, "_narrow",".narrow"))
+
+peaks %<>% separate(file, c("sample", "peak_type", "ext"), sep="[.]") %>% select(-ext)
+
+splitProtTime <- . %>% separate(sample, c("protein", "timpoint"), sep="_", remove=F)
+#peaks %>% splitProtTime %>% head
+
+peaks %<>% mutate(peak_width=end_position-start_position)
+
+peaks %>%
+#    sample_frac(0.3) %>%
+    ggplot(aes(end_position-start_position)) +
+        geom_histogram() +
+        facet_grid(sample ~ peak_type)  +
+        # + geom_vline(yintercept=0, color="blue")
+        ggtitle("peak width distribution") +
+        xlim(0,1000)
+
+peaks %>%
+#    sample_frac(0.3) %>%
+    ggplot(aes(sample, peak_width)) +
+        geom_jitter(alpha=0.05, data=sample_frac(peaks, 0.1)) +
+        geom_violin(alpha=0.8) +
+        facet_wrap(~ peak_type)  +
+        # + geom_vline(yintercept=0, color="blue")
+        ggtitle("peak width distribution") +
+        ylim(0,1000)
+
+#peaks %>% group_by(sample, type) %>% summarize(num_peaks=n(), mean_width=median(end_position-start_position))
+peaks %>% ggplot(aes(sample, fill=peak_type)) + geom_bar() + ggtitle("peak counts")
\ No newline at end of file