Commit 32cc19ec authored by Lena Hersemann's avatar Lena Hersemann

Merge remote-tracking branch 'origin/master'

parents 6f4c9180 48e5f59c
......@@ -16,7 +16,7 @@ export PYTHONPATH=/home/brandl/bin/macs2/lib/python2.7/site-packages:$PYTHONPATH
cs_bowtie_qc(){
rendr_snippet "bowtie2_qc" <<EOF
rendr_snippet "bowtie2_qc" <<"EOF"
devtools::source_url("https://raw.githubusercontent.com/holgerbrandl/datautils/v1.8/R/core_commons.R")
devtools::source_url("https://raw.githubusercontent.com/holgerbrandl/datautils/v1.8/R/ggplot_commons.R")
......
#!/usr/bin/env Rscript
#+ echo=FALSE
devtools::source_url("https://raw.githubusercontent.com/holgerbrandl/datautils/v1.22/R/core_commons.R")
devtools::source_url("https://raw.githubusercontent.com/holgerbrandl/datautils/v1.22/R/ggplot_commons.R")
#+ include=FALSE
suppressMessages(require(docopt))
doc = '
Annotate peaks and create summary report
Usage: peak_report.R [options] <anno_db>
Options:
--subsample <name_prefix> Name to prefix all generated result files [default: ]
'
opts = docopt(doc, commandArgs(TRUE))
# devtools::source_url("https://raw.githubusercontent.com/holgerbrandl/datautils/v1.50/R/core_commons.R")
devtools::source_url("https://dl.dropboxusercontent.com/s/rwnpjpfz0aa9zg0/core_commons.R?dl=0")
devtools::source_url("https://raw.githubusercontent.com/holgerbrandl/datautils/v1.50/R/ggplot_commons.R")
##todo port back into core_commons
# set_names <- purrr::set_names
load_pack(knitr)
require_auto(knitr)
require_auto(tidyr)
#if (!require("DT")) devtools::install_github("rstudio/DT") ## see http://rstudio.github.io/DT/
# devtools::source_url("https://git.mpi-cbg.de/bioinfo/ngs_tools/raw/v6/dge_workflow/diffex_commons.R")
## process input arguments
anno_db = opts$anno_db
assert(anno_db %in% c("TSS.zebrafish.Zv9", "TSS.mouse.GRCm38"), "invalid annotation descriptor, see https://www.bioconductor.org/packages/devel/bioc/manuals/ChIPpeakAnno/man/ChIPpeakAnno.pdf")
results_prefix = "peak_report"
## setwd("/projects/bioinfo/holger/projects/krause_chipseq/macs2")
########################################################################################################################
#' # Called Peaks Overview
......@@ -18,145 +45,185 @@ require_auto(tidyr)
## https://github.com/taoliu/MACS/
readMacsPeaks <- function(peakFile){
data.table::fread(peakFile, header=F) %>% as.df() %>% mutate(file=basename(peakFile))
data.table::fread(peakFile, header = F) %>%
as_df() %>%
mutate(file = basename(peakFile))
}
#5th column: -10*log10pvalue of peak region" (see http://seqanswers.com/forums/showthread.php?t=18674)
## for spec see https://github.com/taoliu/MACS/#output-files
n_peaks <- list.files(".", "*.narrowPeak.*", full.names=TRUE, recursive=T) %>%
ldply(readMacsPeaks) %>%
set_names(c("chromosome_name", "start_position", "end_position", "peak_id", "score", "strand", "fold_change", "pvalue", "qvalue", "sum_rel_to_start", "file")) %>%
select(-sum_rel_to_start)
# list.files(".", "*.narrow.*", full.names=TRUE, recursive=T) %>% map(~ file.info(.)$size )
n_peaks <- list.files(".", "*.narrow.*", full.names = TRUE, recursive = T) %>%
purrr::discard(~ file.info(.)$size == 0) %>%
map_df(readMacsPeaks) %>%
set_names("chromosome_name", "start_position", "end_position", "peak_id", "score", "strand", "fold_change", "pvalue", "qvalue", "sum_rel_to_start", "file") %>%
select(- sum_rel_to_start)
peaks <- n_peaks
broadPeakFiles <- list.files(".", "*.broadPeak.*", full.names=TRUE, recursive=T)
if(length(broadPeakFiles)>0){
b_peaks <- list.files(".", "*.broadPeak.*", full.names=TRUE, recursive=T) %>%
ldply(readMacsPeaks) %>%
set_names(c("chromosome_name", "start_position", "end_position", "peak_id", "score", "strand", "fold_change", "pvalue", "qvalue", "file"))
peaks <- bind_rows(peaks, b_peaks)
}
# broadPeakFiles <- list.files(".", "*.broadPeak.*", full.names=TRUE, recursive=T)
# if(length(broadPeakFiles)>0){
# b_peaks <- list.files(".", "*.broadPeak.*", full.names=TRUE, recursive=T) %>%
# ldply(readMacsPeaks) %>%
# set_names(c("chromosome_name", "start_position", "end_position", "peak_id", "score", "strand", "fold_change", "pvalue", "qvalue", "file"))
#
# peaks <- bind_rows(peaks, b_peaks)
# }
## todo remove this later
#peaks <- mutate(peaks, file=file %>%str_replace( "_narrow", ".narrow") %>% str_replace("_broad", ".broad"))
peaks %>% count(file) %>% kable()
peaks %<>% separate(file, c("sample", "peak_type", "ext"), sep="[.]") %>% select(-ext)
peaks %<>% separate(file, c("sample", "peak_type"), sep = "[.]") %>% mutate_inplace(sample, str_replace_all("_peaks", ""))
#splitProtTime <- . %>% separate(sample, c("protein", "timpoint"), sep="_", remove=F)
#peaks %>% splitProtTime %>% head
peaks %<>% mutate(peak_width=end_position-start_position)
peaks %<>% mutate(peak_width = end_position - start_position)
save(peaks, file="peaks.RData")
save(peaks, file = "peaks.RData")
# peaks <- local(get(load("peaks.RData")))
peaks %>%
# sample_frac(0.3) %>%
ggplot(aes(peak_width)) +
geom_histogram() +
facet_grid(sample ~ peak_type) +
# + geom_vline(xintercept=0, color="blue")
ggtitle("peak width distribution") +
xlim(0,1000)
ggplot(aes(peak_width)) +
geom_histogram() +
facet_grid(sample ~ peak_type) +
# + geom_vline(xintercept=0, color="blue")
ggtitle("peak width distribution") +
xlim(0, 1000)
peaks %>%
# sample_frac(0.3) %>%
ggplot(aes(sample, peak_width)) +
geom_jitter(alpha=0.05, data=sample_frac(peaks, 0.1)) +
geom_violin(alpha=0.8) +
facet_wrap(~ peak_type) +
# + geom_vline(xintercept=0, color="blue")
ggtitle("peak width distribution") +
ylim(0,1000)
ggplot(aes(sample, peak_width)) +
geom_jitter(alpha = 0.05, data = sample_frac(peaks, 0.1)) +
geom_violin(alpha = 0.8) +
facet_wrap(~ peak_type) +
# + geom_vline(xintercept=0, color="blue")
ggtitle("peak width distribution") +
ylim(0, 1000)
peaks %>%
# sample_frac(0.3) %>%
ggplot(aes(sample, score)) +
geom_jitter(alpha=0.05, data=sample_frac(peaks, 0.1)) +
geom_violin(alpha=0.8) +
facet_wrap(~ peak_type) +
# + geom_vline(xintercept=0, color="blue")
ggtitle("peak score distribution") +
ylim(0,100) +
rot_x_lab()
ggplot(aes(sample, score)) +
geom_jitter(alpha = 0.4, data = sample_frac(peaks, 0.5)) +
geom_violin(alpha = 0.8) +
facet_wrap(~ peak_type) +
# + geom_vline(xintercept=0, color="blue")
ggtitle("peak score distribution") +
ylim(0, 100) +
rot_x_lab()
# note: max: score ~ 10*qvalue
#peaks %>% group_by(sample, type) %>% summarize(num_peaks=n(), mean_width=median(end_position-start_position))
peaks %>% ggplot(aes(sample, fill=peak_type)) + geom_bar(position="dodge") + ggtitle("peak counts") + rot_x_lab()
peaks %>% ggplot(aes(sample, fill = peak_type)) +
geom_bar(position = "dodge") +
ggtitle("peak counts") +
rot_x_lab()
peaks %>% sample_frac(0.1) %>%
peaks %>%
sample_frac(0.1) %>%
group_by(sample) %>%
arrange(-score) %>%
mutate(x=row_number()/n()) %>%
arrange(- score) %>%
mutate(x = row_number() / n()) %>%
# sample_frac(0.3) %>%
ggplot(aes(x, qvalue, color=sample)) +
geom_line() +
facet_wrap(~ peak_type) +
xlim(0, 0.2) + ylim(0,50) +
rot_x_lab()
ggplot(aes(x, qvalue, color = sample)) +
geom_line() +
facet_wrap(~ peak_type) +
xlim(0, 0.2) +
ylim(0, 50) +
rot_x_lab()
########################################################################################################################
#' # ChippeakAnno
# see [docs](http://www.bioconductor.org/packages/release/bioc/vignettes/ChIPpeakAnno/inst/doc/ChIPpeakAnno.pdf)
require_auto(ChIPpeakAnno)
data(TSS.zebrafish.Zv9)
## too many conflicts
unloadNamespace("conflicted")
#+ eval=FALSE, echo=FALSE
## DEBUG start
if(F){
#http://www.bioconductor.org/packages/release/bioc/vignettes/ChIPpeakAnno/inst/doc/ChIPpeakAnno.pdf
#require_auto(ChIPpeakAnno)
#data(TSS.zebrafish.Zv9)
# load_pack(ChIPpeakAnno)
require(ChIPpeakAnno)
## https://www.biostars.org/p/115101/
## http://master.bioconductor.org/help/course-materials/2009/SeattleNov09/IRanges/IRangesOverview.R
## todo
# data(TSS.zebrafish.Zv9)
# data(TSS.mouse.GRCm38)
peakSet <- peaks %>% sample_n(10000) %>% filter(sample=="H3HA_Dome", peak_type=="narrow_peaks")
peakRanges <-with(peakSet, RangedData(ranges = IRanges(start_position, end_position), strand = ".", space = chromosome_name))
#+ eval=FALSE, echo=FALSE
## DEBUG start
if (F) {
# https://www.bioconductor.org/packages/devel/bioc/manuals/ChIPpeakAnno/man/ChIPpeakAnno.pdf
#load_pack(ChIPpeakAnno)
#data(TSS.zebrafish.Zv9)
## https://www.biostars.org/p/115101/
## http://master.bioconductor.org/help/course-materials/2009/SeattleNov09/IRanges/IRangesOverview.R
peakSet <- peaks %>%
sample_n(10000) %>%
filter(sample == "H3HA_Dome", peak_type == "narrow_peaks")
peakRanges <- with(peakSet, RangedData(ranges = IRanges(start_position, end_position), strand = ".", space = chromosome_name))
### other feature types
#Anno_exon = getAnnotation(mart, featureType=c("Exon"))
#Anno_TSS = getAnnotation(mart, featureType=c("TSS"))
#Anno_miRNA = getAnnotation(mart, featureType=c("miRNA"))
#annotatedPeak_exon = annotatePeakInBatch(pr, AnnotationData=Anno_exon)
annotatedPeak = annotatePeakInBatch (peakRanges, AnnotationData = TSS.zebrafish.Zv9)
annotatedPeak %>%
as.df() %>%
ggplot(aes(insideFeature)) + geom_bar()
}
## DEBUG end
session::save.session(".tmp_peak_report.dat");
# session::restore.session(".tmp_peak_report.dat");
### other feature types
#Anno_exon = getAnnotation(mart, featureType=c("Exon"))
#Anno_TSS = getAnnotation(mart, featureType=c("TSS"))
#Anno_miRNA = getAnnotation(mart, featureType=c("miRNA"))
#annotatedPeak_exon = annotatePeakInBatch(pr, AnnotationData=Anno_exon)
annotatedPeak = annotatePeakInBatch (peakRanges, AnnotationData = TSS.zebrafish.Zv9)
annotatedPeak %>% as.df()%>% ggplot(aes(insideFeature)) + geom_bar()
#+
# require(IRanges)
# nrow = base::nrow
# nrow = base::with
# slice = dplyr::slice
}
## DEBUG end
## toto make generic
# assert(eval(substitute(require(pkg), list(pkg = as.name(genome)))), glue::glue("could not load {genome}"))
eval(substitute(data(pkg), list(pkg = as.name(anno_db))))
# slotNames(annotData)
#+
require(ChIPpeakAnno)
data(TSS.zebrafish.Zv9)
# data(TSS.mouse.GRCm38)
# slotNames(TSS.mouse.GRCm38)
# annotData = TSS.mouse.GRCm38
# annotData = TSS.zebrafish.Zv9
# annotData = eval(substitute(pkg, list(pkg = as.name(genome))))
## apply csa to all sets
annotatePeaks <- function(somePeaks, subsample=min(nrow(peaks), 10000)){
annotatePeaks <- function(somePeaks, subsample=min(nrow(somePeaks), 10000)){
peakRanges <- somePeaks %>%
sample_n(subsample) %$%
RangedData(ranges = IRanges(start_position, end_position), strand = ".", space = chromosome_name)
sample_n(subsample) %$%
RangedData(ranges = IRanges(start_position, end_position), strand = ".", space = chromosome_name)
annotatePeakInBatch (peakRanges, AnnotationData = TSS.zebrafish.Zv9) %>% as.df()
## todo how to abstract from species here
# annotatePeakInBatch (peakRanges, AnnotationData = substitute(pkg, list(pkg = as.name(anno_db)))) %>% as_df()
# annotatePeakInBatch (peakRanges, AnnotationData = as.name(anno_db)) %>% as_df()
annotatePeakInBatch (peakRanges, AnnotationData = TSS.mouse.GRCm38) %>% as_df()
}
# somePeaks =peaks %>% group_by(sample, peak_type) %>% first_group()
annoPeaks <- peaks %>%
group_by(sample, peak_type) %>%
do(annotatePeaks(.))
annoPeaks <- peaks %>% group_by(sample, peak_type) %>% do(annotatePeaks(.))
save(annoPeaks, file="annoPeaks.RData")
save(annoPeaks, file = "annoPeaks.RData")
# annoPeaks <- local(get(load("annoPeaks.RData")))
## todo merge
......@@ -165,12 +232,26 @@ save(annoPeaks, file="annoPeaks.RData")
#' CSA adds ovelap category, nearest gene id and distance to TSS
#+ results='asis'
annoPeaks %>% ungroup() %>% sample_n(10) %>% kable()
annoPeaks %>%
ungroup() %>%
sample_n(10) %>%
kable()
#+
#with(peaks, as.data.frame(table(sample, peak_type)))
#peaks %$% as.data.frame(table(sample, peak_type))
annoPeaks %>%
# as.data.frame(table(sample, peak_type, insideFeature)) %>%
ggplot(aes(sample, fill=insideFeature)) + geom_bar() + facet_wrap(~peak_type) + rot_x_lab()
annoPeaks %>% ggplot(aes(sample, fill = insideFeature)) +
geom_bar() +
facet_wrap(~ peak_type) +
rot_x_lab()
peaks %<>% tbl_df
write_tsv(annoPeaks, path=add_prefix("annoPeaks.txt"))
# annoPeaks <- read_tsv(add_prefix("annoPeaks.txt"))
#' [annoPeaks](`r add_prefix("annoPeaks.txt")`)
session::save.session(".peak_report.dat");
# session::restore.session(".peak_report.dat");
......@@ -45,8 +45,7 @@ gene_info_file = opts$gene_info
assert(is.null(gene_info_file) || file.exists(gene_info_file), "invalid gene_info_file")
designFormula = opts$design
assert(str_detect(designFormula, ".*condition$"))
assert(str_detect(designFormula, "^condition.*")) ## make sure that the condition comes before all batch factors
results_prefix = if (str_length(opts$out) > 0) opts$out else "" # used by add_prefix
......@@ -136,7 +135,21 @@ group_labels = data_frame(replicate = colnames(expMatrix)) %>%
names(group_labels) = colnames(expMatrix)
makePcaPlot(t(expMatrix), color_by = group_labels, title = "PCA of quantifiable proteins in all conditions")
#' Also do a scatter plot matrix of the PCs
mydata.pca = prcomp(t(expMatrix), retx = TRUE, center = TRUE, scale. = FALSE)
# screeplot(mydata.pca)
# devtools::install_github("vqv/ggbiplot")
# require(ggbiplot)
ggbiplot::ggscreeplot(mydata.pca) + geom_col()
# load_pack(GGally)
pcs = mydata.pca$x %>% as_df %>% rownames_to_column("sample")
pcs %>% GGally::ggpairs(columns=2:6, mapping=ggplot2::aes(color=sample), upper="blank", legend=c(3,3)) + theme(legend.position = "bottom")
#' Also analyze spearman correlation
correlation = cor(expMatrix, method = "spearman")
library(lattice)
levelplot(correlation, scales = list(x = list(rot = 90)), pretty = TRUE, main = "Spearman correlation between conditions after Normalization", xlab = "Conditions", ylab = "Conditions")
......@@ -180,11 +193,15 @@ orderMatcheExpDesign = data_frame(replicate = colnames(expMatrix)) %>%
right_join(expDesign, by = "replicate") %>%
arrange(col_index)
## build design matrix
#A key strength of limma’s linear modelling approach, is the ability accommodate arbitrary experimental complexity. Simple designs, such as the one in this workflow, with cell type and batch, through to more complicated factorial designs and models with interaction terms can be handled relatively easily
#' Build design matrix
#' > A key strength of limma’s linear modelling approach, is the ability accommodate arbitrary experimental complexity. Simple designs, such as the one in this workflow, with cell type and batch, through to more complicated factorial designs and models with interaction terms can be handled relatively easily
#'
#' Make sure that non of the batch-factors is confounded with treatment (condition). See https://support.bioconductor.org/p/39385/ for a discussion
#' References
#' * https://f1000research.com/articles/5-1408/v1
# design <- orderMatcheExpDesign %$% model.matrix(~ 0 + condition)
design <- orderMatcheExpDesign %$% model.matrix(formula(as.formula(paste("~0+", designFormula))))
# design <- orderMatcheExpDesign %$% model.matrix(~ 0 + condition + prep_day)
design = orderMatcheExpDesign %$% model.matrix(formula(as.formula(paste("~0+", designFormula))))
rownames(design) <- orderMatcheExpDesign$replicate
#design <- model.matrix(~0+group+lane)
......@@ -319,7 +336,8 @@ deResults %<>% left_join(sampleMeans)
#' apply the hit criterion
deResults %>% ggplot(aes(adj_p_val)) + geom_histogram()
deResults %>% ggplot(aes(p_value)) + geom_histogram(binwidth=.01)
deResults %>% ggplot(aes(adj_p_val)) + geom_histogram(binwidth=.01)
# report hit criterion
#+ results='asis'
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment