peak_report.R 7.74 KB
Newer Older
Holger Brandl's avatar
Holger Brandl committed
1 2
#!/usr/bin/env Rscript
#+ echo=FALSE
Holger Brandl's avatar
Holger Brandl committed
3

4 5 6 7 8 9 10 11 12 13 14 15 16 17
#+ include=FALSE
suppressMessages(require(docopt))


doc = '
Annotate peaks and create summary report
Usage: peak_report.R [options] <anno_db>

Options:
--subsample <name_prefix>       Name to prefix all generated result files [default: ]
'

opts = docopt(doc, commandArgs(TRUE))

18
# devtools::source_url("https://git.mpi-cbg.de/bioinfo/datautils/raw/v1.50/R/core_commons.R")
19 20
devtools::source_url("https://dl.dropboxusercontent.com/s/rwnpjpfz0aa9zg0/core_commons.R?dl=0")

21
devtools::source_url("https://git.mpi-cbg.de/bioinfo/datautils/raw/v1.50/R/ggplot_commons.R")
22 23 24 25 26 27 28 29 30 31 32 33 34

##todo port back into core_commons
# set_names <- purrr::set_names

load_pack(knitr)


# devtools::source_url("https://git.mpi-cbg.de/bioinfo/ngs_tools/raw/v6/dge_workflow/diffex_commons.R")

## process input arguments
anno_db = opts$anno_db

assert(anno_db %in% c("TSS.zebrafish.Zv9", "TSS.mouse.GRCm38"), "invalid annotation descriptor, see  https://www.bioconductor.org/packages/devel/bioc/manuals/ChIPpeakAnno/man/ChIPpeakAnno.pdf")
Holger Brandl's avatar
Holger Brandl committed
35 36


Holger Brandl's avatar
Holger Brandl committed
37 38 39
results_prefix = "peak_report"


Holger Brandl's avatar
Holger Brandl committed
40 41 42 43 44 45 46 47

########################################################################################################################
#' # Called Peaks Overview


## https://github.com/taoliu/MACS/

readMacsPeaks <- function(peakFile){
48 49 50
    data.table::fread(peakFile, header = F) %>%
        as_df() %>%
        mutate(file = basename(peakFile))
Holger Brandl's avatar
Holger Brandl committed
51 52 53 54
}

#5th column: -10*log10pvalue of peak region" (see http://seqanswers.com/forums/showthread.php?t=18674)
## for spec see https://github.com/taoliu/MACS/#output-files
55 56 57 58 59 60
# list.files(".", "*.narrow.*", full.names=TRUE, recursive=T) %>% map(~ file.info(.)$size )
n_peaks <- list.files(".", "*.narrow.*", full.names = TRUE, recursive = T) %>%
    purrr::discard(~ file.info(.)$size == 0) %>%
    map_df(readMacsPeaks) %>%
    set_names("chromosome_name", "start_position", "end_position", "peak_id", "score", "strand", "fold_change", "pvalue", "qvalue", "sum_rel_to_start", "file") %>%
    select(- sum_rel_to_start)
Holger Brandl's avatar
Holger Brandl committed
61

Holger Brandl's avatar
Holger Brandl committed
62
peaks <- n_peaks
Holger Brandl's avatar
Holger Brandl committed
63

64 65 66 67 68 69 70 71
# broadPeakFiles <- list.files(".", "*.broadPeak.*", full.names=TRUE, recursive=T)
# if(length(broadPeakFiles)>0){
# b_peaks <- list.files(".", "*.broadPeak.*", full.names=TRUE, recursive=T) %>%
#     ldply(readMacsPeaks) %>%
#     set_names(c("chromosome_name", "start_position", "end_position", "peak_id", "score", "strand", "fold_change", "pvalue", "qvalue", "file"))
#
# peaks <- bind_rows(peaks, b_peaks)
# }
Holger Brandl's avatar
Holger Brandl committed
72

Holger Brandl's avatar
Holger Brandl committed
73
## todo remove this later
Holger Brandl's avatar
Holger Brandl committed
74
#peaks <- mutate(peaks, file=file %>%str_replace( "_narrow", ".narrow") %>% str_replace("_broad", ".broad"))
Holger Brandl's avatar
Holger Brandl committed
75

76 77
peaks %>% count(file) %>% kable()

78
peaks %<>% separate(file, c("sample", "peak_type"), sep = "[.]") %>% mutate_inplace(sample, str_replace_all("_peaks", ""))
Holger Brandl's avatar
Holger Brandl committed
79

Holger Brandl's avatar
Holger Brandl committed
80

Holger Brandl's avatar
Holger Brandl committed
81
#splitProtTime <- . %>% separate(sample, c("protein", "timpoint"), sep="_", remove=F)
Holger Brandl's avatar
Holger Brandl committed
82 83
#peaks %>% splitProtTime %>% head

84
peaks %<>% mutate(peak_width = end_position - start_position)
Holger Brandl's avatar
Holger Brandl committed
85

86
save(peaks, file = "peaks.RData")
Holger Brandl's avatar
Holger Brandl committed
87 88 89
# peaks <- local(get(load("peaks.RData")))


Holger Brandl's avatar
Holger Brandl committed
90 91
peaks %>%
#    sample_frac(0.3) %>%
92 93 94 95 96 97
ggplot(aes(peak_width)) +
    geom_histogram() +
    facet_grid(sample ~ peak_type) +
# + geom_vline(xintercept=0, color="blue")
    ggtitle("peak width distribution") +
    xlim(0, 1000)
Holger Brandl's avatar
Holger Brandl committed
98

Holger Brandl's avatar
Holger Brandl committed
99 100
peaks %>%
#    sample_frac(0.3) %>%
101 102 103 104 105 106 107
ggplot(aes(sample, peak_width)) +
    geom_jitter(alpha = 0.05, data = sample_frac(peaks, 0.1)) +
    geom_violin(alpha = 0.8) +
    facet_wrap(~ peak_type) +
# + geom_vline(xintercept=0, color="blue")
    ggtitle("peak width distribution") +
    ylim(0, 1000)
Holger Brandl's avatar
Holger Brandl committed
108 109 110 111


peaks %>%
#    sample_frac(0.3) %>%
112 113 114 115 116 117 118 119
ggplot(aes(sample, score)) +
    geom_jitter(alpha = 0.4, data = sample_frac(peaks, 0.5)) +
    geom_violin(alpha = 0.8) +
    facet_wrap(~ peak_type) +
# + geom_vline(xintercept=0, color="blue")
    ggtitle("peak score distribution") +
    ylim(0, 100) +
    rot_x_lab()
Holger Brandl's avatar
Holger Brandl committed
120

Holger Brandl's avatar
Holger Brandl committed
121 122 123

# note: max: score ~ 10*qvalue

Holger Brandl's avatar
Holger Brandl committed
124 125

#peaks %>% group_by(sample, type) %>% summarize(num_peaks=n(), mean_width=median(end_position-start_position))
126 127 128 129
peaks %>% ggplot(aes(sample, fill = peak_type)) +
    geom_bar(position = "dodge") +
    ggtitle("peak counts") +
    rot_x_lab()
Holger Brandl's avatar
Holger Brandl committed
130 131


132 133
peaks %>%
    sample_frac(0.1) %>%
Holger Brandl's avatar
Holger Brandl committed
134
    group_by(sample) %>%
135 136
    arrange(- score) %>%
    mutate(x = row_number() / n()) %>%
Holger Brandl's avatar
Holger Brandl committed
137
#    sample_frac(0.3) %>%
138 139 140 141 142 143
    ggplot(aes(x, qvalue, color = sample)) +
    geom_line() +
    facet_wrap(~ peak_type) +
    xlim(0, 0.2) +
    ylim(0, 50) +
    rot_x_lab()
Holger Brandl's avatar
Holger Brandl committed
144 145


Holger Brandl's avatar
Holger Brandl committed
146 147 148
########################################################################################################################
#' # ChippeakAnno

149 150
## too many conflicts
unloadNamespace("conflicted")
Holger Brandl's avatar
Holger Brandl committed
151

152 153
# load_pack(ChIPpeakAnno)
require(ChIPpeakAnno)
Holger Brandl's avatar
Holger Brandl committed
154

155 156 157
## todo
# data(TSS.zebrafish.Zv9)
# data(TSS.mouse.GRCm38)
Holger Brandl's avatar
Holger Brandl committed
158

159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188
#+ eval=FALSE, echo=FALSE
## DEBUG start
if (F) {
    # https://www.bioconductor.org/packages/devel/bioc/manuals/ChIPpeakAnno/man/ChIPpeakAnno.pdf
    #load_pack(ChIPpeakAnno)
    #data(TSS.zebrafish.Zv9)

    ## https://www.biostars.org/p/115101/
    ## http://master.bioconductor.org/help/course-materials/2009/SeattleNov09/IRanges/IRangesOverview.R

    peakSet <- peaks %>%
        sample_n(10000) %>%
        filter(sample == "H3HA_Dome", peak_type == "narrow_peaks")
    peakRanges <- with(peakSet, RangedData(ranges = IRanges(start_position, end_position), strand = ".", space = chromosome_name))

    ### other feature types
    #Anno_exon = getAnnotation(mart, featureType=c("Exon"))
    #Anno_TSS = getAnnotation(mart, featureType=c("TSS"))
    #Anno_miRNA = getAnnotation(mart, featureType=c("miRNA"))
    #annotatedPeak_exon = annotatePeakInBatch(pr, AnnotationData=Anno_exon)

    annotatedPeak = annotatePeakInBatch (peakRanges, AnnotationData = TSS.zebrafish.Zv9)

    annotatedPeak %>%
        as.df() %>%
        ggplot(aes(insideFeature)) + geom_bar()
}
## DEBUG end
session::save.session(".tmp_peak_report.dat");
# session::restore.session(".tmp_peak_report.dat");
Holger Brandl's avatar
Holger Brandl committed
189 190


Holger Brandl's avatar
Holger Brandl committed
191

192 193 194 195 196
#+
# require(IRanges)
# nrow = base::nrow
# nrow = base::with
# slice = dplyr::slice
Holger Brandl's avatar
Holger Brandl committed
197

198 199 200 201
## toto make generic
# assert(eval(substitute(require(pkg), list(pkg = as.name(genome)))), glue::glue("could not load {genome}"))
eval(substitute(data(pkg), list(pkg = as.name(anno_db))))
# slotNames(annotData)
Holger Brandl's avatar
Holger Brandl committed
202

203 204 205
# data(TSS.mouse.GRCm38)
# slotNames(TSS.mouse.GRCm38)
# annotData = TSS.mouse.GRCm38
Holger Brandl's avatar
Holger Brandl committed
206

207 208
# annotData = TSS.zebrafish.Zv9
# annotData = eval(substitute(pkg, list(pkg = as.name(genome))))
Holger Brandl's avatar
Holger Brandl committed
209 210

## apply csa to all sets
211
annotatePeaks <- function(somePeaks, subsample=min(nrow(somePeaks), 10000)){
Holger Brandl's avatar
Holger Brandl committed
212
    peakRanges <- somePeaks %>%
213 214
    sample_n(subsample) %$%
    RangedData(ranges = IRanges(start_position, end_position), strand = ".", space = chromosome_name)
Holger Brandl's avatar
Holger Brandl committed
215

216 217 218 219
    ## todo how to abstract from species here
    # annotatePeakInBatch (peakRanges, AnnotationData = substitute(pkg, list(pkg = as.name(anno_db)))) %>% as_df()
    # annotatePeakInBatch (peakRanges, AnnotationData = as.name(anno_db)) %>% as_df()
    annotatePeakInBatch (peakRanges, AnnotationData = TSS.mouse.GRCm38) %>% as_df()
Holger Brandl's avatar
Holger Brandl committed
220
}
221 222 223 224
# somePeaks =peaks %>% group_by(sample, peak_type) %>% first_group()
annoPeaks <- peaks %>%
    group_by(sample, peak_type) %>%
    do(annotatePeaks(.))
Holger Brandl's avatar
Holger Brandl committed
225

226
save(annoPeaks, file = "annoPeaks.RData")
Holger Brandl's avatar
Holger Brandl committed
227 228
# annoPeaks <- local(get(load("annoPeaks.RData")))

Holger Brandl's avatar
Holger Brandl committed
229 230 231
## todo merge
#peaks, annoPeaks

Holger Brandl's avatar
Holger Brandl committed
232

Holger Brandl's avatar
Holger Brandl committed
233 234
#' CSA adds ovelap category, nearest gene id and distance to TSS
#+ results='asis'
235 236 237 238
annoPeaks %>%
    ungroup() %>%
    sample_n(10) %>%
    kable()
Holger Brandl's avatar
Holger Brandl committed
239 240 241 242 243

#+
#with(peaks, as.data.frame(table(sample, peak_type)))
#peaks %$% as.data.frame(table(sample, peak_type))

244 245 246 247 248
annoPeaks %>% ggplot(aes(sample, fill = insideFeature)) +
    geom_bar() +
    facet_wrap(~ peak_type) +
    rot_x_lab()

Holger Brandl's avatar
Holger Brandl committed
249 250 251 252 253
peaks %<>% tbl_df

write_tsv(annoPeaks, path=add_prefix("annoPeaks.txt"))
# annoPeaks <- read_tsv(add_prefix("annoPeaks.txt"))
#'  [annoPeaks](`r add_prefix("annoPeaks.txt")`)
254 255 256 257

session::save.session(".peak_report.dat");
# session::restore.session(".peak_report.dat");