Commit fc0cca22 authored by Lena Hersemann's avatar Lena Hersemann

Merge remote-tracking branch 'origin/master'

parents d3877b01 5af5af73
......@@ -51,8 +51,39 @@ jl wait --report .dupanlysis || { echo "duplicate analysis failed" >&2; exit 1;
## todo expose gtf as script parameter
gtfFile=${IGENOME}/Annotation/Genes/genes.gtf
# R --args $gtfFile ${outputDir}/*.markdup.bam
# R --args $gtfFile ${outputDir}/*.markdup.bam
# ll $gtfFile ${outputDir}/*.markdup.bam
rendr_snippet "duplicate_anlysis" <<"EOF" $gtfFile ${outputDir}/*.markdup.bam
#' # Assess PCR duplicates in RNA-Seq
#'
#' ## Introduction
#'
#' From [dupRadar docs](http://bioconductor.org/packages/release/bioc/vignettes/dupRadar/inst/doc/dupRadar.html#plotting-and-interpretation)
#'
#' Current NGS workflows usually involve PCR steps at some point, which involves the danger of PCR artefacts and over-amplification of reads.
#' In RNA-Seq however, apart from the technical PCR duplicates, there is a strong source for biological read duplicates.
#'
#' `dupRadar` relates the duplication rate and length normalized read counts of every gene to model the dependency of this two variables.
#'
#' Note: RNA-Seq protocols have matured so that for bulk RNA protocols data rarely suffer from technical duplicates.
#'
#' ## How to read the data
#'
#' To summarize the relationship between duplication rates and gene expression, we propose fitting a logistic regression curve onto the data. With the coefficients of the fitted model, one can get an idea of the initial duplication rate at low read counts (Intercept) and the progression of the duplication rate along with the progression of the read counts (Slope).
#'
#' ![](https://snag.gy/p7ZnKz.jpg)
#'
#' With the experience from the ENCODE datasets, we expect from single read experiments little duplication at low RPKM (low intercept) rapidly rising because of natural duplication (high slope). In contrast, paired-end experiments have a more mild rising of the natural duplication (low slope) due to having higher diversity of reads pairs since pairs with same start may still have different end.
#' The common denominator for both designs is the importance of having a low intercept, suggesting that duplication rate at lowly expressed genes may serve as a quality measure.
#'
#' For details see http://bioconductor.org/packages/release/bioc/vignettes/dupRadar/inst/doc/dupRadar.html
#' For interpretion of slope in logit models see https://stats.stackexchange.com/questions/92903/intercept-term-in-logistic-regression
#' ## Results
#' Working directory: `r getwd()`
devtools::source_url("https://raw.githubusercontent.com/holgerbrandl/datautils/v1.40/R/core_commons.R")
load_pack(dupRadar)
......@@ -63,16 +94,23 @@ load_pack(dupRadar)
gtfFile = commandArgs(TRUE)[1]
bamFiles = commandArgs(TRUE)[-1]
# bamFiles %<>% head(2)
#' gtfFile was `r gtfFile`
#' alignment files were `r bamFiles`
dupResults = bamFiles %>% map(function(bamFile){
# DEBUG bamFile=bamFiles[1]
# DEBUG bamFiles %<>% head(2)
# bamFile <- "../alignments/marked_duplicates_10_AP.bam"
# gtfFile <- "../../../igenomes/Homo_sapiens/Ensembl_v88_custom/GRCh38/Annotation/Genes/genes.gtf"
# Rsubread::featureCounts(files = bamFile, annot.ext = gtfFile,
# isGTFAnnotationFile = TRUE, GTF.featureType = "exon",
# GTF.attrType = "gene_id", nthreads = 1, isPairedEnd = FALSE,
# strandSpecific = 0, ignoreDup = TRUE, countMultiMappingReads = TRUE,
# )
analyzeDuprates(bamFile, gtfFile, stranded = 0, paired = FALSE, threads = 10)
})
dupResults = quote({
bamFiles %>% map(function(bamFile){
# DEBUG bamFile=bamFiles[1]
analyzeDuprates(bamFile, gtfFile, stranded = 0, paired = FALSE, threads = 10)
})
}) %>% cache_it("dup_radar")
install_package("session")
......@@ -89,14 +127,34 @@ walk2(bamFiles, dupResults, ~ {
tRPKMval = 0.5,
tFit = TRUE,
addLegend = TRUE,
main=paste0("duplicate results for '", bam_to_sample(.x), "'")) %>% print
main = paste0("duplicate results for '", bam_to_sample(.x), "'")
) %>% print
})
# getDupMatStats(DupMat=dupResults[[1]]) %>% vec_as_df %>% spread(name, value)
sumStats = map2_df(bamFiles, dupResults, ~ getDupMatStats(DupMat=.y) %>% vec_as_df %>% spread(name, value) %>% mutate(sample=bam_to_sample(.x)))
sumStats %>% table_browser()
write_tsv(sumStats, path="duplicate_anlysis.summary_stats.tsv")
modelStats = map2_df(bamFiles, dupResults, ~ duprateExpFit(DupMat = .y) %>% { data_frame(sample = bam_to_sample(.x), intercept = .$intercept, slope = .$slope)})
load_pack(ggrepel)
modelStats %>% ggplot(aes(intercept, slope, label = sample)) +
geom_point() +
# geom_text(vjust = 1.5) +
geom_text_repel(vjust = 1.5) +
scale_x_continuous(expand = c(.2, .2)) +
ggtitle("dupRadar model parameters") +
xlab("Percent of duplication at low RPKM (fitted)") +
ylab("slope (log of duprate growth for 1 RPKM)")
#load_pack(plotly)
#{ modelStats %>% ggplot(aes(intercept, slope, label = sample)) + geom_point()} %>% ggplotly()
sumStats = map2_df(bamFiles, dupResults, ~ getDupMatStats(DupMat = .y) %>%
vec_as_df %>%
spread(name, value) %>%
mutate(sample = bam_to_sample(.x)))
# sumStats %>% ggplot(aes())
dupStats = modelStats %>% inner_join(sumStats)
dupStats %>% table_browser()
write_tsv(dupStats, path = "duplicate_anlysis.summary_stats.tsv")
EOF
\ No newline at end of file
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment