dge_analyze_duplicates.sh 5.96 KB
Newer Older
1 2 3 4 5 6 7 8 9
#!/usr/bin/env bash

while getopts "o:" curopt; do
    case $curopt in
    o) outputDir=$OPTARG;
    esac
done
shift $(($OPTIND - 1))

Lena Hersemann's avatar
Lena Hersemann committed
10 11 12
bamFiles=$*
# bamFiles=$(ls *bam | head -n3)
# bamFiles=$(ls *bam)
13 14

if [ $# -lt 1 ]; then
Lena Hersemann's avatar
Lena Hersemann committed
15
     echo "Usage: dge_analyze_duplicates.sh [-o <output_directory>] [<bam file>]+" >&2 ; exit 1;
16 17 18 19
fi

## use current directory if not specified
if [ -z "$outputDir" ]; then
Lena Hersemann's avatar
Lena Hersemann committed
20
     outputDir="duplicate_analysis"
21 22 23 24 25 26 27 28
fi

if [ ! -d "$outputDir" ]; then
    echo "creating output directory '$outputDir'"
    mkdir $outputDir
fi


Lena Hersemann's avatar
Lena Hersemann committed
29
picardJar=${BIO_BIN_BASE}/picard_tools/picard_v2.10.2.jar
30

Lena Hersemann's avatar
Lena Hersemann committed
31 32 33 34 35
if [[ ! -f $picardJar  ]]; then
    echo "picard tools do not exist" >&2
    exit 1
fi

36
#export JL_FORCE_LOCAL=true
Lena Hersemann's avatar
Lena Hersemann committed
37 38 39 40 41 42

for bamFile in $bamFiles ; do
    # DEBUG   bamFile=$(ls $bamFiles | head -n1)
    echo "running duplicate analysis for $bamFile"

    bamBaseName=$(basename $bamFile .bam)
43

44
    jl submit -j .dupanlysis -t 5 -n "dupanalysis__${bamBaseName}" "java -jar -Xmx5g $picardJar MarkDuplicates I=${bamFile} O=${outputDir}/${bamBaseName}.markdup.bam M=${outputDir}/${bamBaseName}.markdup.txt"
45 46
done

47
jl wait --report .dupanlysis  || { echo "duplicate analysis failed" >&2; exit 1; }
Lena Hersemann's avatar
Lena Hersemann committed
48

49
## todo make sure data is single-end or provide pe options to dupRadar (see https://www.biostars.org/p/178730/)
Lena Hersemann's avatar
Lena Hersemann committed
50

51 52
## todo expose gtf as script parameter
gtfFile=${IGENOME}/Annotation/Genes/genes.gtf
Lena Hersemann's avatar
Lena Hersemann committed
53

54 55 56 57 58 59

if [[ ! -f $gtfFile  ]]; then
    echo "gtf file '$gtfFile' do not exist" >&2
    exit 1
fi

60 61
# R --args $gtfFile ${outputDir}/*.markdup.bam
# ll $gtfFile ${outputDir}/*.markdup.bam
62
rendr_snippet "duplicate_analysis" <<"EOF" $gtfFile ${outputDir}/*.markdup.bam
63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92
#' # Assess PCR duplicates in RNA-Seq
#'
#' ## Introduction
#'
#' From [dupRadar docs](http://bioconductor.org/packages/release/bioc/vignettes/dupRadar/inst/doc/dupRadar.html#plotting-and-interpretation)
#'
#' Current NGS workflows usually involve PCR steps at some point, which involves the danger of PCR artefacts and over-amplification of reads.
#' In RNA-Seq however, apart from the technical PCR duplicates, there is a strong source for biological read duplicates.
#'
#' `dupRadar` relates the duplication rate and length normalized read counts of every gene to model the dependency of this two variables.
#'
#' Note: RNA-Seq protocols have matured so that for bulk RNA protocols data rarely suffer from technical duplicates.
#'
#' ## How to read the data
#'
#' To summarize the relationship between duplication rates and gene expression, we propose fitting a logistic regression curve onto the data. With the coefficients of the fitted model, one can get an idea of the initial duplication rate at low read counts (Intercept) and the progression of the duplication rate along with the progression of the read counts (Slope).
#'
#' ![](https://snag.gy/p7ZnKz.jpg)
#'
#' With the experience from the ENCODE datasets, we expect from single read experiments little duplication at low RPKM (low intercept) rapidly rising because of natural duplication (high slope). In contrast, paired-end experiments have a more mild rising of the natural duplication (low slope) due to having higher diversity of reads pairs since pairs with same start may still have different end.
#' The common denominator for both designs is the importance of having a low intercept, suggesting that duplication rate at lowly expressed genes may serve as a quality measure.
#'
#' For details see http://bioconductor.org/packages/release/bioc/vignettes/dupRadar/inst/doc/dupRadar.html
#' For interpretion of slope in logit models see https://stats.stackexchange.com/questions/92903/intercept-term-in-logistic-regression


#' ## Results

#' Working directory: `r getwd()`

93
devtools::source_url("https://git.mpi-cbg.de/bioinfo/datautils/raw/v1.40/R/core_commons.R")
94

95 96 97 98 99 100 101 102
load_pack(dupRadar)
# source("https://bioconductor.org/biocLite.R")
# biocLite("dupRadar")


gtfFile = commandArgs(TRUE)[1]
bamFiles = commandArgs(TRUE)[-1]

103 104
#' gtfFile was `r gtfFile`
#' alignment files were `r bamFiles`
105

106
# DEBUG bamFiles %<>% head(2)
107

108 109 110 111 112
# Rsubread::featureCounts(files = bamFile, annot.ext = gtfFile,
#         isGTFAnnotationFile = TRUE, GTF.featureType = "exon",
#         GTF.attrType = "gene_id", nthreads = 1, isPairedEnd = FALSE,
#         strandSpecific = 0, ignoreDup = TRUE, countMultiMappingReads = TRUE,
#         )
113

114 115 116 117 118 119
dupResults = quote({
    bamFiles %>% map(function(bamFile){
        # DEBUG bamFile=bamFiles[1]
        analyzeDuprates(bamFile, gtfFile, stranded = 0, paired = FALSE, threads = 10)
    })
}) %>% cache_it("dup_radar")
120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135


install_package("session")
session::save.session(".test_bam.dat");
# session::restore.session(".test_bam.dat");

bam_to_sample = function(x)  basename(x) %>% trim_ext(".markdup.bam")

walk2(bamFiles, dupResults, ~ {
    duprateExpDensPlot(.y,
        pal = c("cyan", "blue", "green", "yellow", "red"),
        tNoAlternative = TRUE,
        tRPKM = TRUE,
        tRPKMval = 0.5,
        tFit = TRUE,
        addLegend = TRUE,
136 137
        main = paste0("duplicate results for '", bam_to_sample(.x), "'")
    ) %>% print
138 139 140
})


141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161
modelStats = map2_df(bamFiles, dupResults, ~ duprateExpFit(DupMat = .y) %>% { data_frame(sample = bam_to_sample(.x), intercept = .$intercept, slope = .$slope)})


load_pack(ggrepel)
modelStats %>% ggplot(aes(intercept, slope, label = sample)) +
    geom_point() +
    # geom_text(vjust = 1.5) +
    geom_text_repel(vjust = 1.5) +
    scale_x_continuous(expand = c(.2, .2)) +
    ggtitle("dupRadar model parameters") +
    xlab("Percent of duplication at low RPKM (fitted)") +
    ylab("slope (log of duprate growth for 1 RPKM)")

#load_pack(plotly)
#{ modelStats %>% ggplot(aes(intercept, slope, label = sample)) + geom_point()} %>% ggplotly()


sumStats = map2_df(bamFiles, dupResults, ~ getDupMatStats(DupMat = .y) %>%
    vec_as_df %>%
    spread(name, value) %>%
    mutate(sample = bam_to_sample(.x)))
162

163 164
dupStats = modelStats %>% inner_join(sumStats)
dupStats %>% table_browser()
165
write_tsv(dupStats, path = "duplicate_analysis.summary_stats.tsv")
166
writeLines(capture.output(devtools::session_info()), ".sessionInfo.txt")
Lena Hersemann's avatar
Lena Hersemann committed
167
EOF