CheckLibraryComplexity.R 2.55 KB
Newer Older
1 2 3 4
#! /usr/bin/Rscript
## madmax /sw/bin/Rscript


5
devtools::source_url("https://git.mpi-cbg.de/bioinfo/datautils/raw/v1.38/R/core_commons.R")
6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71

#library(RCurl, quietly=T, warn.conflicts=F)
#library(devtools) ## http?://stackoverflow.com/questions/7715723/sourcing-r-script-over-https


library(foreach)
library(doMC)
registerDoMC(cores=5)

argv = commandArgs(TRUE)

if(length(argv) == 0){
    stop("Usage: CheckLibraryComplexity.R <bamFile...>")
}


bamFiles=argv
#bamFiles=c("../L2805_Track-4510_R1/L2805_Track-4510_R1.bam", "../KA_14_5/KA_14_5.bam", "../AT_11_5/AT_11_5.bam", "../KA_18_5/KA_18_5.bam", "../KA_20_5/KA_20_5.bam", "../L2804_Track-4509_R1/L2804_Track-4509_R1.bam", "../KA_12_5/KA_12_5.bam", "../KA_16_5/KA_16_5.bam", "../KA_11_5/KA_11_5.bam")
# DEBUG bamFiles=list.files(pattern=".bam$")

libCplx <- ldply(bamFiles, function(bamFile){
# DEBUG setwd("~/mnt/hd-project/DeepSeq/mapping/libcomplexity/"); bamFile="WT_old_6.bam"

## read the data
#zipfile<-"test2.zip"
#algnPos <- read.delim(pipe(paste("zcat", zipfile)), header=F, stringsAsFactors=F)
#names(algnPos) <- c("chr", "position")

#"samtools view WT_old_6.bam | cut -f 3,4 | gzip > test2.zip"
#algnPos <- read.csv(pipe(paste("zcat", zipfile)), header=F, stringsAsFactors=F)
algnPos <- read.csv(pipe(paste("samtools view ",bamFile , " | cut -f 3,4")), header=F, stringsAsFactors=F)

# save(algnPos, file="algnPos.RData")
# algnPos <- local(get(load("algnPos.RData")))


# shuffle in steps of a million
algnPos <- shuffle(algnPos)
#algnPos <- algnPos[sample(nrow(algnPos))]

#algnPos <- transform(algnPos, chromPosition=id(paste(chr, ":", position)))

# subsample in steps of a million
sampleResolution<-1e6
#sampleResolution<-1000
sampleSizes <- seq(sampleResolution, round_any(length(algnPos), sampleResolution,  floor), sampleResolution)
unPosCounts <- ldply(sampleSizes, function(numAlgns) {
    # DEBUG numAlgns <- 10000000
    c(unique_algn_start_pos=length(unique(algnPos[1:numAlgns])))
})

# save(unPosCounts, file=paste0(bamFile, ".libcplx.RData"))
# unPosCounts <- local(get(load(paste0(bamFile, ".libcplx.RData"))))

transform(unPosCounts, num_alignments=sampleSizes, condition=basename(trimEnd(bamFile, ".bam")))
}, .parallel=T)

save(libCplx, file="libCplx.RData")
# libCplx <- local(get(load("libCplx.RData")))


### plot the library complexity
library(scales)
ggplot(libCplx, aes(num_alignments, unique_algn_start_pos, color=condition)) + geom_line() + ggtitle("library complexity") + scale_x_continuous(labels=comma)+ scale_y_continuous(labels=comma) + geom_abline(intercept = 0, slope = 1, linetype=2) #+coord_equal()
ggsave2()