Commit f7692962 authored by Holger Brandl's avatar Holger Brandl

misc fixed in rna-seq pipeline

parent 22e76841
......@@ -66,11 +66,14 @@ require(ggplot2)
ggplot(sampleSheet, aes(bio_sample)) + geom_bar() + coord_flip()
sampleSheet %>% group_by(bio_sample) %>% summarise(
zcat=paste("zcat", paste(paste0("../originals/", File), collapse=" "), "| gzip -c >", paste0(bio_sample[1], ".fastq.gz"))
) %>% with(zcat) %>% write.delim(header=F, file="lane_merge.cmd", quote=F)
zcat=paste("zcat", paste(paste0("../originals/", File), collapse=" "), "| gzip -c >", paste0(bio_sample[1], ".fastq.gz"))
) %$%
zcat %>%
write_lines("lane_merge.cmd")
' | R --vanilla -q
cat lane_merge.cmd | while read line; do
# eval ${line}
jl submit -j .repmerge "$line"
done
......
......@@ -13,7 +13,7 @@ export PATH=${BIO_BIN_BASE}/deepTools-2.2.2/bin:$PATH
export PATH=${BIO_BIN_BASE}/FastQC_0.11.2:$PATH
export PATH=${BIO_BIN_BASE}/bedtools2-2.25.0/bin/:$PATH
export PATH=${BIO_BIN_BASE}/samtools-1.3:$PATH
export PATH=${BIO_BIN_BASE}/STAR-2.5.1b/source:$PATH
export PATH=${BIO_BIN_BASE}/STAR-2.5.2b/source:$PATH
export PATH=${BIO_BIN_BASE}/ucsc:$PATH # for wigToBigWig
#export PATH=/home/brandl/bin/subread-1.4.6-p3-Linux-x86_64/bin:$PATH
......@@ -105,15 +105,15 @@ for fastqFile in $fastqFiles ; do
continue;
fi
jl submit -j .fastqc_jobs -q medium -n "fastqc__$(basename $fastqFile)" "fastqc -j ${JAVA_HOME}/bin/java -o $outputDir -f fastq $fastqFile"
# jl submit -j .fastqc_jobs -n "fastqc__$(basename $fastqFile)" "fastqc -j ${JAVA_HOME}/bin/java -o $outputDir -f fastq $fastqFile"
jl submit -j .fastqc_jobs -n "fastqc__$(basename $fastqFile)" "fastqc -o $outputDir -f fastq $fastqFile"
done
jl wait --report .fastqc_jobs
jl wait --email --report .fastqc_jobs
rend.R ${NGS_TOOLS}/misc/fastqc_summary.R $outputDir
mailme "$project: fastqc done in $(pwd)"
#mailme "$project: fastqc done in $(pwd)"
}
export -f dge_fastqc
......
#!/usr/bin/env kscript
//DEPS de.mpicbg.scicomp:joblist:0.7 de.mpicbg.scicomp:kutils:0.3
//DEPS de.mpicbg.scicomp:joblist:0.7 de.mpicbg.scicomp:kutils:0.7
// add docopts to local m2 index
......@@ -13,6 +13,7 @@ import org.docopt.Docopt
import java.io.File
//val args = listOf("/projects/bioinfo/igenomes/Drosophila_melanogaster/Ensembl/BDGP6", "test.fastq.gz")
//val args = listOf("--disable-pe-pairing", "/projects/bioinfo/igenomes/Drosophila_melanogaster/Ensembl/BDGP6", "test.fastq.gz")
// basic usage tutorial
......@@ -24,8 +25,9 @@ Use star to align fastq files against a genome
Usage: star_align.kts [options] <igenome> <fastq_files>...
Options:
--gtf <gtfFile> Custom gtf file instead of igenome bundled copy
--pc-only Use protein coding genes only for mapping and quantification
--gtf <gtfFile> Custom gtf file instead of igenome bundled copy
--pc-only Use protein coding genes only for mapping and quantification
--disable-pe-pairing Disable auto-detection and pairing of paired end tracks
"""
......@@ -33,15 +35,16 @@ val doArgs = Docopt(usage).parse(args.toList()).map {
it.key.removePrefix("--").replace("[<>]".toRegex(), "") to it.value?.toString()
}.toMap()
//println(doArgs.keys.joinToString())
//println(doArgs.values.joinToString())
//doArgs["fastq_files"]
// extract all configuration parameters
val fastqFiles = (doArgs["fastq_files"] as String).substring(1).substringBefore("]").split(", ").map { File(it) }
val igenome = File(doArgs["igenome"])
val star_index = File(igenome, "Sequence/StarIndex")
val gtfFile = if (doArgs["gtfFile"] != null) File(doArgs["gtfFile"]) else File(igenome, "Annotation/Genes/genes.gtf")
val pePairingEnabled = !doArgs["disable-pe-pairing"]!!.toBoolean()
println("validating system requirements...")
......@@ -73,7 +76,7 @@ fastqFiles.filter { !it.isFile }.let {
require(it.isEmpty()) { "Some fastq files do not exist ${it.map { it.absoluteFile }.joinToString(", ")}" }
}
require(fastqFiles.filter { it.name.contains("_2.fastq") }.isEmpty()){
require(fastqFiles.filter { it.name.contains("_2.fastq") }.isEmpty()) {
"""Mapping 2nd mate alone is unlikely to have meaningful semantics.
Just provide _1 and star_align.kts will pick up corresponding _2 for PE alignment if present"""
}
......@@ -82,9 +85,25 @@ println("Running STAR using igenome '${igenome}' for the following files:\n ${fa
val jl = JobList(".starjobs").apply { reset() }
fun detectReverseReads(fastqFile: File): File? {
if (!fastqFile.name.contains("_R1.fastq")) return null
val revReadsCandidate = fastqFile
.absoluteFile
.parentFile
.resolve(fastqFile.name.replace("_R1.fastq", "_R2.fastq"))
// return revReadsCandidate.takeIf { it.exists() }
return if(revReadsCandidate.exists())revReadsCandidate else null
}
for (fastqFile in fastqFiles) {
println("submitting STAR job for $fastqFile")
var fastqBaseName = fastqFile.name.removeSuffix(".gz").removeSuffix(".fastq")
// uniquely mapping reads only: http:/seqanswers.com/forums/showthread.php?s=93da11109ae12a773a128a637d69defe&t=3464
//# tophat -p6 -G $gtfFile -g1 -o test $bowtie_gindex $fastqFile
......@@ -94,18 +113,16 @@ for (fastqFile in fastqFiles) {
// --quantMode GeneCounts see https://groups.google.com/forum/#!searchin/rna-star/GeneCounts/rna-star/gZRJx3ElRNo/p5FjBYKuY00J
// --outSJfilterCountUniqueMin see https://groups.google.com/forum/#!topic/rna-star/_1BeAlGUmpA
val isPE = with(fastqFile.name) { endsWith("_1.fastq.gz") || endsWith("_1.fastq") }
// detect if pe-pairing is enabled and reverse reads are supplied
val revReads = if (pePairingEnabled) detectReverseReads(fastqFile)?.path ?: "" else ""
if (revReads.isNotBlank()) fastqBaseName = fastqBaseName.removeSuffix("_R1")
var fastqBaseName = fastqFile.name.removeSuffix(".gz").removeSuffix(".fastq")
if(isPE) fastqBaseName = fastqBaseName.removeSuffix("_1")
val optionalZcat = if (fastqFile.name.endsWith("gz")) "--readFilesCommand zcat" else ""
// todo consider to use --outTmpDir which defaults to outFileNamePrefix STARtmp and which is deleted automatically
// or use process substiutation in case of zipped reads (see https://github.com/alexdobin/STAR/issues/143#issuecomment-216597465)
// detect if paired end reads are supplied
val revReads = if (isPE) fastqFile.absoluteFile.parentFile.resolve(fastqFile.name.replace("_1.fastq", "_2.fastq")).path else ""
val cmd = """
STAR --genomeDir ${star_index} --readFilesIn ${fastqFile} ${revReads} --runThreadN 6 ${optionalZcat} --outFileNamePrefix ${fastqBaseName}. --outSAMtype BAM SortedByCoordinate --outSAMstrandField intronMotif --sjdbGTFfile ${gtfFile} --outFilterIntronMotifs RemoveNoncanonicalUnannotated --outFilterType BySJout --quantMode GeneCounts --outFilterMultimapNmax 1 --outSJfilterCountUniqueMin 8 3 3 3
......@@ -163,3 +180,4 @@ dge_bam_correlate `ls *.bam`
#dge_bam_correlate . &
""")
......@@ -4,16 +4,17 @@
## Note This script is supposed to be knitr::spin'ed
devtools::source_url("https://raw.githubusercontent.com/holgerbrandl/datautils/v1.14/R/core_commons.R")
devtools::source_url("https://raw.githubusercontent.com/holgerbrandl/datautils/v1.14/R/ggplot_commons.R")
devtools::source_url("https://raw.githubusercontent.com/holgerbrandl/datautils/v1.36/R/core_commons.R")
devtools::source_url("https://raw.githubusercontent.com/holgerbrandl/datautils/v1.36/R/ggplot_commons.R")
## can we access variables from the parent spin.R process?
#echo("rscript is ", r_script)
argv = commandArgs(TRUE)
# argv = c("fastqc")
#echo("argv is ", argv)
#if(str_detect(argv[1], "fastqc_summary")) argv <- argv[-1]
#if(str_detect(argv[1], "fastqc_summary")) argv = argv[-1]
if(length(argv) != 1){
stop("Usage: fastqc_summmary.R <directory with fastqc results>")
......@@ -32,24 +33,24 @@ if(is.na(file.info(baseDir)$isdir)){
fastqDataFiles <- list.files(path=baseDir, pattern="*fastqc.zip", full.names=TRUE, recursive=T)
fastqDataFiles = list.files(path=baseDir, pattern="*fastqc.zip", full.names=TRUE, recursive=T)
#echo("files are", fastqDataFiles)
get_zip_pipe <- function(zipFile, fileName){
get_zip_pipe = function(zipFile, fileName){
paste0("unzip -p ",zipFile, " ", trim_ext(basename(zipFile), ".zip"), "/", fileName)
}
#' ## Number of reads per run
#statsFile=fastqDataFiles[1]
readCount <- function(statsFile){
readCount = function(statsFile){
data.frame(
run=trim_ext(basename(statsFile), c("_fastqc.zip")),
num_reads=as.numeric(readLines(pipe(paste(get_zip_pipe(statsFile, "fastqc_data.txt"), "| grep -F 'Total Sequences' | cut -f2 -d'\t'"))))
)
}
readCounts <- fastqDataFiles %>% ldply(readCount) # %>% print_head()
readCounts = map_df(fastqDataFiles, readCount)
#+ fig.width=12, fig.height=round(nrow(readCounts)/2)
......@@ -61,18 +62,18 @@ ggplot(readCounts, aes(run, num_reads)) + geom_bar(stat="identity") + coord_flip
### Create a faill/pass matrix
readSummary <- function(statsFile){
readSummary = function(statsFile){
# echo("reading", statsFile)
# statsFile="./fastqc/mouse_big_cyst_rep1_fastqc/summary.txt"
fileMapStats <- read.delim(pipe(get_zip_pipe(statsFile, "summary.txt")), header=F) %>%
fileMapStats = read.delim(pipe(get_zip_pipe(statsFile, "summary.txt")), header=F) %>%
set_names(c("flag", "score", "run")) %>%
mutate(run=trim_ext(run, c(".fastq.gz", "fastq")))
fileMapStats
}
qcSummary <- fastqDataFiles %>% ldply(readSummary)
qcSummary = map_df(fastqDataFiles, readSummary)
#' # Base Quality Distribution Summary
#+ fig.height=2+round(nrow(readCounts)/2), fig.width=12
......@@ -86,7 +87,7 @@ qcSummary %>% ggplot(aes(score, run, fill=tolower(flag))) +
#' # Read duplication & Library Complexity
dupLevels <- fastqDataFiles %>% ldply(function(statsFile){
dupLevels = map_df(fastqDataFiles, function(statsFile){
data.frame(
run=trim_ext(basename(statsFile), c("_fastqc.zip")),
dedup_proportion=as.numeric(readLines(pipe(paste(get_zip_pipe(statsFile, "fastqc_data.txt"), "| grep -F 'Total Deduplicated Percentage' | cut -f2 -d'\t'"))))/100
......@@ -94,7 +95,11 @@ dupLevels <- fastqDataFiles %>% ldply(function(statsFile){
})
#+ fig.width=12, fig.height=round(nrow(dupLevels)/2)
ggplot(dupLevels, aes(run, dedup_proportion)) + geom_bar(stat="identity") + coord_flip() + scale_y_continuous(labels=percent) + ggtitle("unique_reads/total_reads") + ylim(0,1)
ggplot(dupLevels, aes(run, dedup_proportion)) +
geom_bar(stat = "identity") +
coord_flip() +
scale_y_continuous(labels = percent, limits=c(0,1)) +
ggtitle("unique_reads/total_reads")
......@@ -103,13 +108,13 @@ ggplot(dupLevels, aes(run, dedup_proportion)) + geom_bar(stat="identity") + coor
# Allows to compare positional patterns and differences between the runs
readBaseQualDist <- function(statsFile){
readBaseQualDist = function(statsFile){
# statsFile="./fastqc/mouse_big_cyst_rep2_fastqc/fastqc_data.txt"
# statsFile="./fastqc/mouse_liver_polar_stage3_rep2_fastqc/fastqc_data.txt"
# grep -A30 -F '>>Per base sequence quality' /Volumes/projects/bioinfo/holger/projects/helin/mouse/fastqc/mouse_big_cyst_rep1_fastqc/fastqc_data.txt | grep -B100 -F '>>END_M' | head -n-1 | tail -n+2 | tr '#' ' '
# echo("reading", statsFile)
baseStats <- read.delim(pipe(
baseStats = read.delim(pipe(
#http://stackoverflow.com/questions/1946363/how-do-i-display-data-from-the-beginning-of-a-file-until-the-first-occurence-of/1947950#1947950
paste(get_zip_pipe(statsFile, "fastqc_data.txt"), " | grep -A200 -F '>>Per base sequence quality' | perl -pe 'last if />>END_MODULE/' | head -n-2 | tail -n+2 | tr '#' ' '")
)) %>% mutate(
......@@ -119,40 +124,50 @@ readBaseQualDist <- function(statsFile){
baseStats %>% mutate(base_order=1:n())
}
baseQualities <- fastqDataFiles %>% ldply(readBaseQualDist)
baseQualities = map_df(fastqDataFiles, readBaseQualDist) %>% tbl_df
#statsFile="fastqc/L8038_Track-21511_R1.trim_fastqc.zip"
#with(baseQualities, as.data.frame(table(run)))
baseQualities %<>% mutate(first_base=map_int(Base, ~ as.integer(str_split(.x, fixed("-"))[[1]][1])))
#+ fig.widthh=20
seqQualPlot <- baseQualities %>% ggplot(aes(reorder(Base, base_order), Mean, group=run, color=run)) + geom_line() + scale_y_continuous(limits=c(2, 40))
seqQualPlot = baseQualities %>% ggplot(aes(first_base, Mean, group = run, color = run)) +
geom_line() +
scale_y_continuous(limits = c(2, 40)) +
xlab("base position") +
ylab("y mean base quality") +
guides(color=F) +
ggtitle("Track Base Qualities")
## just show color legend if not too many samples
if(unlen(baseQualities$run) > 20) seqQualPlot <- seqQualPlot + guides(color=F)
if(unlen(baseQualities$run) > 20) seqQualPlot = seqQualPlot + guides(color=F)
seqQualPlot
#' # Qualities per run including variance
runs <- with(baseQualities, as.data.frame(table(run)))
runs = with(baseQualities, as.data.frame(table(run)))
#+ warning=FALSE, fig.width=15, fig.height=3*ceiling(nrow(runs)/2)
## http://stackoverflow.com/questions/12518387/can-i-create-an-empty-ggplot2-plot-in-r
summary(baseQualities)
baseQualities %>%
# head(30) %>%
ggplot() +
geom_blank(mapping=aes(reorder(Base, base_order))) +
geom_blank(mapping=aes(first_base)) +
xlab("base position") +
ylab("y mean base quality") +
geom_rect(aes(xmin=-Inf, xmax=Inf, ymin=-Inf, ymax=20), data=runs, alpha=0.05, fill="red") +
geom_rect(aes(xmin=-Inf, xmax=Inf, ymin=20, ymax=28), data=runs, alpha=0.05, fill=colors()[654]) +
geom_rect(aes(xmin=-Inf, xmax=Inf, ymin=28, ymax=Inf), data=runs, alpha=0.05, fill="green") +
geom_boxplot(
mapping=aes(x=reorder(Base, base_order), ymin = X10th.Percentile, lower = Lower.Quartile , middle = Median, upper = Upper.Quartile , ymax = X90th.Percentile),
mapping=aes(x=first_base, ymin = X10th.Percentile, lower = Lower.Quartile , middle = Median, upper = Upper.Quartile , ymax = X90th.Percentile),
stat = "identity"
) +
facet_wrap(~run, ncol=3) +
theme(axis.text.x=element_blank()) +
scale_y_continuous(limits=c(2, 40))
scale_y_continuous(limits=c(2, 42))
......@@ -148,3 +148,12 @@ tar xvf ncbi-blast-2.5.0+-src.tar.gz
cd ncbi-blast-2.5.0+-src/c++
./configure
make
## STAR aligner
cd ~/bin
wget https://github.com/alexdobin/STAR/archive/2.5.2b.tar.gz
tar xvf 2.5.2b.tar.gz
cd STAR-2.5.2b/sources
make
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment