Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
bioinfo
ngs_tools
Commits
f7692962
Commit
f7692962
authored
Mar 13, 2017
by
Holger Brandl
Browse files
misc fixed in rna-seq pipeline
parent
22e76841
Changes
5
Hide whitespace changes
Inline
Side-by-side
dge_workflow/dge_star_template.sh
View file @
f7692962
...
...
@@ -66,11 +66,14 @@ require(ggplot2)
ggplot(sampleSheet, aes(bio_sample)) + geom_bar() + coord_flip()
sampleSheet %>% group_by(bio_sample) %>% summarise(
zcat=paste("zcat", paste(paste0("../originals/", File), collapse=" "), "| gzip -c >", paste0(bio_sample[1], ".fastq.gz"))
) %>% with(zcat) %>% write.delim(header=F, file="lane_merge.cmd", quote=F)
zcat=paste("zcat", paste(paste0("../originals/", File), collapse=" "), "| gzip -c >", paste0(bio_sample[1], ".fastq.gz"))
) %$%
zcat %>%
write_lines("lane_merge.cmd")
'
| R
--vanilla
-q
cat
lane_merge.cmd |
while
read
line
;
do
# eval ${line}
jl submit
-j
.repmerge
"
$line
"
done
...
...
dge_workflow/dge_utils.sh
View file @
f7692962
...
...
@@ -13,7 +13,7 @@ export PATH=${BIO_BIN_BASE}/deepTools-2.2.2/bin:$PATH
export
PATH
=
${
BIO_BIN_BASE
}
/FastQC_0.11.2:
$PATH
export
PATH
=
${
BIO_BIN_BASE
}
/bedtools2-2.25.0/bin/:
$PATH
export
PATH
=
${
BIO_BIN_BASE
}
/samtools-1.3:
$PATH
export
PATH
=
${
BIO_BIN_BASE
}
/STAR-2.5.
1
b/source:
$PATH
export
PATH
=
${
BIO_BIN_BASE
}
/STAR-2.5.
2
b/source:
$PATH
export
PATH
=
${
BIO_BIN_BASE
}
/ucsc:
$PATH
# for wigToBigWig
#export PATH=/home/brandl/bin/subread-1.4.6-p3-Linux-x86_64/bin:$PATH
...
...
@@ -105,15 +105,15 @@ for fastqFile in $fastqFiles ; do
continue
;
fi
jl submit
-j
.fastqc_jobs
-q
medium
-n
"fastqc__
$(
basename
$fastqFile
)
"
"fastqc -j
${
JAVA_HOME
}
/bin/java -o
$outputDir
-f fastq
$fastqFile
"
# jl submit -j .fastqc_jobs -n "fastqc__$(basename $fastqFile)" "fastqc -j ${JAVA_HOME}/bin/java -o $outputDir -f fastq $fastqFile"
jl submit
-j
.fastqc_jobs
-n
"fastqc__
$(
basename
$fastqFile
)
"
"fastqc -o
$outputDir
-f fastq
$fastqFile
"
done
jl
wait
--report
.fastqc_jobs
jl
wait
--email
--report
.fastqc_jobs
rend.R
${
NGS_TOOLS
}
/misc/fastqc_summary.R
$outputDir
mailme
"
$project
: fastqc done in
$(
pwd
)
"
#mailme "$project: fastqc done in $(pwd)"
}
export
-f
dge_fastqc
...
...
dge_workflow/star_align.kts
View file @
f7692962
#
!/
usr
/
bin
/
env
kscript
//DEPS de.mpicbg.scicomp:joblist:0.7 de.mpicbg.scicomp:kutils:0.
3
//DEPS de.mpicbg.scicomp:joblist:0.7 de.mpicbg.scicomp:kutils:0.
7
// add docopts to local m2 index
...
...
@@ -13,6 +13,7 @@ import org.docopt.Docopt
import
java.io.File
//val args = listOf("/projects/bioinfo/igenomes/Drosophila_melanogaster/Ensembl/BDGP6", "test.fastq.gz")
//val args = listOf("--disable-pe-pairing", "/projects/bioinfo/igenomes/Drosophila_melanogaster/Ensembl/BDGP6", "test.fastq.gz")
// basic usage tutorial
...
...
@@ -24,8 +25,9 @@ Use star to align fastq files against a genome
Usage: star_align.kts [options] <igenome> <fastq_files>...
Options:
--gtf <gtfFile> Custom gtf file instead of igenome bundled copy
--pc-only Use protein coding genes only for mapping and quantification
--gtf <gtfFile> Custom gtf file instead of igenome bundled copy
--pc-only Use protein coding genes only for mapping and quantification
--disable-pe-pairing Disable auto-detection and pairing of paired end tracks
"""
...
...
@@ -33,15 +35,16 @@ val doArgs = Docopt(usage).parse(args.toList()).map {
it
.
key
.
removePrefix
(
"--"
).
replace
(
"[<>]"
.
toRegex
(),
""
)
to
it
.
value
?.
toString
()
}.
toMap
()
//println(doArgs.keys.joinToString())
//println(doArgs.values.joinToString())
//doArgs["fastq_files"]
// extract all configuration parameters
val
fastqFiles
=
(
doArgs
[
"fastq_files"
]
as
String
).
substring
(
1
).
substringBefore
(
"]"
).
split
(
", "
).
map
{
File
(
it
)
}
val
igenome
=
File
(
doArgs
[
"igenome"
])
val
star_index
=
File
(
igenome
,
"Sequence/StarIndex"
)
val
gtfFile
=
if
(
doArgs
[
"gtfFile"
]
!=
null
)
File
(
doArgs
[
"gtfFile"
])
else
File
(
igenome
,
"Annotation/Genes/genes.gtf"
)
val
pePairingEnabled
=
!
doArgs
[
"disable-pe-pairing"
]
!!
.
toBoolean
()
println
(
"validating system requirements..."
)
...
...
@@ -73,7 +76,7 @@ fastqFiles.filter { !it.isFile }.let {
require
(
it
.
isEmpty
())
{
"Some fastq files do not exist ${it.map { it.absoluteFile }.joinToString("
,
")}"
}
}
require
(
fastqFiles
.
filter
{
it
.
name
.
contains
(
"_2.fastq"
)
}.
isEmpty
()){
require
(
fastqFiles
.
filter
{
it
.
name
.
contains
(
"_2.fastq"
)
}.
isEmpty
())
{
"""Mapping 2nd mate alone is unlikely to have meaningful semantics.
Just provide _1 and star_align.kts will pick up corresponding _2 for PE alignment if present"""
}
...
...
@@ -82,9 +85,25 @@ println("Running STAR using igenome '${igenome}' for the following files:\n ${fa
val
jl
=
JobList
(
".starjobs"
).
apply
{
reset
()
}
fun
detectReverseReads
(
fastqFile
:
File
):
File
?
{
if
(!
fastqFile
.
name
.
contains
(
"_R1.fastq"
))
return
null
val
revReadsCandidate
=
fastqFile
.
absoluteFile
.
parentFile
.
resolve
(
fastqFile
.
name
.
replace
(
"_R1.fastq"
,
"_R2.fastq"
))
// return revReadsCandidate.takeIf { it.exists() }
return
if
(
revReadsCandidate
.
exists
())
revReadsCandidate
else
null
}
for
(
fastqFile
in
fastqFiles
)
{
println
(
"submitting STAR job for $fastqFile"
)
var
fastqBaseName
=
fastqFile
.
name
.
removeSuffix
(
".gz"
).
removeSuffix
(
".fastq"
)
// uniquely mapping reads only: http:/seqanswers.com/forums/showthread.php?s=93da11109ae12a773a128a637d69defe&t=3464
//# tophat -p6 -G $gtfFile -g1 -o test $bowtie_gindex $fastqFile
...
...
@@ -94,18 +113,16 @@ for (fastqFile in fastqFiles) {
// --quantMode GeneCounts see https://groups.google.com/forum/#!searchin/rna-star/GeneCounts/rna-star/gZRJx3ElRNo/p5FjBYKuY00J
// --outSJfilterCountUniqueMin see https://groups.google.com/forum/#!topic/rna-star/_1BeAlGUmpA
val
isPE
=
with
(
fastqFile
.
name
)
{
endsWith
(
"_1.fastq.gz"
)
||
endsWith
(
"_1.fastq"
)
}
// detect if pe-pairing is enabled and reverse reads are supplied
val
revReads
=
if
(
pePairingEnabled
)
detectReverseReads
(
fastqFile
)
?.
path
?:
""
else
""
if
(
revReads
.
isNotBlank
())
fastqBaseName
=
fastqBaseName
.
removeSuffix
(
"_R1"
)
var
fastqBaseName
=
fastqFile
.
name
.
removeSuffix
(
".gz"
).
removeSuffix
(
".fastq"
)
if
(
isPE
)
fastqBaseName
=
fastqBaseName
.
removeSuffix
(
"_1"
)
val
optionalZcat
=
if
(
fastqFile
.
name
.
endsWith
(
"gz"
))
"--readFilesCommand zcat"
else
""
// todo consider to use --outTmpDir which defaults to outFileNamePrefix STARtmp and which is deleted automatically
// or use process substiutation in case of zipped reads (see https://github.com/alexdobin/STAR/issues/143#issuecomment-216597465)
// detect if paired end reads are supplied
val
revReads
=
if
(
isPE
)
fastqFile
.
absoluteFile
.
parentFile
.
resolve
(
fastqFile
.
name
.
replace
(
"_1.fastq"
,
"_2.fastq"
)).
path
else
""
val
cmd
=
"""
STAR --genomeDir ${star_index} --readFilesIn ${fastqFile} ${revReads} --runThreadN 6 ${optionalZcat} --outFileNamePrefix ${fastqBaseName}. --outSAMtype BAM SortedByCoordinate --outSAMstrandField intronMotif --sjdbGTFfile ${gtfFile} --outFilterIntronMotifs RemoveNoncanonicalUnannotated --outFilterType BySJout --quantMode GeneCounts --outFilterMultimapNmax 1 --outSJfilterCountUniqueMin 8 3 3 3
...
...
@@ -163,3 +180,4 @@ dge_bam_correlate `ls *.bam`
#dge_bam_correlate . &
"""
)
misc/fastqc_summary.R
View file @
f7692962
...
...
@@ -4,16 +4,17 @@
## Note This script is supposed to be knitr::spin'ed
devtools
::
source_url
(
"https://raw.githubusercontent.com/holgerbrandl/datautils/v1.
14
/R/core_commons.R"
)
devtools
::
source_url
(
"https://raw.githubusercontent.com/holgerbrandl/datautils/v1.
14
/R/ggplot_commons.R"
)
devtools
::
source_url
(
"https://raw.githubusercontent.com/holgerbrandl/datautils/v1.
36
/R/core_commons.R"
)
devtools
::
source_url
(
"https://raw.githubusercontent.com/holgerbrandl/datautils/v1.
36
/R/ggplot_commons.R"
)
## can we access variables from the parent spin.R process?
#echo("rscript is ", r_script)
argv
=
commandArgs
(
TRUE
)
# argv = c("fastqc")
#echo("argv is ", argv)
#if(str_detect(argv[1], "fastqc_summary")) argv
<-
argv[-1]
#if(str_detect(argv[1], "fastqc_summary")) argv
=
argv[-1]
if
(
length
(
argv
)
!=
1
){
stop
(
"Usage: fastqc_summmary.R <directory with fastqc results>"
)
...
...
@@ -32,24 +33,24 @@ if(is.na(file.info(baseDir)$isdir)){
fastqDataFiles
<-
list.files
(
path
=
baseDir
,
pattern
=
"*fastqc.zip"
,
full.names
=
TRUE
,
recursive
=
T
)
fastqDataFiles
=
list.files
(
path
=
baseDir
,
pattern
=
"*fastqc.zip"
,
full.names
=
TRUE
,
recursive
=
T
)
#echo("files are", fastqDataFiles)
get_zip_pipe
<-
function
(
zipFile
,
fileName
){
get_zip_pipe
=
function
(
zipFile
,
fileName
){
paste0
(
"unzip -p "
,
zipFile
,
" "
,
trim_ext
(
basename
(
zipFile
),
".zip"
),
"/"
,
fileName
)
}
#' ## Number of reads per run
#statsFile=fastqDataFiles[1]
readCount
<-
function
(
statsFile
){
readCount
=
function
(
statsFile
){
data.frame
(
run
=
trim_ext
(
basename
(
statsFile
),
c
(
"_fastqc.zip"
)),
num_reads
=
as.numeric
(
readLines
(
pipe
(
paste
(
get_zip_pipe
(
statsFile
,
"fastqc_data.txt"
),
"| grep -F 'Total Sequences' | cut -f2 -d'\t'"
))))
)
}
readCounts
<-
fastqDataFiles
%>%
ldply
(
readCount
)
# %>% print_head(
)
readCounts
=
map_df
(
fastqDataFiles
,
readCount
)
#+ fig.width=12, fig.height=round(nrow(readCounts)/2)
...
...
@@ -61,18 +62,18 @@ ggplot(readCounts, aes(run, num_reads)) + geom_bar(stat="identity") + coord_flip
### Create a faill/pass matrix
readSummary
<-
function
(
statsFile
){
readSummary
=
function
(
statsFile
){
# echo("reading", statsFile)
# statsFile="./fastqc/mouse_big_cyst_rep1_fastqc/summary.txt"
fileMapStats
<-
read.delim
(
pipe
(
get_zip_pipe
(
statsFile
,
"summary.txt"
)),
header
=
F
)
%>%
fileMapStats
=
read.delim
(
pipe
(
get_zip_pipe
(
statsFile
,
"summary.txt"
)),
header
=
F
)
%>%
set_names
(
c
(
"flag"
,
"score"
,
"run"
))
%>%
mutate
(
run
=
trim_ext
(
run
,
c
(
".fastq.gz"
,
"fastq"
)))
fileMapStats
}
qcSummary
<-
fastqDataFiles
%>%
ldply
(
readSummary
)
qcSummary
=
map_df
(
fastqDataFiles
,
readSummary
)
#' # Base Quality Distribution Summary
#+ fig.height=2+round(nrow(readCounts)/2), fig.width=12
...
...
@@ -86,7 +87,7 @@ qcSummary %>% ggplot(aes(score, run, fill=tolower(flag))) +
#' # Read duplication & Library Complexity
dupLevels
<-
fastqDataFiles
%>%
ldply
(
function
(
statsFile
){
dupLevels
=
map_df
(
fastqDataFiles
,
function
(
statsFile
){
data.frame
(
run
=
trim_ext
(
basename
(
statsFile
),
c
(
"_fastqc.zip"
)),
dedup_proportion
=
as.numeric
(
readLines
(
pipe
(
paste
(
get_zip_pipe
(
statsFile
,
"fastqc_data.txt"
),
"| grep -F 'Total Deduplicated Percentage' | cut -f2 -d'\t'"
))))
/
100
...
...
@@ -94,7 +95,11 @@ dupLevels <- fastqDataFiles %>% ldply(function(statsFile){
})
#+ fig.width=12, fig.height=round(nrow(dupLevels)/2)
ggplot
(
dupLevels
,
aes
(
run
,
dedup_proportion
))
+
geom_bar
(
stat
=
"identity"
)
+
coord_flip
()
+
scale_y_continuous
(
labels
=
percent
)
+
ggtitle
(
"unique_reads/total_reads"
)
+
ylim
(
0
,
1
)
ggplot
(
dupLevels
,
aes
(
run
,
dedup_proportion
))
+
geom_bar
(
stat
=
"identity"
)
+
coord_flip
()
+
scale_y_continuous
(
labels
=
percent
,
limits
=
c
(
0
,
1
))
+
ggtitle
(
"unique_reads/total_reads"
)
...
...
@@ -103,13 +108,13 @@ ggplot(dupLevels, aes(run, dedup_proportion)) + geom_bar(stat="identity") + coor
# Allows to compare positional patterns and differences between the runs
readBaseQualDist
<-
function
(
statsFile
){
readBaseQualDist
=
function
(
statsFile
){
# statsFile="./fastqc/mouse_big_cyst_rep2_fastqc/fastqc_data.txt"
# statsFile="./fastqc/mouse_liver_polar_stage3_rep2_fastqc/fastqc_data.txt"
# grep -A30 -F '>>Per base sequence quality' /Volumes/projects/bioinfo/holger/projects/helin/mouse/fastqc/mouse_big_cyst_rep1_fastqc/fastqc_data.txt | grep -B100 -F '>>END_M' | head -n-1 | tail -n+2 | tr '#' ' '
# echo("reading", statsFile)
baseStats
<-
read.delim
(
pipe
(
baseStats
=
read.delim
(
pipe
(
#http://stackoverflow.com/questions/1946363/how-do-i-display-data-from-the-beginning-of-a-file-until-the-first-occurence-of/1947950#1947950
paste
(
get_zip_pipe
(
statsFile
,
"fastqc_data.txt"
),
" | grep -A200 -F '>>Per base sequence quality' | perl -pe 'last if />>END_MODULE/' | head -n-2 | tail -n+2 | tr '#' ' '"
)
))
%>%
mutate
(
...
...
@@ -119,40 +124,50 @@ readBaseQualDist <- function(statsFile){
baseStats
%>%
mutate
(
base_order
=
1
:
n
())
}
baseQualities
<-
fastqDataFiles
%>%
ldply
(
readBaseQualDist
)
baseQualities
=
map_df
(
fastqDataFiles
,
readBaseQualDist
)
%>%
tbl_df
#statsFile="fastqc/L8038_Track-21511_R1.trim_fastqc.zip"
#with(baseQualities, as.data.frame(table(run)))
baseQualities
%<>%
mutate
(
first_base
=
map_int
(
Base
,
~
as.integer
(
str_split
(
.x
,
fixed
(
"-"
))[[
1
]][
1
])))
#+ fig.widthh=20
seqQualPlot
<-
baseQualities
%>%
ggplot
(
aes
(
reorder
(
Base
,
base_order
),
Mean
,
group
=
run
,
color
=
run
))
+
geom_line
()
+
scale_y_continuous
(
limits
=
c
(
2
,
40
))
seqQualPlot
=
baseQualities
%>%
ggplot
(
aes
(
first_base
,
Mean
,
group
=
run
,
color
=
run
))
+
geom_line
()
+
scale_y_continuous
(
limits
=
c
(
2
,
40
))
+
xlab
(
"base position"
)
+
ylab
(
"y mean base quality"
)
+
guides
(
color
=
F
)
+
ggtitle
(
"Track Base Qualities"
)
## just show color legend if not too many samples
if
(
unlen
(
baseQualities
$
run
)
>
20
)
seqQualPlot
<-
seqQualPlot
+
guides
(
color
=
F
)
if
(
unlen
(
baseQualities
$
run
)
>
20
)
seqQualPlot
=
seqQualPlot
+
guides
(
color
=
F
)
seqQualPlot
#' # Qualities per run including variance
runs
<-
with
(
baseQualities
,
as.data.frame
(
table
(
run
)))
runs
=
with
(
baseQualities
,
as.data.frame
(
table
(
run
)))
#+ warning=FALSE, fig.width=15, fig.height=3*ceiling(nrow(runs)/2)
## http://stackoverflow.com/questions/12518387/can-i-create-an-empty-ggplot2-plot-in-r
summary
(
baseQualities
)
baseQualities
%>%
# head(30) %>%
ggplot
()
+
geom_blank
(
mapping
=
aes
(
reorder
(
Base
,
base_order
)
))
+
geom_blank
(
mapping
=
aes
(
first_base
))
+
xlab
(
"base position"
)
+
ylab
(
"y mean base quality"
)
+
geom_rect
(
aes
(
xmin
=-
Inf
,
xmax
=
Inf
,
ymin
=-
Inf
,
ymax
=
20
),
data
=
runs
,
alpha
=
0.05
,
fill
=
"red"
)
+
geom_rect
(
aes
(
xmin
=-
Inf
,
xmax
=
Inf
,
ymin
=
20
,
ymax
=
28
),
data
=
runs
,
alpha
=
0.05
,
fill
=
colors
()[
654
])
+
geom_rect
(
aes
(
xmin
=-
Inf
,
xmax
=
Inf
,
ymin
=
28
,
ymax
=
Inf
),
data
=
runs
,
alpha
=
0.05
,
fill
=
"green"
)
+
geom_boxplot
(
mapping
=
aes
(
x
=
reorder
(
Base
,
base_order
)
,
ymin
=
X10th.Percentile
,
lower
=
Lower.Quartile
,
middle
=
Median
,
upper
=
Upper.Quartile
,
ymax
=
X90th.Percentile
),
mapping
=
aes
(
x
=
first_base
,
ymin
=
X10th.Percentile
,
lower
=
Lower.Quartile
,
middle
=
Median
,
upper
=
Upper.Quartile
,
ymax
=
X90th.Percentile
),
stat
=
"identity"
)
+
facet_wrap
(
~
run
,
ncol
=
3
)
+
theme
(
axis.text.x
=
element_blank
())
+
scale_y_continuous
(
limits
=
c
(
2
,
4
0
))
scale_y_continuous
(
limits
=
c
(
2
,
4
2
))
tool_setup.sh
View file @
f7692962
...
...
@@ -148,3 +148,12 @@ tar xvf ncbi-blast-2.5.0+-src.tar.gz
cd
ncbi-blast-2.5.0+-src/c++
./configure
make
## STAR aligner
cd
~/bin
wget https://github.com/alexdobin/STAR/archive/2.5.2b.tar.gz
tar
xvf 2.5.2b.tar.gz
cd
STAR-2.5.2b/sources
make
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment