Skip to content
GitLab
Projects
Groups
Snippets
Help
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Sign in
Toggle navigation
N
ngs_tools
Project overview
Project overview
Details
Activity
Releases
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Locked Files
Issues
15
Issues
15
List
Boards
Labels
Service Desk
Milestones
Requirements
Requirements
List
Security & Compliance
Security & Compliance
Dependency List
License Compliance
Analytics
Analytics
Insights
Issue
Repository
Value Stream
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Create a new issue
Commits
Issue Boards
Open sidebar
bioinfo
ngs_tools
Commits
3b6bc737
Commit
3b6bc737
authored
Jan 28, 2015
by
Holger Brandl
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
integrated cutadapt summary report
parent
9dac4f98
Changes
3
Hide whitespace changes
Inline
Side-by-side
Showing
3 changed files
with
69 additions
and
32 deletions
+69
-32
dge_workflow/cutadapt_summary.R
dge_workflow/cutadapt_summary.R
+63
-24
dge_workflow/dge_utils.sh
dge_workflow/dge_utils.sh
+2
-1
dge_workflow/fastqc_summary.R
dge_workflow/fastqc_summary.R
+4
-7
No files found.
dge_workflow/cutadapt_summary.R
View file @
3b6bc737
...
...
@@ -31,7 +31,7 @@ if(is.na(file.info(baseDir)$isdir)){
# baseDir="/home/mel/MPI-Bioinf/Project1_reads/141126_cutadapt_logs"
logDataFiles
<-
list.files
(
path
=
baseDir
,
pattern
=
"
ca.fastq.gz.ca
.log"
,
full.names
=
TRUE
,
recursive
=
T
)
logDataFiles
<-
list.files
(
path
=
baseDir
,
pattern
=
"
__ca__.*.out
.log"
,
full.names
=
TRUE
,
recursive
=
T
)
#echo("files are", logDataFiles)
...
...
@@ -45,7 +45,8 @@ echo(info3)
#' ## Cutadapt Parameters:
parameters
=
readLines
(
pipe
(
paste
(
"grep -F 'Command line parameters' "
,
logDataFiles
[
1
])
))
echo
(
parameters
)
# #' `r parameters`
#' #### Some explanations:
if
(
grepl
(
"-a"
,
parameters
)
==
TRUE
)
...
...
@@ -53,16 +54,16 @@ if (grepl("-a", parameters) ==TRUE)
if
(
grepl
(
"-g"
,
parameters
)
==
TRUE
)
echo
(
"-g indicates that the following is a 5' end adapter."
)
if
(
grepl
(
"-b"
,
parameters
)
==
TRUE
)
echo
(
"-b indicates that the adapter is located at the 3' or 5' end (both possible)."
)
echo
(
"-b
:
indicates that the adapter is located at the 3' or 5' end (both possible)."
)
if
(
grepl
(
"-m"
,
parameters
)
==
TRUE
)
echo
(
"Reads shorter than -m bases are thrown away."
)
if
(
grepl
(
"-q"
,
parameters
)
==
TRUE
)
if
(
grepl
(
"-q
:
"
,
parameters
)
==
TRUE
)
echo
(
"Quality trimming is done with a threshold specified after -q."
)
if
(
grepl
(
"-p"
,
parameters
)
==
TRUE
)
if
(
grepl
(
"-p
:
"
,
parameters
)
==
TRUE
)
echo
(
"option 'paired output' is used."
)
if
(
grepl
(
"-e"
,
parameters
)
==
TRUE
)
if
(
grepl
(
"-e
:
"
,
parameters
)
==
TRUE
)
echo
(
"-e changes the error tolerance. (The default maximum error rate is 0.1)"
)
if
(
grepl
(
"-O"
,
parameters
)
==
TRUE
)
if
(
grepl
(
"-O
:
"
,
parameters
)
==
TRUE
)
echo
(
"The minimum overlap length is changed using -O."
)
if
(
grepl
(
"-N"
,
parameters
)
==
TRUE
)
echo
(
"Wildcard characters in the adapter are enabled by -N."
)
...
...
@@ -70,10 +71,12 @@ if (grepl("-N", parameters) ==TRUE)
#' #### For more detailed information on cutadapt go to https://cutadapt.readthedocs.org/en/latest/index.html
tidySamples
<-
function
(
sample
)
str_replace
(
sample
,
".*__ca__"
,
""
)
#' ## Trimming Overview
genTable1
<-
function
(
logFile
){
readSummaryStats
<-
function
(
logFile
){
data.frame
(
Run
=
sub
(
"^([^.]*).*"
,
"\\1"
,
basename
(
logFile
)),
Run
=
sub
(
"^([^.]*).*"
,
"\\1"
,
basename
(
logFile
))
%>%
tidySamples
,
No_of_processed_Reads
=
(
paste
(
"grep -F 'Processed reads' "
,
logFile
)
%>%
pipe
()
%>%
readLines
()
%>%
strsplit
(
"[^0-9]+"
)
%>%
unlist
()
%>%
as.numeric
())[
2
],
No_of_processed_Bases
=
(
paste
(
"grep -F 'Processed bases' "
,
logFile
)
%>%
pipe
()
%>%
readLines
()
%>%
strsplit
(
"[^0-9]+"
)
%>%
unlist
()
%>%
as.numeric
())[
2
],
Trimmed_Reads
=
(
paste
(
"grep -F 'Trimmed reads' "
,
logFile
)
%>%
pipe
()
%>%
readLines
()
%>%
strsplit
(
"[^0-9\\.]+"
)
%>%
unlist
()
%>%
as.numeric
())[
3
],
...
...
@@ -82,7 +85,7 @@ genTable1 <- function(logFile){
Too_short_Reads
=
(
paste
(
"grep -F 'Too short reads' "
,
logFile
)
%>%
pipe
()
%>%
readLines
()
%>%
strsplit
(
"[^0-9\\.]+"
)
%>%
unlist
()
%>%
as.numeric
())[
3
]
)
}
trimmingStats
<-
logDataFiles
%>%
ldply
(
genTable1
)
trimmingStats
<-
logDataFiles
%>%
ldply
(
readSummaryStats
)
#+ results = 'asis'
trimmingStats
%>%
head
()
%>%
kable
()
...
...
@@ -90,18 +93,46 @@ trimmingStats %>% head() %>% kable()
write.delim
(
trimmingStats
,
file
=
"cutadapt_summary.trimmingStats.txt"
)
#' [Trimming Statistics](cutadapt_summary.trimmingStats.txt)
myfun
<-
function
(
x
)
x
/
100
trimmingStats
<-
mutate_each
(
trimmingStats
,
funs
(
myfun
),
Trimmed_Reads
,
Quality_trimmed
,
Trimmed_Bases
,
Too_short_Reads
)
#+ fig.height=nrow(trimmingStats)/3, fig.width=12
require
(
scales
)
ggplot
(
trimmingStats
,
aes
(
Run
,
No_of_processed_Reads
))
+
geom_bar
(
stat
=
'identity'
)
+
coord_flip
()
+
scale_y_continuous
(
labels
=
comma
)
#ggplot(trimmingStats, aes(Run, No_of_processed_Bases)) + geom_bar(stat='identity') + coord_flip() + scale_y_continuous(labels=comma)
ggplot
(
trimmingStats
,
aes
(
Run
,
Trimmed_Reads
))
+
geom_bar
(
stat
=
'identity'
)
+
coord_flip
()
+
scale_y_continuous
(
labels
=
percent
)
+
## other layer with quality trimmed ones
geom_bar
(
aes
(
Run
,
Quality_trimmed
),
stat
=
'identity'
,
color
=
'red'
,
alpha
=
0.4
)
+
ylab
(
"Trimmed reads proportion (quality-related colored in in red)"
)
#ggplot(trimmingStats, aes(Run, Quality_trimmed)) + geom_bar(stat='identity') + coord_flip() + scale_y_continuous(labels=percent)
#ggplot(trimmingStats, aes(Run, Trimmed_Bases)) + geom_bar(stat='identity') + coord_flip() + scale_y_continuous(labels=percent)
ggplot
(
trimmingStats
,
aes
(
Run
,
Too_short_Reads
))
+
geom_bar
(
stat
=
'identity'
)
+
coord_flip
()
+
scale_y_continuous
(
labels
=
percent
)
+
ggtitle
(
"Discarded Reads Proportion"
)
#' ## Adapter trimming statistics
genTable2
<-
function
(
logFile
){
readAdapterStats
<-
function
(
logFile
){
#browser()
data.frame
(
Run
=
sub
(
"^([^.]*).*"
,
"\\1"
,
basename
(
logFile
)),
Run
=
sub
(
"^([^.]*).*"
,
"\\1"
,
basename
(
logFile
))
%>%
tidySamples
,
Adapter
=
(
paste
(
"grep -F '=== Adapter ' "
,
logFile
)
%>%
pipe
()
%>%
readLines
()
%>%
str_split_fixed
(
"'"
,
3
))[,
2
],
Trimmed
=
(
paste
(
"grep -F '; Trimmed: ' "
,
logFile
)
%>%
pipe
()
%>%
readLines
()
%>%
str_split_fixed
(
"[^0-9]+"
,
6
))[,
5
]
%>%
as.numeric
(),
Overlapped_at_5prime
=
(
paste
(
"grep -F 'overlapped the 5' "
,
logFile
)
%>%
pipe
()
%>%
readLines
()
%>%
str_split_fixed
(
"[^0-9]+"
,
2
)
)[,
1
]
%>%
as.numeric
(),
Overlapped_at_3prime
=
(
paste
(
"grep -F 'overlapped the 3' "
,
logFile
)
%>%
pipe
()
%>%
readLines
()
%>%
str_split_fixed
(
"[^0-9]+"
,
2
)
)[,
1
]
%>%
as.numeric
()
)
}
adapterTrimmingStats
<-
logDataFiles
%>%
ldply
(
genTable2
)
adapterTrimmingStats
<-
logDataFiles
%>%
ldply
(
readAdapterStats
)
#+ results = 'asis'
adapterTrimmingStats
%>%
head
()
%>%
kable
()
...
...
@@ -110,14 +141,22 @@ adapterTrimmingStats %>% head() %>% kable()
write.delim
(
adapterTrimmingStats
,
file
=
"cutadapt_summary.adapterTrimmingStats.txt"
)
#' [Adapter Statistics](cutadapt_summary.adapterTrimmingStats.txt)
#+ fig.height=10, fig.width=10
ggplot
(
trimmingStats
,
aes
(
Run
,
No_of_processed_Reads
))
+
geom_bar
(
stat
=
'identity'
)
+
coord_flip
()
ggplot
(
trimmingStats
,
aes
(
Run
,
No_of_processed_Bases
))
+
geom_bar
(
stat
=
'identity'
)
+
coord_flip
()
ggplot
(
trimmingStats
,
aes
(
Run
,
Trimmed_Reads
))
+
geom_bar
(
stat
=
'identity'
)
+
coord_flip
()
ggplot
(
trimmingStats
,
aes
(
Run
,
Quality_trimmed
))
+
geom_bar
(
stat
=
'identity'
)
+
coord_flip
()
ggplot
(
trimmingStats
,
aes
(
Run
,
Trimmed_Bases
))
+
geom_bar
(
stat
=
'identity'
)
+
coord_flip
()
ggplot
(
trimmingStats
,
aes
(
Run
,
Too_short_Reads
))
+
geom_bar
(
stat
=
'identity'
)
+
coord_flip
()
ggplot
(
adapterTrimmingStats
,
aes
(
Run
,
Trimmed
))
+
geom_bar
(
stat
=
'identity'
)
+
facet_wrap
(
~
Adapter
)
+
coord_flip
()
ggplot
(
adapterTrimmingStats
,
aes
(
Run
,
Overlapped_at_5prime
))
+
geom_bar
(
stat
=
'identity'
)
+
facet_wrap
(
~
Adapter
)
+
coord_flip
()
ggplot
(
adapterTrimmingStats
,
aes
(
Run
,
Overlapped_at_3prime
))
+
geom_bar
(
stat
=
'identity'
)
+
facet_wrap
(
~
Adapter
)
+
coord_flip
()
#with(adapterTrimmingStats, Trimmed==Overlapped_at_3prime+Overlapped_at_5prime)
#ggplot(adapterTrimmingStats, aes(Run, Trimmed)) + geom_bar(stat='identity') + facet_wrap(~Adapter) + coord_flip() + scale_y_continuous(labels=comma)
#ggplot(adapterTrimmingStats, aes(Run, Overlapped_at_5prime)) + geom_bar(stat='identity') + facet_wrap(~Adapter) + coord_flip() + scale_y_continuous(labels=comma)
#ggplot(adapterTrimmingStats, aes(Run, Overlapped_at_3prime)) + geom_bar(stat='identity') + facet_wrap(~Adapter) + coord_flip() + scale_y_continuous(labels=comma)
#+ fig.height=nrow(trimmingStats), fig.width=12
adapterTrimmingStats
%>%
select
(
-
Trimmed
)
%>%
melt
()
%>%
mutate
(
overlap_at
=
str_replace
(
variable
,
"Overlapped_at_"
,
""
))
%>%
ggplot
(
aes
(
Run
,
value
,
fill
=
overlap_at
))
+
geom_bar
(
stat
=
'identity'
)
+
facet_wrap
(
~
Adapter
)
+
coord_flip
()
+
scale_y_continuous
(
labels
=
comma
)
+
ylab
(
"# num reads"
)
+
ggtitle
(
"Adapter trimming by overlap"
)
dge_workflow/dge_utils.sh
View file @
3b6bc737
...
...
@@ -10,6 +10,7 @@ export PATH=/projects/bioinfo/holger/bin/tophat-2.0.13.Linux_x86_64:$PATH
export
PATH
=
/home/brandl/bin/cufflinks-2.2.1.Linux_x86_64:
$PATH
export
PATH
=
/sw/apps/python/current/bin:
$PATH
export
PATH
=
/home/brandl/bin/deepTools/bin:
$PATH
export
PATH
=
/projects/bioinfo/holger/bin/FastQC_0.11.2:
$PATH
# which tophat; which bowtie2; which cuffdiff
...
...
@@ -78,7 +79,7 @@ for fastqFile in $* ; do
echo
"cutadapting
$caFastq
into
$caFastq
"
#todo use a more specific trimming model (trim just correct part on each side without using reverse complements
mysub
"
$
project
__ca__
$(
basename
$fastqFile
.fastq.gz
)
"
"cutadapt -b file:
$Ill_ADAPTERS
-m 20 -q 25 -o
$caFastq
$fastqFile
"
-q
long | joblist .cajobs
mysub
"
$
{
project
}
__ca__
$(
basename
$fastqFile
.fastq.gz
)
"
"cutadapt -b file:
$Ill_ADAPTERS
-m 20 -q 25 -o
$caFastq
$fastqFile
"
-q
long | joblist .cajobs
done
wait4jobs .cajobs
...
...
dge_workflow/fastqc_summary.R
View file @
3b6bc737
...
...
@@ -52,7 +52,7 @@ require.auto(scales)
gg
<-
ggplot
(
readCounts
,
aes
(
run
,
num_reads
))
+
geom_bar
(
stat
=
"identity"
)
+
coord_flip
()
+
scale_y_continuous
(
labels
=
comma
)
+
ggtitle
(
"read counts"
)
#+ results='asis'
gg
%>%
ggsave2
(
width
=
1
5
,
height
=
round
(
nrow
(
readCounts
)
/
4
),
limitsize
=
FALSE
)
%>%
paste0
(
"<img src='"
,
.
,
"'><br>"
)
%>%
cat
()
gg
%>%
ggsave2
(
width
=
1
0
,
height
=
round
(
nrow
(
readCounts
)
/
4
),
limitsize
=
FALSE
)
%>%
paste0
(
"<img src='"
,
.
,
"'><br>"
)
%>%
cat
()
### Create a faill/pass matrix
...
...
@@ -70,18 +70,15 @@ readSummary <- function(statsFile){
qcSummary
<-
list.files
(
path
=
baseDir
,
pattern
=
"^summary.txt"
,
full.names
=
TRUE
,
recursive
=
T
)
%>%
ldply
(
readSummary
)
#+ fig.height=20
gg
<-
qcSummary
%>%
ggplot
(
aes
(
score
,
run
,
fill
=
tolower
(
flag
)))
+
#' # Base Quality Distribution Summary
#+ fig.height=2+round(nrow(readCounts)/4), fig.width=12
qcSummary
%>%
ggplot
(
aes
(
score
,
run
,
fill
=
tolower
(
flag
)))
+
geom_tile
()
+
rotXlab
()
+
scale_fill_manual
(
values
=
c
(
fail
=
"darkred"
,
pass
=
"darkgreen"
,
warn
=
"orange"
))
+
ggtitle
(
"fastqc passcodes"
)
#+ results='asis'
gg
%>%
ggsave2
(
width
=
15
,
height
=
round
(
nrow
(
readCounts
)
/
4
),
limitsize
=
FALSE
)
%>%
paste0
(
"<img src='"
,
.
,
"'><br>"
)
%>%
cat
()
#' # Base Quality Distribution Summary
#' Median qualities per position to compare positional patterns and differences between the runs
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment