Skip to content
GitLab
Explore
Sign in
Primary navigation
Search or go to…
Project
N
ngs_tools
Manage
Activity
Members
Labels
Plan
Issues
Issue boards
Milestones
Code
Repository
Branches
Commits
Tags
Repository graph
Compare revisions
Deploy
Releases
Model registry
Monitor
Incidents
Analyze
Value stream analytics
Contributor analytics
Repository analytics
Model experiments
Help
Help
Support
GitLab documentation
Compare GitLab plans
Community forum
Contribute to GitLab
Provide feedback
Keyboard shortcuts
?
Snippets
Groups
Projects
Show more breadcrumbs
bioinfo
ngs_tools
Commits
08a3bee1
Commit
08a3bee1
authored
10 years ago
by
Melanie Schneider
Browse files
Options
Downloads
Patches
Plain Diff
two tables
parent
522d5257
No related branches found
Branches containing commit
No related tags found
Tags containing commit
No related merge requests found
Changes
1
Hide whitespace changes
Inline
Side-by-side
Showing
1 changed file
dge_workflow/cutadapt_summary.R
+100
-0
100 additions, 0 deletions
dge_workflow/cutadapt_summary.R
with
100 additions
and
0 deletions
dge_workflow/cutadapt_summary.R
+
100
−
0
View file @
08a3bee1
#!/usr/bin/env Rscript
#' # First try
##+ echo=FALSE, message=FALSE
## Note This script is supposed to be knitr::spin'ed
devtools
::
source_url
(
"https://dl.dropboxusercontent.com/u/113630701/datautils/R/core_commons.R"
)
devtools
::
source_url
(
"https://dl.dropboxusercontent.com/u/113630701/datautils/R/ggplot_commons.R"
)
## can we access variables from the parent spin.R process?
#echo("rscript is ", r_script)
argv
=
commandArgs
(
TRUE
)
#echo("argv is ", argv)
#if(str_detect(argv[1], "fastqc_summary")) argv <- argv[-1]
if
(
length
(
argv
)
!=
1
){
stop
(
"Usage: First_try.R <directory with cutadapt log files>"
)
echo
}
baseDir
=
argv
[
1
]
if
(
is.na
(
file.info
(
baseDir
)
$
isdir
)){
stop
(
paste
(
"directory '"
,
baseDir
,
"'does not exist"
))
}
baseDir
=
"/home/mel/MPI-Bioinf/Project1_reads/141126_cutadapt_logs"
logDataFiles
<-
list.files
(
path
=
baseDir
,
pattern
=
"ca.fastq.gz.ca.log"
,
full.names
=
TRUE
,
recursive
=
T
)
#echo("files are", logDataFiles)
#' ## Title
echo
(
"Quality Control Summary"
)
#' ## General information
info1
=
readLines
(
pipe
(
paste
(
"grep -F 'cutadapt' "
,
logDataFiles
[
1
])
))
echo
(
info1
)
info2
=
readLines
(
pipe
(
paste
(
"grep -F 'Maximum error rate' "
,
logDataFiles
[
1
])
))
echo
(
info2
)
info3
=
readLines
(
pipe
(
paste
(
"grep -F 'No. of adapters' "
,
logDataFiles
[
1
])
))
echo
(
info3
)
#' ## cutadapt parameters:
parameters
=
readLines
(
pipe
(
paste
(
"grep -F 'Command line parameters' "
,
logDataFiles
[
1
])
))
echo
(
parameters
)
echo
(
"Some explanation:"
)
if
(
grepl
(
"-a"
,
parameters
)
==
TRUE
)
echo
(
"-a indicates that the following is a 3' adapter."
)
if
(
grepl
(
"-g"
,
parameters
)
==
TRUE
)
echo
(
"-g indicates that the following is a 5' adapter."
)
if
(
grepl
(
"-b"
,
parameters
)
==
TRUE
)
echo
(
"-b indicates that the adapter is 3' or 5' (both possible)."
)
if
(
grepl
(
"-m"
,
parameters
)
==
TRUE
)
echo
(
"Reads shorter than -m bases are thrown away."
)
if
(
grepl
(
"-q"
,
parameters
)
==
TRUE
)
echo
(
"Quality trimming is done with a threshold specified after -q."
)
if
(
grepl
(
"-p"
,
parameters
)
==
TRUE
)
echo
(
"option 'paired output' is used."
)
if
(
grepl
(
"-e"
,
parameters
)
==
TRUE
)
echo
(
"-e changes the error tolerance. (The default maximum error rate is 0.1)"
)
if
(
grepl
(
"-O"
,
parameters
)
==
TRUE
)
echo
(
"The minimum overlap length is changed using -O."
)
if
(
grepl
(
"-N"
,
parameters
)
==
TRUE
)
echo
(
"Wildcard characters in the adapter are enabled by -N."
)
echo
(
"For more detailed information on cutadapt go to https://cutadapt.readthedocs.org/en/latest/index.html"
)
#' ## Table 1: information of each run
genTable1
<-
function
(
logFile
){
data.frame
(
run
=
sub
(
"^([^.]*).*"
,
"\\1"
,
basename
(
logFile
)),
num_proReads
=
(
paste
(
"grep -F 'Processed reads' "
,
logFile
)
%>%
pipe
()
%>%
readLines
()
%>%
strsplit
(
"[^0-9]+"
)
%>%
unlist
()
%>%
as.numeric
())[
2
],
num_proBases
=
(
paste
(
"grep -F 'Processed bases' "
,
logFile
)
%>%
pipe
()
%>%
readLines
()
%>%
strsplit
(
"[^0-9]+"
)
%>%
unlist
()
%>%
as.numeric
())[
2
],
trim_reads
=
(
paste
(
"grep -F 'Trimmed reads' "
,
logFile
)
%>%
pipe
()
%>%
readLines
()
%>%
strsplit
(
"[^0-9\\.]+"
)
%>%
unlist
()
%>%
as.numeric
())[
3
],
qual_trimmed
=
(
paste
(
"grep -F 'Quality-trimmed' "
,
logFile
)
%>%
pipe
()
%>%
readLines
()
%>%
strsplit
(
"[^0-9\\.]+"
)
%>%
unlist
()
%>%
as.numeric
())[
4
],
trim_bases
=
(
paste
(
"grep -F 'Trimmed bases' "
,
logFile
)
%>%
pipe
()
%>%
readLines
()
%>%
strsplit
(
"[^0-9\\.]+"
)
%>%
unlist
()
%>%
as.numeric
())[
4
],
too_short
=
(
paste
(
"grep -F 'Too short reads' "
,
logFile
)
%>%
pipe
()
%>%
readLines
()
%>%
strsplit
(
"[^0-9\\.]+"
)
%>%
unlist
()
%>%
as.numeric
())[
3
]
)
}
table1
<-
logDataFiles
%>%
ldply
(
genTable1
)
%>%
print_head
()
#' ## Table 2: adapter information (of each run)
genTable2
<-
function
(
logFile
){
#browser()
data.frame
(
run
=
sub
(
"^([^.]*).*"
,
"\\1"
,
basename
(
logFile
)),
adapter
=
(
paste
(
"grep -F '=== Adapter ' "
,
logFile
)
%>%
pipe
()
%>%
readLines
()
%>%
str_split_fixed
(
"'"
,
3
))[,
2
],
trimmed
=
(
paste
(
"grep -F '; Trimmed: ' "
,
logFile
)
%>%
pipe
()
%>%
readLines
()
%>%
str_split_fixed
(
"[^0-9]+"
,
6
))[,
5
]
%>%
as.numeric
(),
overlapped5
=
(
paste
(
"grep -F 'overlapped the 5' "
,
logFile
)
%>%
pipe
()
%>%
readLines
()
%>%
str_split_fixed
(
"[^0-9]+"
,
2
)
)[,
1
]
%>%
as.numeric
(),
overlapped3
=
(
paste
(
"grep -F 'overlapped the 3' "
,
logFile
)
%>%
pipe
()
%>%
readLines
()
%>%
str_split_fixed
(
"[^0-9]+"
,
2
)
)[,
1
]
%>%
as.numeric
()
)
}
table2
<-
logDataFiles
%>%
ldply
(
genTable2
)
%>%
print_head
()
This diff is collapsed.
Click to expand it.
Preview
0%
Loading
Try again
or
attach a new file
.
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Save comment
Cancel
Please
register
or
sign in
to comment