Commit eb8199af authored by Holger Brandl's avatar Holger Brandl

reorg and cleaned up repo for better maintainability

parent 5b89e4ad
Copyright (c) 2016, Max Planck Institute of Molecular Cell Biology and
Copyright (c) 2017, Max Planck Institute of Molecular Cell Biology and
Genetics, Dresden, Germany
All rights reserved.
......
spin.R - A shell-wrapper for knitr::spin
===
Installation
---
Download a local copy and add it to your path using
```
targetDirectory=~/bin
wget -P $targetDirectory --no-check-certificate https://raw.githubusercontent.com/holgerbrandl/datautils/master/R/spinr/spin.R
chmod +x $targetDirectory/spin.R
export PATH=$targetDirectory:$PATH
```
Usage
---
To use it from a shell you can call spin.R directly with a script as argument.
```
spin.R MyScript.R
```
The report will be created in the current working directory. To learn about options just call `spin.R --help`
In case you want to spin snippets you can source a small bash function that wraps spin.R
```
source <(curl https://raw.githubusercontent.com/holgerbrandl/datautils/master/R/spinr/spin_utils.sh 2>&1 2>/dev/null)
```
Now you can spin R snippets by piping them into `spinsnip`
```
echo "require(ggplot2); ggplot(iris, aes(Sepal.Length, Sepal.Width)) + geom_point()" | spinsnip "my_report"
```
#!/usr/bin/env Rscript
# similar http://stackoverflow.com/questions/10943695/what-is-the-knitr-equivalent-of-r-cmd-sweave-myfile-rnw
#http://stackoverflow.com/questions/3433603/parsing-command-line-arguments-in-r-scripts
#https://github.com/edwindj/docopt.R
#http://www.slideshare.net/EdwindeJonge1/docopt-user2014
# load the docopt library
suppressMessages(library(docopt))
# retrieve and parse the command-line arguments
doc <- '
Use knitr to spin R documents
Usage: spin.R [options] <r_script> [<quoted_script_args>]
Options:
--toc Add a table of contents
-c Cache results
-e Show Code
-w Show warnings
-m Show Messages
--keep Keep generated Rmd and md files
'
#!docopt(doc, "-w test a b c ")$keep
#spin_opts <- docopt(doc, "fastqc")
#docopt(doc, "-w test a b c ")$"-w"
#spin_opts <- docopt(doc, "$DGE_HOME/dge_analysis.R \"--undirected --qcutoff 0.05 --minfpkm 2 ..\"")
spin_opts <- docopt(doc)
#print(spin_opts)
r_script <- spin_opts$r_script
keep_markdown_files <- as.logical(spin_opts$keep)
if(keep_markdown_files){
print("keeping markdown files")
}
if(!file.exists(r_script)){
stop(paste("file does not exist\n", doc))
}
require(plyr)
require(knitr)
require(stringr)
options(width=150)
#https://groups.google.com/forum/#!topic/knitr/ojcnq5Nm298
## better table css: http://www.stat.ubc.ca/~jenny/STAT545A/topic10_tablesCSS.html
commandArgs <- function(trailingOnly = FALSE){ return(as.character(spin_opts$quoted_script_args)) }
#print("trimmed args are")
#print(commandArgs())
#print("end args")
## todo use temp-file-name here to allow for cocurring spin.R in same directory --> pointless because results would be overwritten
## copy R-script to working directory and forget about the original one
#file.copy(r_script, basename(r_script))
system(paste("cat ", r_script," | grep -Ev '^#+$' | grep -Fv '#!/usr/bin/env Rscript' >", basename(r_script)))
r_script <- basename(r_script)
spin(r_script, knit=F)
file.remove(basename(r_script))
rmdScript <- str_replace(r_script, "[.]R$", ".Rmd")
# system(paste("mv", rmdScript, "tmp.Rmd"))
#system(paste("cat tmp.Rmd | grep -Ev '^#+$' | grep -Fv '#!/usr/bin/env Rscript' >", basename(rmdScript)))
#file.remove("tmp.Rmd")
cssHeader='
<style type="text/css">
body {
max-width: 90%;
}
</style>
<!-- add jquery datatable support -->
<link rel="stylesheet" type="text/css" href="http://cdn.datatables.net/1.10.4/css/jquery.dataTables.css">
<script type="text/javascript" charset="utf8" src="http://code.jquery.com/jquery-2.1.2.min.js"></script>
<script type="text/javascript" charset="utf8" src="http://cdn.datatables.net/1.10.4/js/jquery.dataTables.js"></script>
'
#<!-- add bootstrap support -->
#<link href="http://maxcdn.bootstrapcdn.com/bootstrap/3.3.1/css/bootstrap.min.css" rel="stylesheet">
#<script src="http://maxcdn.bootstrapcdn.com/bootstrap/3.3.1/js/bootstrap.min.js"></script>
## custom title http://stackoverflow.com/questions/14124022/setting-html-meta-elements-with-knitr
opts_chunk$set(
cache = spin_opts$c,
message= spin_opts$m,
warning= spin_opts$w,
echo= spin_opts$e,
fig.width=15,
width=200
)
knitOptions <- c()
## optionally render toc
## see http://stackoverflow.com/questions/25872558/generating-a-table-of-contents-toc-when-using-knitrs-spin
if(spin_opts$toc){
knitOptions= c("toc", markdown::markdownHTMLOptions(TRUE))
}
knit2html(basename(rmdScript), header=cssHeader, options = knitOptions)
## also remove the .md and the .Rmd files
if(is.logical(keep_markdown_files) & !keep_markdown_files){
file.remove(basename(rmdScript))
file.remove(basename(str_replace(r_script, "[.]R$", ".md")))
}
spinr(){
## test if present in PATH
if [ -z "$(which spin.R)" ]; then
>&2 echo "spin.R is not installed. See https://github.com/holgerbrandl/datautils/tree/master/R/spinr for details"
fi
spin.R $*
}
export -f spinr
spinsnip(){
if [ $# -lt 1 ]; then
>&2 echo "Usage: spinsnip <report name> [other args]*"
>&2 echo "The R snippet to be spinned will be read from standard input."
return
fi
reportName=$1
tmpR=$(mktemp -d)/$(echo $reportName | tr " " "_").R
## http://stackoverflow.com/questions/11454343/pipe-output-to-bash-function
cat | sed 's/#>/#'"'"'/g' > $tmpR
echo "spining $tmpR..."
shift
spinr -e $tmpR $*
# rm -r $(dirname $tmpR)
rm ${tmpR}
}
export -f spinsnip
## usage example
# echo '
# > # test report
# 1+1;
# ggplot(iris, aes(Sepal.Width) + geom_histogram()
# ' | spinsnip some_report
# requires
# devtools::source_url("https://raw.githubusercontent.com/holgerbrandl/datautils/v1.45/R/core_commons.R")
if(!exists("load_pack")){
devtools::source_url("https://raw.githubusercontent.com/holgerbrandl/datautils/v1.45/R/core_commons.R")
}
if(!exists("multiplot")){
devtools::source_url("https://raw.githubusercontent.com/holgerbrandl/datautils/v1.45/R/ggplot_commons.R")
}
calc_ci = function(df, variable, ci_interval=0.95){
variable <- enquo(variable)
......@@ -62,6 +68,7 @@ plot_ci = function(grpData, variable, ci_interval=0.95){
## todo support model formula instead of group data
interaction_plot = function(grpData, variable, ci_interval=0.95){
variable <- enquo(variable)
#fail if there are more not 2 group attributes
......@@ -86,15 +93,19 @@ interaction_plot = function(grpData, variable, ci_interval=0.95){
geom_errorbar(aes(ymin = mean - ci, ymax = mean + ci, y = NULL), data = ciData, width = .2, size = 0.9, position = position_dodge(width = dodge_with)) +
geom_line(aes(y = mean, group = eval(rlang::UQE(groupVar2))), position = position_dodge(width = dodge_with), data = ciData) +
xlab(groupVar1) +
ylab(quo_name(variable))
ylab(quo_name(variable)) +
guides(color=guide_legend(groupVar2))
gg
}
two_way_interaction = function(grpData, variable){
# Example:
# grpData = ToothGrowth %>% group_by(supp, as.factor(dose))
## invert the grouping
grpData = ToothGrowth %>% group_by(supp, as.factor(dose))
groupVar1 = groups(grpData)[[1]]
groupVar2 = groups(grpData)[[2]]
......@@ -107,6 +118,14 @@ two_way_interaction = function(grpData, variable){
}
## EXAMPLES-START
if(F){
# interaction_plot(ToothGrowth %>% group_by(supp, as.factor(dose)),len)
ToothGrowth %>% group_by(supp, as.factor(dose)) %>% interaction_plot(len)
}
## EXAMPLES-END
########################################################################################################################
### DEV PLAYGROUND
......@@ -114,7 +133,8 @@ two_way_interaction = function(grpData, variable){
## DEBUG-START
if(F){
devtools::source_url("https://raw.githubusercontent.com/holgerbrandl/datautils/v1.43/R/ggplot_commons.R")
source("/Users/brandl/Dropbox/projects/datautils/R/stats/ci_commons.R")
lmModel = lm(len ~ supp*dose, data = ToothGrowth)
varNames = attr(attr(lmModel$terms, "factors"), "dimnames")[[1]]
......@@ -135,53 +155,10 @@ if(F){
lmModel$model
}
ToothGrowth %>% group_by(supp, as.factor(dose)) %>% interaction_plot2(len)
}
## DEBUG-END
if(F){
# iris %>% ggplot(aes(Sepal.Width)) + geom_histogram()
iris %>%
group_by(Species) %>%
plot_ci(Petal.Length)
iris %>%
group_by(Sepal.Width > 3, Species) %>%
plot_ci(Petal.Length)
iris %>%
group_by(Sepal.Width > 3, Sepal.Width > 4, Species) %>%
plot_ci(Petal.Length)
## see https://stackoverflow.com/questions/43405843/how-to-use-the-devel-version-of-dplyrs-enquo-and-quo-name-in-a-function-with-ti/43601059
## also see https://stackoverflow.com/questions/45279287/use-dplyr-se-with-ggplot2
#' generic simple examples
some_plot = function(data, var){
variable <- enquo(var)
tt = quo_name(variable)
ggplot(data, aes_string(tt)) + geom_histogram()
}
some_plot(iris, Sepal.Length) -> works
## without using aes_string
some_plot = function(data, var){
variable <- enquo(var)
# ggplot(data, aes(eval(rlang::UQE(variable))+eval(rlang::UQE(variable)))) + geom_histogram()
ggplot(data, aes(eval(rlang::UQE(variable)))) + geom_histogram()
}
some_plot(iris, Petal.Length)
foo = function(){
}
}
Some little helpers to work with data, and bio-data in particular.
# `datautils`
Some little [R](http://r-project.org/) helpers to work with data, and bio-data in particular.
To use them just source them when needed.
......@@ -8,9 +10,11 @@ R
===
The R bits are split up into tools for
* general data handling
* plotting using ggplot2
* bioinformatics using various bioconductor packages
* general data handling, see [`core_commons.R`](R/core_commons.R)
* plotting using ggplot2, see [`ggplot_commons.R`](R/ggplot_commons.R)
* bioinformatics using various bioconductor packages, see [`bioinfo_commons.R`](R/bio/bioinfo_commons.R)
and some more.
```
devtools::source_url("https://raw.githubusercontent.com/holgerbrandl/datautils/master/R/core_commons.R")
......@@ -18,101 +22,18 @@ devtools::source_url("https://raw.githubusercontent.com/holgerbrandl/datautils/m
devtools::source_url("https://raw.githubusercontent.com/holgerbrandl/datautils/master/R/bio/bioinfo_commons.R")
```
Bash
===
LSF Cluster Utils:
```
source <(curl https://raw.githubusercontent.com/holgerbrandl/datautils/master/bash/lsf_utils.sh 2>&1 2>/dev/null)
```
Tools to simplify bio-dataprocessing in bash
```
source <(curl https://raw.githubusercontent.com/holgerbrandl/datautils/master//bash/bioinfo_utils.sh 2>&1 2>/dev/null)
```
Versioning
===
To allow for reproducible research, we regularly create [version tags](https://github.com/holgerbrandl/datautils/releases).
Installation
============
Eg. you could use the stable `v1.3` tag
To allow for reproducible research, we prefer [version tags](https://github.com/holgerbrandl/datautils/releases) over cran deployment. You can use these tags to write our workflows. Eg. you could use the stable `v1.45` tag
```
devtools::source_url("https://raw.githubusercontent.com/holgerbrandl/datautils/v1.3/R/core_commons.R")
devtools::source_url("https://raw.githubusercontent.com/holgerbrandl/datautils/v1.45/R/core_commons.R")
```
instead of the development copy form the master-branch copy
Instead to use the latest master-branch version (which is subject of constant change) use
```
devtools::source_url("https://raw.githubusercontent.com/holgerbrandl/datautils/master/R/datatable_commons.R")
```
## install as package
```bash
R -e "devtools::create('tt')"
# move DESCRIPTION and NAMESPACE
```
in R we can load the package from sources with
```r
devtools::load_all()
```
will just load toplevel scripting directly under `R/`
further reading https://uoftcoders.github.io/studyGroup/lessons/r/packages/lesson/
https://github.com/jtleek/rpackages
## module system for namespace isolation
How to organize large R programs? https://stackoverflow.com/a/1319786/590437
```
util = new.env()
util$bgrep = function \[...\]
util$timeit = function \[...\]
while("util" %in% search())
detach("util")
attach(util)
```
## how to install package with multiple namespaces
https://stackoverflow.com/questions/3094232/add-objects-to-package-namespace
```r
myfun <- function(x) print(x)
environment(myfun) <- as.environment("package:foo")
```
https://stackoverflow.com/questions/9002544/how-to-add-functions-in-an-existing-environment
docke testing
```bash
docker pull rocker/tidyverse
docker run --rm -it rocker/tidyverse /bin/bash
```
```r
devtools::install_github("vqv/ggbiplot")
```
\ No newline at end of file
#!/usr/bin/env Rscript
#sessionInfo()
devtools::source_url("https://raw.githubusercontent.com/holgerbrandl/datautils/v1.20/R/core_commons.R")
devtools::source_url("https://raw.githubusercontent.com/holgerbrandl/datautils/v1.20/R/ggplot_commons.R")
require_auto(lubridate)
if(!exists("reportName")){
argv = commandArgs(TRUE)
if(length(argv) == 0){
reportName=".jobs"
# reportName=".ipsjobs"
# reportName=".trinjob"
# reportName=".blastn"
# reportName=".failchunksblastx"
# stop("Usage: RemoveContaminants.R <assemblyBaseName>")
}else{
reportName=argv[1]
}
}
reportNiceName <- str_replace_all(reportName, "^[.]", "")
#> # Job Report: `r reportNiceName`
echo("processing job report for '", reportName,"'")
jobData <- read.table(paste0(reportName, ".cluster_snapshots.txt"), header=F, fill=T) %>% as.df() %>%
set_names(c("jobid", "user", "stat", "queue", "from_host", "exec_host", "job_name", "submit_time", "proj_name", "cpu_used", "mem", "swap", "pids", "start_time", "finish_time", "snapshot_time")) %>%
transform(jobid=factor(jobid)) %>%
arrange(jobid) %>%
subset(stat=="RUN")
if(nrow(jobData)==0){
system(paste("mailme 'no jobs were run in ",normalizePath(reportName),"'"))
warning(paste("no jobs were run in ",normalizePath(reportName)))
stop(-1)
}
#jobData %>% count(jobid) %>% nrow
## extract multi-threading number
jobData %<>% transform(num_cores=str_match(exec_host, "([0-9]+)[*]n")[,2]) %>% mutate(num_cores=ifelse(is.na(num_cores), 1, num_cores))
jobData %>% count(exec_host)
jobData %>% select(submit_time, start_time, finish_time) %>% head
# filter(finish_time!="-") %>% head
#parse_date_time(ac("00:00:00.00"), c("%d:%H:%M.%S"))
#parse_date_time(ac("00:04:55.18"), c("%d:%H%M%S"))
## parse the submission time
curYear=str_match(ac(jobData$snapshot_time[1]), "-([0-9]*)_")[,2]
convertTimes <- function(someDate) parse_date_time(paste0(curYear, ac(someDate)), c("%Y/%m/%d-%H%M%S"))
#convertedTimes <- colwise(convertTimes, .(submit_time, start_time, finish_time))(jobData)
#jobData <- cbind(subset(jobData, select=!(names(jobData) %in% names(convertedTimes))), convertedTimes)
jobData %<>% mutate_each(funs(convertTimes), submit_time, start_time, finish_time)
jobData <- transform(jobData, snapshot_time=parse_date_time(ac(snapshot_time), c("%d-%m-%y_%H%M%S")))
splitCPU <- str_split_fixed(jobData$cpu_used, "[.]", 2)[,1]
splitCPUhms <- str_split_fixed(splitCPU, ":", 3)
cpuSecs <- 3600*as.numeric(splitCPUhms[,1]) + 60*as.numeric(splitCPUhms[,2]) + as.numeric(splitCPUhms[,3])
#splitCPU <- str_sub(splitCPU, 2, str_length(splitCPU))
#as.numeric(as.difftime(jobData[22,]$cpu_used_hms, units="secs"))
#jobData <- mutate(jobData, cpu_used_hms=hms(ac(splitCPU)), cpu_used_secs=as.numeric(as.difftime(cpu_used_hms, units="secs")), cpu_used_hours=cpu_used_secs/3600)
jobData <- mutate(jobData, cpu_used_secs=cpuSecs, cpu_used_hours=cpu_used_secs/3600)
jobData <- mutate(jobData, exec_time=difftime(snapshot_time, start_time, units="secs"), exec_time_min=as.numeric(exec_time)/60, exec_time_hours=as.numeric(exec_time)/3600)
## add the queue limits
wallLimits <- c(short=1, medium=8, long=96)
jobData <- mutate(jobData, queueLimit=wallLimits[ac(queue)])
#tt <- head(subset(jobData, is.na(cpu_used_secs)), 100)
#subset(jobData, cpu_used_secs==max(jobData$cpu_used_secs))
#with(jobData, as.data.frame(table(is.na(cpu_used_secs))))
if(max(jobData$cpu_used_secs)==0){
stop(echo("stopping job report generation for", reportName, "because no cpu time has been consumed"))
quit()
}
## todo use rollapply to calculate better normalized cpu usage overtime
#ggplot(jobData, aes(exec_time_min, cpu_used_secs/(60*exec_time_min), group=jobid)) + geom_line(alpha=0.3) + ggtitle("normalized cpu usage")
#ggsave2()
save(jobData, file=paste0(reportName, ".cluster_snapshots.RData"))
#jobData <- local(get(load(concat(reportName, ".cluster_snapshots.RData"))))
#ggplot(jobData, aes(exec_time_min, cpu_used_secs, group=jobid)) + geom_line(alpha=0.3) + geom_smooth() + ggtitle("accumulated cpu usage")
ggplot(jobData, aes(exec_time_hours, cpu_used_hours, group=jobid)) + geom_line(alpha=0.3) + ggtitle("accumulated cpu usage") + geom_vline(aes(xintercept=queueLimit), color="red")
#### ussage per time interval
jobDataSlim <- with(jobData, data.frame(jobid, num_cores, cpu_used_secs, exec_time=as.numeric(exec_time)))
jobDataCPUChange = ddply(jobDataSlim, .(jobid), subset, diff(cpu_used_secs)!=0)
smoothData <- ddply(jobDataCPUChange, .(jobid), mutate, exec_period=c(NA, diff(as.numeric(exec_time))), cpu_usage_in_period=c(NA, diff(cpu_used_secs)))
smoothData[is.na(smoothData)] <- 0
#ggplot(smoothData, aes(exec_time, cpu_usage_in_period, color=jobid)) + geom_line()
ggplot(subset(smoothData, cpu_usage_in_period>0), aes(exec_time/3600, cpu_usage_in_period/(exec_period* as.numeric(as.character(num_cores))), color=num_cores, group=jobid)) +
geom_line(alpha=0.3) +
xlab("exec time [hours]") +
ylab("core normalized cpu usage") # + scale_color_discrete(name="jobid")
#######################################################################################################################
### sumarize the jobs
jobSummaries <- mutate(subset(plyr::arrange(jobData, -1* exec_time), !duplicated(jobid)), pending_time=difftime(start_time, submit_time, units="secs"), pending_time_min=as.numeric(pending_time)/60)
jobSummaries <- transform(jobSummaries, jobid=reorder(jobid, as.numeric(jobid)))
#ggplot(jobSummaries, aes(pending_time_min)) + geom_histogram() + ggtitle("pending times") + coord_flip()
if(nrow(jobSummaries)<50){
ggplot(jobSummaries, aes(reorder(jobid, -as.numeric(jobid)), pending_time_min/60)) + geom_bar(stat="identity") + ggtitle("pending times") + coord_flip() + xlab("job id")
}else{
ggplot(jobSummaries, aes(as.numeric(jobid), pending_time_min/60)) + geom_area() + ggtitle("pending times")+xlab("job_nr") + ylab("pending time [h]")
}
#ggsave2(p=reportName)
if(nrow(jobSummaries)<50){
ggplot(jobSummaries, aes(reorder(jobid, -as.numeric(jobid)), exec_time_hours)) + geom_bar(stat="identity") + ggtitle("job execution times") + coord_flip() + xlab("job id")
}else{
ggplot(jobSummaries, aes(as.numeric(jobid), exec_time_hours)) + geom_area() + ggtitle("job execution times")+ xlab("job_nr") + geom_hline(mapping=aes(yintercept=queueLimit), color="red")
}
#ggplot(jobSummaries, aes(as.numeric(jobidx), exec_time_min/pending_time_min)) + geom_area() + ggtitle("pending vs exec time ratio")+xlab("job_nr")
ggplot(jobSummaries, aes(exec_time_min, pending_time_min)) + geom_point() + ggtitle("pending vs exec time") + geom_abline()
jobSummaries %<>% mutate(exceeded_queue_limit=exec_time_hours>queueLimit)
write.delim(jobSummaries, file=paste0(reportName, ".jobSummaries.txt"))
# jobSummaries <- read.delim("jobSummaries.txt")
require_auit(knitr)
jobSummaries %>% mutate(pending_time_hours=pending_time_min/60) %>% select(jobid, exec_host, job_name, cpu_used_hours, pending_time_hours, exec_time_hours) %>% kable()
#######################################################################################################################
## create warning email if jobs died
## todo finish send mail if wall time was exceeded
numKilled=nrow(filter(jobSummaries, exceeded_queue_limit))
numTotal= nrow(jobSummaries)
killedListFile=paste0(reportName, ".killed_jobs.txt")
if(numKilled >0){
system(paste("mailme '",numKilled,"out of ",numTotal," jobs in ", getwd(), " died because of queue length limitation'"))
filter(jobSummaries, exceeded_queue_limit) %$% writeLines(jobid, con=killedListFile)
}else{
## Create an empty killed list to indicate that we actually looked into it
file.create(killedListFile)
}
## Tophat Mapping Report from the logs
TophatMappingReport(){
echo '
devtools::source_url("https://raw.githubusercontent.com/holgerbrandl/datautils/v1.9/R/core_commons.R")
devtools::source_url("https://raw.githubusercontent.com/holgerbrandl/datautils/v1.9/R/ggplot_commons.R")
parseAlgnSummary_T2_0_11 <- function(alignSummary){
#alignSummary="/projects/bioinfo/brandl/projects/marta_rnaseq/human_leipzig/mapping/S5382_aRG_1b_rep1/align_summary.txt"
algnData <- readLines(alignSummary)
data.frame(
condition=basename(dirname(alignSummary)),
num_reads=as.numeric(str_match(algnData[2], " ([0-9]*$)")[,2]),
mapped_reads=as.numeric(str_match(algnData[3], ":[ ]*([0-9]*) ")[,2][1])
) %>% transform(mapping_efficiency=100*mapped_reads/num_reads)
}
algnSummary <- ldply(list.files(".", "align_summary.txt", full.names=TRUE, recursive=T), parseAlgnSummary_T2_0_11, .progress="text")
write.delim(algnSummary, file="tophat_mapping_stats.txt")
scale_fill_discrete <- function (...){ scale_color_brewer(..., type = "seq", palette="Set1", "fill", na.value = "grey50") }
projectName=basename(dirname(getwd()))
devtools::source_url("https://raw.githubusercontent.com/holgerbrandl/mdreport/master/R/mdreport-package.r")
md_new(paste("Mapping Summary for ", projectName))
md_plot(ggplot(algnSummary, aes(condition, mapping_efficiency)) + geom_bar(stat="identity") +coord_flip() + ylim(0,100) + ggtitle("mapping efficiency"))
md_plot(ggplot(algnSummary, aes(condition, num_reads)) + geom_bar(stat="identity") + coord_flip() + ggtitle("read counts") +scale_y_continuous(labels=comma))
md_plot(ggplot(algnSummary, aes(condition, mapped_reads)) + geom_bar(stat="identity") + coord_flip() + ggtitle("alignments counts") +scale_y_continuous(labels=comma))
#ggplot(melt(algnSummary), aes(condition, value)) + geom_bar(stat="identity") +facet_wrap(~variable, scales="free") + ggtitle("mapping summary") + scale_y_continuous(labels=comma) + theme(axis.text.x=element_text(angle=90, hjust=0))
#ggsave2(w=10, h=10, p="mapstats")
md_report("tophat_mapping_report", open=F)
' | R -q --vanilla
}
export -f TophatMappingReport
## replaced with cs_bowtie_qc
#### Bowtie Mapping Report from the logs
#Bowtie2MappingReport(){
#
#echo '
#devtools::source_url("https://raw.githubusercontent.com/holgerbrandl/datautils/v1.9/R/core_commons.R")
#devtools::source_url("https://raw.githubusercontent.com/holgerbrandl/datautils/v1.9/R/ggplot_commons.R")
#
#logSuffix=".logs"
#parseAlgnSummary <- function(alignSummary){
# #alignSummary="./H2Az_Rep1_Lane1_Lib4454.bowtie.log"
# algnData <- readLines(alignSummary)
#
# data.frame(
# condition=trimEnd(basename(alignSummary), logSuffix),
# num_reads=as.numeric(str_split_fixed(algnData[3], " ", 2)[1]),
# unique_mappers=as.numeric(str_split_fixed(str_trim(algnData[6]), " ", 2)[1]),
# mapping_efficiency=as.numeric(str_replace(str_split_fixed(algnData[8], " ", 2)[1], "%", "")),
# multi_mappers=as.numeric(str_split_fixed(str_trim(algnData[7]), " ", 2)[1])
# )
#}
#
#mapStats <- ldply(list.files(".", logSuffix, full.names=TRUE, recursive=T), parseAlgnSummary, .progress="text")
#write.delim(mapStats, file="mapStats.txt")
#
#ggplot(melt(mapStats), aes(condition, value)) + geom_bar(stat="identity") +facet_wrap(~variable, scales="free") + ggtitle("mapping summary") + scale_y_continuous(labels=comma) + theme(axis.text.x=element_text(angle=90, hjust=0))
#ggsave2(w=10, h=10, p="mapstats")
#
#ggplot(mapStats, aes(condition, mapping_efficiency)) + geom_bar(stat="identity") +coord_flip() + ylim(0,100) + ggtitle("mapping efficiency")
#ggsave2(p="mapstats")
#ggplot(mapStats, aes(condition, num_reads)) + geom_bar(stat="identity") + coord_flip() + ggtitle("read counts")
#ggsave2(p="mapstats")
#
#ggplot(mapStats, aes(condition, unique_mappers)) + geom_bar(stat="identity") + coord_flip() + ggtitle("unique alignment") + scale_fill_discrete()
#ggsave2(p="mapstats")
#' | R --vanilla
#}
#export -f Bowtie2MappingReport
### Create a cuffdb on a network of lustre file-systen
MakeCuffDB() {
if [ $# -ne 2 ]; then echo "Usage: MakeCuffDB <gtffile> <genomebuild>"; return; fi
echo '
devtools::source_url("https://raw.githubusercontent.com/holgerbrandl/datautils/v1.9/R/core_commons.R")
require.auto(cummeRbund)
createCuffDbTrickyDisk <- function(dbDir, gtfFile, genome, ...){
tmpdir <- tempfile()
system(paste("cp -r", dbDir, tmpdir))
oldWD <- getwd()
setwd(tmpdir)
cuff <- readCufflinks(rebuild=T, gtf=gtfFile, genome=genome, ...)
system(paste("cp cuffData.db", dbDir))
system(paste("rm -r", tmpdir))