Commit 10fdfc65 authored by Holger Brandl's avatar Holger Brandl

clean up and tool deprecation

parent a4056b0f
......@@ -13,8 +13,8 @@ for stable branch, or
/projects/bioinfo/scripts/ngs_tools/dev
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
for current master where changes can be added and are pushed occassionally to
the origin on git-srv1
for current master where changes can be added and are pushed occasionally to
the origin on gitlab
Bioinfo
=======
......@@ -24,24 +24,11 @@ To use the structure from above when working on bioinformatics-srv1 just use
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
/home/brandl/mnt/mack/bioinfo/scripts/ngs_tools/v1.0
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
Get started with an RNASeq Project
==================================
1. use
/Volumes/projects/bioinfo/scripts/ngs\_tools/dev/misc/new\_project\_template.sh
to setup project directories
2. copy
/Volumes/projects/bioinfo/scripts/ngs\_tools/dev/dge\_workflow/dge\_master\_template.sh
to scripts directory
 
How to create a new version tag
-------------------------------
 
1. Create branch:
......@@ -67,23 +54,16 @@ chmod -R -w ../$newVersion
 
 
ToDos
====
- mcdir sourced in?
- Finish igv\_track\_range.sh
- Better peak report bioinfo\_templates/chipseq\_workflow/peak\_report.R
- Write igv session xml programmatically including adjusted track; render open
- Write igv session xml programatically including adjusted track; render open
session link ( like
http://localhost:60151/load?file=/Users/brandl/Desktop/my\_session.xml) to
reports above igv-link tables
- Use proper redmine tracker to evolve the shared codebase
- Learn from http://www.bioconductor.org/help/workflows/rnaseqGene/
* distance matrix using Poisson Distance
......
# RNA-Seq DGE Workflow
## todo write this!!
\ No newline at end of file
export PATH=/projects/bioinfo/holger/bin/tophat-2.0.13.Linux_x86_64:$PATH
export PATH=/home/brandl/bin/cufflinks-2.2.1.Linux_x86_64:$PATH
dge_cutadapt(){
export Ill_ADAPTERS=/projects/bioinfo/common/cutadapt_patterns.fasta
for fastqFile in $* ; do
# DEBUG fastqFile=/projects/bioinfo/holger/projects/helin/mouse/treps_pooled/mouse_liver_polar_stage1_rep4.fastq.gz
caFastq=$(basename $fastqFile .fastq.gz)_ca.fastq.gz
echo "cutadapting $caFastq into $caFastq"
#todo use a more specific trimming model (trim just correct part on each side without using reverse complements
jl submit -j .cajobs -q long -n "${project}__ca__$(basename $fastqFile .fastq.gz)" "cutadapt -b file:$Ill_ADAPTERS -m 20 -q 25 -o $caFastq $fastqFile"
done
jl wait --report --email .cajobs
spin.R ${NGS_TOOLS}/dge_workflow/cutadapt_summary.R .
## todo do a small report here about what has been trimmed away and why
}
export -f dge_cutadapt
#http://wiki.bash-hackers.org/howto/getopts_tutorial
dge_tophat_se(){
# http://stackoverflow.com/questions/18414054/bash-getopts-reading-optarg-for-optional-flags
#echo $*
while getopts "i:" curopt; do
case $curopt in
i) IGENOME="$OPTARG";
esac
done
shift $(($OPTIND - 1))
local fastqFiles=$*
# IGENOME=/projects/bioinfo/igenomes/Mus_musculus/Ensembl/GRCm38
if [ -z "$IGENOME" ] || [ -z "$fastqFiles" ];
then echo "Usage: dge_tophat_se -i <path to igenome> [<fastq.gz file>]+" >&2 ; return;
fi
export bowtie_gindex="$IGENOME/Sequence/Bowtie2Index/genome"
export gtfFile="$IGENOME/Annotation/Genes/genes.gtf"
#head $gtfFile
if [ ! -f $gtfFile ]; then
>&2 echo "gtf '$gtfFile' does not exis"; return;
fi
if [ -z "$(which tophat)" ]; then
>&2 echo "no tomcat binary in PATH"; return;
fi
echo "running tophat using igenome '$IGENOME' for the following files"
ll $fastqFiles
#fastqFiles=$(ls $baseDir/treps_pooled/*fastq.gz)
for fastqFile in $fastqFiles ; do
echo "submitting tophat job for $fastqFile"
# DEBUG fastqFile=/projects/bioinfo/holger/projects/helin/mouse/trimmed/mouse_big_cyst_rep4_ca.fastq.gz
fastqBaseName=$(basename ${fastqFile%%.fastq.gz})
outputdir=$fastqBaseName
## uniquely mapping reads only: http:/seqanswers.com/forums/showthread.php?s=93da11109ae12a773a128a637d69defe&t=3464
### tophat -p6 -G $gtfFile -g1 -o test $bowtie_gindex $fastqFile
tophatCmd="
tophat -p6 -G $gtfFile -g1 -o $outputdir $bowtie_gindex $fastqFile
mv $outputdir/accepted_hits.bam $outputdir/$(basename $outputdir).bam
samtools index $outputdir/$(basename $outputdir).bam
"
jl submit -j .tophatjobs -t 5 -q medium -n "${project}__tophat__${fastqBaseName}" "${tophatCmd}"
done
jl wait --report .tophatjobs
## see https://github.com/fidelram/deepTools/wiki/QC#bamCorrelate
dge_bam_correlate . &
## create tophat mapping report
spin.R ${NGS_TOOLS}/dge_workflow/tophat_qc.R .
mailme "$project: tophat done in $(pwd)"
}
export -f dge_tophat_se
## tester
#dge_merge_treps $baseDir/mapped_trim/ "Ctrl_0703,Ctrl_1029,Ctrl_1103,Insm1_0703,Insm1_1029,Insm1_1103"
#dge_cd(){
#( ## required to work around docopt sys.exit
#usage='
#Use cuffdiff2 to performa a differntial expression analysis
#Usage: dge_cuffdiff [--exclude=<regex>] <gtfFile> <bamDir> <labels>
#
#Options:
#--exclude=<regex> Exclude patterns (grep regex matching bam file paths to be excluded)
#'
#echo "$usage" | ~/bin/docopts/docopts -h - : $*
#eval "$(echo "$usage" | ~/bin/docopts/docopts -h - : $*)"
#
#)
#}
#dge_cd $gtfFilePC $baseDir/mapped $labels
#dge_cd --help
dge_cuffdiff(){
local gtfFile=$1
local bamDir=$2
local labels=$3
if [ $# -ne 3 ]; then
echo "Usage: dge_cuffdiff <gtf_file> <bam_directory> <labels>" >&2 ; return;
fi
if [ -z "$(which cuffdiff)" ]; then
>&2 echo "no cuffdiff binary in PATH"; return;
fi
if [ ! -f $gtfFile ]; then
>&2 echo "gtf '$gtfFile' does not exis"; return;
fi
## collect all bam-files and group them
allBams=$(find $bamDir | grep ".bam$" | grep -v "unmapped" | sort)
if [ -n "$EXCLUDE_BAMS_PATTERN" ]; then
echo "excluding bams with $EXCLUDE_BAMS_PATTERN"
allBams=$(echo "$allBams" | grep -v "$EXCLUDE_BAMS_PATTERN")
fi
## todo use optional argument here --> default "unmapped" --> allow user to add others
#allBams=$(find $bamDir | grep ".bam$" | grep -v "unmapped" | grep -v "1029" | sort)
#allBams=$(find $bamDir | grep ".bam$" | grep -v "unmapped" | grep -v "Ctrl_1029" | sort)
bamsSplit=""
for label in $(echo $labels | tr ", " " "); do
# DEBUG label="liver"; label="cyst"
echo $label
## grep the bam names excluding the path
echo "$allBams" | xargs -n1 basename | grep $label > ${label}.cuff_bamlist
labelBams=$(echo "$allBams" | grep -Ff ${label}.cuff_bamlist | xargs echo -n | tr ' ' ',')
# echo "$allBams" | grep -Ff ${label}.bamlist | xargs -n1 echo
bamsSplit+=$labelBams" "
done
#echo $bamsSplit
## make sure (again that all files are there
echo $gtfFile $bamsSplit | tr "," " " | xargs ls -la
#mysub ${project}_cuffdiff "cuffdiff -L aRGm,aRGp,aRG,bRG,IPC,N -o . -p 8 mm10_ensGene_ccds.gtf $amBams $apBams $aBams $bBams $cBams $dBams 2>&1 | tee cuffdiff.log" -q long -n 8 -R span[hosts=1] | blockScript
cdCmd="cuffdiff -L $labels -o . -p 10 $gtfFile $bamsSplit"
echo "cuffdiff cmd is: $cdCmd"
#eval $cdCmd
jl submit --wait -t 4 -q long -n "${project}__cuffdiff" "$cdCmd"
## create a cuffdiff db using cummerbund in a temporary directory to avoid locking problems
tmpDbDir=$(mktemp -d)
cp -r . $tmpDbDir
#genome=$(echo $gtfFile | cut -f8 -d'/' | tr '[:upper:]' '[:lower:]'); echo "genome is $genome"
## make sure to use temp-r to avoid file locking problems
#export R_LIBS=/tmp/r_index
genome=$(scala -e '
val gtfFile = args(0); //val gtfFile="mm10_igenomes_pc.gtf"
val pattern = "mm10|mm9|hg19|zv9".r
println(pattern.findFirstIn(gtfFile).getOrElse(""))
' $(readlink -f $gtfFile)
)
echo $genome
echo '
require(cummeRbund)
dbDir=commandArgs(T)[1]
gtfFile=commandArgs(T)[2]
genome=commandArgs(T)[3]
## note without providing the gtf the db is much smaller
readCufflinks(dir=dbDir, rebuild=T, gtf=gtfFile, genome=genome)
' | R -q --no-save --no-restore --args "$tmpDbDir" "$gtfFile" "$genome"
if [ ! -f $tmpDbDir/cuffData.db ]; then
>&2 echo "cummerbund failed to create cuffidff sqlite db"; return;
fi
cp $tmpDbDir/cuffData.db .
rm -rf $tmpDbDir ## because it's no longer needed
mailme "$project: cuffdiff done in $(pwd)"
}
export -f dge_cuffdiff
**tbd write me!**
Get started with an RNASeq Project
==================================
1. use
/Volumes/projects/bioinfo/scripts/ngs\_tools/dev/misc/new\_project\_template.sh
to setup project directories
2. copy
/Volumes/projects/bioinfo/scripts/ngs\_tools/dev/dge\_workflow/dge\_master\_template.sh
to scripts directory
This diff is collapsed.
......@@ -159,12 +159,12 @@ exDesign <- data_frame(replicate=colnames(countMatrix)) %>% mutate(col_index=row
#exDesign <- replicates2samples %>% arrange(colnames(countMatrix))
# valiadate that contrasts model is compatible with data
designSamples = as.df(exDesign)[, contrastAttribute] %>% unique
contrastSamples = contrasts %>% gather %$% value %>% unique
designSamples = as.df(exDesign)[, contrastAttribute] %>% unique %>% sort
contrastSamples = contrasts %>% gather %$% value %>% unique %>% sort
if(!all(contrastSamples %in% designSamples)){
warning("column model: ", exDesign)
warning("contrasts: ", contrasts %>% gather %$% value %>% unique)
stop("column model is not consistent with contrasts")
warning("design : ", paste(designSamples, collapse=", "))
warning("contrasts: ", paste(contrastSamples, collapse=", "))
stop("experiment design is not consistent with contrasts")
}
#stopifnot(all((contrasts %>% gather %$% value %>% unique) %in% exDesign$condition))
......@@ -181,6 +181,22 @@ dds <- DESeq(dds)
########################################################################################################################
#' ## Quality Control
#' ### Size Factors
##' Size Factors estimated by DESeq
sizeFactors(dds) %>%
set_names(colnames(countMatrix)) %>% melt() %>%
rownames2column("sample") %>%
ggplot(aes(sample, value)) + geom_bar(stat="identity") + ggtitle("deseq size factors") + coord_flip()
# From DESeq manual: The regularized log transformation is preferable to the variance stabilizing transformation if the size factors vary widely.
#' For details about size factor normalziation and calculation see https://www.biostars.org/p/79978/
##' From the DESeq docs about how size factors are used: The sizeFactors vector assigns to each column of the count matrix a value, the size factor, such that count values in the columns can be brought to a common scale by dividing by the corresponding size factor. Counts are divied by size factors.
#' ### Data Dispersion
#' Plot of the per-gene dispersion estimates together with the fitted mean-dispersion relationship.
......@@ -204,7 +220,8 @@ plotDispEsts(dds, main="Dispersion plot")
# Regularized log transformation for clustering/heatmaps, etc
rld <- rlogTransformation(dds)
plotPCA(rld, intgroup = "sample")
#plotPCA(rld, intgroup = "sample")
plotPCA(rld, intgroup = contrastAttribute)
#' The Euclidean distance between samples are calculated after performing the regularized log transformation.
......@@ -233,19 +250,6 @@ d3heatmap(labelcntData, scale = "column")
########################################################################################################################
#' # Perform Differential Expression Analysis
#' For details about size factor normalziation and calculation see https://www.biostars.org/p/79978/
##' Size Factors estimated by DESeq
sizeFactors(dds) %>%
set_names(colnames(countMatrix)) %>% melt() %>%
rownames2column("sample") %>%
ggplot(aes(sample, value)) + geom_bar(stat="identity") + ggtitle("deseq size factors") + coord_flip()
##' From the DESeq docs about how size factors are used: The sizeFactors vector assigns to each column of the count matrix a value, the size factor, such that count values in the columns can be brought to a common scale by dividing by the corresponding size factor.
##' This means that counts are divied by size factors. So let's now load the lambda libraies and replace the predefined size factors with our custom ones
#' From DESeq manual: The regularized log transformation is preferable to the variance stabilizing transformation if the size factors vary widely.
#' Run Deseq Test
#dds <- DESeq(dds, fitType='local', betaPrior=FALSE)
......
- cuffdbs change dramatically in size if gtf is provided when building them, but what impact does it have on the results
- Also try to remove RNA PCR Primer enrichment. Currently we just remove index and universal adapter
1) evaluate if trimmoatic is the better trimmer (with respect to cutadapt)
- also consider to use contamination list from fastqc for trimming (see https://www.biostars.org/p/15753/)
Multi-Mapper
---
- perl filter does not seem change tophat results
- samtools view -bq1 removes something (nur bei tophat default "-g 20")
- "-g1" --> no multimapper in algn_summary and not filter effect
next:
- are removed/remaining counts consistent with reported multimapper counts from algn_summary.txt
- are there duplicated read ids when using -g1
spinr has been moved to https://github.com/holgerbrandl/datautils/tree/master/R/spinr
\ No newline at end of file
#!/usr/bin/env Rscript
# similar http://stackoverflow.com/questions/10943695/what-is-the-knitr-equivalent-of-r-cmd-sweave-myfile-rnw
#http://stackoverflow.com/questions/3433603/parsing-command-line-arguments-in-r-scripts
#https://github.com/edwindj/docopt.R
#http://www.slideshare.net/EdwindeJonge1/docopt-user2014
## a thin wrapper around spin to make it more useful with more custom output
#devtools::source_url("https://dl.dropboxusercontent.com/u/113630701/datautils/R/core_commons.R")
# load the docopt library
suppressMessages(library(docopt))
# retrieve and parse the command-line arguments
doc <- '
Use knitr to spin R documents
Usage: spin.R [options] <r_script> [<script_args>...]
Options:
-c Cache results
-e Show Code
-w Show warnings
-m Show Messages
--keep Keep generated Rmd and md files
'
#!docopt(doc, "-w test a b c ")$keep
#docopt(doc, "-w test a b c ")$"-w"
spin_opts <- docopt(doc)
#print(spin_opts)
r_script <- spin_opts$r_script
keep_markdown_files <- as.logical(spin_opts$keep)
if(keep_markdown_files){
print("keeping markdown files")
}
if(!file.exists(r_script)){
stop(paste("file does not exist\n", doc))
}
require(plyr)
require(knitr)
require(stringr)
options(width=150)
#https://groups.google.com/forum/#!topic/knitr/ojcnq5Nm298
## better table css: http://www.stat.ubc.ca/~jenny/STAT545A/topic10_tablesCSS.html
commandArgs <- function(trailingOnly = FALSE){ return(as.character(spin_opts$script_args)) }
#print("trimmed args are")
#print(commandArgs())
#print("end args")
spin(r_script, knit=F)
rmdScript <- str_replace(r_script, "[.]R$", ".Rmd")
system(paste("mv", rmdScript, "tmp.Rmd"))
system(paste("cat tmp.Rmd | grep -Ev '^#+$' | grep -Fv '#!/usr/bin/env Rscript' >", basename(rmdScript)))
file.remove("tmp.Rmd")
cssHeader='
<style type="text/css">
body {
max-width: 90%;
}
</style>
<!-- add jquery datatable support -->
<link rel="stylesheet" type="text/css" href="http://cdn.datatables.net/1.10.4/css/jquery.dataTables.css">
<script type="text/javascript" charset="utf8" src="http://code.jquery.com/jquery-2.1.2.min.js"></script>
<script type="text/javascript" charset="utf8" src="http://cdn.datatables.net/1.10.4/js/jquery.dataTables.js"></script>
'
#<!-- add bootstrap support -->
#<link href="http://maxcdn.bootstrapcdn.com/bootstrap/3.3.1/css/bootstrap.min.css" rel="stylesheet">
#<script src="http://maxcdn.bootstrapcdn.com/bootstrap/3.3.1/js/bootstrap.min.js"></script>
## custom title http://stackoverflow.com/questions/14124022/setting-html-meta-elements-with-knitr
opts_chunk$set(
cache = spin_opts$c,
message= spin_opts$m,
warning= spin_opts$w,
echo= spin_opts$e,
fig.width=15,
width=200
)
knit2html(basename(rmdScript), header=cssHeader)
## also remove the .md and the .Rmd files
if(is.logical(keep_markdown_files) & !keep_markdown_files){
file.remove(basename(rmdScript))
file.remove(basename(str_replace(r_script, "[.]R$", ".md")))
}
spinsnip(){
if [ $# -lt 1 ]; then
>&2 echo "Usage: spinsnip <report name> [other args]*"
return
fi
reportName=$1
tmpR=$(echo $reportName | tr " " "_").R
## http://stackoverflow.com/questions/11454343/pipe-output-to-bash-function
cat | sed 's/#>/#'"'"'/g' > $tmpR
echo "spining $tmpR..."
shift
spin.R $tmpR $*
rm $tmpR
}
## usage example
# echo '
# > # test report
# 1+1;
# ggplot(iris, aes(Sepal.Width) + geom_histogram()
# ' | spinsnip some_report
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment