Skip to content
Snippets Groups Projects
Commit 614e8317 authored by Holger Brandl's avatar Holger Brandl
Browse files

initial commit of dge workflow

parents
No related branches found
No related tags found
No related merge requests found
# RNA-Seq DGE Workflow
http://stackoverflow.com/questions/600079/is-there-any-way-to-clone-a-git-repositorys-sub-directory-only
http://stackoverflow.com/questions/160608/do-a-git-export-like-svn-export
git checkout-index -a -f --prefix=/destination/path/
## How to use it?
git checkout-index -a -f --prefix=/destination/path/
1) Optionally rename dge_master.sh to something like dge_mouse_helin.sh
2) Adjust paths in master script
3) Run on madmax
4) Adjust reports if necessary
## How to backport changes into it?
Clone it, change it, push and commit it.
\ No newline at end of file
#' # Read Summary Report
#+ echo=FALSE, message=FALSE
## Note This script is supposed to be knitr::spin'ed
devtools::source_url("https://dl.dropboxusercontent.com/u/113630701/datautils/R/core_commons.R")
devtools::source_url("https://dl.dropboxusercontent.com/u/113630701/datautils/R/ggplot_commons.R")
argv = commandArgs(TRUE)
if(length(argv) != 1){
stop("Usage: fastqc_summmary.R <directory with fastqc results>")
}
#baseDir="fastqc"
#baseDir="/Volumes/projects/bioinfo/holger/projects/helin/mouse/fastqc"
baseDir=argv[1]
fastqDataFiles <- list.files(path=baseDir, pattern="^fastqc_data.txt", full.names=TRUE, recursive=T)
#' ## Number of reads per run
readCount <- function(statsFile){
data.frame(
run=trim_ext(basename(dirname(statsFile)), c("_fastqc")),
num_reads=as.numeric(readLines(pipe( paste("grep -F 'Total Sequences ' ", statsFile, " | cut -f2 -d'\t'") )))
)
}
readCounts <- fastqDataFiles %>% ldply(readCount)
require.auto(scales)
ggplot(readCounts, aes(run, num_reads)) + geom_bar(stat="identity") + coord_flip() + scale_y_continuous(labels=comma)
### Create a faill/pass matrix
readSummary <- function(statsFile){
# echo("reading", statsFile)
# statsFile="./fastqc/mouse_big_cyst_rep1_fastqc/summary.txt"
fileMapStats <- read.delim(statsFile, header=F) %>%
set_names(c("flag", "score", "run")) %>%
mutate(run=trim_ext(run, c(".fastq.gz", "fastq")))
fileMapStats
}
qcSummary <- list.files(path=baseDir, pattern="^summary.txt", full.names=TRUE, recursive=T) %>% ldply(readSummary)
qcSummary %>% ggplot(aes(score, run, fill=tolower(flag))) +
geom_tile() +
rotXlab() +
scale_fill_manual(values = c(fail="darkred", pass="darkgreen", warn="orange"))
readBaseQualDist <- function(statsFile){
# statsFile="./fastqc/mouse_big_cyst_rep2_fastqc/fastqc_data.txt"
# statsFile="./fastqc/mouse_liver_polar_stage3_rep2_fastqc/fastqc_data.txt"
# grep -A30 -F '>>Per base sequence quality' /Volumes/projects/bioinfo/holger/projects/helin/mouse/fastqc/mouse_big_cyst_rep1_fastqc/fastqc_data.txt | grep -B100 -F '>>END_M' | head -n-1 | tail -n+2 | tr '#' ' '
echo("reading", statsFile)
baseStats <- read.delim(pipe(
paste("grep -A30 -F '>>Per base sequence quality' ", statsFile, "| grep -B100 -F '>>END_M' | head -n-1 | tail -n+2 | tr '#' ' '")
)) %>% mutate(
run=trim_ext(basename(dirname(statsFile)), c("_fastqc"))
)
baseStats %>% mutate(base_order=1:n())
}
baseQualities <- fastqDataFiles %>% ldply(readBaseQualDist)
#with(baseQualities, as.data.frame(table(run)))
baseQualities %>% ggplot(aes(reorder(Base, base_order), Mean, group=run, color=run)) + geom_line() + scale_y_continuous(limits=c(2, 40))
#' # Base Quality Distribution Summary
runs <- with(baseQualities, as.data.frame(table(run)))
## http://stackoverflow.com/questions/12518387/can-i-create-an-empty-ggplot2-plot-in-r
baseQualities %>%
head(30) %>%
ggplot() +
geom_blank(mapping=aes(reorder(Base, base_order))) +
geom_rect(aes(xmin=-Inf, xmax=Inf, ymin=-Inf, ymax=20), data=runs, alpha=0.05, fill="red") +
geom_rect(aes(xmin=-Inf, xmax=Inf, ymin=20, ymax=28), data=runs, alpha=0.05, fill=colors()[654]) +
geom_rect(aes(xmin=-Inf, xmax=Inf, ymin=28, ymax=Inf), data=runs, alpha=0.05, fill="green") +
geom_boxplot(
mapping=aes(x=reorder(Base, base_order), ymin = X10th.Percentile, lower = Lower.Quartile , middle = Median, upper = Upper.Quartile , ymax = X90th.Percentile),
stat = "identity"
) +
facet_wrap(~run) +
theme(axis.text.x=element_blank()) +
scale_y_continuous(limits=c(2, 40)) +
xlab("")
## docs
## http://blog.joncairns.com/2013/08/what-you-need-to-know-about-bash-functions/
## create fastq report for all fastq and fastq.gz files in the current directory
dge_fastqc(){
while getopts "o:" curopt; do
case $curopt in
o) outputDir=$OPTARG;
esac
done
shift $(($OPTIND - 1))
local fastqFiles=$*
#if [ -z "$fastqFiles" ]; then
if [ $# -lt 1 ]; then
echo "Usage: dge_fastqc [-o <output_directory>] [<fastq.gz file>]+" >&2 ; return;
fi
## use default directory if not specified
if [ -z "$outputDir" ]; then
outputDir="fastqc_reports"
fi
if [ ! -d "$outputDir" ]; then
echo "creating output directory '$outputDir'"
mkdir $outputDir
fi
for fastqFile in $fastqFiles ; do
echo "fastqcing $fastqFile"
if [ ! -f $fastqFile ]; then
continue;
fi
mysub "fastqc__$(basename $fastqFile)" "fastqc -j /sw/bin/java -o $outputDir -f fastq $fastqFile" -q medium | joblist .fastqc_jobs
done
wait4jobs .fastqc_jobs
mailme "fastqc done for $outputDir"
ziprm fastqc_logs fastqc__*
## create a small report
source <(curl https://dl.dropboxusercontent.com/u/113630701/datautils/R/utils/spinr.sh 2>&1 2>/dev/null)
spinr $(which fastqc_summary.R) $outputDir
# todo create some summary report here
}
export -f dge_fastqc
#http://wiki.bash-hackers.org/howto/getopts_tutorial
dge_tophat_se(){
# http://stackoverflow.com/questions/18414054/bash-getopts-reading-optarg-for-optional-flags
echo $*
while getopts "i:" curopt; do
case $curopt in
i) IGENOME="$OPTARG";
esac
done
shift $(($OPTIND - 1))
local fastqFiles=$*
echo fastq $fastqFiles
echo igenomes "$IGENOME"
if [ -z "$IGENOME" ] || [ -z "$fastqFiles" ];
then echo "Usage: dge_tophat_se -i <path to igenome> [<fastq.gz file>]+" >&2 ; return;
fi
export bowtie_gindex="$IGENOME/Sequence/Bowtie2Index/genome"
export gtfFile="$IGENOME/Annotation/Genes/genes.gtf"
#head $gtfFile
if [ ! -f $gtfFile ]; then
>&2 echo "gtf '$gtfFile' does not exis"; return;
fi
if [ -z "$(which tophat)" ]; then
>&2 echo "no tomcat binary in PATH"; return;
fi
echo "running tophat using igenome '$IGENOME' for the following files"
ll $fastqFiles
#fastqFiles=$(ls $baseDir/treps_pooled/*fastq.gz)
for fastqFile in $fastqFiles ; do
echo "submitting tophat job for $fastqFile"
# DEBUG fastqFile=/projects/bioinfo/holger/projects/eric/trimmed/a1_ca.fastq.gz
fastqBaseName=$(basename ${fastqFile%%.fastq.gz})
outputdir=$fastqBaseName
## uniquely mapping reads only: http:/seqanswers.com/forums/showthread.php?s=93da11109ae12a773a128a637d69defe&t=3464
mysub "${project}__tophat__${fastqBaseName}" "
tophat -p6 -G $gtfFile -g1 -o $outputdir $bowtie_gindex $fastqFile
mv $outputdir/accepted_hits.bam $outputdir/$(basename $outputdir).bam
samtools index $outputdir/$(basename $outputdir).bam
" -n 5 -R span[hosts=1] -q long | joblist .tophatjobs
done
wait4jobs .tophatjobs
## create tophat mapping report
source <(curl https://dl.dropboxusercontent.com/u/113630701/datautils/bash/bioinfo_utils.sh 2>&1 2>/dev/null)
TophatMappingReport
}
export -f dge_tophat_se
# dge_tophat_se
# dge_tophat_se -i
dge_cuffdiff(){
local gtfFile=$1
local bamDir=$2
local labels=$3
if [ $# -ne 3 ]; then
echo "Usage: dge_fastqc <gtf_file> <bam directory> <labels>" >&2 ; return;
fi
if [ -z "$(which cuffdiff)" ]; then
>&2 echo "no cuffdiff binary in PATH"; return;
fi
if [ ! -f $gtfFile ]; then
>&2 echo "gtf '$gtfFile' does not exis"; return;
fi
## collect all bam-files and group them
allBams=$(find $bamDir | grep ".bam$" | grep -v "unmapped" | sort)
bamsSplit=""
for label in $(echo $labels | tr ", " " "); do
echo $label
labelBams=$(echo "$allBams" | grep $label | xargs echo -n | tr ' ' ',')
bamsSplit+=$labelBams" "
done
#echo $bamsSplit
## make sure (again that all files are there
echo $gtfFile $bamsSplit | tr "," " " | xargs ls -la
#mysub ${project}_cuffdiff "cuffdiff -L aRGm,aRGp,aRG,bRG,IPC,N -o . -p 8 mm10_ensGene_ccds.gtf $amBams $apBams $aBams $bBams $cBams $dBams 2>&1 | tee cuffdiff.log" -q long -n 8 -R span[hosts=1] | blockScript
cdCmd="cuffdiff -L $labels -o . -p 10 $gtfFile $bamsSplit"
#echo "cmd is: $cdCmd"
mysub ${project}__cuffdiff "$cdCmd" -q long -n 8 -R span[hosts=1] | blockScript
MakeCuffDB $gtfFile "NAN"
}
\ No newline at end of file
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment