Commit e833e7c9 authored by Holger Brandl's avatar Holger Brandl

removed unused files

parent 719fe937
#!/usr/bin/env Rscript
#' # FastQC Summary for `r getwd()`
##+ echo=FALSE, message=FALSE
## Note This script is supposed to be knitr::spin'ed
devtools::source_url("https://www.dropbox.com/s/r6kim8kb8ohmptx/core_commons.R?dl=1")
devtools::source_url("https://www.dropbox.com/s/0oa2npepqmfm25k/ggplot_commons.R?dl=1")
## can we access variables from the parent spin.R process?
#echo("rscript is ", r_script)
argv = commandArgs(TRUE)
#echo("argv is ", argv)
#if(str_detect(argv[1], "fastqc_summary")) argv <- argv[-1]
if(length(argv) != 1){
stop("Usage: fastqc_summmary.R <directory with fastqc results>")
echo
}
baseDir=argv[1]
if(is.na(file.info(baseDir)$isdir)){
stop(paste("directory '", baseDir,"'does not exist"))
}
#baseDir="fastqc"
#baseDir="/Volumes/projects/bioinfo/holger/projects/helin/mouse/fastqc"
fastqDataFiles <- list.files(path=baseDir, pattern="^fastqc_data.txt", full.names=TRUE, recursive=T)
#echo("files are", fastqDataFiles)
#' ## Number of reads per run
readCount <- function(statsFile){
data.frame(
run=trim_ext(basename(dirname(statsFile)), c("_fastqc")),
num_reads=as.numeric(readLines(pipe( paste("grep -F 'Total Sequences ' ", statsFile, " | cut -f2 -d'\t'") )))
)
}
readCounts <- fastqDataFiles %>% ldply(readCount) %>% print_head()
require.auto(scales)
gg <- ggplot(readCounts, aes(run, num_reads)) + geom_bar(stat="identity") + coord_flip() + scale_y_continuous(labels=comma) + ggtitle("read counts")
#+ results='asis'
gg %>% ggsave2(width=10, height = round(nrow(readCounts)/4), limitsize=FALSE) %>% paste0("<img src='", ., "'><br>") %>% cat()
### Create a faill/pass matrix
readSummary <- function(statsFile){
# echo("reading", statsFile)
# statsFile="./fastqc/mouse_big_cyst_rep1_fastqc/summary.txt"
fileMapStats <- read.delim(statsFile, header=F) %>%
set_names(c("flag", "score", "run")) %>%
mutate(run=trim_ext(run, c(".fastq.gz", "fastq")))
fileMapStats
}
qcSummary <- list.files(path=baseDir, pattern="^summary.txt", full.names=TRUE, recursive=T) %>% ldply(readSummary)
#' # Base Quality Distribution Summary
#+ fig.height=2+round(nrow(readCounts)/4), fig.width=12
qcSummary %>% ggplot(aes(score, run, fill=tolower(flag))) +
geom_tile() +
rotXlab() +
scale_fill_manual(values = c(fail="darkred", pass="darkgreen", warn="orange")) +
ggtitle("fastqc passcodes")
#' Median qualities per position to compare positional patterns and differences between the runs
readBaseQualDist <- function(statsFile){
# statsFile="./fastqc/mouse_big_cyst_rep2_fastqc/fastqc_data.txt"
# statsFile="./fastqc/mouse_liver_polar_stage3_rep2_fastqc/fastqc_data.txt"
# grep -A30 -F '>>Per base sequence quality' /Volumes/projects/bioinfo/holger/projects/helin/mouse/fastqc/mouse_big_cyst_rep1_fastqc/fastqc_data.txt | grep -B100 -F '>>END_M' | head -n-1 | tail -n+2 | tr '#' ' '
# echo("reading", statsFile)
baseStats <- read.delim(pipe(
paste("grep -A30 -F '>>Per base sequence quality' ", statsFile, "| grep -B100 -F '>>END_M' | head -n-1 | tail -n+2 | tr '#' ' '")
)) %>% mutate(
run=trim_ext(basename(dirname(statsFile)), c("_fastqc"))
)
baseStats %>% mutate(base_order=1:n())
}
baseQualities <- fastqDataFiles %>% ldply(readBaseQualDist)
#with(baseQualities, as.data.frame(table(run)))
baseQualities %>% ggplot(aes(reorder(Base, base_order), Mean, group=run, color=run)) + geom_line() + scale_y_continuous(limits=c(2, 40))
#+ warning=FALSE, fig.width=15
runs <- with(baseQualities, as.data.frame(table(run)))
#' Qualities per run including variance
## http://stackoverflow.com/questions/12518387/can-i-create-an-empty-ggplot2-plot-in-r
baseQualities %>%
# head(30) %>%
ggplot() +
geom_blank(mapping=aes(reorder(Base, base_order))) +
geom_rect(aes(xmin=-Inf, xmax=Inf, ymin=-Inf, ymax=20), data=runs, alpha=0.05, fill="red") +
geom_rect(aes(xmin=-Inf, xmax=Inf, ymin=20, ymax=28), data=runs, alpha=0.05, fill=colors()[654]) +
geom_rect(aes(xmin=-Inf, xmax=Inf, ymin=28, ymax=Inf), data=runs, alpha=0.05, fill="green") +
geom_boxplot(
mapping=aes(x=reorder(Base, base_order), ymin = X10th.Percentile, lower = Lower.Quartile , middle = Median, upper = Upper.Quartile , ymax = X90th.Percentile),
stat = "identity"
) +
facet_wrap(~run) +
theme(axis.text.x=element_blank()) +
scale_y_continuous(limits=c(2, 40)) +
xlab("")
#!/bin/sh
exec scalas "$0" "$@"
!#
/** Work in progress: Guess the species from a gtf file. By name first and then by file content. A more simplistic approach is already implementd in dge_workflow/dge_utils.sh
*/
import java.io.File
import scala.io.Source
// http://alvinalexander.com/scala/scala-shell-script-command-line-arguments-args
val gtfFile = args(1)
//val gtfFile="mm10_igenomes_pc.gtf"
val pattern = "mm10|mm9|h19|zv9".r
val genomeByName = pattern.findFirstIn(gtfFile)
if (genomeByName.isEmpty) {
System.exit(1)
}
def guessFromContent(gtfFile: File): Option[String] = {
// Source.fromString(s"grep ENSMUSG $gtfFile | "!!).getLines().hasNext
// Bash.evalCapture(s"grep ENSMUSG $gtfFile | wc -l")
if (Source.fromFile(gtfFile).getLines().exists(_.contains("ENSMUSG"))) return Some("mouse")
if (Source.fromFile(gtfFile).getLines().exists(_.contains("ENSCAFG"))) return Some("dog")
None
}
genomeByName.get match {
case "mm9" =>
}
println(genomeByName)
\ No newline at end of file
## from naha
PRE=/home/lakshman/mnt/chipseq_huttner/Naha_analysis/BigWig_Files/Combined_BAMS_20141024/*.remdup.bam.sorted.bam
name_suf=.remdup.bam
all_bams=$( ls $PRE)
all_bams_lables=$( ls $PRE | sed 's#.*/##' | sed "s/$name_suf$//")
cd /home/lakshman/mnt/chipseq_huttner/Naha_analysis/BigWig_Files/Combined_BAMS_20141024/
screen -R bamcorrelate
bamCorrelate bins --bamfiles $all_bams --labels $all_bams_lables --binSize 1000 --corMethod spearman -f 200 --numberOfProcessors=8 --colorMap Reds --zMin 0.5 --zMax 1 --plotFile="MOUSE_combined_bams_correlation_remdup.pdf" --outFileCorMatrix="MOUSE_combined_bams_correlation_Matrix.txt"
setwd("/Volumes/albert_chipseq/Naha_analysis/BigWig_Files/Combined_BAMS_20141024/")
library(reshape)
library(stringr)
library(lattice)
library(ggplot2)
data=read.delim("MOUSE_combined_bams_correlation_Matrix.txt", sep="\t", h=T)
data_1=data
rownames(data_1)=data_1$X
data_1$X=NULL
levelplot(as.matrix(data_1))
colorFun <- colorRampPalette(c("yellow","red"))
pdf("Spearman_correlation_matrix_plot.pdf")
levelplot(as.matrix(data_1),scales=list(x=list(rot=90, cex=.7), y=list(cex=.7)), col.regions = colorFun(50),cexRow=0.2, cexCol=0.2)
dev.off()
distance=dist(as.matrix(data_1))
tree=hclust(distance)
plot(tree)
pdf("Hierarchical_clustering_plot.pdf")
plot(tree, hang=-1)
dev.off()
data_trans=data
rownames(data_trans)=data_trans$X
data_trans$X=NULL
pca_chip <- prcomp(data_trans, center=TRUE, scale=T, retx=TRUE)
pc123 <- data.frame(PC1=pca_chip$x[,1], PC2=pca_chip$x[,2], PC3=pca_chip$x[,3])
makePcaPlot <- function(pca, item_names=rownames(pca$x),conditions=str_split_fixed(item_names, "_", 2)[,2]) {
# browser()
percent <- round((((pca$sdev)^2 / sum(pca$sdev^2))*100))
pc12 <- data.frame(PC1=pca$x[,1], PC2=pca$x[,2], sample=conditions, items=item_names)
ggplot(pc12, aes(PC1, PC2, color=sample, label=items)) + geom_point()+xlab(paste("PC1 (", percent[1], "%)", sep = "")) + ylab(paste("PC2 (", percent[2], "%)", sep = ""))
}
makePcaPlot(pca_chip) + theme_bw() + ggtitle("Cell type clustering PCA")+geom_vline(xintercept=0, linetype="dashed")+geom_hline(xintercept=0, linetype="dashed")
makePcaPlot_text <- function(pca, item_names=rownames(pca$x),conditions=str_split_fixed(item_names, "_", 2)[,2]) {
# browser()
percent <- round((((pca$sdev)^2 / sum(pca$sdev^2))*100))
pc12 <- data.frame(PC1=pca$x[,1], PC2=pca$x[,2], sample=conditions, items=item_names)
ggplot(pc12, aes(PC1, PC2, color=sample, label=items)) + geom_text()+xlab(paste("PC1 (", percent[1], "%)", sep = "")) + ylab(paste("PC2 (", percent[2], "%)", sep = ""))
}
save.image("cell_clustering.RData")
pdf("PCA_plot.pdf")
makePcaPlot_text(pca_chip) + theme_bw() + ggtitle("Cell type clustering PCA")+geom_vline(xintercept=0, linetype="dashed")+geom_hline(xintercept=0, linetype="dashed")
dev.off()
## allow group to modify results by default
umask u=rwx,g=rwx,o=
#export PROJECT_NAME=dye_rnaseq
#export PROJECT_DIR=bioinformatics/projects/${PROJECT_NAME}
#export PROJECT_NAME=sarov_crispr_qc
#export PROJECT_DIR=bioinformatics/projects/transgenomics/${PROJECT_NAME}
#############################################
# setup project locally on project space
#############################################
mkdir /Volumes/${PROJECT_DIR}
cd /Volumes/${PROJECT_DIR}
## setup data (unison target)
mkdir data
## setup scripts origin git
git init --bare .scripts_git_origin
## check out the working copy
git clone ssh://fileserver/projects/${PROJECT_DIR}/.scripts_git_origin scripts
## check out used workfing copy for rnaseq
git --git-dir /Volumes/projects/bioinfo/scripts/ngs_tools/dev/.git archive --format zip --output ngs_tools.zip master
git clone git-srv1:/local/git/bioinformatics .temp_ngs_tools
cd .temp_ngs_tools
git archive --format zip --output /full/path/to/ngs_tools.zip master
#############################################
## setup project on cluster
#############################################
mm
## setup data (to be synced with unison later)
mkdir /projects/bioinfo/holger/projects/${PROJECT_NAME}
cd /projects/bioinfo/holger/projects/${PROJECT_NAME}
## setup scripts
git clone ssh://fileserver/projects/${PROJECT_DIR}/.scripts_git_origin /projects/bioinfo/holger/scripts/${PROJECT_NAME}
## configure which version of common tools to use
cd /projects/bioinfo/holger/scripts/${PROJECT_NAME}
#ln -s /projects/bioinfo/scripts/ngs_tools/dev ngs_tools
## prepare gitignore to avoid that we commit it again
echo ngs_tools > .gitignore
echo ngs_tools.zip > .gitignore
git add .gitignore
git commit -m "started project"
git push origin master
## setup project on bioinfo
#bi
#mkdir ~/projects/${PROJECT_NAME}
#cd ~/projects/${PROJECT_NAME}
\ No newline at end of file
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment