From 5a33481ced8b4e44ef8d5b225f8c342f902e8dfa Mon Sep 17 00:00:00 2001 From: Holger Brandl <holgerbrandl@gmail.com> Date: Mon, 2 Nov 2015 15:19:07 +0100 Subject: [PATCH] improved genome guessing --- dge_workflow/dge_utils.sh | 15 ++++++++++-- misc/guess_species_from_gtf.scalah | 37 ++++++++++++++++++++++++++++++ 2 files changed, 50 insertions(+), 2 deletions(-) create mode 100755 misc/guess_species_from_gtf.scalah diff --git a/dge_workflow/dge_utils.sh b/dge_workflow/dge_utils.sh index 9e25660..d2dd44f 100755 --- a/dge_workflow/dge_utils.sh +++ b/dge_workflow/dge_utils.sh @@ -384,12 +384,23 @@ mysub "${project}__cuffdiff" "$cdCmd" -q long -n 4 -R span[hosts=1] | blockScri tmpDbDir=$(mktemp -d) cp -r . $tmpDbDir -## todo remove this hack -genome=$(echo $gtfFile | cut -f8 -d'/' | tr '[:upper:]' '[:lower:]'); echo "genome is $genome" + +#genome=$(echo $gtfFile | cut -f8 -d'/' | tr '[:upper:]' '[:lower:]'); echo "genome is $genome" ## make sure to use temp-r to avoid file locking problems #export R_LIBS=/tmp/r_index + +genome=$(scala -e ' +val gtfFile = args(0); //val gtfFile="mm10_igenomes_pc.gtf" +val pattern = "mm10|mm9|hg19|zv9".r +println(pattern.findFirstIn(gtfFile).getOrElse("")) +' $(readlink -f $gtfFile) +) +echo $genome + + + echo ' require(cummeRbund) dbDir=commandArgs(T)[1] diff --git a/misc/guess_species_from_gtf.scalah b/misc/guess_species_from_gtf.scalah new file mode 100755 index 0000000..70188fd --- /dev/null +++ b/misc/guess_species_from_gtf.scalah @@ -0,0 +1,37 @@ +#!/bin/sh +exec scalas "$0" "$@" +!# + +/** Work in progress: Guess the species from a gtf file. By name first and then by file content. A more simplistic approach is already implementd in dge_workflow/dge_utils.sh + */ + +import java.io.File + +import scala.io.Source + +// http://alvinalexander.com/scala/scala-shell-script-command-line-arguments-args +val gtfFile = args(1) +//val gtfFile="mm10_igenomes_pc.gtf" + +val pattern = "mm10|mm9|h19|zv9".r +val genomeByName = pattern.findFirstIn(gtfFile) + +if (genomeByName.isEmpty) { + System.exit(1) +} + +def guessFromContent(gtfFile: File): Option[String] = { + // Source.fromString(s"grep ENSMUSG $gtfFile | "!!).getLines().hasNext + // Bash.evalCapture(s"grep ENSMUSG $gtfFile | wc -l") + + if (Source.fromFile(gtfFile).getLines().exists(_.contains("ENSMUSG"))) return Some("mouse") + if (Source.fromFile(gtfFile).getLines().exists(_.contains("ENSCAFG"))) return Some("dog") + + None +} + + +genomeByName.get match { + case "mm9" => +} +println(genomeByName) \ No newline at end of file -- GitLab