cont. mf deseq

b485c9e1 · Holger Brandl · ffc16c40 · b485c9e1 · b485c9e1
Commit b485c9e1 authored 8 years ago by Holger Brandl
--- a/dge_workflow/featcounts_deseq_mf.R
+++ b/dge_workflow/featcounts_deseq_mf.R
@@ -14,6 +14,7 @@ Options:
 --pcutoff <pcutoff>    Override q-value filter and filter by p-value instead
 --min_count <min_count>   Minimal expression in any of the sample to be included in the final result list [default: 10]
 --project <project_prefix>   Name to prefix all generated result files [default: ]
+--design <exp_design>       Name to prefix all generated result files [default: sample]
 '

 opts <- docopt(doc, commandArgs(TRUE))
@@ -111,7 +112,7 @@ if(!is.null(contrasts_file)){


 #' The used design matrix is
-contrasts %>% kable()
+replicates2samples %>% kable()

 #' The contrasts of interest are
 contrasts %>% kable()
@@ -130,23 +131,33 @@ contrasts %>% kable()
 #colData <- data.frame(condition=colnames(countMatrix) %>% get_sample_from_replicate)

 ## new mf-approach
-colData <- replicates2samples %>%data.frame(replciate=colnames(countMatrix) %>% get_sample_from_replicate) %>%
-    arrange(replicate)
-colData %<>% rename(sample, sample_2ndwt)
+colData <- data_frame(replicate=colnames(countMatrix)) %>% mutate(col_index=row_number()) %>%
+    full_join(replicates2samples, by="replicate") %>%
+    arrange(col_index) #%>% transmute(condition=sample_2ndwt) %>% fac2char
+
+## infer the sample to be used from the formula
+#designFormula <- "sample_2ndwt + batch"
+designFormula <- opts$design
+
+## consider last one as sample attribute
+sampleAttribute <- str_split(designFormula, " ")
+colData
+
+#colData %<>% rename(sample, sample_2ndwt)

 #colData <- replicates2samples %>%  arrange(colnames(countMatrix))

 # valiadate that contrasts model is compatible with data

-%<>%
-if(!all((contrasts %>% gather %$% value %>% unique) %in% colData$condition)){
-    echo("column model: ",colData$condition)
-    echo("contrasts: ", contrasts %>% gather %$% value %>% unique)
-    stop("column model is not consistent with contrasts")
-}
+#if(!all((contrasts %>% gather %$% value %>% unique) %in% colData$condition)){
+#    echo("column model: ",colData$condition)
+#    echo("contrasts: ", contrasts %>% gather %$% value %>% unique)
+#    stop("column model is not consistent with contrasts")
+#}
 #stopifnot(all((contrasts %>% gather %$% value %>% unique) %in% colData$condition))

-dds <- DESeqDataSetFromMatrix(countMatrix, colData, formula(~ condition))
+#http://www.cookbook-r.com/Formulas/Creating_a_formula_from_a_string/
+dds <- DESeqDataSetFromMatrix(countMatrix, colData, formula(as.formula("~ batch + sample_2ndwt")))
 #dds <- estimateSizeFactors(dds)
 #dds <- estimateDispersions(dds)
 dds <- DESeq(dds)
@@ -177,7 +188,7 @@ plotDispEsts(dds, main="Dispersion plot")
 # Regularized log transformation for clustering/heatmaps, etc
 rld <- rlogTransformation(dds)

-plotPCA(rld, intgroup = "condition")
+plotPCA(rld, intgroup = "sample")


 #' The Euclidean distance between samples are calculated after performing the regularized log transformation.

--- a/dge_workflow/star_align.kts
+++ b/dge_workflow/star_align.kts
 #!/usr/bin/env kscript
-//DEPS com.offbytwo:docopt:0.6.0.20150202 de.mpicbg.scicomp:joblist:0.6 de.mpicbg.scicomp.bioinfo:kutils:0.1-SNAPSHOT
+//DEPS de.mpicbg.scicomp:joblist:0.6 de.mpicbg.scicomp.bioinfo:kutils:0.1-SNAPSHOT


 // add docopts to local m2 index
@@ -92,6 +92,9 @@ for (fastqFile in fastqFiles) {
    val fastqBaseName = fastqFile.name.removeSuffix(".gz").removeSuffix(".fastq")
    val optionalZcat = if (fastqFile.name.endsWith("gz")) "--readFilesCommand zcat" else ""

+    // todo consider to use --outTmpDir which defaults to outFileNamePrefix STARtmp and which is deleted automatically
+    // or use process substiutation in case of zipped reads (see https://github.com/alexdobin/STAR/issues/143#issuecomment-216597465)
+
    val cmd = """
    STAR --genomeDir $star_index --readFilesIn $fastqFile --runThreadN 6 ${optionalZcat} --outFileNamePrefix ${fastqBaseName}. --outSAMtype BAM SortedByCoordinate --outSAMstrandField intronMotif --sjdbGTFfile $gtfFile --outFilterIntronMotifs   RemoveNoncanonicalUnannotated --outFilterType BySJout --quantMode GeneCounts --outFilterMultimapNmax 1 --outSJfilterCountUniqueMin 8 3 3 3
    mv ${fastqBaseName}.Aligned.sortedByCoord.out.bam ${fastqBaseName}.bam
@@ -101,7 +104,7 @@ for (fastqFile in fastqFiles) {
    // todo provide proper walltime here
    // slurm memory limit https://rc.fas.harvard.edu/resources/documentation/slurm-memory/
    // sacct -o MaxRSS -j JOBID
-    jl.run(JobConfiguration(cmd, "star__${fastqBaseName}", "", "medium", 5, 0, "",  better.files.File(File(".").toPath())))
+    jl.run(JobConfiguration(cmd, "star__${fastqBaseName}", "10:00", "", 5, 40000, "",  better.files.File(File(".").toPath())))
 }