From 31d828987145ad30efd66e50397b01b60c242d7d Mon Sep 17 00:00:00 2001
From: Holger Brandl <brandl@mpi-cbg.de>
Date: Thu, 14 Aug 2014 10:21:54 +0200
Subject: [PATCH] split up R utils

---
 R/core_commons.R           | 179 +++++++++++++++++++++++++++++++++++++
 R/datatable_utils.R        |  16 ++++
 R/df_utils.R               |  77 ++++++++++++++++
 R/{ => ggplot}/ggheatmap.R |   0
 R/ggplot/ggplot_utils.R    | 156 ++++++++++++++++++++++++++++++++
 R/ggplot_utils.R           |  41 ---------
 R/holger_utils.R           |  43 +++++++++
 R/plyr_utils.R             |   7 --
 8 files changed, 471 insertions(+), 48 deletions(-)
 create mode 100644 R/core_commons.R
 create mode 100644 R/datatable_utils.R
 create mode 100644 R/df_utils.R
 rename R/{ => ggplot}/ggheatmap.R (100%)
 create mode 100644 R/ggplot/ggplot_utils.R
 delete mode 100644 R/ggplot_utils.R
 create mode 100644 R/holger_utils.R
 delete mode 100644 R/plyr_utils.R

diff --git a/R/core_commons.R b/R/core_commons.R
new file mode 100644
index 0000000..fa8937b
--- /dev/null
+++ b/R/core_commons.R
@@ -0,0 +1,179 @@
+
+## automatic package installation
+require.auto <-  function(x){
+    x <- as.character(substitute(x))
+
+    if(isTRUE(x %in% .packages(all.available=TRUE))) {
+        eval(parse(text=paste("require(", x, ",  quietly=T)", sep="")))
+    } else {
+    #        update.packages(ask=F) # update dependencies, if any.
+        eval(parse(text=paste("install.packages('", x, "')", sep="")))
+    }
+
+    if(isTRUE(x %in% .packages(all.available=TRUE))) {
+        eval(parse(text=paste("require(", x, ",  quietly=T)", sep="")))
+    } else {
+        source("http://bioconductor.org/biocLite.R")
+        biocLite(character(), ask=FALSE) # update dependencies, if any.
+        eval(parse(text=paste("biocLite('", x, "')", sep="")))
+        eval(parse(text=paste("require(", x, ",  quietly=T)", sep="")))
+    }
+}
+
+require.auto(plyr)
+require.auto(stringr)
+require.auto(reshape2)
+#require.auto(reshape2, quietly=T, warn.conflicts=F)
+require.auto(scales)
+
+require.auto(dplyr)
+
+
+#require.auto(data.table)
+
+options(help_type="html")
+options(width=150)
+
+# for sqldf to avoid the use of tckl
+options(gsubfn.engine = "R")
+
+
+##### set the r mirror
+#cat(".Rprofile: Setting Goettingen repository\n")
+r = getOption("repos") # hard code the UK repo for CRAN

+r["CRAN"] = "http://ftp5.gwdg.de/pub/misc/cran/"
+options(repos = r)
+rm(r)
+
+
+
+
+#qlibrary <- function(libname) {library(as.character(substitute(libname)), quietly=T, warn.conflicts=F, character.only=T )}
+
+
+
+
+########################################################################################################################
+#### Small aliases
+
+
+#praste <- function(...) print(paste(...))
+echo <- function(...) print(paste(...))
+
+
+ac <- function(...) as.character(...)
+
+qns <- function() quit(save="no")
+
+
+# string concatenation without space gaps (could/should use paste0 instead)
+concat <- function(...) paste(..., sep="")
+
+
+se<-function(x)	sd(x, na.rm=TRUE) / sqrt(sum(!is.na(x)))
+
+unlen <- function(x) length(unique(x))
+
+pp <- function(dat) page(dat, method = "print")
+
+
+
+########################################################################################################################
+#### File System
+
+is.directory <- function(dirname) !is.na(file.info(dirname)$isdir)
+
+
+mcdir <- function(dirname){
+    if(!file.exists(dirname)){
+        dir.create(dirname)
+    }
+
+    setwd(dirname)
+}
+
+locload <- function(fileName) local(get(load(fileName)))
+
+
+rmerge <- function(LDF, by, ...){
+    DF <- LDF[[1]]
+    for (i in 2:length(LDF)) {
+        DF <- merge(DF, LDF[[i]], by=by)
+    }
+    DF
+}
+
+
+
+
+trimEnd <-function(fileNames, exts=c()){
+    for(fileExt in exts){
+        fileNames <- str_replace(fileNames, paste(fileExt, "$",sep=""), "")
+    }
+
+    fileNames
+}
+
+chopFileExt <-function(fileNames, exts=c()){
+    warning("this method is deprecated. use trimEnd instead")
+    for(fileExt in exts){
+        fileNames <- str_replace(fileNames, paste(".", fileExt, "$",sep=""), "")
+    }
+
+    fileNames
+}
+
+write.delim <- function(df, header=TRUE,...){
+    write.table(df, row.names=FALSE, col.names=header, sep="\t", ...)
+}
+
+## writes a table in bed format expecting columns being ordered according to bed spec already
+write.bed <- function(bedData, file){
+    write.table(bedData, file=file, quote=FALSE, sep ="\t", na="NA", row.names=FALSE, col.names=FALSE)
+}
+#options(scipen=100) ## necessary to disable scientific number formats for long integers
+
+
+
+
+rmSomeElements <- function(vec, toDel) vec[!(vec %in% toDel)]
+
+rmLastElement <- function(vec) vec[-length(vec)]
+
+
+########################################################################################################################
+#### File System
+
+## Memory management
+
+# improved list of objects
+lsos <- function (pos = 1, pattern, order.by,
+                        decreasing=FALSE, head=FALSE, n=5) {
+    napply <- function(names, fn) sapply(names, function(x)
+                                         fn(get(x, pos = pos)))
+    names <- ls(pos = pos, pattern = pattern)
+    obj.class <- napply(names, function(x) as.character(class(x))[1])
+    obj.mode <- napply(names, mode)
+    obj.type <- ifelse(is.na(obj.class), obj.mode, obj.class)
+    obj.size <- napply(names, object.size)/1000000
+    obj.dim <- t(napply(names, function(x)
+                        as.numeric(dim(x))[1:2]))
+    vec <- is.na(obj.dim)[, 1] & (obj.type != "function")
+    obj.dim[vec, 1] <- napply(names, length)[vec]
+    out <- data.frame(obj.type, obj.size, obj.dim)
+    names(out) <- c("Type", "Size", "Rows", "Columns")
+    if (!missing(order.by))
+        out <- out[order(out[[order.by]], decreasing=decreasing), ]
+    if (head)
+        out <- head(out, n)
+#        out
+    out <- transform(out, var_name=rownames(out))
+    rownames(out) <- NULL
+    arrange(out, Size)
+}
+
+# shorthand
+lsosh <- function(..., n=10) {
+    lsos(..., order.by="Size", decreasing=TRUE, head=TRUE, n=n)
+}
+
diff --git a/R/datatable_utils.R b/R/datatable_utils.R
new file mode 100644
index 0000000..3a7733d
--- /dev/null
+++ b/R/datatable_utils.R
@@ -0,0 +1,16 @@
+    require.auto(data.table)
+
+
+dt.merge <- function(dfA, dfB, by=intersect(names(dfA), names(dfB)) , ...)  {
+#    require(data.table)
+    as.df(merge(data.table(dfA, key=by), data.table(dfB, key=by), ...))
+#    unloadNameSpace(data.table)
+}
+
+
+
+#   http://stackoverflow.com/questions/11792527/filtering-out-duplicated-non-unique-rows-in-data-table
+unique_rows <- function(df, columns){
+
+    unique(setkeyv(data.table(df), columns)) %>% as.df()
+}
diff --git a/R/df_utils.R b/R/df_utils.R
new file mode 100644
index 0000000..1bf549a
--- /dev/null
+++ b/R/df_utils.R
@@ -0,0 +1,77 @@
+
+
+subsample <- function(df, sampleSize, ...){
+    df[sample(1:nrow(df), min(sampleSize, nrow(df)), ...),]
+}
+
+shuffle <- function(df) df[sample(nrow(df)),]
+
+first <- function(x, n=1) head(x,n)
+
+as.df <- function(dt) as.data.frame(dt)
+
+
+## small wrappers
+
+
+
+
+rownames2column <- function(df, colname){
+    df <- as.df(df)
+    df$tttt <- rownames(df);
+    rownames(df) <- NULL;
+    rename(df, c(tttt=colname))
+}
+
+column2rownames<- function(df, colname){
+#browser()
+    rownames(df) <- ac(df[,colname])
+    df[colname] <- NULL
+    return(df)
+}
+
+## pushing some columns to the right end of a data.frame
+push_right <- function(df, pushColNames){
+    df[,c(setdiff(names(df),pushColNames), pushColNames)]
+}
+
+push_left <- function(df, pushColNames){
+    df[,c(pushColNames, setdiff(names(df),pushColNames))]
+}
+
+set_names <- function(df, newnames){
+    df<- as.df(df)
+    names(df) <- newnames;
+    return(df)
+}
+
+
+## dplyr utilities
+
+print_head <- function(df, desc=NULL){
+    print(head(df))
+    return(df)
+}
+
+
+fac2char <- function(mydata, convert=names(mydata)[sapply(mydata, is.factor)]){
+    if(length(convert)==0)
+        return(mydata)
+
+    inputColOrder <- names(mydata)
+
+    convertData <- subset(mydata, select= names(mydata)%in%convert)
+    convertData <- as.data.frame(lapply(convertData, as.character), stringsAsFactors = FALSE)
+
+    keepData <-  subset(mydata, select=!(names(mydata)%in%convert))
+    newdata <- cbind(convertData, keepData)
+    newdata <- newdata[,inputColOrder]
+
+    return(newdata)
+}
+
+
+## workaround for biomart
+dselect <- function(...) dplyr::select(...)
+
+
diff --git a/R/ggheatmap.R b/R/ggplot/ggheatmap.R
similarity index 100%
rename from R/ggheatmap.R
rename to R/ggplot/ggheatmap.R
diff --git a/R/ggplot/ggplot_utils.R b/R/ggplot/ggplot_utils.R
new file mode 100644
index 0000000..b77ae14
--- /dev/null
+++ b/R/ggplot/ggplot_utils.R
@@ -0,0 +1,156 @@
+
+require.auto(ggplot2)
+
+
+
+plotPDF <- function(filename, expr){ pdf(paste0(filename, ".pdf")); expr; dev.off(); }
+#plotPDF("test", plot(1:10))
+
+
+# Multiple plot function
+#
+# ggplot objects can be passed in ..., or to plotlist (as a list of ggplot objects)
+# - cols:   Number of columns in layout
+# - layout: A matrix specifying the layout. If present, 'cols' is ignored.
+#
+# If the layout is something like matrix(c(1,2,3,3), nrow=2, by.row=TRUE),
+# then plot 1 will go in the upper left, 2 will go in the upper right, and
+# 3 will go all the way across the bottom.
+#
+multiplot <- function(..., plotlist=NULL, file, cols=1, layout=NULL) {
+  require(grid)
+
+  # Make a list from the ... arguments and plotlist
+  plots <- c(list(...), plotlist)
+
+  numPlots = length(plots)
+
+  # If layout is NULL, then use 'cols' to determine layout
+  if (is.null(layout)) {
+    # Make the panel
+    # ncol: Number of columns of plots
+    # nrow: Number of rows needed, calculated from # of cols
+    layout <- matrix(seq(1, cols * ceiling(numPlots/cols)),
+                    ncol = cols, nrow = ceiling(numPlots/cols))
+  }
+
+ if (numPlots==1) {
+    print(plots[[1]])
+
+  } else {
+    # Set up the page
+    grid.newpage()
+    pushViewport(viewport(layout = grid.layout(nrow(layout), ncol(layout))))
+
+    # Make each plot, in the correct location
+    for (i in 1:numPlots) {
+      # Get the i,j matrix positions of the regions that contain this subplot
+      matchidx <- as.data.frame(which(layout == i, arr.ind = TRUE))
+
+      print(plots[[i]], vp = viewport(layout.pos.row = matchidx$row,
+                                      layout.pos.col = matchidx$col))
+    }
+  }
+}
+
+rotXlab <- function() theme(axis.text.x = element_text(angle = 90, hjust = 1))
+
+
+gg2Format="png"
+
+ggsave2 <- function(gplot=last_plot(), width=8, height=6, prefix="", saveData=FALSE, outputFormat=gg2Format, ...){
+    title <- try(gplot$labels[["title"]])
+
+    if(is.null(title)){
+        varMapping <- gplot$labels
+        varMapping <- varMapping[names(varMapping) %in% c("x", "y")]
+#        if(varMapping==NULL){
+#            varMapping <- gplot$mapping;
+#        }
+
+        if(length(varMapping) == 1){
+            title= paste("distribution of", varMapping)
+        }else{
+            title = try(paste(varMapping, collapse=" vs "))
+        # stop("last plot had no title. Use ggsave() and give it a manual title")
+        }
+
+        rawFacetDesc <- format(gplot$facet)
+        if(rawFacetDesc!="facet_null()"){
+            title <- paste(title, "by", str_replace_all(str_match(rawFacetDesc, "facet_.*[(](.*))")[,2], "~", "and"))
+        }
+    }
+
+
+    fileBaseName <- ifelse(nchar(prefix)>0, concat(prefix, " - ", title), title)
+
+    ## clean up weired characters
+    fileBaseName <- str_replace_all(fileBaseName, "[$%/?]", "_")
+
+    fileName = concat(fileBaseName, concat(".", outputFormat))
+
+    ## remove line-breaks and trim spaces
+    fileName = str_replace_all(str_replace_all(fileName, "\\n", ""), "[ ]{2,}", " ")
+#    print(paste("saving plot to ", fileName))
+    ggsave(fileName, width=width, height=height, ...)
+
+    if(saveData){
+        write.delim(gplot$data, file= concat(fileBaseName, ".txt"))
+    }
+
+
+    ##todo review
+    LAST_GG2_OUTPUT <<- fileName
+
+    return(fileName)
+}
+
+### alternate over all open devices when running ggplot
+## disabled because it's causing problems if X11 device is not available
+#ggplot <- function(...){
+#    dev.set(dev.next())
+#    ggplot2::ggplot(...)
+#}
+
+
+scale_fill_redgreed <- function() scale_fill_manual(values = c("red","darkgreen"))
+
+
+########################################################################################################################
+### pca plots (http://largedata.blogspot.de/2011/07/plotting-pca-results-in-ggplot2.html)
+
+makePcaPlot <- function(x = getData(), group = NA, items=rownames(x), title = "") {
+  require(ggplot2)
+  require(RColorBrewer)
+
+#  data <- x
+#  data <- t(apply(data, 1, scale))
+#  rownames(data) <- rownames(x)
+#  colnames(data) <- colnames(x)
+#  mydata <- t(data)
+  mydata <-x
+  mydata.pca <- prcomp(mydata, retx=TRUE, center=TRUE, scale.=TRUE)
+
+  percent <- round((((mydata.pca$sdev)^2 / sum(mydata.pca$sdev^2))*100)[1:2])
+
+  scores <- mydata.pca$x
+  pc12 <- data.frame(PCA1=scores[,1], PCA2=scores[,2], group=group)
+
+#  ggplot(pc12, aes(PCA1, PCA2, colour = group)) + geom_point(size = 6, alpha = 3/4)
+  ggplot(pc12, aes(PCA1, PCA2, colour = group, label=items)) +
+    geom_point(size = 6, alpha = 3/4)   +
+    geom_text(size = 6, alpha = 3/4)    +
+    xlab(paste("PCA1 (", percent[2], "%)", sep = "")) +
+    ylab(paste("PCA2 (", percent[2], "%)", sep = ""))
+
+  qplot(PCA2, PCA1, geom="blank", main = title, xlab = paste("PCA2 (", percent[2], "%)", sep = ""), ylab = paste("PCA1 (", percent[1], "%)", sep = "")) +
+    geom_point(aes(colour = group), size = 6, alpha = 3/4)
+#    theme(
+#      axis.text.x = element_text(size = base_size * 1.3 , lineheight = 0.9, colour = "grey50", hjust = 1, angle = 90),
+#      axis.text.y = element_text(size = base_size * 1.3, lineheight = 0.9, colour = "grey50", hjust = 1)
+#    )
+}
+
+
+## example
+# makePcaPlot(getData(30,4,2,distort = 0.7))
diff --git a/R/ggplot_utils.R b/R/ggplot_utils.R
deleted file mode 100644
index 72fd7c1..0000000
--- a/R/ggplot_utils.R
+++ /dev/null
@@ -1,41 +0,0 @@
-
-
-
-########################################################################################################################
-### pca plots (http://largedata.blogspot.de/2011/07/plotting-pca-results-in-ggplot2.html)
-
-makePcaPlot <- function(x = getData(), group = NA, items=rownames(x), title = "") {
-  require(ggplot2)
-  require(RColorBrewer)
-
-#  data <- x
-#  data <- t(apply(data, 1, scale))
-#  rownames(data) <- rownames(x)
-#  colnames(data) <- colnames(x)
-#  mydata <- t(data)
-  mydata <-x
-  mydata.pca <- prcomp(mydata, retx=TRUE, center=TRUE, scale.=TRUE)
-
-  percent <- round((((mydata.pca$sdev)^2 / sum(mydata.pca$sdev^2))*100)[1:2])
-
-  scores <- mydata.pca$x
-  pc12 <- data.frame(PCA1=scores[,1], PCA2=scores[,2], group=group)
-
-#  ggplot(pc12, aes(PCA1, PCA2, colour = group)) + geom_point(size = 6, alpha = 3/4)
-  ggplot(pc12, aes(PCA1, PCA2, colour = group, label=items)) +
-    geom_point(size = 6, alpha = 3/4)   +
-    geom_text(size = 6, alpha = 3/4)    +
-    xlab(paste("PCA1 (", percent[2], "%)", sep = "")) +
-    ylab(paste("PCA2 (", percent[2], "%)", sep = ""))
-
-  qplot(PCA2, PCA1, geom="blank", main = title, xlab = paste("PCA2 (", percent[2], "%)", sep = ""), ylab = paste("PCA1 (", percent[1], "%)", sep = "")) +
-    geom_point(aes(colour = group), size = 6, alpha = 3/4)
-#    theme(
-#      axis.text.x = element_text(size = base_size * 1.3 , lineheight = 0.9, colour = "grey50", hjust = 1, angle = 90),
-#      axis.text.y = element_text(size = base_size * 1.3, lineheight = 0.9, colour = "grey50", hjust = 1)
-#    )
-}
-
-
-## example
-# makePcaPlot(getData(30,4,2,distort = 0.7))
diff --git a/R/holger_utils.R b/R/holger_utils.R
new file mode 100644
index 0000000..f3a1d96
--- /dev/null
+++ b/R/holger_utils.R
@@ -0,0 +1,43 @@
+
+xtab<- function(mydata){
+    tFile<-tempfile(fileext=".csv",tmpdir="~/tmp")
+    write.table(mydata, file=tFile, row.names=F, sep=",")
+
+    if(str_detect(R.version$platform, "linux")){
+        system(paste("ssh bioinfo-mac-6-wifi 'open -a /Applications/XTabulator.app/ /Volumes/bioinfo/tmp/", basename(tFile), "'", sep=""))
+    }else{
+        system(paste("open -a /Applications/XTabulator.app/ ", tFile))
+    }
+}
+
+xtabh <- function(mydata, rows=100) xtab(head(mydata, rows))
+
+
+## todo remove this
+TAB <- "\t"
+
+xls <- function(data, remoteDesktop="bioinfo-mac-6.mpi-cbg.de"){
+
+#    isRemote=str_detect(R.version$platform, "linux")
+
+
+    if(str_detect(R.version$platform, "linux")){
+        tFile<-tempfile(fileext=".tsv",tmpdir="~/tmp")
+
+        ## http://linux.icydog.net/ssh/piping.php
+        write.table(data, file=pipe(concat("ssh brandl@",remoteDesktop," 'cat - >",tFile,"'")), row.names=F, sep=TAB, quote=F)
+#        browser()
+
+#        system(paste("ssh ",remoteDesktop," 'open -a \"/Applications/_MPI_Applications/Microsoft Office 2011/Microsoft Excel.app\" ", tFile, "'", sep=""))
+        system(format(paste("ssh ",remoteDesktop," 'open -a \"/Applications/_MPI_Applications/Microsoft Office 2011/Microsoft Excel.app\" ", tFile, "'", sep="")))
+
+    }else{
+        tFile<-tempfile(fileext=".csv")
+        write.table(data, file=tFile, row.names=F, sep=",")
+        system(paste("open -a '/Applications/Microsoft Excel' ", tFile))
+    }
+}
+
+xlsh <- function(mydata, rows=100) xls(head(mydata, rows))
+
+
diff --git a/R/plyr_utils.R b/R/plyr_utils.R
deleted file mode 100644
index d0f1e0e..0000000
--- a/R/plyr_utils.R
+++ /dev/null
@@ -1,7 +0,0 @@
-
-require(data.table)
-
-#   http://stackoverflow.com/questions/11792527/filtering-out-duplicated-non-unique-rows-in-data-table
-unique_rows <- function(df, columns){
-    unique(setkeyv(data.table(df), columns)) %>% as.df()
-}
-- 
GitLab