From 31d828987145ad30efd66e50397b01b60c242d7d Mon Sep 17 00:00:00 2001 From: Holger Brandl <brandl@mpi-cbg.de> Date: Thu, 14 Aug 2014 10:21:54 +0200 Subject: [PATCH] split up R utils --- R/core_commons.R | 179 +++++++++++++++++++++++++++++++++++++ R/datatable_utils.R | 16 ++++ R/df_utils.R | 77 ++++++++++++++++ R/{ => ggplot}/ggheatmap.R | 0 R/ggplot/ggplot_utils.R | 156 ++++++++++++++++++++++++++++++++ R/ggplot_utils.R | 41 --------- R/holger_utils.R | 43 +++++++++ R/plyr_utils.R | 7 -- 8 files changed, 471 insertions(+), 48 deletions(-) create mode 100644 R/core_commons.R create mode 100644 R/datatable_utils.R create mode 100644 R/df_utils.R rename R/{ => ggplot}/ggheatmap.R (100%) create mode 100644 R/ggplot/ggplot_utils.R delete mode 100644 R/ggplot_utils.R create mode 100644 R/holger_utils.R delete mode 100644 R/plyr_utils.R diff --git a/R/core_commons.R b/R/core_commons.R new file mode 100644 index 0000000..fa8937b --- /dev/null +++ b/R/core_commons.R @@ -0,0 +1,179 @@ + +## automatic package installation +require.auto <- function(x){ + x <- as.character(substitute(x)) + + if(isTRUE(x %in% .packages(all.available=TRUE))) { + eval(parse(text=paste("require(", x, ", quietly=T)", sep=""))) + } else { + # update.packages(ask=F) # update dependencies, if any. + eval(parse(text=paste("install.packages('", x, "')", sep=""))) + } + + if(isTRUE(x %in% .packages(all.available=TRUE))) { + eval(parse(text=paste("require(", x, ", quietly=T)", sep=""))) + } else { + source("http://bioconductor.org/biocLite.R") + biocLite(character(), ask=FALSE) # update dependencies, if any. + eval(parse(text=paste("biocLite('", x, "')", sep=""))) + eval(parse(text=paste("require(", x, ", quietly=T)", sep=""))) + } +} + +require.auto(plyr) +require.auto(stringr) +require.auto(reshape2) +#require.auto(reshape2, quietly=T, warn.conflicts=F) +require.auto(scales) + +require.auto(dplyr) + + +#require.auto(data.table) + +options(help_type="html") +options(width=150) + +# for sqldf to avoid the use of tckl +options(gsubfn.engine = "R") + + +##### set the r mirror +#cat(".Rprofile: Setting Goettingen repository\n") +r = getOption("repos") # hard code the UK repo for CRAN
 +r["CRAN"] = "http://ftp5.gwdg.de/pub/misc/cran/" +options(repos = r) +rm(r) + + + + +#qlibrary <- function(libname) {library(as.character(substitute(libname)), quietly=T, warn.conflicts=F, character.only=T )} + + + + +######################################################################################################################## +#### Small aliases + + +#praste <- function(...) print(paste(...)) +echo <- function(...) print(paste(...)) + + +ac <- function(...) as.character(...) + +qns <- function() quit(save="no") + + +# string concatenation without space gaps (could/should use paste0 instead) +concat <- function(...) paste(..., sep="") + + +se<-function(x) sd(x, na.rm=TRUE) / sqrt(sum(!is.na(x))) + +unlen <- function(x) length(unique(x)) + +pp <- function(dat) page(dat, method = "print") + + + +######################################################################################################################## +#### File System + +is.directory <- function(dirname) !is.na(file.info(dirname)$isdir) + + +mcdir <- function(dirname){ + if(!file.exists(dirname)){ + dir.create(dirname) + } + + setwd(dirname) +} + +locload <- function(fileName) local(get(load(fileName))) + + +rmerge <- function(LDF, by, ...){ + DF <- LDF[[1]] + for (i in 2:length(LDF)) { + DF <- merge(DF, LDF[[i]], by=by) + } + DF +} + + + + +trimEnd <-function(fileNames, exts=c()){ + for(fileExt in exts){ + fileNames <- str_replace(fileNames, paste(fileExt, "$",sep=""), "") + } + + fileNames +} + +chopFileExt <-function(fileNames, exts=c()){ + warning("this method is deprecated. use trimEnd instead") + for(fileExt in exts){ + fileNames <- str_replace(fileNames, paste(".", fileExt, "$",sep=""), "") + } + + fileNames +} + +write.delim <- function(df, header=TRUE,...){ + write.table(df, row.names=FALSE, col.names=header, sep="\t", ...) +} + +## writes a table in bed format expecting columns being ordered according to bed spec already +write.bed <- function(bedData, file){ + write.table(bedData, file=file, quote=FALSE, sep ="\t", na="NA", row.names=FALSE, col.names=FALSE) +} +#options(scipen=100) ## necessary to disable scientific number formats for long integers + + + + +rmSomeElements <- function(vec, toDel) vec[!(vec %in% toDel)] + +rmLastElement <- function(vec) vec[-length(vec)] + + +######################################################################################################################## +#### File System + +## Memory management + +# improved list of objects +lsos <- function (pos = 1, pattern, order.by, + decreasing=FALSE, head=FALSE, n=5) { + napply <- function(names, fn) sapply(names, function(x) + fn(get(x, pos = pos))) + names <- ls(pos = pos, pattern = pattern) + obj.class <- napply(names, function(x) as.character(class(x))[1]) + obj.mode <- napply(names, mode) + obj.type <- ifelse(is.na(obj.class), obj.mode, obj.class) + obj.size <- napply(names, object.size)/1000000 + obj.dim <- t(napply(names, function(x) + as.numeric(dim(x))[1:2])) + vec <- is.na(obj.dim)[, 1] & (obj.type != "function") + obj.dim[vec, 1] <- napply(names, length)[vec] + out <- data.frame(obj.type, obj.size, obj.dim) + names(out) <- c("Type", "Size", "Rows", "Columns") + if (!missing(order.by)) + out <- out[order(out[[order.by]], decreasing=decreasing), ] + if (head) + out <- head(out, n) +# out + out <- transform(out, var_name=rownames(out)) + rownames(out) <- NULL + arrange(out, Size) +} + +# shorthand +lsosh <- function(..., n=10) { + lsos(..., order.by="Size", decreasing=TRUE, head=TRUE, n=n) +} + diff --git a/R/datatable_utils.R b/R/datatable_utils.R new file mode 100644 index 0000000..3a7733d --- /dev/null +++ b/R/datatable_utils.R @@ -0,0 +1,16 @@ + require.auto(data.table) + + +dt.merge <- function(dfA, dfB, by=intersect(names(dfA), names(dfB)) , ...) { +# require(data.table) + as.df(merge(data.table(dfA, key=by), data.table(dfB, key=by), ...)) +# unloadNameSpace(data.table) +} + + + +# http://stackoverflow.com/questions/11792527/filtering-out-duplicated-non-unique-rows-in-data-table +unique_rows <- function(df, columns){ + + unique(setkeyv(data.table(df), columns)) %>% as.df() +} diff --git a/R/df_utils.R b/R/df_utils.R new file mode 100644 index 0000000..1bf549a --- /dev/null +++ b/R/df_utils.R @@ -0,0 +1,77 @@ + + +subsample <- function(df, sampleSize, ...){ + df[sample(1:nrow(df), min(sampleSize, nrow(df)), ...),] +} + +shuffle <- function(df) df[sample(nrow(df)),] + +first <- function(x, n=1) head(x,n) + +as.df <- function(dt) as.data.frame(dt) + + +## small wrappers + + + + +rownames2column <- function(df, colname){ + df <- as.df(df) + df$tttt <- rownames(df); + rownames(df) <- NULL; + rename(df, c(tttt=colname)) +} + +column2rownames<- function(df, colname){ +#browser() + rownames(df) <- ac(df[,colname]) + df[colname] <- NULL + return(df) +} + +## pushing some columns to the right end of a data.frame +push_right <- function(df, pushColNames){ + df[,c(setdiff(names(df),pushColNames), pushColNames)] +} + +push_left <- function(df, pushColNames){ + df[,c(pushColNames, setdiff(names(df),pushColNames))] +} + +set_names <- function(df, newnames){ + df<- as.df(df) + names(df) <- newnames; + return(df) +} + + +## dplyr utilities + +print_head <- function(df, desc=NULL){ + print(head(df)) + return(df) +} + + +fac2char <- function(mydata, convert=names(mydata)[sapply(mydata, is.factor)]){ + if(length(convert)==0) + return(mydata) + + inputColOrder <- names(mydata) + + convertData <- subset(mydata, select= names(mydata)%in%convert) + convertData <- as.data.frame(lapply(convertData, as.character), stringsAsFactors = FALSE) + + keepData <- subset(mydata, select=!(names(mydata)%in%convert)) + newdata <- cbind(convertData, keepData) + newdata <- newdata[,inputColOrder] + + return(newdata) +} + + +## workaround for biomart +dselect <- function(...) dplyr::select(...) + + diff --git a/R/ggheatmap.R b/R/ggplot/ggheatmap.R similarity index 100% rename from R/ggheatmap.R rename to R/ggplot/ggheatmap.R diff --git a/R/ggplot/ggplot_utils.R b/R/ggplot/ggplot_utils.R new file mode 100644 index 0000000..b77ae14 --- /dev/null +++ b/R/ggplot/ggplot_utils.R @@ -0,0 +1,156 @@ + +require.auto(ggplot2) + + + +plotPDF <- function(filename, expr){ pdf(paste0(filename, ".pdf")); expr; dev.off(); } +#plotPDF("test", plot(1:10)) + + +# Multiple plot function +# +# ggplot objects can be passed in ..., or to plotlist (as a list of ggplot objects) +# - cols: Number of columns in layout +# - layout: A matrix specifying the layout. If present, 'cols' is ignored. +# +# If the layout is something like matrix(c(1,2,3,3), nrow=2, by.row=TRUE), +# then plot 1 will go in the upper left, 2 will go in the upper right, and +# 3 will go all the way across the bottom. +# +multiplot <- function(..., plotlist=NULL, file, cols=1, layout=NULL) { + require(grid) + + # Make a list from the ... arguments and plotlist + plots <- c(list(...), plotlist) + + numPlots = length(plots) + + # If layout is NULL, then use 'cols' to determine layout + if (is.null(layout)) { + # Make the panel + # ncol: Number of columns of plots + # nrow: Number of rows needed, calculated from # of cols + layout <- matrix(seq(1, cols * ceiling(numPlots/cols)), + ncol = cols, nrow = ceiling(numPlots/cols)) + } + + if (numPlots==1) { + print(plots[[1]]) + + } else { + # Set up the page + grid.newpage() + pushViewport(viewport(layout = grid.layout(nrow(layout), ncol(layout)))) + + # Make each plot, in the correct location + for (i in 1:numPlots) { + # Get the i,j matrix positions of the regions that contain this subplot + matchidx <- as.data.frame(which(layout == i, arr.ind = TRUE)) + + print(plots[[i]], vp = viewport(layout.pos.row = matchidx$row, + layout.pos.col = matchidx$col)) + } + } +} + +rotXlab <- function() theme(axis.text.x = element_text(angle = 90, hjust = 1)) + + +gg2Format="png" + +ggsave2 <- function(gplot=last_plot(), width=8, height=6, prefix="", saveData=FALSE, outputFormat=gg2Format, ...){ + title <- try(gplot$labels[["title"]]) + + if(is.null(title)){ + varMapping <- gplot$labels + varMapping <- varMapping[names(varMapping) %in% c("x", "y")] +# if(varMapping==NULL){ +# varMapping <- gplot$mapping; +# } + + if(length(varMapping) == 1){ + title= paste("distribution of", varMapping) + }else{ + title = try(paste(varMapping, collapse=" vs ")) + # stop("last plot had no title. Use ggsave() and give it a manual title") + } + + rawFacetDesc <- format(gplot$facet) + if(rawFacetDesc!="facet_null()"){ + title <- paste(title, "by", str_replace_all(str_match(rawFacetDesc, "facet_.*[(](.*))")[,2], "~", "and")) + } + } + + + fileBaseName <- ifelse(nchar(prefix)>0, concat(prefix, " - ", title), title) + + ## clean up weired characters + fileBaseName <- str_replace_all(fileBaseName, "[$%/?]", "_") + + fileName = concat(fileBaseName, concat(".", outputFormat)) + + ## remove line-breaks and trim spaces + fileName = str_replace_all(str_replace_all(fileName, "\\n", ""), "[ ]{2,}", " ") +# print(paste("saving plot to ", fileName)) + ggsave(fileName, width=width, height=height, ...) + + if(saveData){ + write.delim(gplot$data, file= concat(fileBaseName, ".txt")) + } + + + ##todo review + LAST_GG2_OUTPUT <<- fileName + + return(fileName) +} + +### alternate over all open devices when running ggplot +## disabled because it's causing problems if X11 device is not available +#ggplot <- function(...){ +# dev.set(dev.next()) +# ggplot2::ggplot(...) +#} + + +scale_fill_redgreed <- function() scale_fill_manual(values = c("red","darkgreen")) + + +######################################################################################################################## +### pca plots (http://largedata.blogspot.de/2011/07/plotting-pca-results-in-ggplot2.html) + +makePcaPlot <- function(x = getData(), group = NA, items=rownames(x), title = "") { + require(ggplot2) + require(RColorBrewer) + +# data <- x +# data <- t(apply(data, 1, scale)) +# rownames(data) <- rownames(x) +# colnames(data) <- colnames(x) +# mydata <- t(data) + mydata <-x + mydata.pca <- prcomp(mydata, retx=TRUE, center=TRUE, scale.=TRUE) + + percent <- round((((mydata.pca$sdev)^2 / sum(mydata.pca$sdev^2))*100)[1:2]) + + scores <- mydata.pca$x + pc12 <- data.frame(PCA1=scores[,1], PCA2=scores[,2], group=group) + +# ggplot(pc12, aes(PCA1, PCA2, colour = group)) + geom_point(size = 6, alpha = 3/4) + ggplot(pc12, aes(PCA1, PCA2, colour = group, label=items)) + + geom_point(size = 6, alpha = 3/4) + + geom_text(size = 6, alpha = 3/4) + + xlab(paste("PCA1 (", percent[2], "%)", sep = "")) + + ylab(paste("PCA2 (", percent[2], "%)", sep = "")) + + qplot(PCA2, PCA1, geom="blank", main = title, xlab = paste("PCA2 (", percent[2], "%)", sep = ""), ylab = paste("PCA1 (", percent[1], "%)", sep = "")) + + geom_point(aes(colour = group), size = 6, alpha = 3/4) +# theme( +# axis.text.x = element_text(size = base_size * 1.3 , lineheight = 0.9, colour = "grey50", hjust = 1, angle = 90), +# axis.text.y = element_text(size = base_size * 1.3, lineheight = 0.9, colour = "grey50", hjust = 1) +# ) +} + + +## example +# makePcaPlot(getData(30,4,2,distort = 0.7)) diff --git a/R/ggplot_utils.R b/R/ggplot_utils.R deleted file mode 100644 index 72fd7c1..0000000 --- a/R/ggplot_utils.R +++ /dev/null @@ -1,41 +0,0 @@ - - - -######################################################################################################################## -### pca plots (http://largedata.blogspot.de/2011/07/plotting-pca-results-in-ggplot2.html) - -makePcaPlot <- function(x = getData(), group = NA, items=rownames(x), title = "") { - require(ggplot2) - require(RColorBrewer) - -# data <- x -# data <- t(apply(data, 1, scale)) -# rownames(data) <- rownames(x) -# colnames(data) <- colnames(x) -# mydata <- t(data) - mydata <-x - mydata.pca <- prcomp(mydata, retx=TRUE, center=TRUE, scale.=TRUE) - - percent <- round((((mydata.pca$sdev)^2 / sum(mydata.pca$sdev^2))*100)[1:2]) - - scores <- mydata.pca$x - pc12 <- data.frame(PCA1=scores[,1], PCA2=scores[,2], group=group) - -# ggplot(pc12, aes(PCA1, PCA2, colour = group)) + geom_point(size = 6, alpha = 3/4) - ggplot(pc12, aes(PCA1, PCA2, colour = group, label=items)) + - geom_point(size = 6, alpha = 3/4) + - geom_text(size = 6, alpha = 3/4) + - xlab(paste("PCA1 (", percent[2], "%)", sep = "")) + - ylab(paste("PCA2 (", percent[2], "%)", sep = "")) - - qplot(PCA2, PCA1, geom="blank", main = title, xlab = paste("PCA2 (", percent[2], "%)", sep = ""), ylab = paste("PCA1 (", percent[1], "%)", sep = "")) + - geom_point(aes(colour = group), size = 6, alpha = 3/4) -# theme( -# axis.text.x = element_text(size = base_size * 1.3 , lineheight = 0.9, colour = "grey50", hjust = 1, angle = 90), -# axis.text.y = element_text(size = base_size * 1.3, lineheight = 0.9, colour = "grey50", hjust = 1) -# ) -} - - -## example -# makePcaPlot(getData(30,4,2,distort = 0.7)) diff --git a/R/holger_utils.R b/R/holger_utils.R new file mode 100644 index 0000000..f3a1d96 --- /dev/null +++ b/R/holger_utils.R @@ -0,0 +1,43 @@ + +xtab<- function(mydata){ + tFile<-tempfile(fileext=".csv",tmpdir="~/tmp") + write.table(mydata, file=tFile, row.names=F, sep=",") + + if(str_detect(R.version$platform, "linux")){ + system(paste("ssh bioinfo-mac-6-wifi 'open -a /Applications/XTabulator.app/ /Volumes/bioinfo/tmp/", basename(tFile), "'", sep="")) + }else{ + system(paste("open -a /Applications/XTabulator.app/ ", tFile)) + } +} + +xtabh <- function(mydata, rows=100) xtab(head(mydata, rows)) + + +## todo remove this +TAB <- "\t" + +xls <- function(data, remoteDesktop="bioinfo-mac-6.mpi-cbg.de"){ + +# isRemote=str_detect(R.version$platform, "linux") + + + if(str_detect(R.version$platform, "linux")){ + tFile<-tempfile(fileext=".tsv",tmpdir="~/tmp") + + ## http://linux.icydog.net/ssh/piping.php + write.table(data, file=pipe(concat("ssh brandl@",remoteDesktop," 'cat - >",tFile,"'")), row.names=F, sep=TAB, quote=F) +# browser() + +# system(paste("ssh ",remoteDesktop," 'open -a \"/Applications/_MPI_Applications/Microsoft Office 2011/Microsoft Excel.app\" ", tFile, "'", sep="")) + system(format(paste("ssh ",remoteDesktop," 'open -a \"/Applications/_MPI_Applications/Microsoft Office 2011/Microsoft Excel.app\" ", tFile, "'", sep=""))) + + }else{ + tFile<-tempfile(fileext=".csv") + write.table(data, file=tFile, row.names=F, sep=",") + system(paste("open -a '/Applications/Microsoft Excel' ", tFile)) + } +} + +xlsh <- function(mydata, rows=100) xls(head(mydata, rows)) + + diff --git a/R/plyr_utils.R b/R/plyr_utils.R deleted file mode 100644 index d0f1e0e..0000000 --- a/R/plyr_utils.R +++ /dev/null @@ -1,7 +0,0 @@ - -require(data.table) - -# http://stackoverflow.com/questions/11792527/filtering-out-duplicated-non-unique-rows-in-data-table -unique_rows <- function(df, columns){ - unique(setkeyv(data.table(df), columns)) %>% as.df() -} -- GitLab