core_commons.R 10.6 KB
Newer Older
Holger Brandl's avatar
Holger Brandl committed
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
########################################################################################################################
## set a default cran r mirror  and customize environment

#cat(".Rprofile: Setting Goettingen repository\n")
r = getOption("repos") # hard code the UK repo for CRAN

r["CRAN"] = "http://ftp5.gwdg.de/pub/misc/cran/"
options(repos = r)
rm(r)


## user browser for help
options(help_type="html")

## plot more characters per line
options(width=150)

17
18
19
20
21
22
## adjust dplyr printing settings
## http://stackoverflow.com/questions/22471256/overriding-variables-not-shown-in-dplyr-to-display-all-columns-from-df
## http://stackoverflow.com/questions/29396111/dplyrtbl-df-fill-whole-screen
options(dplyr.print_max = 20) ## num rows
options(dplyr.width = 130) ## width

Holger Brandl's avatar
Holger Brandl committed
23
24
25
# for sqldf to avoid the use of tckl
options(gsubfn.engine = "R")

Holger Brandl's avatar
Holger Brandl committed
26

Holger Brandl's avatar
Holger Brandl committed
27
########################################################################################################################
Holger Brandl's avatar
Holger Brandl committed
28
## automatic package installation
Holger Brandl's avatar
Holger Brandl committed
29

30
require_auto <-  function(x){
Holger Brandl's avatar
Holger Brandl committed
31
32
33
34
35
    x <- as.character(substitute(x))

    if(isTRUE(x %in% .packages(all.available=TRUE))) {
        eval(parse(text=paste("require(", x, ",  quietly=T)", sep="")))
    } else {
36
        #        update.packages(ask=F) # update dependencies, if any.
Holger Brandl's avatar
Holger Brandl committed
37
38
39
40
41
42
43
        eval(parse(text=paste("install.packages('", x, "')", sep="")))
    }

    if(isTRUE(x %in% .packages(all.available=TRUE))) {
        eval(parse(text=paste("require(", x, ",  quietly=T)", sep="")))
    } else {
        source("http://bioconductor.org/biocLite.R")
44
        #        biocLite(character(), ask=FALSE) # update dependencies, if any.
45
        eval(parse(text=paste("biocLite('", x, "', ask=FALSE)", sep="")))
Holger Brandl's avatar
Holger Brandl committed
46
47
48
49
        eval(parse(text=paste("require(", x, ",  quietly=T)", sep="")))
    }
}

50
51
52
53
54
55
check_version = function(pkg_name, min_version) {
    cur_version = packageVersion(pkg_name)
    if(cur_version < min_version) stop(sprintf("Package %s needs a newer version,
               found %s, need at least %s", pkg_name, cur_version, min_version))
}
#check_version("dplyr", "0.4-1")
Holger Brandl's avatar
Holger Brandl committed
56

Holger Brandl's avatar
Holger Brandl committed
57
########################################################################################################################
Holger Brandl's avatar
Holger Brandl committed
58
## load core packages
Holger Brandl's avatar
Holger Brandl committed
59

60
61
62
63
require_auto(plyr)
require_auto(stringr)
require_auto(reshape2)
#require_auto(reshape2, quietly=T, warn.conflicts=F)
Holger Brandl's avatar
Holger Brandl committed
64

Holger Brandl's avatar
Holger Brandl committed
65
## load on purpose after plyr
66
67
68
require_auto(dplyr)
require_auto(magrittr)
require_auto(tidyr)
69

Holger Brandl's avatar
Holger Brandl committed
70
## needed for caching
71
require_auto(digest)
Holger Brandl's avatar
Holger Brandl committed
72
require_auto(readr)
73
74
75
76
77
78
79
80
require_auto(readxl)


## common plotting requirements since they are omnipresent
require_auto(ggplot2)
require_auto(scales)
require_auto(grid)

Holger Brandl's avatar
Holger Brandl committed
81
82
83
## for table exploration without using Rstudio
require_auto(DT)

Holger Brandl's avatar
Holger Brandl committed
84
85

## moved into datatable_commons because replaced almost everywhere with dplyr
86
#require_auto(data.table)
Holger Brandl's avatar
Holger Brandl committed
87
88
89
90
91




########################################################################################################################
Holger Brandl's avatar
Holger Brandl committed
92
#### Convenience aliases
Holger Brandl's avatar
Holger Brandl committed
93
94


Holger Brandl's avatar
Holger Brandl committed
95
echo <- function(...) cat(paste(...), fill=T)
Holger Brandl's avatar
Holger Brandl committed
96
97
98
99

ac <- function(...) as.character(...)

# string concatenation without space gaps (could/should use paste0 instead)
Holger Brandl's avatar
Holger Brandl committed
100
101
## Deprecated: use paste0 instead
#concat <- function(...) paste(..., sep="")
Holger Brandl's avatar
Holger Brandl committed
102
103
104
105
106

unlen <- function(x) length(unique(x))

pp <- function(dat) page(dat, method = "print")

Holger Brandl's avatar
Holger Brandl committed
107
108
as.df <- function(dt) as.data.frame(dt)

Holger Brandl's avatar
Holger Brandl committed
109

Holger Brandl's avatar
Holger Brandl committed
110
111
112
113
114
115
116
117
118
119
########################################################################################################################
#### data.frame manipulation


shuffle <- function(df) df[sample(nrow(df)),]

first <- function(x, n=1) head(x,n)



Holger Brandl's avatar
Holger Brandl committed
120
121
122
#vec2df <- function(namedVec) namedVec %>% {data.frame(name=names(.), value=., row.names=NULL)}
vec2df <- function(namedVec) data.frame(name=names(namedVec), value=namedVec, row.names=NULL)

Holger Brandl's avatar
Holger Brandl committed
123
124
125
126
127

rownames2column <- function(df, colname){
    df <- as.df(df)
    df$tttt <- rownames(df);
    rownames(df) <- NULL;
Holger Brandl's avatar
Holger Brandl committed
128
    plyr::rename(df, c(tttt=colname))
Holger Brandl's avatar
Holger Brandl committed
129
130
131
132
}


column2rownames<- function(df, colname){
133
    #browser()
Holger Brandl's avatar
Holger Brandl committed
134
135
136
    ## force into df to avoid dplyr problems
    df <- as.df(df)

Holger Brandl's avatar
Holger Brandl committed
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
    rownames(df) <- ac(df[,colname])
    df[colname] <- NULL
    return(df)
}

## pushing some columns to the right end of a data.frame
push_right <- function(df, pushColNames){
    df[,c(setdiff(names(df),pushColNames), pushColNames)]
}


push_left <- function(df, pushColNames){
    df[,c(pushColNames, setdiff(names(df),pushColNames))]
}


Holger Brandl's avatar
Holger Brandl committed
153
154
#http://astrostatistics.psu.edu/datasets/R/html/base/html/formals.html
set_names <- function(df, ...){
155
    #browser()
Holger Brandl's avatar
Holger Brandl committed
156
    newnames <- as.character(unlist(list(...)))
157
158
159
160

    ## automatically convert matrices to data.frames (otherwise the names set would fail
    if(is.matrix(df)) df %<>% as.data.frame()

161
    names(df) <- newnames;
Holger Brandl's avatar
Holger Brandl committed
162
163
    return(df)
}
Holger Brandl's avatar
Holger Brandl committed
164
165
#iris %>% set_names(c("setosa", "hallo")) %>% head
#iris %>% set_names("setosa", "hallo") %>% head
Holger Brandl's avatar
Holger Brandl committed
166
167


Holger Brandl's avatar
Holger Brandl committed
168
169
170
rify_names  <- function(df){ names(df) <- names(df) %>% str_replace_all(c(" "="_", "-"="_")); df}


Holger Brandl's avatar
Holger Brandl committed
171
172
head_html <- function(df, n=5) head(df, n) %>% knitr::kable(format="html") %>% print()

Holger Brandl's avatar
Holger Brandl committed
173
174
print_head <- function(df, desc=NULL){
    print(head(df))
Holger Brandl's avatar
Holger Brandl committed
175
    print(nrow(df))
Holger Brandl's avatar
Holger Brandl committed
176
177
178
179
180
    return(df)
}


fac2char <- function(mydata, convert=names(mydata)[sapply(mydata, is.factor)]){
Holger Brandl's avatar
Holger Brandl committed
181
182
183
    if(length(convert)==0){
        return(mydata)
    }
Holger Brandl's avatar
Holger Brandl committed
184
185
186
187
188
189
190
191
192
193
194
195
196

    inputColOrder <- names(mydata)

    convertData <- subset(mydata, select= names(mydata)%in%convert)
    convertData <- as.data.frame(lapply(convertData, as.character), stringsAsFactors = FALSE)

    keepData <-  subset(mydata, select=!(names(mydata)%in%convert))
    newdata <- cbind(convertData, keepData)
    newdata <- newdata[,inputColOrder]

    return(newdata)
}

Holger Brandl's avatar
Holger Brandl committed
197
## replace R within pipe change just use ... %>% do(replaceNA(0)) %>% ...
Holger Brandl's avatar
Holger Brandl committed
198
replaceNA <- function(x, withValue) { x[is.na(x)] <- withValue; x }
Holger Brandl's avatar
Holger Brandl committed
199

Holger Brandl's avatar
Holger Brandl committed
200

Holger Brandl's avatar
Holger Brandl committed
201

Holger Brandl's avatar
Holger Brandl committed
202
## todo still needed
Holger Brandl's avatar
Holger Brandl committed
203
safe_ifelse <- function(cond, yes, no) {
Holger Brandl's avatar
Holger Brandl committed
204
205
206
207
208
209
#browser()
    isfacOrChar <- function(x) class(x)  %in% c("factor", "character")

    if (isfacOrChar(yes) | isfacOrChar(no)) {
        yes <- ac(yes)
        no <- ac(no)
210
    }
Holger Brandl's avatar
Holger Brandl committed
211
212

    ifelse(cond,yes,no)
Holger Brandl's avatar
Holger Brandl committed
213
214
}

Holger Brandl's avatar
Holger Brandl committed
215

Holger Brandl's avatar
Holger Brandl committed
216
## for na instead use mutate_each with:
Holger Brandl's avatar
Holger Brandl committed
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
#empty_as_na <- function(x) safe_ifelse(x=="", NA, x)
#empty_as_na <- function(x) ifelse(class(x)  %in% c("factor", "character") & x=="", NA, x)
empty_as_na <- function(x){
    if("factor" %in% class(x)) x <- as.character(x) ## since ifelse wont work with factors
    ifelse(as.character(x)!="", x, NA)
}

#if(F){ ## DEBUG of empty_as_na
#cond <- allJobs %>% head %$% submit_time %>% c("")
#empty_as_na( cond)
#cond <- allJobs %>% head %$% resubmission_of
#empty_as_na( cond)
#
#empty_as_na( c(1, 2, NA))
#empty_as_na( c("sdf", "sdf2", NA))
#empty_as_na( c("sdf", "sdf2", ""))
#
#myFac <- as.factor(c("sdf", "sdf2", NA))
#empty_as_na( myFac)
#ifelse(as.character(myFac)!="", myFac, NA)
#
#empty_as_na( c("sdf", "sdf2", ""))
#
#iris[1,1] <- ""
#apply(iris, 2, function(x) gsub("^$|^ $", NA, x))
#}


Holger Brandl's avatar
Holger Brandl committed
245
246
247
## see http://stackoverflow.com/questions/24172111/change-the-blank-cells-to-na/33952598#33952598


248
249
250
251
252
253
254
reload_dplyr <- function(){
    unloadNamespace('tidyr')
    unloadNamespace('dplyr')
    require(tidyr);require(dplyr)
}


Holger Brandl's avatar
Holger Brandl committed
255
256


Holger Brandl's avatar
Holger Brandl committed
257
258
259
260
## workaround for biomart
## Deprecated: load dplyr after biomart to avoid this problem
#dselect <- function(...) dplyr::select(...)

Holger Brandl's avatar
Holger Brandl committed
261

Holger Brandl's avatar
Holger Brandl committed
262
########################################################################################################################
Holger Brandl's avatar
Holger Brandl committed
263
#### Result Caching for long running tasks
Holger Brandl's avatar
Holger Brandl committed
264

Holger Brandl's avatar
Holger Brandl committed
265
266
## related: http://cran.r-project.org/web/packages/R.cache/R.cache.pdf

267
cache_it <- function(expr, filePrefix="cache"){
268
    cacheFile <- paste0(filePrefix, "_", substr(digest::digest(deparse(expr)), 1,6)) %>% paste0(".", ., ".RData")
Holger Brandl's avatar
Holger Brandl committed
269

Holger Brandl's avatar
Holger Brandl committed
270
271
272
273
274
275
276
277
278
279
280
    if(file.exists(cacheFile)){
        local(get(load(cacheFile)))
    } else {
        result <- eval(expr)
        save(result, file=cacheFile)
        result
    }
}

## Examples
#mydata <- quote(iris %>% filter(Species=="setosa")) %>% cache_it("tt")
Holger Brandl's avatar
Holger Brandl committed
281
282
#mydata <- quote(iris %>% filter(Species=="setosa")) %>% cache_it()
#mydata <- quote( { print("evaluate expr"); iris %>% filter(Species=="setosa") } ) %>% cache_it()
Holger Brandl's avatar
Holger Brandl committed
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310

########################################################################################################################
#### File System

is.directory <- function(dirname) !is.na(file.info(dirname)$isdir)


mcdir <- function(dirname){
    if(!file.exists(dirname)){
        dir.create(dirname)
    }

    setwd(dirname)
}

locload <- function(fileName) local(get(load(fileName)))


rmerge <- function(LDF, by, ...){
    DF <- LDF[[1]]
    for (i in 2:length(LDF)) {
        DF <- merge(DF, LDF[[i]], by=by)
    }
    DF
}



Holger Brandl's avatar
Holger Brandl committed
311
312
## Deprecated: use trim_ext instead
trimEnd <- function(fileNames, ...) trim_ext(fileNames, ...)
Holger Brandl's avatar
Holger Brandl committed
313

Holger Brandl's avatar
Holger Brandl committed
314
trim_ext <-function(fileNames, exts=c()){
Holger Brandl's avatar
Holger Brandl committed
315
316
317
318
319
320
321
    for(fileExt in exts){
        fileNames <- str_replace(fileNames, paste(fileExt, "$",sep=""), "")
    }

    fileNames
}

Holger Brandl's avatar
Holger Brandl committed
322
## DEPRECATED Use write_tsv instead
Holger Brandl's avatar
Holger Brandl committed
323
324
write.delim <- function(df, file, header=TRUE,...){
    write.table(df, file, row.names=FALSE, col.names=header, sep="\t", ...)
Holger Brandl's avatar
Holger Brandl committed
325
326
327
328
329
330
331
332
333
334
}

rmSomeElements <- function(vec, toDel) vec[!(vec %in% toDel)]

rmLastElement <- function(vec) vec[-length(vec)]


########################################################################################################################
## Memory management

Holger Brandl's avatar
Holger Brandl committed
335

Holger Brandl's avatar
Holger Brandl committed
336
# improved list of objects
Holger Brandl's avatar
Holger Brandl committed
337
338
lsos <- function (pos = 1, pattern, order.by, decreasing=FALSE, head=FALSE, n=5) {
    napply <- function(names, fn) sapply(names, function(x) fn(get(x, pos = pos)))
Holger Brandl's avatar
Holger Brandl committed
339
    names <- ls(pos = pos, pattern = pattern)
Holger Brandl's avatar
Holger Brandl committed
340

Holger Brandl's avatar
Holger Brandl committed
341
342
343
344
345
    obj.class <- napply(names, function(x) as.character(class(x))[1])
    obj.mode <- napply(names, mode)
    obj.type <- ifelse(is.na(obj.class), obj.mode, obj.class)
    obj.size <- napply(names, object.size)/1000000
    obj.dim <- t(napply(names, function(x)
Holger Brandl's avatar
Holger Brandl committed
346

347
    as.numeric(dim(x))[1:2]))
Holger Brandl's avatar
Holger Brandl committed
348
349
    vec <- is.na(obj.dim)[, 1] & (obj.type != "function")
    obj.dim[vec, 1] <- napply(names, length)[vec]
Holger Brandl's avatar
Holger Brandl committed
350

Holger Brandl's avatar
Holger Brandl committed
351
352
    out <- data.frame(obj.type, obj.size, obj.dim)
    names(out) <- c("Type", "Size", "Rows", "Columns")
Holger Brandl's avatar
Holger Brandl committed
353

Holger Brandl's avatar
Holger Brandl committed
354
    if (!missing(order.by))
355
    out <- out[order(out[[order.by]], decreasing=decreasing), ]
Holger Brandl's avatar
Holger Brandl committed
356
357
358

    if (head) out <- head(out, n)

Holger Brandl's avatar
Holger Brandl committed
359
360
361
362
363
    out <- transform(out, var_name=rownames(out))
    rownames(out) <- NULL
    arrange(out, Size)
}

Holger Brandl's avatar
Holger Brandl committed
364
# shorthand that just shows top 1 results
Holger Brandl's avatar
Holger Brandl committed
365
366
367
368
lsosh <- function(..., n=10) {
    lsos(..., order.by="Size", decreasing=TRUE, head=TRUE, n=n)
}

Holger Brandl's avatar
Holger Brandl committed
369
370
371
372
373
374
375
376
377
378
########################################################################################################################
### Statistics


## outlier handling
trim_outliers <- function(values, range=quantile(values, c(0.05, 0.95)))  pmax(range[1], pmin(range[2], values))

## use trim_outliers instead
#limit_range <- function(values, range)  pmax(range[1], pmin(range[2], values))

Holger Brandl's avatar
Holger Brandl committed
379
se <- function(x) sd(x, na.rm=TRUE) / sqrt(sum(!is.na(x)))