## Stat union lecture 24/03/2010 - Yizhar (Izzy) Toren ## Code editors: eclipse.org # line completion, environment structure, project management # requirements: eclipse, StatET (eclispe add-on), rJava (R package, optional) # recommended: R-java interface # a good guide: www.splusbook.com/RIntro/R_Eclipse_StatET.pdf ######################################################## ## quick review: apply functions ## apply: works on data.frames by column/row, returns ## lists or simplified data.frames n = 100 x1 = sample(x = c(1:5, NA), size = n, replace = T) x2 = sample(x = c(1:5, NA), size = n, replace = T) y = apply(cbind(x1, x2), 1, sum, na.rm = T) + rnorm(n, sd = 6) d = data.frame(y, x1, x2) print(d[1:5,]) apply(X = d, # apply the function in FUN row by row MARGIN= 1, FUN = sum # also accepts a text var. - "sum" ) apply(X = d, # process column by column MARGIN = 2, FUN = sum, # passing more parameters to the function # be mindful of naming - better to be specific to avoid # errors caused by parameter order in the function na.rm = T ) ## lapply / sapply (simplified) : works on lists, item by ## item, returns a list (or simplified data.frame) y = sapply( X = 1:10, FUN = # you are not limited to known function, and can create the # function "on the fly". however - functions are not saved # for future use (because of scoping) function(y) { return(c(y, y^2, log(y))) } ) print(t(y)) ## tapply / by ## tapply: process vectors/lists by subgroups, which are ## determined by a factor vector. ## by: the same just for data.frames d = data.frame( d, group = sample(1:2, size = dim(d)[1], replace = T) ) print(d[1:10,]) lms = by( data = d, # a vector of groups. coerced into "factor" INDICES = d[,'group'], function(X){return(lm(y ~ x1 + x2, data = X))} ) ## returns a list of objects print(lms) length(lms) attributes(lms) attr(lms, 'dim') # < dim(lms) # < lms[[1]] # an "lm' object class(lms[[1]]) ## and we use this as a normal "lm" object hist(residuals(lms[[1]])) ## A naive way: just extract the data ## preparation: # we transform the 'lm' object into a summary.lm object summary(lms[[1]]) attributes(summary(lms[[1]])) # extract a table called 'coefficients' summary(lms[[1]])[['coefficients']] class(summary(lms[[1]])[['coefficients']]) # and choose the column names 'Pr(>|t|)' summary(lms[[1]])[['coefficients']][,'Pr(>|t|)'] lm_p_table = NULL for (i in 1:length(lms)) { lm_p_table = rbind( lm_p_table, c(summary(lms[[i]])[['coefficients']][,'Pr(>|t|)']) ) } print(lm_p_table) ## A different approach - extract within the original ## function. this can be useful for very large numbers ## since it saves memory allocation lm_p_table = by( data = d, INDICES = d[,'group'], function(X){ return(c( summary(lm(y ~ x1 + x2, data = X))$coefficients[,'Pr(>|t|)'], unique(X$group) )) } ) print(lm_p_table) attributes(lm_p_table) ## convert into a data.frame, use "I" to return just the ## vector and 'sapply' does the rest t(sapply(lm_p_table,I)) ######################################################## ## Example 1: functional use of objects ## extracting attributes x = c(1:5, NA, 6:9, rep(NA,3), 15:-5) print(x) y = na.omit(x)# drop missing values print(y) ## print the possible attributes of the object y attributes(y) ## extract a specific attribute y2 = attr(y, 'na.action') class(y2) y2 # the same... but recalculates each time which(is.na(x)) x[!is.na(x)] which(is.na(x)) ## methods: "summary" # linear models lm1 = lm(y ~ x1 + x2, data = d) attributes(lm1) names(lm1) attr(lm1 ,'names') class(lm1) attr(lm1, 'class') # object class 'summary.lm' summary summary(lm1) attributes(summary(lm1)) # show all functions with summary methods summary # press 'tab' in native console to see all functions methods(summary) # error in printouts... summary.lm # ANOVA aov1 = aov(y ~ x1 + x2, data = d) attributes(aov1) summary(aov1) attributes(summary(aov1)) # does not give much... summary.aov # hidden functions d3 = na.omit(d)######################################################### ppr1 = ppr(x = d3[,c('x1','x2')], y = d3[,'y'], nterms = 2, na.action = 'omit') summary(ppr1) getAnywhere(summary.ppr) # the function summary.ppr is not directly available # How to find which methods apply to an object? # The function's documentation! ## another example: "print" print(1) print.by(lms) # thie parameter "vsep" appears only in documentation... print.by(lms, vsep = '*+*+*+*+*+*+*+*+*+*+*+*+*') ##################################################### ## Example 3: Expressions and evaluation: reading and using filters from a text file filters = c('x1 > 2', 'x2 > 3') as.logical(filters[1]) # returns NA parse(text = filters[1]) # creates object type 'expression' eval(parse(text = filters[1])) # evaluation d[,'x1'] eval(parse(text = "d$x1")) # some use of enviromnets rm(x1) eval(parse(text = filters[1])) # returns error, x1 is missing eval(parse(text = filters[1]), envir = d) # or d2 / ... ## Example 4: using different functions parametrically # sapply funcs = c(rep('length',5), rep('sum',5)) print(funcs) sapply(1:10, FUN = 'sum') sapply(1:10, FUN = 'length') sapply(1:10, FUN = funcs) # error d2 = data.frame(dat1 = 1:10, dat2 = c(1:7, NA, 9:10), fun = funcs) print(d2) ## a more 'constructed' way - do.call: ## accepts a string value and a 'list' of parameters for (i in 1:dim(d2)[1]) { tmp = do.call( as.character(d2[i,'fun']), args = list( x = as.numeric(d2[i,c('dat1', 'dat2')]) , na.rm = T ) ) print(tmp) } ## but - can't handle different parameters: ## 'na.rm = T' returns an error for length apply(d2, 1, function(y){ #print(y) y = as.list(y) .str = paste( y$fun, '(as.numeric(c(y$dat1, y$dat2))', # add the na.rm parameter by function type ifelse(y$fun == 'sum', ', na.rm = T', ''), ')', sep ='') .expr = parse(text = .str) class(.expr) return(eval(.expr)) } ) y = d2[1,] y = d2[8,] ######################################################### # A note on scoping: variables in R live in enviromnets. # each function() {...} clause creates an sun-environment # that inherits all of the variables. This works only one # way - variables created or manipulated within a sub-env # are not seen by the parent env. print(tmp) # for () {} works in the global env print(.str) # This is why we have to use the 'return(...)' clause at # the end of our functions. print(d2) remove_line = function(x, data) { d2 = d2[-x,] # just print index. will make no differance return(x) # returns d2 without line x #return(d2) } remove_line(1, d2) print(d2) ## in eclipse you can see the internal variables under "structure" ######################################################### # some more on environmets e1 = new.env() e2 = new.env() assign("x1", 1:10, envir = e1) assign("x1", 10:20, envir = e2) eval(parse(text = 'x1 > 5'), envir = e1) eval(parse(text = 'x1 > 5'), envir = e2)