# R in Action (2nd ed): Chapter 20 # Advanced R programming # requires packages ggplot2, reshape, foreach, doParallel # install.packages(c("ggplot2", "reshape", "foreach", "doParallel")) #------------------------------------------------------------------- # data types passed <- c(TRUE, TRUE, FALSE, TRUE) ages <- c(15, 18, 25, 14, 19) cmplxNums <- c(1+2i, 0+1i, 39+3i, 12+2i) names <- c("Bob", "Ted", "Carol", "Alice") # matrices x <- c(1,2,3,4,5,6,7,8) class(x) print(x) attr(x, "dim") <- c(2,4) print(x) class(x) attributes(x) attr(x, "dimnames") <- list(c("A1", "A2"), c("B1", "B2", "B3", "B4")) print(x) attr(x, "dim") <- NULL class(x) print(x) # generic vectors (lists) head(iris) unclass(iris) attributes(iris) fit <- kmeans(iris[1:4], 3) names(fit) unclass(fit) sapply(fit, class) # indexing atomic vectors x <- c(20, 30, 40) x[3] x[c(2,3)] x <- c(A=20, B=30, C=40) x[c(2,3)] x[c("B", "C")] # indexing lists fit[c(2,7)] fit[2] fit[[2]] fit$centers fit[[2]][1,] fit$centers$Petal.Width # should give an error # plotting the centroides from a k-mean cluster analysis fit <- kmeans(iris[1:4], 3) means <- fit$centers library(reshape2) dfm <- melt(means) names(dfm) <- c("Cluster", "Measurement", "Centimeters") dfm$Cluster <- factor(dfm$Cluster) head(dfm) library(ggplot2) ggplot(data=dfm, aes(x=Measurement, y=Centimeters, group=Cluster)) + geom_point(size=3, aes(shape=Cluster, color=Cluster)) + geom_line(size=1, aes(color=Cluster)) + ggtitle("Profiles for Iris Clusters") # control structures for(i in 1:5) print(1:i) for(i in 5:1)print(1:i) pvalues <- c(.0867, .0018, .0054, .1572, .0183, .5386) results <- ifelse(pvalues <.05, "Significant", "Not Significant") results pvalues <- c(.0867, .0018, .0054, .1572, .0183, .5386) results <- vector(mode="character", length=length(pvalues)) for(i in 1:length(pvalues)){ if (pvalues[i] < .05) results[i] <- "Significant" else results[i] <- "Not Significant" } # creating functions f <- function(x, y, z=1){ result <- x + (2*y) + (3*z) return(result) } f(2,3,4) f(2,3) f(x=2, y=3) f(z=4, y=2, 3) args(f) # object scope x <- 2 y <- 3 z <- 4 f <- function(w){ z <- 2 x <- w*y*z return(x) } f(x) x y z # working with environments x <- 5 myenv <- new.env() assign("x", "Homer", env=myenv) ls() ls(myenv) x get("x", env=myenv) myenv <- new.env() myenv$x <- "Homer" myenv$x parent.env(mynewenv) # function closures trim <- function(p){ trimit <- function(x){ n <- length(x) lo <- floor(n*p) + 1 hi <- n + 1 - lo x <- sort.int(x, partial = unique(c(lo, hi)))[lo:hi] } trimit } x <- 1:10 trim10pct <- trim(.1) y <- trim10pct(x) y trim20pct <- trim(.2) y <- trim20pct(x) y ls(environment(trim10pct)) get("p", env=environment(trim10pct)) makeFunction <- function(k){ f <- function(x){ print(x + k) } } g <- makeFunction(10) g (4) k <- 2 g (5) ls(environment(g)) environment(g)$k # an example of a generic function mymethod <- function(x, ...) UseMethod("mymethod") mymethod.a <- function(x) print("Using A") mymethod.b <- function(x) print("Using B") mymethod.default <- function(x) print("Using Default") x <- 1:5 y <- 6:10 z <- 10:15 class(x) <- "a" class(y) <- "b" mymethod(x) mymethod(y) mymethod(z) class(z) <- c("a", "b") mymethod(z) class(z) <- c("c", "a", "b") mymethod(z) # vectorization and efficient code set.seed(1234) mymatrix <- matrix(rnorm(10000000), ncol=10) accum <- function(x){ sums <- numeric(ncol(x)) for (i in 1:ncol(x)){ for(j in 1:nrow(x)){ sums[i] <- sums[i] + x[j,i] } } } system.time(accum(mymatrix)) # using loops system.time(colSums(mymatrix)) # using vectorization # correctly size objects set.seed(1234) k <- 100000 x <- rnorm(k) y <- 0 system.time(for (i in 1:length(x)) y[i] <- x[i]^2) y <- numeric(k) system.time(for (i in 1:k) y[i] <- x[i]^2) y <- numeric(k) system.time(y <- x^2) # Parallelization with foreach and doParallel library(foreach) library(doParallel) registerDoParallel(cores=4) eig <- function(n, p){ x <- matrix(rnorm(100000), ncol=100) r <- cor(x) eigen(r)$values } n <- 1000000 p <- 100 k <- 500 system.time( x <- foreach(i=1:k, .combine=rbind) %do% eig(n, p) ) system.time( x <- foreach(i=1:k, .combine=rbind) %dopar% eig(n, p) ) # debugging mtcars$Transmission <- factor(mtcars$a, levels=c(1,2), labels=c("Automatic", "Manual")) aov(mpg ~ Transmission, data=mtcars) # generates error head(mtcars[c("mpg", "Transmission")]) table(mtcars$Transmission) # here is the source of the error # sample debugging session with recover f <- function(x, y){ z <- x + y g(z) } g <- function(x){ z <- round(x) h(z) } h <- function(x){ set.seed(1234) z <- rnorm(x) print(z) } options(error=recover) f(2,3) f(2, -3) # enter interactive debugging # at this point