#  Tuesday, November 18th, 2014

# R in Action (2nd ed): Chapter 5
# Advanced data management
#---------------------------------------------------


# Class Roster Dataset

Student <- c("John Davis","Angela Williams","Bullwinkle Moose","David Jones","Janice Markhammer",
             "Cheryl Cushing","Reuven Ytzrhak","Greg Knox","Joel England","Mary Rayburn")

math <- c(502, 600, 412, 358, 495, 512, 410, 625, 573, 522)
science <- c(95, 99, 80, 82, 75, 85, 80, 95, 89, 86)
english <- c(25, 22, 18, 15, 20, 28, 15, 30, 27, 18)

roster <- data.frame(Student, math, science, english, stringsAsFactors=FALSE)

# Calculating the mean and standard deviation

x <- c(1, 2, 3, 4, 5, 6, 7, 8)
mean(x)
sd(x)
n <- length(x)

meanx <- sum(x)/n
css <- sum((x - meanx)**2)            
sdx <- sqrt(css / (n-1))

meanx
sdx

# Generating pseudo-random numbers from a uniform distribution

runif(5)
runif(5)
set.seed(1234)                                                     

runif(5)

set.seed(1234)                                                      
runif(5)

# Generating data from a multivariate normal distribution
library(MASS)

mean <- c(230.7, 146.7, 3.6)                                           

sigma <- matrix( c(15360.8, 6721.2, -47.1,                              
                    6721.2, 4700.9, -16.5,
                     -47.1,  -16.5,   0.3), nrow=3, ncol=3)
set.seed(1234)

mydata <- mvrnorm(500, mean, sigma)                                     

mydata <- as.data.frame(mydata)                                         

names(mydata) <- c("y", "x1", "x2")                                       
dim(mydata)                                                             
head(mydata, n=10)   

# Applying functions to data objects

a <- 5
sqrt(a)
b <- c(1.243, 5.654, 2.99)
round(b)
c <- matrix(runif(12), nrow=3)
c
log(c)
mean(c)

#  Applying a function to the rows (columns) of a matrix

mydata <- matrix(rnorm(30), nrow=6)
mydata
apply(mydata, 1, mean)     
apply(mydata, 2, mean) 
apply(mydata, 2, mean, trim=.4)   

# A solution to the learning example
options(digits=2)
z <- scale(roster[,2:4])                                               
score <- apply(z, 1, mean)                                            

roster <- cbind(roster, score)
y <- quantile(score, c(.8,.6,.4,.2))                                   

roster$grade[score >= y[1]] <- "A"                                     

roster$grade[score < y[1] & score >= y[2]] <- "B"
roster$grade[score < y[2] & score >= y[3]] <- "C"
roster$grade[score < y[3] & score >= y[4]] <- "D"

roster$grade[score < y[4]] <- "F"

name <- strsplit((roster$Student), " ")                                

lastname <- sapply(name, "[", 2)
firstname <- sapply(name, "[", 1)

roster <- cbind(firstname,lastname, roster[,-1])
roster <- roster[order(lastname,firstname),]
roster


# A switch example

feelings <- c("sad", "afraid")

for (i in feelings)
    print(
      switch(i,
             happy  = "I am glad you are happy",
             afraid = "There is nothing to fear",
             sad    = "Cheer up",
             angry  = "Calm down now"
    )
  )
  

#  mystats() -  a user-written function for summary statistics

mystats <- function(x, parametric=TRUE, print=FALSE) {
  if (parametric) {
    center <- mean(x); spread <- sd(x) 
  } else {
    center <- median(x); spread <- mad(x) 
  }
  if (print & parametric) {
    cat("Mean=", center, "\n", "SD=", spread, "\n")
  } else if (print & !parametric) {
    cat("Median=", center, "\n", "MAD=", spread, "\n")
  }

  result <- list(center=center, spread=spread)
  return(result)
}

# trying it out

set.seed(1234)
x <- rnorm(500) 
y <- mystats(x)
y <- mystats(x, parametric=FALSE, print=TRUE)

# mydate: a user-written function using switch

mydate <- function(type="long") {
            switch(type,
                   long =  format(Sys.time(), "%A %B %d %Y"), 
                   short = format(Sys.time(), "%m-%d-%y"),
                   cat(type, "is not a recognized type\n"))
}

mydate("long")
mydate("short")
mydate()
mydate("medium")

# Transposing a dataset

cars <- mtcars[1:5, 1:4]      
cars
t(cars)

# Aggregating data

options(digits=3)
attach(mtcars)
aggdata <-aggregate(mtcars, by=list(cyl,gear), FUN=mean, na.rm=TRUE)
aggdata