# R in Action (2nd ed): Chapter 4 # Basic data management # requires that the reshape package has been installed # install.packages("reshape") #----------------------------------------------------- # leadership dataset manager <- c(1,2,3,4,5) date <- c("10/24/08","10/28/08","10/1/08","10/12/08","5/1/09") gender <- c("M","F","F","M","F") age <- c(32,45,25,39,99) q1 <- c(5,3,3,3,2) q2 <- c(4,5,5,3,2) q3 <- c(5,2,5,4,1) q4 <- c(5,5,5,NA,2) q5 <- c(5,5,2,NA,1) leadership <- data.frame(manager,date,gender,age,q1,q2,q3,q4,q5, stringsAsFactors=FALSE) attach(leadership) # Recoding variables leadership$agecat[age > 75] <- "Elder" leadership$agecat[age > 45 & age <= 75] <- "Middle Aged" leadership$agecat[age <= 45] <- "Young" # Renaming variables with the reshape package library(reshape) rename(leadership, c( manager="managerID", date="testDate" )) # Applying the is.na() function is.na(leadership[, 6:10]) # recode 99 to missing for the variable age leadership[age == 99, "age"] <- NA leadership # Using na.omit() to delete incomplete observations na.omit(leadership) # Converting character values to dates strDates <- c("01/05/1965", "08/16/1975") dates <- as.Date(strDates, "%m/%d/%Y") mydates <- as.Date(c("2007-06-22", "2004-02-13")) mydates # Calculations with with dates startdate <- as.Date("2004-02-13") enddate <- as.Date("2009-06-22") enddate - startdate # Date functions and formatted printing today <- Sys.Date() format(today, format="%B %d %Y") dob <- as.Date("1956-10-10") format(dob, format="%A") # Converting from one data type to another a <- c(1,2,3) a is.numeric(a) is.vector(a) a <- as.character(a) a is.numeric(a) is.vector(a) is.character(a) # Sorting a dataset leadership[order(age),] leadership[order(gender, age),] leadership[order(gender, -age),] # Selecting variables leadership[, c(6:10)] myvars <- c("q1", "q2", "q3", "q4", "q5") leadership[myvars] myvars <- paste("q", 1:5, sep="") leadership[myvars] # Dropping variables myvars <- names(leadership) %in% c("q3", "q4") leadership[!myvars] leadership[c(-7,-8)] # Selecting observations leadership[1:5,] leadership[which(leadership$gender=="M" & leadership$age > 30),] leadership[which(gender == 'M' & age > 30),] # Selecting observations based on dates leadership$date <- as.Date(leadership$date, "%m/%d/%y") startdate <- as.Date("2009-01-01") enddate <- as.Date("2009-10-31") leadership[leadership$date >= startdate & leadership$date <= enddate,] # Using the subset() function subset(leadership, age >= 35 | age < 24, select=c(q1, q2, q3, q4)) subset(leadership, gender=="M" & age > 25, select=gender:q4)