# R in Action (2nd ed): Chapter 8
# Regression
# requires packages car, gvlma, MASS, leaps to be installed
# install.packages(c("car", "gvlma", "MASS", "leaps"))
#----------------------------------------------------------

par(ask=TRUE)
opar <- par(no.readonly=TRUE)

# simple linear regression
fit <- lm(weight ~ height, data=women)
summary(fit)
women$weight
fitted(fit)
residuals(fit)

# scatter plot of height by weight
plot(women$height,women$weight,
     main="Women Age 30-39", 
     xlab="Height (in inches)", 
     ylab="Weight (in pounds)")
# add the line of best fit
abline(fit)

# Polynomial regression
fit2 <- lm(weight ~ height + I(height^2), data=women)
summary(fit2)
plot(women$height,women$weight,
     main="Women Age 30-39",
     xlab="Height (in inches)",
     ylab="Weight (in lbs)")
lines(women$height,fitted(fit2))

# scatterplot for women data
library(car)
scatterplot(weight ~ height, data=women, 
            spread=FALSE, lty.smooth=2, pch=19,
            main="Women Age 30-39", 
            xlab="Height (inches)",
            ylab="Weight (lbs.)")

# multiple linear regression
states <- as.data.frame(state.x77[,c("Murder", "Population", 
                                     "Illiteracy", "Income", "Frost")])
cor(states)
library(car)
scatterplotMatrix(states, spread=FALSE, lty.smooth=2, main="Scatterplot Matrix")
fit <- lm(Murder ~ Population + Illiteracy + Income + Frost, data=states)
summary(fit)
confint(fit)

# effects plot
fit <- lm(mpg ~ hp + wt + hp:wt, data=mtcars)
library(effects)
plot(effect("hp:wt", fit, list(wt=c(2.2, 3.2, 4.2))), multiline=TRUE)

# simple regression diagnostics
fit <- lm(weight ~ height, data=women)
par(mfrow=c(2,2))
plot(fit)
newfit <- lm(weight ~ height + I(height^2), data=women)
par(opar)
par(mfrow=c(2,2))
plot(newfit)
par(opar)

# basic regression diagnostics for states data
fit <- lm(Murder ~ Population + Illiteracy + Income + Frost, data=states)
par(mfrow=c(2,2))
plot(fit)
par(opar)


# Assessing normality
library(car)
fit <- lm(Murder ~ Population + Illiteracy + Income + Frost, data=states)
qqPlot(fit, labels=FALSE, simulate=TRUE, main="Q-Q Plot")


# residplot function
residplot <- function(fit, nbreaks=10) {
               x <- rstudent(fit)
               h<-hist(x, breaks=nbreaks, freq=FALSE, 
               xlab="Studentized Residual", 
               main="Distribution of Errors") 
               xfit<-seq(min(x),max(x),length=40) 
               lines(xfit, dnorm(xfit),col="blue", lwd=2)
               lines(density(x)$x, density(x)$y, col="red", lwd=2, lty=2)
               legend("topright", legend = c( "Normal Curve", "Density Curve"), 
               lty=1:2, col=c("blue","red"), cex=.7)
 }

residplot(fit)

#  Durbin Watson test for Autocorrelated Errors
durbinWatsonTest(fit)

# Component + Residual Plots
crPlots(fit, one.page=TRUE, ask=FALSE)

# Assessing homoscedasticity
ncvTest(fit)
spreadLevelPlot(fit)

# Global test of linear model assumptions
library(gvlma)
gvmodel <- gvlma(fit) 
summary(gvmodel)

# Evaluating multi-collinearity
vif(fit) 
sqrt(vif(fit)) > 2 # problem?

# Assessing outliers
outlierTest(fit)

#  Index plot of hat values
p <- length(coefficients(fit))
n <- length(fitted(fit))
plot(hatvalues(fit), main="Index Plot of Hat Values")
abline(h=c(2,3)*p/n, col="red", lty=2)
identify(1:n, hatvalues(fit), row.names(states))

# Cook's D Plot
# identify D values > 4/(n-k-1) 
cutoff <- 4/(nrow(states)-length(fit$coefficients)-2)
plot(fit, which=4, cook.levels=cutoff)
abline(h=cutoff, lty=2, col="red")

# Added variable plots
# add id.method="identify" to interactively identify points
avPlots(fit, ask=FALSE, onepage=TRUE)

# Influence Plot
influencePlot(fit, id.method="identify", main="Influence Plot", 
              sub="Circle size is proportial to Cook's Distance" )

# Box-Cox Transformation to Normality
summary(powerTransform(states$Murder))

# Box-Tidwell Transformations to Linearity
boxTidwell(Murder~Population+Illiteracy,data=states)

# Comparing nested models using the anova function
fit1 <- lm(Murder ~ Population + Illiteracy + Income + Frost,
           data=states)
fit2 <- lm(Murder ~ Population + Illiteracy, data=states)
anova(fit2, fit1)

# Comparing models with the Akaike Information Criterion
fit1 <- lm(Murder ~ Population + Illiteracy + Income + Frost,
           data=states)
fit2 <- lm(Murder ~ Population + Illiteracy, data=states)
AIC(fit1,fit2)

# Backward stepwise selection
library(MASS)
fit1 <- lm(Murder ~ Population + Illiteracy + Income + Frost,
           data=states)
stepAIC(fit, direction="backward")

# All subsets regression
library(leaps)
leaps <-regsubsets(Murder ~ Population + Illiteracy + Income +
                   Frost, data=states, nbest=4)
plot(leaps, scale="adjr2", 
     main="Best model subsets based on Adjusted R2")

library(car)
subsets(leaps, statistic="cp", legend=TRUE, # legend is placed interactively
         main="Cp Plot for All Subsets Regression") 
abline(1,1,lty=2,col="red")

# Function for k-fold cross-validated R-square
shrinkage <- function(fit,k=10){
require(bootstrap)
# define functions 
theta.fit <- function(x,y){lsfit(x,y)}
theta.predict <- function(fit,x){cbind(1,x)%*%fit$coef} 

# matrix of predictors
x <- fit$model[,2:ncol(fit$model)]
# vector of predicted values
y <- fit$model[,1]

results <- crossval(x,y,theta.fit,theta.predict,ngroup=k)
r2 <- cor(y, fit$fitted.values)**2 # raw R2 
r2cv <- cor(y,results$cv.fit)**2 # cross-validated R2
cat("Original R-square =", r2, "\n")
cat(k, "Fold Cross-Validated R-square =", r2cv, "\n")
cat("Change =", r2-r2cv, "\n")
}

# using it
fit <- lm(Murder ~ Population + Income + Illiteracy + Frost, data=states)
shrinkage(fit)
fit2 <- lm(Murder~Population+Illiteracy,data=states)
shrinkage(fit2)

#  Calculating standardized regression coefficients
zstates <- as.data.frame(scale(states))
zfit <- lm(Murder~Population + Income + Illiteracy + Frost, data=zstates)
coef(zfit)

########################################################################
# The relweights function determines the relative importance of each   # 
# independent variable to the dependent variable in an OLS regression. # 
# The code is adapted from an SPSS program generously provided by      # 
# Dr. Johnson.                                                         #
#                                                                      #
# See Johnson (2000, Multivariate Behavioral Research, 35, 1-19) for   #
# an explanation of how the relative weights are derived.              #                                           
######################################################################## 
relweights <- function(fit,...){                              
  R <- cor(fit$model)      # correlation matrix with criterion in 1st column
  nvar <- ncol(R)          # number of variables
  rxx <- R[2:nvar, 2:nvar] # correlations among predictors
  rxy <- R[2:nvar, 1]      # correlations between predictors and criterion
  svd <- eigen(rxx)        # singular value decomposition
  evec <- svd$vectors      # eigenvectors                     
  ev <- svd$values         # eigenvalues
  delta <- diag(sqrt(ev))  # diag matrix with sqrts of eigenvalues
  
  # correlations between original predictors and new orthogonal variables
  lambda <- evec %*% delta %*% t(evec)        
  lambdasq <- lambda ^ 2   # square of the correlations
  
  # regression coefficients of Y on orthogonal variables
  beta <- solve(lambda) %*% rxy           
  rsquare <- colSums(beta ^ 2)       # calculate R-square            
  rawwgt <- lambdasq %*% beta ^ 2    # raw relative weights
  import <- (rawwgt / rsquare) * 100 # rescale to % of R-square
  lbls <- names(fit$model[2:nvar])   # predictor labels
  rownames(import) <- lbls
  colnames(import) <- "Weights"
  
  # plot results
  barplot(t(import),names.arg=lbls,
          ylab="% of R-Square",
          xlab="Predictor Variables",
          main="Relative Importance of Predictor Variables", 
          sub=paste("R-Square = ", round(rsquare, digits=3)),
          ...)  
return(import)
}

# using it
fit <- lm(Murder ~ Population + Illiteracy + Income + Frost, data=states)
relweights(fit, col="lightgrey")