# R in Action (2nd ed): Chapter 8 # Regression # requires packages car, gvlma, MASS, leaps to be installed # install.packages(c("car", "gvlma", "MASS", "leaps")) #---------------------------------------------------------- par(ask=TRUE) opar <- par(no.readonly=TRUE) # simple linear regression fit <- lm(weight ~ height, data=women) summary(fit) women$weight fitted(fit) residuals(fit) # scatter plot of height by weight plot(women$height,women$weight, main="Women Age 30-39", xlab="Height (in inches)", ylab="Weight (in pounds)") # add the line of best fit abline(fit) # Polynomial regression fit2 <- lm(weight ~ height + I(height^2), data=women) summary(fit2) plot(women$height,women$weight, main="Women Age 30-39", xlab="Height (in inches)", ylab="Weight (in lbs)") lines(women$height,fitted(fit2)) # scatterplot for women data library(car) scatterplot(weight ~ height, data=women, spread=FALSE, lty.smooth=2, pch=19, main="Women Age 30-39", xlab="Height (inches)", ylab="Weight (lbs.)") # multiple linear regression states <- as.data.frame(state.x77[,c("Murder", "Population", "Illiteracy", "Income", "Frost")]) cor(states) library(car) scatterplotMatrix(states, spread=FALSE, lty.smooth=2, main="Scatterplot Matrix") fit <- lm(Murder ~ Population + Illiteracy + Income + Frost, data=states) summary(fit) confint(fit) # effects plot fit <- lm(mpg ~ hp + wt + hp:wt, data=mtcars) library(effects) plot(effect("hp:wt", fit, list(wt=c(2.2, 3.2, 4.2))), multiline=TRUE) # simple regression diagnostics fit <- lm(weight ~ height, data=women) par(mfrow=c(2,2)) plot(fit) newfit <- lm(weight ~ height + I(height^2), data=women) par(opar) par(mfrow=c(2,2)) plot(newfit) par(opar) # basic regression diagnostics for states data fit <- lm(Murder ~ Population + Illiteracy + Income + Frost, data=states) par(mfrow=c(2,2)) plot(fit) par(opar) # Assessing normality library(car) fit <- lm(Murder ~ Population + Illiteracy + Income + Frost, data=states) qqPlot(fit, labels=FALSE, simulate=TRUE, main="Q-Q Plot") # residplot function residplot <- function(fit, nbreaks=10) { x <- rstudent(fit) h<-hist(x, breaks=nbreaks, freq=FALSE, xlab="Studentized Residual", main="Distribution of Errors") xfit<-seq(min(x),max(x),length=40) lines(xfit, dnorm(xfit),col="blue", lwd=2) lines(density(x)$x, density(x)$y, col="red", lwd=2, lty=2) legend("topright", legend = c( "Normal Curve", "Density Curve"), lty=1:2, col=c("blue","red"), cex=.7) } residplot(fit) # Durbin Watson test for Autocorrelated Errors durbinWatsonTest(fit) # Component + Residual Plots crPlots(fit, one.page=TRUE, ask=FALSE) # Assessing homoscedasticity ncvTest(fit) spreadLevelPlot(fit) # Global test of linear model assumptions library(gvlma) gvmodel <- gvlma(fit) summary(gvmodel) # Evaluating multi-collinearity vif(fit) sqrt(vif(fit)) > 2 # problem? # Assessing outliers outlierTest(fit) # Index plot of hat values p <- length(coefficients(fit)) n <- length(fitted(fit)) plot(hatvalues(fit), main="Index Plot of Hat Values") abline(h=c(2,3)*p/n, col="red", lty=2) identify(1:n, hatvalues(fit), row.names(states)) # Cook's D Plot # identify D values > 4/(n-k-1) cutoff <- 4/(nrow(states)-length(fit$coefficients)-2) plot(fit, which=4, cook.levels=cutoff) abline(h=cutoff, lty=2, col="red") # Added variable plots # add id.method="identify" to interactively identify points avPlots(fit, ask=FALSE, onepage=TRUE) # Influence Plot influencePlot(fit, id.method="identify", main="Influence Plot", sub="Circle size is proportial to Cook's Distance" ) # Box-Cox Transformation to Normality summary(powerTransform(states$Murder)) # Box-Tidwell Transformations to Linearity boxTidwell(Murder~Population+Illiteracy,data=states) # Comparing nested models using the anova function fit1 <- lm(Murder ~ Population + Illiteracy + Income + Frost, data=states) fit2 <- lm(Murder ~ Population + Illiteracy, data=states) anova(fit2, fit1) # Comparing models with the Akaike Information Criterion fit1 <- lm(Murder ~ Population + Illiteracy + Income + Frost, data=states) fit2 <- lm(Murder ~ Population + Illiteracy, data=states) AIC(fit1,fit2) # Backward stepwise selection library(MASS) fit1 <- lm(Murder ~ Population + Illiteracy + Income + Frost, data=states) stepAIC(fit, direction="backward") # All subsets regression library(leaps) leaps <-regsubsets(Murder ~ Population + Illiteracy + Income + Frost, data=states, nbest=4) plot(leaps, scale="adjr2", main="Best model subsets based on Adjusted R2") library(car) subsets(leaps, statistic="cp", legend=TRUE, # legend is placed interactively main="Cp Plot for All Subsets Regression") abline(1,1,lty=2,col="red") # Function for k-fold cross-validated R-square shrinkage <- function(fit,k=10){ require(bootstrap) # define functions theta.fit <- function(x,y){lsfit(x,y)} theta.predict <- function(fit,x){cbind(1,x)%*%fit$coef} # matrix of predictors x <- fit$model[,2:ncol(fit$model)] # vector of predicted values y <- fit$model[,1] results <- crossval(x,y,theta.fit,theta.predict,ngroup=k) r2 <- cor(y, fit$fitted.values)**2 # raw R2 r2cv <- cor(y,results$cv.fit)**2 # cross-validated R2 cat("Original R-square =", r2, "\n") cat(k, "Fold Cross-Validated R-square =", r2cv, "\n") cat("Change =", r2-r2cv, "\n") } # using it fit <- lm(Murder ~ Population + Income + Illiteracy + Frost, data=states) shrinkage(fit) fit2 <- lm(Murder~Population+Illiteracy,data=states) shrinkage(fit2) # Calculating standardized regression coefficients zstates <- as.data.frame(scale(states)) zfit <- lm(Murder~Population + Income + Illiteracy + Frost, data=zstates) coef(zfit) ######################################################################## # The relweights function determines the relative importance of each # # independent variable to the dependent variable in an OLS regression. # # The code is adapted from an SPSS program generously provided by # # Dr. Johnson. # # # # See Johnson (2000, Multivariate Behavioral Research, 35, 1-19) for # # an explanation of how the relative weights are derived. # ######################################################################## relweights <- function(fit,...){ R <- cor(fit$model) # correlation matrix with criterion in 1st column nvar <- ncol(R) # number of variables rxx <- R[2:nvar, 2:nvar] # correlations among predictors rxy <- R[2:nvar, 1] # correlations between predictors and criterion svd <- eigen(rxx) # singular value decomposition evec <- svd$vectors # eigenvectors ev <- svd$values # eigenvalues delta <- diag(sqrt(ev)) # diag matrix with sqrts of eigenvalues # correlations between original predictors and new orthogonal variables lambda <- evec %*% delta %*% t(evec) lambdasq <- lambda ^ 2 # square of the correlations # regression coefficients of Y on orthogonal variables beta <- solve(lambda) %*% rxy rsquare <- colSums(beta ^ 2) # calculate R-square rawwgt <- lambdasq %*% beta ^ 2 # raw relative weights import <- (rawwgt / rsquare) * 100 # rescale to % of R-square lbls <- names(fit$model[2:nvar]) # predictor labels rownames(import) <- lbls colnames(import) <- "Weights" # plot results barplot(t(import),names.arg=lbls, ylab="% of R-Square", xlab="Predictor Variables", main="Relative Importance of Predictor Variables", sub=paste("R-Square = ", round(rsquare, digits=3)), ...) return(import) } # using it fit <- lm(Murder ~ Population + Illiteracy + Income + Frost, data=states) relweights(fit, col="lightgrey")