# R in Action (2nd ed): Chapter 17 # Classification # requires packaged rpart, party, randomForest, kernlab, rattle # install.packages(c("rpart", "party", "randomForest", "kernlab") # install.packages(rattle, dependencies = c("Depends", "Suggests")) #------------------------------------------------------------------- par(ask=TRUE) ## Prepare Breast Cancer Data loc <- "http://archive.ics.uci.edu/ml/machine-learning-databases/" ds <- "breast-cancer-wisconsin/breast-cancer-wisconsin.data" url <- paste(loc, ds, sep="") breast <- read.table(url, sep=",", header=FALSE, na.strings="?") names(breast) <- c("ID", "clumpThickness", "sizeUniformity", "shapeUniformity", "maginalAdhesion", "singleEpithelialCellSize", "bareNuclei", "blandChromatin", "normalNucleoli", "mitosis", "class") df <- breast[-1] df$class <- factor(df$class, levels=c(2,4), labels=c("benign", "malignant")) set.seed(1234) train <- sample(nrow(df), 0.7*nrow(df)) df.train <- df[train,] df.validate <- df[-train,] table(df.train$class) table(df.validate$class) ## Logistic Regression fit.logit <- glm(class~., data=df.train, family=binomial()) summary(fit.logit) prob <- predict(fit.logit, df.validate, type="response") logit.pred <- factor(prob > .5, levels=c(FALSE, TRUE), labels=c("benign", "malignant")) logit.perf <- table(df.validate$class, logit.pred, dnn=c("Actual", "Predicted")) logit.perf ## Decision Trees # Decesion tree with rpart library(rpart) set.seed(1234) dtree <- rpart(class ~ ., data=df.train, method="class", parms=list(split="information")) dtree$cptable plotcp(dtree) dtree.pruned <- prune(dtree, cp=.0125) library(rpart.plot) prp(dtree.pruned, type = 2, extra = 104, fallen.leaves = TRUE, main="Decision Tree") dtree.pred <- predict(dtree.pruned, df.validate, type="class") dtree.perf <- table(df.validate$class, dtree.pred, dnn=c("Actual", "Predicted")) dtree.perf # Conditional inference tree with ctree library(party) fit.ctree <- ctree(class~., data=df.train) plot(fit.ctree, main="Conditional Inference Tree") ctree.pred <- predict(fit.ctree, df.validate, type="response") ctree.perf <- table(df.validate$class, ctree.pred, dnn=c("Actual", "Predicted")) ctree.perf ## Random Forests library(randomForest) set.seed(1234) fit.forest <- randomForest(class~., data=df.train, na.action=na.roughfix, importance=TRUE) fit.forest importance(fit.forest, type=2) forest.pred <- predict(fit.forest, df.validate) forest.perf <- table(df.validate$class, forest.pred, dnn=c("Actual", "Predicted")) forest.perf ## Support Vector Machines library(kernlab) set.seed(1234) fit.svm <- ksvm(class~., data=df.train) fit.svm svm.pred <- predict(fit.svm, na.omit(df.validate)) svm.perf <- table(na.omit(df.validate)$class, svm.pred, dnn=c("Actual", "Predicted")) svm.perf ## Choosing a Best Predictive Solution # Function for assessing binary classification accuracy performance <- function(table, n=2){ if(!all(dim(table) == c(2,2))) stop("Must be a 2 x 2 table") tn = table[1,1] fp = table[1,2] fn = table[2,1] tp = table[2,2] sensitivity = tp/(tp+fn) specificity = tn/(tn+fp) ppp = tp/(tp+fp) npp = tn/(tn+fn) hitrate = (tp+tn)/(tp+tn+fp+fn) result <- paste("Sensitivity = ", round(sensitivity, n), "\nSpecificity = ", round(specificity, n), "\nPositive Predictive Value = ", round(ppp, n), "\nNegative Predictive Value = ", round(npp, n), "\nHit Rate = ", round(hitrate, n), sep="") cat(result) } # Performance of breast cancer data classifiers performance(dtree.perf) performance(ctree.perf) performance(forest.perf) performance(svm.perf) ## Using Rattle Package for Data Mining loc <- "http://archive.ics.uci.edu/ml/machine-learning-databases/" ds <- "pima-indians-diabetes/pima-indians-diabetes.data" url <- paste(loc, ds, sep="") diabetes <- read.table(url, sep=",", header=FALSE) names(diabetes) <- c("npregant", "plasma", "bp", "triceps", "insulin", "bmi", "pedigree", "age", "class") diabetes$class <- factor(diabetes$class, levels=c(0,1), labels=c("normal", "diabetic")) library(rattle) rattle()