head(cars) tail(cars) scatter.smooth(x=cars$speed, y=cars$dist, main="Dist ~ Speed") ?scatter.smooth sort(cars$speed) speedsSorted <- sort(cars$speed) distsSorted <- sort(cars$dist) ?cars scatter.smooth(x=speedsSorted, y=distsSorted, main="Dist ~ Speed") #Note the decrease in variability here. #We paired them up in a new way cars$dist par(mfrow = c(1,2)) # divide graph area into two columns boxplot(cars$speed,main="Speed",sub=paste("Outlier rows: ", boxplot.stats(cars$speed)$out)) boxplot(cars$dist,main="Distance",sub=paste("Outlier rows: ", boxplot.stats(cars$dist)$out)) sort(cars$dist) install.packages("e1071") library(e1071) plot(density(cars$speed),main="Density Plot:Speed",ylab="Frequency", sub=paste("Skewness:",round(e1071::skewness(cars$speed),2))) ?density ?skewness polygon(density(cars$speed), col="red") plot(density(cars$dist),main="Density Plot:Distance", ylab="Frequency", sub=paste("Skewness:", round(e1071::skewness(cars$dist),2))) polygon(density(cars$dist), col="red") cor(cars$speed, cars$dist) cor(speedsSorted, distsSorted) linearMod <- lm(dist ~ speed, data=cars) names(linearMod) linearMod$coefficients m <- linearMod$coefficients[2] #slope b <- linearMod$coefficients[1] #intercept distHat <- linearMod$coefficients[2] * cars$speed + linearMod$coefficients[1] cars2 <- cars cars2$yHat <- m * cars2$speed + b par(mfrow = c(1,1)) plot(cars$speed, distHat) print(linearMod) head(cars2) summary(linearMod) cars3 <- cars2[-c(49), ] lmcars3 <- lm(dist ~ speed, data=cars3) m3 <- lmcars3$coefficients[2] b3 <- lmcars3$coefficients[1] cars3$yHat <- m3 * cars3$speed + b3 head(cars3) head(cars2) tail(cars3) tail(cars2) modelSummary <- summary(linearMod) modelCoeffs <- modelSummary$coefficients modelCoeffs # model_p <- pf(f[1],f[2],f[3], lower = FALSE) # COPIED FROM HISTORY ###################################### modelCoeffs[2,1] modelCoeffs[2,2] beta.estimate <- modelCoeffs[2,1] std.error <- modelCoeffs[2,2] t_value <- beta.estimate / std.error ?pt #p_value <- 2 * pt(-abs(t_value), df-nrow(cars)- ncol(cars)) p_value <- 2 * pt(-abs(t_value), df=nrow(cars) - ncol(cars) ) names(linearMod) names(summary(linearMod)) summary(linearMod)$fstatistic f_statistic <- summary(linearMod)$fstatistic[1] f_statistic <- summary(linearMod)$fstatistic f <- f_statistic model_p <- pf(f[1],f[2],f[3], lower = FALSE) ############################################################### cor(cars$speed, cars$dist) # calculate correlation between speed and distance model_p t_value p_value AIC(linearMod) BIC(linearMod) # Create Training and Test data - set.seed(100) # setting seed to reproduce results of random sampling trainingRowIndex <- sample(1:nrow(cars), 0.8*nrow(cars)) # row indices for training data trainingData <- cars[trainingRowIndex, ] # model training data head(trainingData) testData <- cars[-trainingRowIndex, ] # test data testData nrow(cars) # Build the model on training data lmMod <- lm(dist ~ speed, data=trainingData) # build the model distPred <- predict(lmMod, testData) # predict distance distPred summary (lmMod) # model summary AIC (lmMod) # Calculate akaike information criterion actuals_preds <- data.frame(cbind(actuals=testData$dist, predicteds=distPred)) # make actuals_predicteds dataframe. correlation_accuracy <- cor(actuals_preds) # 82.7% correlation_accuracy head(actuals_preds) # www.machinelearningplus.com/machine-learning/complete-introduction-linear-regression-r/ churn <- read.csv("http://www.cs.uni.edu/~jacobson/SCL/R/data/adult.csv")