head(cars)
tail(cars)

scatter.smooth(x=cars$speed, y=cars$dist, main="Dist ~ Speed")

?scatter.smooth
sort(cars$speed)

speedsSorted <- sort(cars$speed)
distsSorted <- sort(cars$dist)
?cars
scatter.smooth(x=speedsSorted, y=distsSorted, main="Dist ~ Speed")
#Note the decrease in variability here.
#We paired them up in a new way
cars$dist

par(mfrow = c(1,2)) # divide graph area into two columns

boxplot(cars$speed,main="Speed",sub=paste("Outlier rows: ", 
                                          boxplot.stats(cars$speed)$out))

boxplot(cars$dist,main="Distance",sub=paste("Outlier rows: ", 
                                            boxplot.stats(cars$dist)$out))
sort(cars$dist)

install.packages("e1071")
library(e1071)
plot(density(cars$speed),main="Density Plot:Speed",ylab="Frequency", sub=paste("Skewness:",round(e1071::skewness(cars$speed),2)))
?density
?skewness
polygon(density(cars$speed), col="red")
plot(density(cars$dist),main="Density Plot:Distance",
     ylab="Frequency", 
     sub=paste("Skewness:", round(e1071::skewness(cars$dist),2)))
polygon(density(cars$dist), col="red")

cor(cars$speed, cars$dist)
cor(speedsSorted, distsSorted)


linearMod <- lm(dist ~ speed, data=cars)


names(linearMod)
linearMod$coefficients
m <- linearMod$coefficients[2]    #slope
b <- linearMod$coefficients[1]    #intercept
distHat <- linearMod$coefficients[2] * cars$speed + linearMod$coefficients[1]

cars2 <- cars
cars2$yHat <- m * cars2$speed + b
par(mfrow = c(1,1))
plot(cars$speed, distHat)

print(linearMod)

head(cars2)

summary(linearMod)

cars3 <- cars2[-c(49), ]
lmcars3 <- lm(dist ~ speed, data=cars3)
m3 <- lmcars3$coefficients[2]
b3 <- lmcars3$coefficients[1]
cars3$yHat <- m3 * cars3$speed + b3
head(cars3)
head(cars2)

tail(cars3)
tail(cars2)

modelSummary <- summary(linearMod)
modelCoeffs <- modelSummary$coefficients
modelCoeffs

# model_p <- pf(f[1],f[2],f[3], lower = FALSE)

#  COPIED FROM HISTORY   ######################################
modelCoeffs[2,1]
modelCoeffs[2,2]
beta.estimate <- modelCoeffs[2,1]
std.error <- modelCoeffs[2,2]
t_value <- beta.estimate / std.error
?pt
#p_value <- 2 * pt(-abs(t_value), df-nrow(cars)- ncol(cars))

p_value <- 2 * pt(-abs(t_value), df=nrow(cars) - ncol(cars) )
names(linearMod)
names(summary(linearMod))
summary(linearMod)$fstatistic


f_statistic <- summary(linearMod)$fstatistic[1]
f_statistic <- summary(linearMod)$fstatistic
f <- f_statistic
model_p <- pf(f[1],f[2],f[3], lower = FALSE)

###############################################################

cor(cars$speed, cars$dist)  # calculate correlation between speed and distance 

model_p
t_value
p_value

AIC(linearMod)
BIC(linearMod)

# Create Training and Test data -
set.seed(100)  # setting seed to reproduce results of random sampling
trainingRowIndex <- sample(1:nrow(cars), 0.8*nrow(cars))  # row indices for training data
trainingData <- cars[trainingRowIndex, ]  # model training data

head(trainingData)
testData  <- cars[-trainingRowIndex, ]   # test data

testData

nrow(cars)

# Build the model on training data
lmMod <- lm(dist ~ speed, data=trainingData)  # build the model
distPred <- predict(lmMod, testData)  # predict distance

distPred
summary (lmMod)  # model summary
AIC (lmMod)  # Calculate akaike information criterion

actuals_preds <- data.frame(cbind(actuals=testData$dist, predicteds=distPred))  # make actuals_predicteds dataframe.
correlation_accuracy <- cor(actuals_preds)  # 82.7%

correlation_accuracy
head(actuals_preds)

# www.machinelearningplus.com/machine-learning/complete-introduction-linear-regression-r/


churn <- read.csv("http://www.cs.uni.edu/~jacobson/SCL/R/data/adult.csv")