# R in Action (2nd ed): Chapter 7
# Basic statistics
# requires packages npmc, ggm, gmodels, vcd, Hmisc, 
#                   pastecs, psych, doBy, reshape to be installed
# install.packages(c("ggm", "gmodels", "vcd", "Hmisc", 
#                    "pastecs", "psych", "doBy", "reshape"))
#---------------------------------------------------------------------


mt <- mtcars[c("mpg", "hp", "wt", "am")]
head(mt)

# descriptive stats via summary
summary(mt)

# descriptive stats via sapply
descript <- function(x, na.omit=FALSE){
              if (na.omit)
                  x <- x[!is.na(x)]
                  m <- mean(x)
                  n <- length(x)
                  s <- sd(x)
                  skew <- sum((x-m)^3/s^3)/n
                  kurt <- sum((x-m)^4/s^4)/n - 3
                  return(c(n=n, mean=m, stdev=s, skew=skew, kurtosis=kurt))
}

sapply(mt, descript)

# descriptive stats via describe (Hmisc)
library(Hmisc)
describe(mt)

# descriptive stats via stat.desc
library(pastecs)
stat.desc(mt)

# descriptive stats via describe (psych)
library(psych)
describe(mt)


# descriptive stats by group with aggregate
aggregate(mt, by=list(am=mt$am), mean)
aggregate(mt, by=list(am=mt$am), sd)

# descriptive stats by group via by
by(mt, mt$am, function(x)(c(mean=mean(x), sd=sd(x))))

# descriptive stats by group via summaryBy
library(doBy)
summaryBy(mpg+hp+wt~am, data=mt, FUN=descript)

# descriptive stats by group via describe.by
library(psych)
describe.by(mt, mt$am)

# summary statistics by group via the reshape package
library(reshape)
dstats <- function(x)(c(n=length(x), mean=mean(x), sd=sd(x)))
dfm <- melt(mtcars, measure.vars=c("mpg", "hp", "wt"), 
            id.vars=c("am", "cyl"))
cast(dfm, am + cyl + variable ~ ., dstats)


# frequency tables
library(vcd)
head(Arthritis)
attach(Arthritis)

# one way table
 mytable <- table(Improved)
 mytable

# two way table
mytable <- table(Treatment, Improved)
mytable
margin.table(mytable,1)
prop.table(mytable)
prop.table(mytable, 1)
prop.table(mytable, 2)
detach(Arthritis)

# two way table using CrossTable
library(gmodels)
CrossTable(Arthritis$Treatment, Arthritis$Improved)

# three way table
attach(Arthritis)
mytable <- table(Treatment, Improved, Sex)  
ftable(mytable) 
margin.table(mytable, 1)
margin.table(mytable, 2)
ftable(prop.table(mytable, c(1,2)))

# tests of independence
mytable <- table(Treatment, Improved)
chisq.test(mytable)
mytable <- table(Improved, Sex)
chisq.test(mytable)

# Fisher's exact test
mytable <- table(Treatment, Improved)
fisher.test(mytable)

# Chochran-Mantel-Haenszel test
mytable <- table(Treatment, Improved, Sex)
mantelhaen.test(mytable)
 
# measures of association for a two-way table
mytable <- table(Treatment, Improved)
assocstats(mytable)
detach(Arthritis)

# converting a table into a flat file via table2flat
table2flat <- function(mytable) {
                df <- as.data.frame(mytable)
                rows  <- dim(df)[1]
                cols  <- dim(df)[2]
                x <- NULL
                for (i in 1:rows){
                     for (j in 1:df$Freq[i]){
                          row <- df[i,c(1:(cols-1))]
                          x <- rbind(x,row)
                      }
                }
                row.names(x)<-c(1:dim(x)[1])
                return(x)
}

# Using table2flat
treatment <- rep(c("Placebo", "Treated"), 3)
improved <- rep(c("None", "Some", "Marked"), each=2)
Freq <- c(29,13,7,7,7,21)
mytable <- data.frame(treatment, improved, Freq)
mydata <- table2flat(mytable)
head(mydata)

# correlations and covariances
states<- state.x77[,1:6]
cov(states)
cor(states)
x <- states[,c("Population", "Income", "Illiteracy", "HS Grad")]
y <- states[,c("Life Exp", "Murder")]
cor(x,y)

# partial correlations
library(ggm)
# partial correlation of population and murder rate, controlling
# for income, illiteracy rate, and HS graduation rate
pcor(c(1,5,2,3,6), cov(states))

# testing correlations for significance
cor.test(states[,3], states[,5])

# correlation matrix and tests of significance via corr.test
library(psych)
corr.test(states, use="complete")

# t test
library(MASS)
attach(UScrime)
t.test(Prob ~ So)

# dependent t test
sapply(UScrime[c("U1","U2")], function(x)(c(mean=mean(x),sd=sd(x))))
t.test(U1, U2, paired=TRUE)

# Wilcoxon two group comparison
by(Prob, So, median)
wilcox.test(Prob ~ So)
sapply(UScrime[c("U1", "U2")], median)
wilcox.test(U1,U2,paired=TRUE)

# Kruskal Wallis test
states <- as.data.frame(cbind(state.region, state.x77))
attach(states)
kruskal.test(Illiteracy ~ state.region)
detach(states)

# Nonparametric multiple comparisons
source("http://www.statmethods.net/RiA/wmc.txt")              
states <- data.frame(state.region, state.x77)
wmc(Illiteracy ~ state.region, data=states, method="holm")