# R in Action (2nd ed): Chapter 7 # Basic statistics # requires packages npmc, ggm, gmodels, vcd, Hmisc, # pastecs, psych, doBy, reshape to be installed # install.packages(c("ggm", "gmodels", "vcd", "Hmisc", # "pastecs", "psych", "doBy", "reshape")) #--------------------------------------------------------------------- mt <- mtcars[c("mpg", "hp", "wt", "am")] head(mt) # descriptive stats via summary summary(mt) # descriptive stats via sapply descript <- function(x, na.omit=FALSE){ if (na.omit) x <- x[!is.na(x)] m <- mean(x) n <- length(x) s <- sd(x) skew <- sum((x-m)^3/s^3)/n kurt <- sum((x-m)^4/s^4)/n - 3 return(c(n=n, mean=m, stdev=s, skew=skew, kurtosis=kurt)) } sapply(mt, descript) # descriptive stats via describe (Hmisc) library(Hmisc) describe(mt) # descriptive stats via stat.desc library(pastecs) stat.desc(mt) # descriptive stats via describe (psych) library(psych) describe(mt) # descriptive stats by group with aggregate aggregate(mt, by=list(am=mt$am), mean) aggregate(mt, by=list(am=mt$am), sd) # descriptive stats by group via by by(mt, mt$am, function(x)(c(mean=mean(x), sd=sd(x)))) # descriptive stats by group via summaryBy library(doBy) summaryBy(mpg+hp+wt~am, data=mt, FUN=descript) # descriptive stats by group via describe.by library(psych) describe.by(mt, mt$am) # summary statistics by group via the reshape package library(reshape) dstats <- function(x)(c(n=length(x), mean=mean(x), sd=sd(x))) dfm <- melt(mtcars, measure.vars=c("mpg", "hp", "wt"), id.vars=c("am", "cyl")) cast(dfm, am + cyl + variable ~ ., dstats) # frequency tables library(vcd) head(Arthritis) attach(Arthritis) # one way table mytable <- table(Improved) mytable # two way table mytable <- table(Treatment, Improved) mytable margin.table(mytable,1) prop.table(mytable) prop.table(mytable, 1) prop.table(mytable, 2) detach(Arthritis) # two way table using CrossTable library(gmodels) CrossTable(Arthritis$Treatment, Arthritis$Improved) # three way table attach(Arthritis) mytable <- table(Treatment, Improved, Sex) ftable(mytable) margin.table(mytable, 1) margin.table(mytable, 2) ftable(prop.table(mytable, c(1,2))) # tests of independence mytable <- table(Treatment, Improved) chisq.test(mytable) mytable <- table(Improved, Sex) chisq.test(mytable) # Fisher's exact test mytable <- table(Treatment, Improved) fisher.test(mytable) # Chochran-Mantel-Haenszel test mytable <- table(Treatment, Improved, Sex) mantelhaen.test(mytable) # measures of association for a two-way table mytable <- table(Treatment, Improved) assocstats(mytable) detach(Arthritis) # converting a table into a flat file via table2flat table2flat <- function(mytable) { df <- as.data.frame(mytable) rows <- dim(df)[1] cols <- dim(df)[2] x <- NULL for (i in 1:rows){ for (j in 1:df$Freq[i]){ row <- df[i,c(1:(cols-1))] x <- rbind(x,row) } } row.names(x)<-c(1:dim(x)[1]) return(x) } # Using table2flat treatment <- rep(c("Placebo", "Treated"), 3) improved <- rep(c("None", "Some", "Marked"), each=2) Freq <- c(29,13,7,7,7,21) mytable <- data.frame(treatment, improved, Freq) mydata <- table2flat(mytable) head(mydata) # correlations and covariances states<- state.x77[,1:6] cov(states) cor(states) x <- states[,c("Population", "Income", "Illiteracy", "HS Grad")] y <- states[,c("Life Exp", "Murder")] cor(x,y) # partial correlations library(ggm) # partial correlation of population and murder rate, controlling # for income, illiteracy rate, and HS graduation rate pcor(c(1,5,2,3,6), cov(states)) # testing correlations for significance cor.test(states[,3], states[,5]) # correlation matrix and tests of significance via corr.test library(psych) corr.test(states, use="complete") # t test library(MASS) attach(UScrime) t.test(Prob ~ So) # dependent t test sapply(UScrime[c("U1","U2")], function(x)(c(mean=mean(x),sd=sd(x)))) t.test(U1, U2, paired=TRUE) # Wilcoxon two group comparison by(Prob, So, median) wilcox.test(Prob ~ So) sapply(UScrime[c("U1", "U2")], median) wilcox.test(U1,U2,paired=TRUE) # Kruskal Wallis test states <- as.data.frame(cbind(state.region, state.x77)) attach(states) kruskal.test(Illiteracy ~ state.region) detach(states) # Nonparametric multiple comparisons source("http://www.statmethods.net/RiA/wmc.txt") states <- data.frame(state.region, state.x77) wmc(Illiteracy ~ state.region, data=states, method="holm")