# Source: www.openintro.org/stat/supplements.php # License: CC BY-SA # http://creativecommons.org/licenses/by-sa/3.0/ #_____ Data Download _____# download.file("http://www.openintro.org/stat/data/cc.RData", destfile = "cc.RData") load("cc.RData") cc <- countyComplete #_____ Libraries and Data _____# library(openintro) library(maps) data(COL) data(countyComplete) #_____ Initialize Variables _____# these <- c("FIPS", "pop2010", "pop2000", "age_under_5", "age_under_18", "age_over_65", "female", "black", "hispanic", "white_not_hispanic", "no_move_in_one_plus_year", "foreign_born", "foreign_spoken_at_home", "hs_grad", "bachelors", "mean_work_travel", "home_ownership", "housing_multi_unit", "median_val_owner_occupied", "persons_per_household", "per_capita_income", "poverty", "sales_per_capita", "density") d <- countyComplete[,these] d <- na.omit(d) dim(d) growth <- 100*(d$pop2010/d$pop2000-1) dd <- data.frame(growth, d) # dd is identical to downloaded data identical(dd, cc) row.names(dd) <- NULL #_____ Mapping Function _____# # Source: OpenIntro Statistics source # www.openintro.org/stat/textbook.php countyMap <- function(values, FIPS, col=c("red", "green", "blue"), varTrans=I, gtlt="", ...){ if(missing(FIPS)){ stop("Must provide the county FIPS") } #===> Drop NAs <===# FIPS <- FIPS[!is.na(values)] values <- values[!is.na(values)] #===> Scale Values <===# MI <- min(values) MA <- max(values) Leg <- seq(MI, MA, length.out=50) if(identical(varTrans, log)){ VAL <- varTrans(values+0.1) valCol <- varTrans(values+0.1) LegCol <- varTrans(Leg+0.1) } else { VAL <- varTrans(values) valCol <- varTrans(values) LegCol <- varTrans(Leg) } valCol <- 0.98*(valCol - MI)/(MA - MI) + 0.01 LegCol <- 0.98*(LegCol - MI)/(MA - MI) + 0.01 val.04 <- 0.4*(1 - valCol) val.06 <- 0.6*(1 - valCol) val.07 <- 0.7*(1 - valCol) val.08 <- 0.8*(1 - valCol) val.10 <- 1.0*(1 - valCol) Leg.04 <- 0.4*(1 - LegCol) Leg.06 <- 0.6*(1 - LegCol) Leg.07 <- 0.7*(1 - LegCol) Leg.08 <- 0.8*(1 - LegCol) Leg.10 <- 1.0*(1 - LegCol) if(col[1] == "red"){ col <- rgb(val.10, val.07, val.07) COL <- rgb(Leg.10, Leg.07, Leg.07) } else if(col[1] == "green"){ col <- rgb(val.07, val.10, val.07) COL <- rgb(Leg.07, Leg.10, Leg.07) } else if(col[1] == "bg"){ col <- rgb(val.06, val.08, val.10) COL <- rgb(Leg.06, Leg.08, Leg.10) } else if(col[1] == "ye"){ col <- rgb(val.10, val.10, val.04) COL <- rgb(Leg.10, Leg.10, Leg.04) } else { col <- rgb(val.06, val.06, val.10) COL <- rgb(Leg.06, Leg.06, Leg.10) } #===> Remove These <===# data(county.fips) col <- col[match(county.fips$fips, FIPS)] plot(0,0,type="n", axes=FALSE, xlab="", ylab="") par(mar=rep(0.1,4), usr=c(-0.385,0.41,0.44,0.91)) map("county", col=col, fill=TRUE, resolution=0, lty=0, projection="polyconic", mar=rep(0.1,4), add=TRUE, ...) x1 <- 0.33 x2 <- 0.36 for(i in 1:50){ y1 <- i/50 * 0.25 + 0.5 y2 <- (i-1)/50 * 0.25 + 0.5 rect(x1, y1, x2, y2, border="#00000000", col=COL[i]) } VR <- range(VAL) VR[3] <- VR[2] VR[2] <- mean(VR[c(1,3)]) VR1 <- c() VR1[1] <- values[which.min(abs(VAL - VR[1]))] VR1[2] <- values[which.min(abs(VAL - VR[2]))] VR1[2] <- values[which.min(abs(VAL - VR[3]))] VR <- round(VR) if(gtlt %in% c(">", "><")){ VR[3] <- paste(">", VR[3], sep="") } if(gtlt %in% c("<", "><")){ VR[1] <- paste("<", VR[1], sep="") } text(0.36, 0.51, VR[1], pos=4) text(0.36, 0.625, VR[2], pos=4) text(0.36, 0.74, VR[3], pos=4) } #_____ Growth Figures And Summaries _____# summary(growth) sd(growth) hist(growth, breaks = 100, xlab="Growth (percent)", main="", col=COL[1]) val <- (county$pop2010 - county$pop2000) / county$pop2000 val <- val*100 val[val > 30] <- 30 val[val < -30] <- -30 countyMap(val, countyComplete$FIPS, "green", gtlt="><") #_____ Population Summaries _____# summary(d$pop2010) sd(d$pop2010) hist(d$pop2000[d$pop2000 < 1e6], breaks = 100, xlim = c(0, 1000000), xlab="Population, Year 2000", col=COL[1], main="") temp <- paste(sum(d$pop2000 > 1e6), "counties with a population\ngreater than 1,000,000\nare not shown") text(7.5e5, 200, temp) hist(log(d$pop2000), breaks = 60, xlab="Natural Log of Population, Year 2000", col=COL[1], main="") th <- c(1e3, 1e4, 1e5, 1e6) thl <- c("1,000", "10,000", "100,000", "1,000,000") segments(log(th), rep(0, length(th)), log(th), rep(185, length(th)), col=COL[4], lwd=2.5) text(log(th)+c(-0.4,-0.6,0.65,0.75), rep(195, length(th)), thl, col=COL[4]) #_____ Collinearity _____# plot(d$age_under_5, d$age_under_18, xlab="Percent under Age 5", ylab="Percent under Age 18", col=COL[1,4], pch=19, cex=1.3) plot(d$black, d$white_not_hispanic, xlab="Percent Black", ylab="Percent White (not Hispanic)", col=COL[1,4], pch=19, cex=1.3) abline(100, -1, col="#FFFFFF", lty=2, lwd=2) abline(100, -1, col=COL[6], lty=2) plot(d$hs_grad, d$bachelors, xlab="Percent With High School Degree", ylab="Percent With Bachelors Degree", col=COL[1,4], pch=19, cex=1.3, xlim=c(40,100)) #_____ Models _____# m1 <- lm(growth ~ log(pop2000) + age_under_5 + age_under_18 + age_over_65 + female + black + hispanic + white_not_hispanic + no_move_in_one_plus_year + foreign_born + foreign_spoken_at_home + hs_grad + bachelors + mean_work_travel + home_ownership + housing_multi_unit + median_val_owner_occupied + persons_per_household + per_capita_income + poverty + sales_per_capita + density, dd) summary(m1) m2 <- lm(growth ~ log(pop2000) + age_under_5 + age_under_18 + age_over_65 + female + hispanic + white_not_hispanic + no_move_in_one_plus_year + foreign_born + foreign_spoken_at_home + bachelors + mean_work_travel + housing_multi_unit + median_val_owner_occupied + persons_per_household + per_capita_income + poverty + sales_per_capita, dd) summary(m2) #_____ Diagnostics _____# qqnorm(m2$res, main="", xlab="", pch=19, col=COL[1,4]) plot(m2$fit, m2$res, xlab="Predicted Values", ylab="Residuals", pch=19, col=COL[1,4]) abline(h=0, col=COL[6], lty=2) text(-45, 32, "Kalawao County, Hawaii") points(-59.4, 20.6, cex=3) val <- m2$res val <- val val[val > 18] <- 18 val[val < -18] <- -18 countyMap(val, dd$FIPS, "green", gtlt="><") #_____ IQR Summary _____# apply(m2$model[,-1], 2, IQR) * m2$coef[-1]