# Code assumes you have saved the relevant data locally; # it is generally too big to keep fetching over HTTP setwd("~/files/class_data/") dir() # A sample of the raw data x <- read.csv(bzfile("2007.csv.bz2"), as.is=TRUE, nrow=1000) str(x) # The cleaned version: x <- readRDS("airline2007_clean.Rds") rownames(x) <- NULL dim(x) str(x) # For now, going to take a 5% sample to speed up the # plots and exploration x <- x[sample(1:nrow(x),nrow(x)*0.05),] # Calculate the base linear model out <- lm(arrDelay ~ depDelay, data=x) out summary(out) # plot the residuals plot(out$fitted, out$resid, pch=".") plot(out$fitted/3600, out$resid/3600, pch=".", xlab="Py", ylab="My") abline(h=0,col="red", lty="dashed") plot(out$fitted/3600, out$resid/3600, pch=19, xlab="Py", ylab="My", cex=0.5, col=rgb(0,0,0,0.01)) # Let's explore relationship to other covariates tapply(out$resid, x$dest, mean) sort(round(tapply(out$resid, x$dest, mean) / 60)) sort(round(tapply(out$resid, x$dest, sd) / 60)) # make sense of this data: sdDest <- sort(round(tapply(out$resid, x$dest, sd) / 60), decreasing=TRUE) air <- read.csv("airports.csv", as.is=TRUE) index <- match(names(sdDest), air$iata) cat(sprintf("%40s - %d", air$airport[index], sdDest), sep="\n") library(maps) map("usa") map("state",add=TRUE,lwd=0.2) cols <- rgb(0,0,1,sdDest / max(sdDest)) points(air$long[index], air$lat[index], pch=19, cex=1.5, col=heat_hcl(100,alpha=0.7)) # Now, try to fit the linear model using airport as # a covariate out2 <- lm(arrDelay ~ depDelay + dest - 1, data=x) out2 summary(out2) out2 <- lm(arrDelay ~ depDelay + dest - 1, data=x) plot(out2$fitted/3600, out2$resid/3600, pch=".", xlab="Py", ylab="My") abline(h=0,col="red", lty="dashed") # How do the two fits align? anova(out,out2) summary(out)$r.squared summary(out2)$r.squared plot(out$resid,out2$res, pch=".")