################### ### Problem One ### ################### Temps <- read.table('http://math.uttyler.edu/nathan/classes/statistics/data/ustemp.data',header=TRUE) attach(Temps) plot(Temps) m1 <- lm(JanTemp ~ Lat + Long) summary(m1) plot(m1$fitted.values,m1$residuals) abline(h=0) lines(lowess(m1$fitted.values,m1$residuals),col='red') # maybe some heteroskedasticity: plot(m1$fitted.values,abs(m1$residuals)) plot(abs(rstudent(m1))) abline(h=qt(1-.05/(2*length(abs(rstudent(m1)))),df=summary(m1)$df[2]-1)) identify(abs(rstudent(m1))) #52 is above the line Temps[52,] # Seattle summary(m1)$df[2]-1 plot(Lat,m1$residuals) abline(h=0) lines(lowess(Lat,m1$residuals),col='red') # more evidence of heteroskedasticity plot(Long,m1$residuals) abline(h=0) lines(lowess(Long,m1$residuals),col='red') # whoa! big curvilinear pattern, if you want to see more # try added var or partial+resid plots: plot(lm(Long~Lat)$residuals, lm(JanTemp~Lat)$residuals, main='Added Variable Plot for Long') abline(h=0) lines(lowess(lm(Long~Lat)$residuals,lm(JanTemp~Lat)$residuals),col='red') plot(Long,m1$residuals + summary(m1)$coefficients[3,1]*Long, main='Partial Plus Residual Plot for Long') summary(m1)$coefficients[3,1] ## clearly the biggest problem here at the moment is the curved ## relationship with long, so we address that rather than the outliers ## or heteroskedasticity m2 <- lm(JanTemp ~ Lat + Long + I(Long^2)) summary(m2) plot(m2$fitted.values,m2$residuals) abline(h=0) lines(lowess(m2$fitted.values,m2$residuals),col='red') plot(Long,m2$residuals) abline(h=0) lines(lowess(Long,m2$residuals),col='red') # still doesn't look like we've fixed our problem m3 <- lm(JanTemp ~ Lat + Long + I(Long^2) + I(Long^3)) summary(m3) plot(m3$fitted.values,m3$residuals) abline(h=0) lines(lowess(m3$fitted.values,m3$residuals),col='red') # note that the heteroskedasticity seems a little better! plot(m3$fitted.values,abs(m3$residuals)) # here is a cheesy test for those so inclined: summary(lm(abs(m3$residuals) ~ m3$fitted.values)) # no apparent relationship between 'x' and 'y' here # take tests such as these with a few grains of salt, of course! plot(Long,m3$residuals) abline(h=0) lines(lowess(Long,m3$residuals),col='red') # looks much better plot(abs(rstudent(m3))) abline(h=qt(1-.05/(2*length(abs(rstudent(m3)))),df=summary(m3)$df[2]-1)) identify(abs(rstudent(m3))) # just under the legal limit: Temps[6,] # San Francisco # Tyler predict(m3,newdata = data.frame(Lat = 32,Long=95)) # Kansas City predict(m3,newdata = data.frame(Lat = 39,Long=95)) # interestingly these are a little low, and with a smallish data set # like we have, we may be a little overfit, though there are some # obvious problems with the first model. # it does predict more closely (for Tyler and Kansas City at least) predict(m1,newdata = data.frame(Lat = 32,Long=95)) predict(m1,newdata = data.frame(Lat = 39,Long=95)) # though m3 is closer: # Louisville, KY predict(m3,newdata = data.frame(Lat = 38,Long=85)) predict(m1,newdata = data.frame(Lat = 38,Long=85)) # Sacramento, CA predict(m3,newdata = data.frame(Lat = 38,Long=121)) predict(m1,newdata = data.frame(Lat = 38,Long=121)) # Pittsburgh, PA predict(m3,newdata = data.frame(Lat = 40,Long=80)) predict(m1,newdata = data.frame(Lat = 40,Long=80)) detach(Temps) ################### ### Problem Two ### ################### Wages <- read.table('http://math.uttyler.edu/nathan/classes/statistics/data/wages.data',header=TRUE) attach(Wages) plot(Wages) m0 <- lm(hours ~ wage.rate) summary(m0) m1 <- lm(hours ~ wage.rate + spouse.earnings + other.earnings + non.earned + asset + age + dependents + education) summary(m1) # note the change in sign of the slope for wage.rate! Which is it? # When assessing the effect of wage.rate on hours we need to get this # straightened out. # some diagnostic plots par(mfrow=c(3,3)) plot(m1$fitted.values,m1$residuals) abline(h=0) lines(lowess(m1$fitted.values,m1$residuals),col='red') for (i in 2:9){ plot(Wages[,i],m1$residuals,main=names(Wages)[i]) abline(h=0) lines(lowess(Wages[,i],m1$residuals),col='red')} par(mfrow=c(1,1)) ## looks like we might have some trouble with outliers plot(abs(rstudent(m1))) abline(h=qt(1-.05/(2*length(abs(rstudent(m1)))),df=summary(m1)$df[2]-1)) identify(abs(rstudent(m1))) # 4 and 19 plot(age,hours) identify(age,hours) # 4 and 5 plot(wage.rate,m1$residuals) identify(wage.rate,m1$residuals) # 19 plot(spouse.earnings,m1$residuals) identify(spouse.earnings,m1$residuals) # 16,19,30,32 plot(other.earnings,m1$residuals) identify(other.earnings,m1$residuals) # 4,5,19,28 # just to see what happens m2 <- lm(hours ~ wage.rate + spouse.earnings + other.earnings + non.earned + asset + age + dependents + education, subset=-c(4,19)) # 4 and 19 seem to show up the most above, # and were outliers in the rstudent test summary(m2) # now it's wage.rate age and education # some more diagnostic plots from m1 before we abandon it, # partial + residual plots par(mfrow=c(3,3)) for (col in 2:dim(m1$model)[2]){ plot(m1$model[,col],m1$residuals+m1$coefficients[col]*m1$model[,col], main=names(m1$coefficients)[col], xlab=names(m1$coefficients)[col], ylab='Partial+Residual') lines(lowess(m1$model[,col],m1$residuals+m1$coefficients[col]*m1$model[,col]), col='red')} par(mfrow=c(1,1)) # maybe a curve with non.earned, again maybe some issues with outliers m3 <- lm(hours ~ wage.rate + other.earnings + non.earned + dependents + education) summary(m3) anova(m1,m3) plot(m3$fitted.values,m3$residuals) abline(h=0) lines(lowess(m3$fitted.values,m3$residuals),col='red') plot(abs(rstudent(m3))) abline(h=qt(1-.05/(2*length(abs(rstudent(m3)))),df=summary(m3)$df[2]-1)) # nobody above line qt(1-.05/(2*length(abs(rstudent(m3)))),df=summary(m3)$df[2]-1) #nope names(m3) summary(m3) # some residual plots par(mfrow=c(2,3)) plot(m3$fitted.values,m3$residuals,main='residual plot') abline(h=0) lines(lowess(m3$fitted.values,m3$residuals),col='red') for (col in 2:dim(m3$model)[2]){ plot(m3$model[,col],m3$residuals, main=names(m3$coefficients)[col], xlab=names(m3$coefficients)[col], ylab='m3 residuals') abline(h=0) lines(lowess(m3$model[,col],m3$residuals),col='red')} par(mfrow=c(1,1)) # again some evidence of outliers plot(wage.rate,m3$residuals) identify(wage.rate,m3$residuals) # 19 is down in the bottom left corner plot(other.earnings,m3$residuals) identify(other.earnings,m3$residuals) # 4, 5, 28 par(mfrow=c(2,3)) for (col in 2:dim(m3$model)[2]){ plot(m3$model[,col],m3$residuals+m3$coefficients[col]*m3$model[,col], main=names(m3$coefficients)[col], xlab=names(m3$coefficients)[col], ylab='Partial+Residual') lines(lowess(m3$model[,col],m3$residuals+m3$coefficients[col]*m3$model[,col]), col='red')} par(mfrow=c(1,1)) plot(non.earned,m3$residuals + summary(m3)$coefficients[4,1]*non.earned, main='Partial+Residual for non.earned') identify(non.earned,m3$residuals + summary(m3)$coefficients[4,1]*non.earned) #5 is up in the top left corner plot(dfbetas(m3)[,2],main='wage rate dfbetas') abline(h=2/sqrt(36)) identify(dfbetas(m3)[,2])# 4,5,13,19 m3.5 <- lm(hours ~ wage.rate + other.earnings + non.earned + dependents + education,subset=-4) summary(m3) summary(m3.5) #4 does make a _big_ difference # in both of these models, keeping the other variables the same, and # increasing wage rate, the hours worked yearly decreases substantially # (84 is more than 2 work weeks and 147 is more than 4!) # handling the curve relationship, but each of these obscures the # effect of wage.rate m4 <- lm(hours ~ wage.rate +I(wage.rate^2) + other.earnings + non.earned + dependents + education) summary(m4) m4.1 <- lm(hours ~ wage.rate + wage.rate*education + other.earnings + non.earned + dependents + education) summary(m4.1) # lets see what happens if we get rid of more of the outliers m5 <- lm(hours ~ wage.rate + other.earnings + non.earned + dependents + education,subset=-c(4,5,13,19,28)) summary(m5) ### in most of these models we see that increasing wage.rate decreases ### hours ## you all don't know this yet, but we can do some 'automatic' selection m6 <- lm(hours ~ wage.rate + spouse.earnings + other.earnings + non.earned + asset + age + dependents + education) step(m6,direction="both") m7 <- lm(hours ~ wage.rate + other.earnings + asset + dependents + education) summary(m7) # again wage.rate increases, hours decreases