# fitting the model: Prestige <- read.table("http://math.uttyler.edu/nathan/classes/grad-statistics/data/prestige.data",header=TRUE) Prestige[1:5,] attach(Prestige) plot(education,prestige) model <- lm(prestige ~ education) summary(model) abline(model,col='red') confint(model) # null model is y = ybar + error # need resid sum sq for this model: ss.y <- sum((prestige - mean(prestige))^2) ss.resid <- sum(model$resid^2) f.val <- ((ss.y - ss.resid)/1)/(ss.resid/model$df.residual) f.val #see this in the summary output sqrt(f.val) # see this in the t column in the summary output 1 - pf(f.val,1,model$df.residual) # the p-value # this approach can compare models: big.model <- lm(prestige ~ education + income + women + census) summary(big.model) smaller.model <- lm(prestige ~ education + income) summary(smaller.model) # let's compare the two models: residsq.big <- sum(big.model$resid^2) residsq.smaller <- sum(smaller.model$resid^2) df.big <- big.model$df.residual df.small <- smaller.model$df.residual f.stat <- ((residsq.smaller - residsq.big)/(df.small - df.big))/ (residsq.big/df.big) f.stat 1-pf(f.stat,df1=(df.small-df.big),df2=df.big) # pf is the cumulative density # this gives us the p-value # h0: both models same # ha: big model preferable anova(big.model,smaller.model) # a shortcut to the above # a cautionary tale: another.model <- lm(prestige ~ women + census) summary(another.model) anova(big.model,another.model) # on to diagnostics # the number one diagnostic tool, the residual plot plot(smaller.model$fitted.values,smaller.model$resid) abline(h=0) # sometimes this helps plot(smaller.model$fitted.values,abs(smaller.model$resid)) # a cheesy test: summary(lm(abs(smaller.model$resid) ~ smaller.model$fitted.values)) plot(education,smaller.model$resid) abline(h=0) plot(income,smaller.model$resid) abline(h=0) #also try the variables you left out plot(women,smaller.model$resid) abline(h=0) plot(census,smaller.model$resid) lines(lowess(census,smaller.model$resid),col='red') summary(lm(smaller.model$resid ~ census)) summary(lm(smaller.model$resid ~ income)) # see page 55 in book par(mfrow=c(3,3)) # plot in 3x3 grid for(i in 1:9) plot(1:50,rnorm(50)) # ideal situation for(i in 1:9) plot(1:50,(1:50)*rnorm(50)) # strong heteroskedasticity for(i in 1:9) plot(1:50,sqrt(1:50)*rnorm(50)) # less strong heteroskedasticity for(i in 1:9) plot(1:50,cos((1:50)*pi/25)+rnorm(50)) # curved relationship par(mfrow=c(1,1)) model <- lm(prestige ~ income + education) summary(model) plot(model$fitted.values,model$resid) abline(h=0) plot(model$fitted.values,abs(model$resid)) plot(income,model$resid) plot(education,model$resid) new.model <- lm(prestige ~ log(income) + education) summary(new.model) plot(new.model$fitted.values,new.model$resid) abline(h=0) plot(new.model$fitted.values,abs(new.model$resid)) plot(log(income),new.model$resid) abline(h=0) plot(education,new.model$resid) another.model <- lm(prestige ~ log(income) + education + I(education^2)) summary(another.model) another.model <- lm(prestige ~ log(income) + education + I(education^2) + I(education^3)) summary(another.model) anova(another.model,new.model) plot(another.model$fitted.values,another.model$resid) abline(h=0) plot(education,another.model$resid) abline(h=0) plot(I(education^2),another.model$resid) abline(h=0) plot(I(education^3),another.model$resid) abline(h=0) plot(log(income),another.model$resid) abline(h=0) plot(census,another.model$resid) abline(h=0) lines(lowess(census,another.model$resid),col='red') # why not make it more absurd? yet.another.model <- lm(prestige ~ log(income) + education + I(education^2) + I(education^3) + census + I(census^2)) summary(yet.another.model) anova(another.model,yet.another.model) plot(yet.another.model$fitted.values, yet.another.model$residuals) plot(log(income),yet.another.model$residuals) plot(I(census^2),yet.another.model$residuals) qqnorm(another.model$resid) # see page 59 in book par(mfrow=c(3,3)) for(i in 1:9) qqnorm(rnorm(50)) for(i in 1:9) qqnorm(exp(rnorm(50))) #skewed for(i in 1:9) qqnorm(rcauchy(50)) #long-tailed for(i in 1:9) qqnorm(runif(50)) #short tailed