oPres <- na.omit(Prestige) attach(oPres) white.collar <- ifelse(type=='wc',1,0) prof <- ifelse(type=='prof',1,0) data.frame(income,education,white.collar,prof) #### kind of like problem 2 in the HW m1 <- lm(income ~ education) m2 <- lm(income ~ education + white.collar + prof) summary(m2) anova(m1,m2) #### kind of like problem 3 in the HW m3 <- lm(income ~ education + white.collar + prof + white.collar*education + prof*education) summary(m3) anova(m3,m2) summary(m3)$coeff income = -1865 + 866*education + 3647 white.collar -3068 prof -569 white.collar*education + 234 prof*education + error blue collar: income = -1865 + 866*education + error white.collar: income = -1865 + 866*education + 3647 white.collar -569 white.collar*education + error = 1782 + 297*education + error prof: income = -1865 + 866*education -3068 prof + 234 prof*education + error = -4933 + 1100*education + error summary(lm(income[white.collar==1]~education[white.collar==1])) m4 <- lm(income ~ education + census + white.collar + prof + white.collar*education + prof*education + white.collar*census + prof*census) m5 <- lm(income ~ education + census + white.collar + prof) anova(m4,m5) # added variable plots: for the sake of concreteness we'll do this for # x1 with the obvious modifications to be made when necessary: # # step 1: regress x1 on x2 ... xn and get the residuals # step 2: regress y on x2 ... xn and get the residuals # step 3: plot step 1 resids against step2 resids Prestige <- read.table("http://math.uttyler.edu/nathan/classes/statistics/data/prestige.data",header=TRUE) attach(Prestige) my.model <- lm(prestige ~ education + income + census) summary(my.model) step.1 <- lm(education ~ income + census) step.2 <- lm(prestige ~ income + census) # added variable plot: plot(step.1$resid,step.2$resid) # note: summary(lm(step.2$resid ~ step.1$resid))$coeff summary(my.model)$coeff identify(step.1$resid,step.2$resid) Prestige[2,] influence.measures(my.model) # notice row 2 is starred! # partial residual plot: plot x1 against resids + betahat x1 summary(my.model) plot(education,my.model$resid + 4.653*education) summary(lm(my.model$resid + 4.653*education ~ education))$coeff # again # let's do census: plot(census,my.model$resid + .0005762*census) lines(lowess(census,my.model$resid + .0005762*census),col='red') # the added variable plot step.1 <- lm(census ~ income + education) step.2 <- lm(prestige ~ income + education) plot(step.1$resid,step.2$resid) lines(lowess(step.1$resid,step.2$resid),col='red') # general consensus: # added variable plots -- better at identifying outliers, leverage, etc. # partial + residual plots -- better at identifying non-linearity another.model <- lm(prestige ~ income + education + census + I(census^2)) summary(another.model) plot(income,another.model$residuals) plot(income,prestige) plot(log(income),prestige) plot(sqrt(income),prestige) a.third.model <- lm(prestige ~ log(income) + education + census + I(census^2)) summary(a.third.model) plot(a.third.model$fitted.values,a.third.model$residuals) abline(h=0) lines(lowess(a.third.model$fitted.values,a.third.model$residuals)) plot(log(income),a.third.model$residuals) plot(education,a.third.model$residuals) plot(census,a.third.model$residuals) lines(lowess(census,a.third.model$residuals)) step.1 <- lm(log(income) ~ education + census + I(census^2)) step.2 <- lm(prestige ~ education + census + I(census^2)) plot(step.1$residuals,step.2$residuals) one.more <- lm(prestige ~ log(income)+ education + census + I(census^2) + I(census^3)) summary(one.more)