Hot.dogs <- read.table('http://math.uttyler.edu/nathan/classes/statistics/data/hotdogs.data',header=TRUE) detach() attach(Hot.dogs) Hot.dogs[1:10,] Type plot(Sodium,Calories,type='n') points(Sodium[Type=='Beef'],Calories[Type=='Beef'],pch='b') points(Sodium[Type=='Meat'],Calories[Type=='Meat'],pch='m') points(Sodium[Type=='Poultry'],Calories[Type=='Poultry'],pch='p') ### two new variables beef <- ifelse(Type=='Beef',1,0) meat <- ifelse(Type=='Meat',1,0) data.frame(Calories,Sodium,beef,meat,Type) hd.mod <- lm(Sodium ~ Calories + beef + meat + beef*Calories + meat*Calories) summary(hd.mod) # p = .068 H0: beta4=0, beta5 in model # p = .376 H0: beta5=0, beta4 in model # # want to test: # H0: beta4 = beta5 = 0 hd.mod.1 <- lm(Sodium ~ Calories + beef + meat) # here's how to test this null hypothesis anova(hd.mod,hd.mod.1) #p = .1853, fail to reject H0: beta4 = beta5 = 0 # I can go ahead and use hd.mod.1 summary(hd.mod.1) # don't really need this here, but: # H0: beta2 = beta3 = 0 hd.mod.2 <- lm(Sodium ~ Calories) anova(hd.mod.1,hd.mod.2) # reject h0, we need the difference in intercept terms included. ########################## Star <- read.table('http://math.uttyler.edu/nathan/classes/statistics/data/star.data',header=TRUE) Star[1:5,] attach(Star) plot(temp,light) m1 <- lm(light~temp) summary(m1) abline(m1) abline(v=3.7,lty=3) m2 <- lm(light~temp,subset=temp>3.7) summary(m2) abline(m2,col='blue') plot(m1$fitted.values,m1$resid) #### The number one diagnostic tool # the residual plot ########################################## ########################################## ######## outliers ######## ########################################## ########################################## # first a perfect situation x <- runif(30,-3,3) y <- 4 + 7*x + rnorm(30,sd=2) plot(x,y) first.model <- lm(y~x) summary(first.model) abline(first.model) # let's add some outliers: # first in the y-direction: new.x <- c(x,0) new.y <- c(y,-20) plot(new.x,new.y) second.model <- lm(new.y ~ new.x) abline(first.model) abline(second.model,col='red') # now let's put an outlier in the x direction instead: new.x[31] <- 10 new.y[31] <- 4+7*10 # putting it exactly in the pattern third.model <- lm(new.y ~ new.x) plot(new.x,new.y) abline(first.model) abline(third.model,col='red') new.y[31] <- -10 fourth.model <- lm(new.y ~ new.x) plot(new.x,new.y) abline(first.model) abline(fourth.model,col='red') # yikes! # the notion of being far away in the y direction is somehow just # `outlierness' but being far away in the x direction has the # somewhat clever name `leverage'. Note that third.model indicates # that # leverage by itself is not necessarily a bad thing, but couple # leverage and y-outlierness and you've got trouble. You might think # to # yourself, ``Self, I can just use a residual plot to get at all of # this business when doing multiple regression.'' See the next # (contrived -- though of course the others were too) example for # something that should worry you: x <- 5*(runif(15) - .5) y <- 8 - x + rnorm(15) x <- c(x,20) y <- c(y,median(y)) plot(x,y) mod <- lm(y~x) abline(mod) abline(lm(y[-16]~x[-16]),col='red') #notice how the residual is not even the largest: plot(mod$fitted.values,mod$resid) # how will we catch these points, the above suggests running the # regression without each point and finding the fitted value, then # comparing it to the y value. These are poetically called the # jacknife residuals (sometimes the externally studentized residuals). # In practice we scale these according to their variance, rstudent(mod) plot(abs(rstudent(mod))) # These follow a T-distribution with n-(# terms in model)-1 # degreees of freedom qt(.975,df=13) # of course we're implicitly testing all 16 when we test the largest: qt(1-(.05/16)/2,df=13) abline(h=qt(1-(.05/16)/2,df=13)) # along the same lines one can look at the dffits -- the difference in # fitted values when including the ith observation and excluding it # from the regression: dffits(mod) plot(dffits(mod)) # as a rule of thumb we are curious of any greater than 2*sqrt((k+1)/n) abline(h=2*sqrt(2/length(x))) # also along the same lines is the dfbetas -- the difference in the # slopes dfbetas(mod) # of course there is one of these for each observation and for each # element of the data set! plot(dfbetas(mod)[,1]) # the rule of thumb cutoff: abline(h=2/sqrt(length(x))) plot(dfbetas(mod)[,2]) # the rule of thumb cutoff: abline(h=2/sqrt(length(x))) # We are implicitly testing a lot of stuff here, so your mileage may # vary. One of my favorite lines from another book on regression is: # ``torture the data enough and they will confess'' # The general idea is valid, however, in that we are fitting the # model with and without the point in question and hoping to see not # too much difference, otherwise the point has been fairly influential. # These and more are available using: influence.measures(mod) # leverage: # we look at the hat matrix. Note that y.hat = X beta.hat # = X (X'X)^{-1}X' y # = H y # # H is the hat matrix, and the hat values are the values along the # diagonal summary(mod) X <- model.matrix(mod) diag(X %*% solve(t(X)%*%X)%*% t(X)) # or we can use the built in: hat(model.matrix(mod)) # var(ith residual) = (1-hat(x)[i])sigma^2 so big hats force the ith # residual to be small, and indicate points with high leverage. Recall # that high leverage need not mean a problem, but one should be careful # about such points. # rule of thumb -- high leverage = hat > 2(k+1)/n # added variable plots: for the sake of concreteness we'll do this for # x1 with the obvious modifications to be made when necessary: # # step 1: regress x1 on x2 ... xn and get the residuals # step 2: regress y on x2 ... xn and get the residuals # step 3: plot step 1 resids against step2 resids Prestige <- read.table("http://math.uttyler.edu/nathan/classes/statistics/data/prestige.data",header=TRUE) attach(Prestige) my.model <- lm(prestige ~ education + income + census) summary(my.model) step.1 <- lm(education ~ income + census) step.2 <- lm(prestige ~ income + census) # added variable plot: plot(step.1$resid,step.2$resid) # note: summary(lm(step.2$resid ~ step.1$resid))$coeff summary(my.model)$coeff identify(step.1$resid,step.2$resid) Prestige[2,] influence.measures(my.model) # notice row 2 is starred! # partial residual plot: plot x1 against resids + betahat x1 summary(my.model) plot(education,my.model$resid + 4.653*education) summary(lm(my.model$resid + 4.653*education ~ education))$coeff # again # let's do census: plot(census,my.model$resid + .0005762*census) lines(lowess(census,my.model$resid + .0005762*census),col='red') # the added variable plot step.1 <- lm(census ~ income + education) step.2 <- lm(prestige ~ income + education) plot(step.1$resid,step.2$resid) lines(lowess(step.1$resid,step.2$resid),col='red') # general consensus: # added variable plots -- better at identifying outliers, leverage, etc. # partial + residual plots -- better at identifying non-linearity another.model <- lm(prestige ~ income + education + census + I(census^2)) summary(another.model) plot(income,another.model$residuals) plot(income,prestige) plot(log(income),prestige) plot(sqrt(income),prestige) a.third.model <- lm(prestige ~ log(income) + education + census + I(census^2)) summary(a.third.model) plot(a.third.model$fitted.values,a.third.model$residuals) abline(h=0) lines(lowess(a.third.model$fitted.values,a.third.model$residuals)) plot(log(income),a.third.model$residuals) plot(education,a.third.model$residuals) plot(census,a.third.model$residuals) lines(lowess(census,a.third.model$residuals)) step.1 <- lm(log(income) ~ education + census + I(census^2)) step.2 <- lm(prestige ~ education + census + I(census^2)) plot(step.1$residuals,step.2$residuals) one.more <- lm(prestige ~ log(income)+ education + census + I(census^2) + I(census^3)) summary(one.more)