################################################################ prostate <- read.table("http://math.uttyler.edu/nathan/classes/grad-statistics/data/prostate.data",header=TRUE) f <- function (i) { summary(lm(prostate$lpsa~.,data=data.frame(prostate[1:i])))$r.squared} plot(sapply(1:8,f)) plot(sapply(1:8,function (i) { summary(lm(prostate$lpsa~.,data=data.frame(prostate[1:i])))$sigma})) ################################################################ # we're in Chapter 4, we'll postpone the last part of 4.1 on # autocorrelation and move on to 4.2: Finding unusual observations # let's set up a `perfect' situation first: x <- runif(30,-3,3) y <- 8 - 4*x + rnorm(30,sd=2) plot(x,y) normal.model <- lm(y~x) abline(normal.model) summary(normal.model) # let's add an 'outlier' in the y-direction newx <- c(x,0) newy <- c(y,-5) plot(newx,newy) model.1 <- lm(newy~newx) abline(normal.model) abline(model.1,col='red') summary(model.1) # let's make it more extreme newy[31] <- -15 plot(newx,newy) model.2 <- lm(newy~newx) abline(normal.model) abline(model.1,col='red') abline(model.2,col='green') summary(model.2) # you get the picture # now let's instead change the outlier to be in the x-direction newx[31] <- 15 newy[31] <- 8-4*15 plot(newx,newy) model.3 <- lm(newy~newx) abline(normal.model) abline(model.3,col='red') summary(model.3) # now let's see what happens: newy[31] <- 20 plot(newx,newy) model.4 <- lm(newy~newx) abline(normal.model) abline(model.3,col='red') abline(model.4,col='green') summary(model.4) #yikes! # the notion of being far away in the y direction is somehow just # `outlierness' but being far away in the x direction has the # somewhat clever name `leverage'. Note that model 3 indicates that # leverage by itself is not necessarily a bad thing, but couple # leverage and outlierness and you've got trouble. You might think to # yourself, ``Self, I can just use a residual plot to get at all of # this business when doing multiple regression.'' See the next # (contrived -- though of course the others were too) example for # something that should worry you: x <- 5*(runif(15) - .5) y <- 8 - x + rnorm(15) x <- c(x,20) y <- c(y,median(y)) plot(x,y) mod <- lm(y~x) abline(mod) abline(lm(y[-16]~x[-16]),col='red') #notice how the residual is not even the largest: plot(mod$fitted.values,mod$resid) # how will we catch these points, the above suggests running the # regression without each point and finding the fitted value, then # comparing it to the y value. These are poetically called the # jacknife residuals (sometimes the externally studentized residuals). # In practice we scale these according to their variance, rstudent(mod) plot(abs(rstudent(mod))) # These follow a T-distribution with n-(k+1)-1 (n-p-1 in the book) # degreees of freedom qt(.975,df=13) # of course we're implicitly testing all 16 when we test the largest: qt(1-(.05/16)/2,df=13) abline(h=qt(1-(.05/16)/2,df=13)) # along the same lines one can look at the dffits -- the difference in # fitted values when including the ith observation and excluding it # from the regression: dffits(mod) plot(dffits(mod)) # as a rule of thumb we are curious of any greater than 2*sqrt((k+1)/n) abline(h=2*sqrt(2/length(x))) # also along the same lines is the dfbetas -- the difference in the # slopes dfbetas(mod) # of course there is one of these for each observation and for each # element of the data set! plot(dfbetas(mod)[,1]) # the rule of thumb cutoff: abline(h=2/sqrt(length(x))) plot(dfbetas(mod)[,2]) # the rule of thumb cutoff: abline(h=2/sqrt(length(x))) # we are implicitly testing a lot of stuff here, so your mileage may # vary. The general idea is valid, however, in that we are fitting the # model with and without the point in question and hoping to see not # too much difference, otherwise the point has been fairly influential. # These and more are available using: influence.measures(mod) # 'outliers' are identified, but just how something gets starred is # opaque. I'll leave cov.r and cook.d for you to look up when you're # bored, the first looks for changes in the covariance matrix and the # other is sort of a summary of the dfbetas. The last, the hat value, # is the diagonal entries of the hat matrix: H = X(X'X)^(-1)X', so # called because yhat = Xbetahat = X(X'X)^(-1)X'y = Hy. capx <- model.matrix(mod) hat.mat <- capx %*% solve(t(capx) %*% capx) %*% t(capx) #t = transpose # %*% = matrix # mult. hat.mat diag(hat.mat) # or we can use the built in: hat(x) # var(ith residual) = (1-hat(x)[i])sigma^2 so big hats force the ith # residual to be small, and indicate points with high leverage. Recall # that high leverage need not mean a problem, but one should be careful # about such points. ################### ## 4.3 # added variable plots: for the sake of concreteness we'll do this for # x1 with the obvious modifications to be made when necessary: # # step 1: regress x1 on x2 ... xn and get the residuals # step 2: regress y on x2 ... xn and get the residuals # step 3: plot step 1 resids against step2 resids Prestige <- read.table("http://math.uttyler.edu/nathan/classes/grad-statistics/data/prestige.data",header=TRUE) attach(Prestige) my.model <- lm(prestige ~ education + income + census) summary(my.model) step.1 <- lm(education ~ income + census) step.2 <- lm(prestige ~ income + census) # added variable plot: plot(step.1$resid,step.2$resid) # note: summary(lm(step.2$resid ~ step.1$resid))$coeff summary(my.model)$coeff identify(step.1$resid,step.2$resid) Prestige[2,] influence.measures(my.model) # notice row 2 is starred! # partial residual plot: plot x1 against resids + betahat x1 summary(my.model) plot(education,my.model$resid + 4.653*education) summary(lm(my.model$resid + 4.653*education ~ education))$coeff # again # let's do census: plot(census,my.model$resid + .0005762*census) lines(lowess(census,my.model$resid + .0005762*census),col='red') # the added variable plot step.1 <- lm(census ~ income + education) step.2 <- lm(prestige ~ income + education) plot(step.1$resid,step.2$resid) lines(lowess(step.1$resid,step.2$resid),col='red') # perhaps this confirms the books assertion: # added variable plots -- better at identifying outliers, leverage, # etc. # partial + residual plots -- better at identifying non-linearity # Multicollinearity, what to do when your X variables are correlated: # "What's the problem?" you ask. # var(beta_j) = j-th diag element of (X'X)^(-1)*sigma^2 # = 1/(1-R_j^2) * sigma^2/((n-1)*SE(X_j)) # # R_j^2 is the R^2 from regressing X_j on all of the other Xs. # the 1/(1-R_j^2) term is called the variance inflation factor # note that as X_j is more linearly related to the other Xs, R_j^2 # gets up closer to 1, and 1-R_j^2 goes down closer to zero, so # 1/(1-R_j^2) trots off to infinity, and you can see what happens to # var(beta_j) :( # note that variance is in squared units, so: plot(seq(0,.99,.01),sqrt(1/(1-seq(0,.99,.01))),type='l',main='VIF Troubles', xlab='R_j^2',ylab='sqrt(VIF)') abline(h=2) # An example: census <- read.table("http://math.uttyler.edu/nathan/classes/grad-statistics/data/census.data",header=TRUE) attach(census) model.1 <- lm(undercount ~ minority + crime +poverty + language + highschool + housing + conventional) summary(model.1) minority.rsq <- summary(lm(minority ~ poverty + crime + language + highschool + housing + conventional))$r.squared minority.vif <- 1/(1-minority.rsq) minority.vif poverty.rsq <- summary(lm(poverty ~ minority + crime + language + highschool + housing + conventional))$r.squared poverty.vif <-1/(1-poverty.rsq) poverty.vif all.at.once <- function (mod) { dat <- as.matrix(mod$model[,-1]) rsq.omit <- function (i) { summary(lm(dat[,i] ~ dat[,-i]))$r.squared} rsqs <- sapply(1:dim(dat)[2],rsq.omit) vifs <- 1/(1-rsqs) names(vifs) <- names(mod$model[,-1]) vifs} all.at.once(model.1)