################################################################# # Multicollinearity, what to do when your X variables are correlated: ################################################################# # "What's the problem?" you ask. # var(beta_j) = j-th diag element of (X'X)^(-1)*sigma^2 # = 1/(1-R_j^2) * sigma^2/((n-1)*SE(X_j)) # # R_j^2 is the R^2 from regressing X_j on all of the other Xs. # the 1/(1-R_j^2) term is called the variance inflation factor # note that as X_j is more linearly related to the other Xs, R_j^2 # gets up closer to 1, and 1-R_j^2 goes down closer to zero, so # 1/(1-R_j^2) trots off to infinity, and you can see what happens to # var(beta_j) :( # note that variance is in squared units, so: plot(seq(0,.99,.01),sqrt(1/(1-seq(0,.99,.01))),type='l',main='VIF Troubles', xlab='R_j^2',ylab='sqrt(VIF)') abline(h=2) # An example: census <- read.table("http://math.uttyler.edu/nathan/classes/statistics/data/census.data",header=TRUE) attach(census) model.1 <- lm(undercount ~ minority + crime +poverty + language + highschool + housing + conventional) summary(model.1) minority.rsq <- summary(lm(minority ~ poverty + crime + language + highschool + housing + conventional))$r.squared minority.vif <- 1/(1-minority.rsq) minority.vif poverty.rsq <- summary(lm(poverty ~ minority + crime + language + highschool + housing + conventional))$r.squared poverty.vif <-1/(1-poverty.rsq) poverty.vif all.at.once <- function (mod) { dat <- as.matrix(mod$model[,-1]) rsq.omit <- function (i) { summary(lm(dat[,i] ~ dat[,-i]))$r.squared} rsqs <- sapply(1:dim(dat)[2],rsq.omit) vifs <- 1/(1-rsqs) names(vifs) <- names(mod$model[,-1]) vifs} all.at.once(model.1) plot(data.frame(minority,crime,poverty,highschool)) # the terms causing the problems are minority, poverty, highschool, and # to a slightly lesser degree crime. A first approach to solving the # multicollinearity issue, the problems with which we've already # alluded to, involves deletion of terms. Our first approach is ad-hoc. mod.2 <- lm(undercount ~ minority + crime + poverty + language + conventional) summary(mod.2) # multiple r^2 and rse about same anova(model.1,mod.2) # the vifs again, note highschool was the second highest vif before and # it has been removed due to non-significance in mod.1 all.at.once(mod.2) plot(data.frame(minority,crime,poverty)) mod.3 <- lm(undercount ~ crime + poverty + language + conventional) summary(mod.3) mod.4 <- lm(undercount ~ minority + language + conventional) summary(mod.4) # we prefer mod.4 to mod.3 using adjusted r^2 and rse as criteria # now we have no trouble with collinearity, but note that this is # artificial since we just removed all variables collinear with # minority all.at.once(mod.4) # note also, we've done no diagnostics, we might prefer mod.3 if # mod.4 violates many of the regression assumptions # # note further that using anova test to compare mod.3 and mod.4 is not # allowed # other criteria to choose models instead of using adj r^2 and rse: # Aikike information criterion: AIC(model.1,mod.2,mod.3,mod.4) # we prefer small AIC # Bayes information criterion: AIC(model.1,mod.2,mod.3,mod.4,k=log(length(undercount))) # again we prefer small BIC # there are automated ways to take one big model, say mod.1, and start # systematically dropping and re-adding terms in an effort to maximize # adj. r^2 (say) or minimize AIC or BIC. step(model.1) mod.5 <- lm(undercount ~ poverty + language + crime + conventional + minority) all.at.once(mod.5) # of course we now have problems with collinearity again :( # Alternatively, concerning model selection, we have only 7 variables # here, each of which can be included or excluded, think 1 or 0. So # there are 2^7 = 128 possible models, compute them all and pick the # one with the highest adj. r^2 or similar. This is left as an # exercise! Note that these automated methods don't directly address # the autocorrelation issue and the elimination of correlated variables # method doesn't necessarily produce models with high r^2 or small rse # or low AIC or ...