<br />
# 8 回归<br />
# 8.1 回归的多面性<br />
# 8.2 OLS回归<br />
# 8.3 回归诊断<br />
# 8.4 异常观测值<br />
# 8.5 改进措施<br />
# 8.6 选择“最佳”的回归模型<br />
# 8.7 深层次分析<br />
# 8.8 小结</p>
<p># 8.1 回归的多面性<br />
# 8.1.1 OLS回归的适用情境<br />
# 8.1.2 基础回顾<br />
# ---------------------------------------------------------------------------<br />
# 本章重点是普通最小二乘(OLS)回归法<br />
# 包括简单线性回归、多项式回归和多元线性回归<br />
# 困难:1 发现有趣的问题<br />
# 2 设计一个有用的、可以测量的响应变量<br />
# 3 收集合适的数据<br />
# 如何用R函数拟合OSL回归模型、评价拟合优度、检验假设条件以及选择模型</p>
<p># 8.2 OLS回归<br />
# 8.2.1 用lm()拟合回归模型<br />
# 8.2.2 简单线性回归<br />
# 8.2.3 多项式回归<br />
# 8.2.4 多元线性回归<br />
# 8.2.5 有交互项的多元线性回归<br />
# ---------------------------------------------------------------------------<br />
# 目标:通过减少响应变量的真实值与预测值的差值来获得模型参数(截距项和斜率)<br />
# 统计假设:<br />
# 正态性:对于固定的自变量值,因变量值成正态分布<br />
# 独立性:Yi值之间相互独立<br />
# 线性:因变量与自变量之间为线性相关<br />
# 同方差性:因变量的方差不随自变量的水平不同而变化<br />
# OLS回归还假定自变量是固定的且测量无误差,但在实践中通常都放松了这个假设。</p>
<p># 表8-2 R表达式中常用的符号<br />
# ~ + : * ^ . - -1 I() function<br />
# 具体用途略 ^__^</p>
<p># 表8-3 对拟合线性模型非常有用的其他函数<br />
# summary() coefficients() confint() fitted() residuals()<br />
# anova() vcov() AIC() plot() predict()</p>
<p># 以下示例用来熟悉表8-3中的函数<br />
# 数据集women提供了15个年龄在30~39岁间女性的身高和体重信息<br />
# 通过身高来预测体重<br />
# 代码清单8-1 简单线性回归<br />
fit <- lm(weight ~ height, data=women)<br />
# 拟合线性模型最基本的函数lm(formula, data)<br />
summary(fit)<br />
# summary()展示拟合模型的详细结果<br />
women$weight<br />
fitted(fit)<br />
# 列出拟合模型的预测值<br />
residuals(fit)<br />
# 列出拟合模型的残差值<br />
plot(women$height, women$weight,<br />
xlab="Height (in inches)",<br />
ylab="Weight (in pounds)")<br />
# 生成评价拟合模型的诊断图<br />
abline(fit)<br />
# 画出拟合线</p>
<p># 代码清单8-2 多项式回归<br />
fit2 <- lm(weight ~ height + I(height^2), data=women)<br />
summary(fit2)<br />
plot(women$height, women$weight,<br />
xlab="Height (in inches)",<br />
ylab="Weight (in lbs)")<br />
lines(women$height, fitted(fit2))<br />
residuals(fit2)</p>
<p># 比三次更高的多项式几乎没有必要</p>
<p># scatterplot()函数可以很容易、方便地绘制二元关系图<br />
install.packages("car")<br />
library(car)<br />
scatterplot(weight ~ height,<br />
data=women,<br />
spread=FALSE, lty.smooth=2,<br />
pch=19,<br />
main="Women Age 30-39",<br />
xlab="Height (inches)",<br />
ylab="Weight (lbs.)")<br />
# spread=FALSE选项删除了残差正负均方根在平滑曲线上的展开和非对称信息<br />
# lty.smooth=2选项设置loess拟合曲线为虚线<br />
# pch=19选项设置点为实心圆(默认为空心圆)<br />
## Warning messages:<br />
## 1: In plot.window(...) : "lty.smooth" is not a graphical parameter<br />
## 2: In plot.xy(xy, type, ...) : "lty.smooth" is not a graphical parameter<br />
## 3: In axis(side = side, at = at, labels = labels, ...) :<br />
## "lty.smooth" is not a graphical parameter<br />
## 4: In axis(side = side, at = at, labels = labels, ...) :<br />
## "lty.smooth" is not a graphical parameter<br />
## 5: In box(...) : "lty.smooth" is not a graphical parameter<br />
## 6: In title(...) : "lty.smooth" is not a graphical parameter</p>
<p># lm()函数需要一个数据框<br />
# 代码清单8-3 检测二变量关系<br />
states <- as.data.frame(state.x77[,c("Murder", "Population", "Illiteracy", "Income",<br />
"Frost")])<br />
cor(states)<br />
library(car)<br />
scatterplotMatrix(states, spread=FALSE, lty.smooth=2,<br />
main="Scatter Plot Matrix")</p>
<p># 代码清单8-4 多元线性回归<br />
fit <- lm(Murder ~ Population + Illiteracy + Income + Frost, data=states)<br />
summary(fit)</p>
<p># 代码清单8-5 有显著交互项的多元线性回归<br />
fit <- lm(mpg ~ hp + wt + hp:wt, data=mtcars)<br />
summary(fit)<br />
install.packages("effects")<br />
library(effects)<br />
plot(effect("hp:wt", fit,<br />
list(wt=c(2.2, 3.2, 4.2))),<br />
multiline=TRUE)<br />
# 随着车重的增加,马力与每加仑汽油行驶英里数的关系减弱了<br />
# 当wt=4.2时,直线几乎是水平的,表明随着hp的增加, mpg不会发生改变</p>
<p># 8.3 回归诊断<br />
# 8.3.1 标准方法<br />
# 8.3.2 改进的方法<br />
# 8.3.3 线性模型假设的综合验证<br />
# 8.3.4 多重共线性<br />
# --------------------------------------------------------------------------<br />
states <- as.data.frame(state.x77[,c("Murder", "Population",<br />
"Illiteracy", "Income", "Frost")])<br />
fit <- lm(Murder ~ Population + Illiteracy + Income + Frost, data=states)<br />
confint(fit)<br />
# 文盲率改变1%,谋杀率就在95%的置信区间[2.38, 5.90]中变化<br />
# Frost的置信区间包含0,当其他变量不变时,温度的改变与谋杀率无关</p>
<p>fit <- lm(weight ~ height, data=women)<br />
par(mfrow=c(2,2))<br />
plot(fit) # abline(fit)<br />
# 书中给出的图形有误<br />
# 正态性:若满足正态性,正态Q-Q图上的点应该落在呈45度角的直线上<br />
# 独立性:从收集的数据中来验证<br />
# 线性:残差图与拟合图中可以清楚的看到一个曲线关系<br />
# 同方差性:若满足假设,Scale-Location Graph水平线周围的点应该随机分布<br />
# Residuals vs Leverage(离群点、杠杆值、强影响点)<br />
# 对杠杆值不是很明白</p>
<p>fit2 <- lm(weight ~ height + I(height^2), data=women)<br />
par(mfrow=c(2,2))<br />
plot(fit2)<br />
# 删除预测点13(不满足正态性)、观测点15(强影响点)重新拟合<br />
newfit2 <- lm(weight ~ height + I(height^2), data=women[-c(13,15),])<br />
par(mfrow=c(2,2))<br />
plot(newfit2)</p>
<p>fit <- lm(Murder ~ Population + Illiteracy + Income + Frost, data=states)<br />
par(mfrow=c(2,2))<br />
plot(fit)<br />
# Nevada是个离群点</p>
<p># 表8-4<br />
# qqPlot() durbinWatsonTest() crPlots() ncvTest()<br />
# spreadLevelPlot() outlierTest() avPlots() inluencePlot()<br />
# scatterplot() scatterplotMatrix() vif()</p>
<p># gvlma包 这里也没用gvlma包<br />
states <- as.data.frame(state.x77[,c("Murder", "Population",<br />
"Illiteracy", "Income", "Frost")])<br />
library(car)<br />
fit <- lm(Murder ~ Population + Illiteracy + Income + Frost, data=states)<br />
qqPlot(fit, labels=row.names(states), id.method="identify",<br />
simulate=TRUE, main="Q-Q Plot")<br />
# id.method="identify"选项能够交互式绘图<br />
# simulate=TRUE, 95%的置信区间将会用参数自助法生成<br />
# 自助法可参见第12章</p>
<p># residplot()函数生成学生化残差柱状图(直方图)<br />
# 并添加正态曲线、核密度曲线和轴须图<br />
# 不需要加载car包<br />
# 代码清单8-6 绘制学生化残差图的函数<br />
residplot <- function(fit, nbreaks=10){<br />
z <- rstudent(fit)<br />
hist(z, breaks=nbreaks, freq=FALSE,<br />
xlab="Studentized Residual",<br />
main="Distribution of Errors")<br />
rug(jitter(z), col="brown")<br />
curve(dnorm(x, mean=mean(z), sd=sd(z)),<br />
add=TRUE, col="blue", lwd=2)<br />
lines(density(z)$x, density(z)$y,<br />
col="red", lwd=2, lty=2)<br />
legend("topright",<br />
legend=c("Normal Curve", "Kernel Density Curve"),<br />
lty=1:2, col=c("blue", "red"), cex=.7)<br />
}<br />
residplot(fit)</p>
<p># 误差的独立性检验<br />
durbinWatsonTest(fit)<br />
# p=0.282不显著说明无自相关性,误差项之间独立<br />
# 使用自助法来导出p值(参见第12章)<br />
# 如果添加了选项simulate=FALSE,则每次运行测试时获得的结果都将略有不同</p>
<p># 线性<br />
library(car)<br />
crPlots(fit)</p>
<p># 代码清单8-7 检验同方差性<br />
library(car)<br />
ncvTest(fit)<br />
spreadLevelPlot(fit)</p>
<p># 代码清单8-8 线性模型假设的综合验证<br />
install.packages("gvlma")<br />
library(gvlma)<br />
gvmodel <- gvlma(fit)<br />
summary(gvmodel)</p>
<p># 代码清单8-9 检测多重共线性<br />
library(car)<br />
vif(fit)<br />
sqrt(vif(fit)) > 2 # problem?<br />
# 一般原则下,sqrt(vif) > 2就表明存在多重共线性问题</p>
<p># 8.4 异常观测值<br />
# 8.4.1 离群点<br />
# 8.4.2 高杠杆值点<br />
# 8.4.3 强影响点<br />
# ----------------------------------------------------------------------<br />
# Q-Q图,落在置信区间带外的点即可被认为是离群点<br />
# 标准化残差值大于2或者小于-2的点可能是离群点<br />
install.packages("car")<br />
library(car)<br />
outlierTest(fit)</p>
<p># 高杠杆值观测点:与其他预测变量有关的离群点<br />
# 若观测点的帽子值大于帽子均值的2或3倍,即可以认定为高杠杆值点<br />
hat.plot <- function(fit){<br />
p <- length(coefficients(fit))<br />
n <- length(fitted(fit))<br />
plot(hatvalues(fit), main="Index Plot of Hat Values")<br />
abline(h=c(2,3)*p/n, col="red", lty=2)<br />
identify(1:n, hatvalues(fit), names(hatvalues(fit)))<br />
}<br />
hat.plot(fit)</p>
<p># 一般来说,Cook's D值大于4/(n-k-1),则表明它是强影响点<br />
# 其中n为样本量大小,k是预测变量数目<br />
cutoff <- 4/(nrow(states)-length(fit$coefficients)-2)<br />
plot(fit, which=4, cook.levels=cutoff)<br />
abline(h=cutoff, lty=2, col="red")<br />
library(car)<br />
avPlots(fit, ask=FALSE, onepage=TRUE, id.method="identify")<br />
influencePlot(fit, id.method="identify", main="Influence Plot",<br />
sub="Circle size is proportional to Cook's distance")</p>
<p># 8.5 改进措施<br />
# 8.5.1 删除观测点<br />
# 8.5.2 变量转换<br />
# 8.5.3 增删变量<br />
# 8.5.4 尝试其他方法<br />
# ----------------------------------------------------------------<br />
# 删除观测点</p>
<p># 表8-5 常见的变换<br />
# -2 -1 -0.5 0 0.5 1 2<br />
# 若Y是比例数,通常使用logit变换[ln(Y/1-Y)]</p>
<p># 代码清单8-10 Box-Cox正态变换<br />
library(car)<br />
summary(powerTransform(states$Murder))<br />
boxTidwell(Murder ~ Population+Illiteracy, data=states)</p>
<p># 删除变量在处理多重共线性时是一种非常重要的方法</p>
<p># 8.6 选择“最佳”的回归模型<br />
# 8.6.1 模型比较<br />
# 8.6.2 变量选择<br />
# ---------------------------------------------------------------<br />
# 代码清单8-11 用anova()函数比较<br />
states <- as.data.frame(state.x77[,c("Murder", "Population",<br />
"Illiteracy", "Income", "Frost")])<br />
fit1 <- lm(Murder ~ Population + Illiteracy + Income + Frost, data=states)<br />
fit2 <- lm(Murder ~ Population + Illiteracy, data=states)<br />
anova(fit2, fit1)</p>
<p># 代码清单8-12 用AIC来比较模型<br />
AIC(fit1, fit2)<br />
# ANOVA需要嵌套模型,而AIC方法不需要</p>
<p># 代码清单8-13 向后回归<br />
library(MASS)<br />
fit1 <- lm(Murder ~ Population + Illiteracy + Income + Frost, data=states)<br />
stepAIC(fit1, direction="backward")</p>
<p># 代码清单8-14 全子集回归<br />
install.packages("leaps")<br />
library(leaps)<br />
leaps <- regsubsets(Murder ~ Population + Illiteracy + Income + Frost, data=states, nbest=4)<br />
plot(leaps, scale="adjr2")</p>
<p>library(car)<br />
subsets(leaps, statistic="cp", main="Cp Plot for All Subsets Regression")<br />
abline(1, 1, lty=2, col="red")</p>
<p># 8.7 深层次分析<br />
# 8.7.1 交叉验证<br />
# 8.7.2 相对重要性<br />
# -----------------------------------------------------------------<br />
# 所谓交叉验证,即将一定比例的数据挑选出来作为训练样本<br />
# 另外的样本作保留样本,先在训练样本上获取回归<br />
# 然后在保留样本上做预测<br />
# 当n是观测总数目,k为n时,该方法又称作刀切法(jackknifing)<br />
# 代码清单8-15 R平方的k重交叉验证<br />
shrinkage <- function(fit, k=10){<br />
require(bootstrap)</p>
<p> theta.fit <- function(x,y){lsfit(x,y)}<br />
theta.predict <- function(fit,x){cbind(1,x)%*%fit$coef}</p>
<p> x <- fit$model[,2:ncol(fit$model)]<br />
y <- fit$model[,1]</p>
<p> results <- crossval(x, y, theta.fit, theta.predict, ngroup=k)<br />
r2 <- cor(y, fit$fitted.values)^2<br />
r2cv <- cor(y, results$cv.fit)^2<br />
cat("Original R-square =", r2, "\n")<br />
cat(k, "Fold Cross-Validated R-square =", r2cv, "\n")<br />
cat("Change =", r2-r2cv, "\n")<br />
}<br />
fit1 <- lm(Murder ~ Population + Income + Illiteracy + Frost, data=states)<br />
shrinkage(fit1)<br />
fit2 <- lm(Murder ~ Population + Illiteracy , data=states)<br />
shrinkage(fit2)</p>
<p>zstates <- as.data.frame(scale(states))<br />
zfit <- lm(Murder~Population+Income+Illiteracy+Frost, data=zstates)<br />
coef(zfit)</p>
<p># 代码清单8-16 relweights()函数,计算预测变量的相对权重<br />
relweights <- function(fit, ...){<br />
R <- cor(fit$model)<br />
nvar <- ncol(R)<br />
rxx <- R[2:nvar, 2:nvar]<br />
rxy <- R[2:nvar, 1]<br />
svd <- eigen(rxx)<br />
evec <- svd$vectors<br />
ev <- svd$values<br />
delta <- diag(sqrt(ev))<br />
lambda <- evec %*% delta %*% t(evec)<br />
lambdasq <- lambda^2<br />
beta <- solve(lambda) %*% rxy<br />
rsquare <- colSums(beta^2)<br />
rawwgt <- lambdasq %*% beta^2<br />
import <- (rawwgt/rsquare) * 100<br />
lbls <- names(fit$model[2:nvar])<br />
rownames(import) <- lbls<br />
colnames(import) <- "Weights"<br />
barplot(t(import), names.arg=lbls,<br />
ylab="% of R-Square",<br />
xlab="Predictor Variables",<br />
main="Relative Importance of Predictor Variables",<br />
sub=paste("R-Square=", round(rsquare, digits=3)),<br />
...)<br />
return(import)<br />
}</p>
<p># 代码清单8-17 relweights()函数的应用<br />
fit <- lm(Murder ~ Population + Illiteracy + Income + Frost, data=states)<br />
relweights(fit, col="lightgrey")</p>
<p># 8.8 小结<br />
# -------------------------------------------------------------<br />
</p>