install.packages(c("car", "gvlma", "MASS", "leaps"))
par(ask=F)
opar <- par(no.readonly=TRUE)

简单线性回归

\[ {y}=a+b\times x+\epsilon \]

?women
#- Simple linear regression
fit <- lm(weight ~ height, data=women)
summary(fit)
## 
## Call:
## lm(formula = weight ~ height, data = women)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -1.7333 -1.1333 -0.3833  0.7417  3.1167 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept) -87.51667    5.93694  -14.74 1.71e-09 ***
## height        3.45000    0.09114   37.85 1.09e-14 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 1.525 on 13 degrees of freedom
## Multiple R-squared:  0.991,  Adjusted R-squared:  0.9903 
## F-statistic:  1433 on 1 and 13 DF,  p-value: 1.091e-14
women$weight
##  [1] 115 117 120 123 126 129 132 135 139 142 146 150 154 159 164
fitted(fit)
##        1        2        3        4        5        6        7        8 
## 112.5833 116.0333 119.4833 122.9333 126.3833 129.8333 133.2833 136.7333 
##        9       10       11       12       13       14       15 
## 140.1833 143.6333 147.0833 150.5333 153.9833 157.4333 160.8833
residuals(fit)
##           1           2           3           4           5           6 
##  2.41666667  0.96666667  0.51666667  0.06666667 -0.38333333 -0.83333333 
##           7           8           9          10          11          12 
## -1.28333333 -1.73333333 -1.18333333 -1.63333333 -1.08333333 -0.53333333 
##          13          14          15 
##  0.01666667  1.56666667  3.11666667
plot(women$height,women$weight,
     main="Women Age 30-39", 
     xlab="Height (in inches)", 
     ylab="Weight (in pounds)")
# add the line of best fit
abline(fit)

多项式回归

# Polynomial regression
fit2 <- lm(weight ~ height + I(height^2), data=women)
summary(fit2)
## 
## Call:
## lm(formula = weight ~ height + I(height^2), data = women)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -0.50941 -0.29611 -0.00941  0.28615  0.59706 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 261.87818   25.19677  10.393 2.36e-07 ***
## height       -7.34832    0.77769  -9.449 6.58e-07 ***
## I(height^2)   0.08306    0.00598  13.891 9.32e-09 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.3841 on 12 degrees of freedom
## Multiple R-squared:  0.9995, Adjusted R-squared:  0.9994 
## F-statistic: 1.139e+04 on 2 and 12 DF,  p-value: < 2.2e-16
plot(women$height,women$weight,
     main="Women Age 30-39",
     xlab="Height (in inches)",
     ylab="Weight (in lbs)")
fitted(fit2)
##        1        2        3        4        5        6        7        8 
## 115.1029 117.4731 120.0094 122.7118 125.5804 128.6151 131.8159 135.1828 
##        9       10       11       12       13       14       15 
## 138.7159 142.4151 146.2804 150.3118 154.5094 158.8731 163.4029
lines(women$height,fitted(fit2))

# Enhanced scatterplot for women data
library(car)
scatterplot(weight ~ height, data=women,
            spread=FALSE, smoother.args=list(lty=2), pch=19,
            main="Women Age 30-39",
            xlab="Height (inches)",
            ylab="Weight (lbs.)")

?state.x77
#  Examining bivariate relationships
states <- as.data.frame(state.x77[,c("Murder", "Population", 
                                     "Illiteracy", "Income", "Frost")])
cor(states)
##                Murder Population Illiteracy     Income      Frost
## Murder      1.0000000  0.3436428  0.7029752 -0.2300776 -0.5388834
## Population  0.3436428  1.0000000  0.1076224  0.2082276 -0.3321525
## Illiteracy  0.7029752  0.1076224  1.0000000 -0.4370752 -0.6719470
## Income     -0.2300776  0.2082276 -0.4370752  1.0000000  0.2262822
## Frost      -0.5388834 -0.3321525 -0.6719470  0.2262822  1.0000000
library(car)
scatterplotMatrix(states, spread=FALSE, smoother.args=list(lty=2),
                  main="Scatter Plot Matrix")

多重线性回归

#  Multiple linear regression
states <- as.data.frame(state.x77[,c("Murder", "Population", 
                                     "Illiteracy", "Income", "Frost")])
fit <- lm(Murder ~ Population + Illiteracy + Income + Frost, data=states)
summary(fit)
## 
## Call:
## lm(formula = Murder ~ Population + Illiteracy + Income + Frost, 
##     data = states)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -4.7960 -1.6495 -0.0811  1.4815  7.6210 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 1.235e+00  3.866e+00   0.319   0.7510    
## Population  2.237e-04  9.052e-05   2.471   0.0173 *  
## Illiteracy  4.143e+00  8.744e-01   4.738 2.19e-05 ***
## Income      6.442e-05  6.837e-04   0.094   0.9253    
## Frost       5.813e-04  1.005e-02   0.058   0.9541    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 2.535 on 45 degrees of freedom
## Multiple R-squared:  0.567,  Adjusted R-squared:  0.5285 
## F-statistic: 14.73 on 4 and 45 DF,  p-value: 9.133e-08
# - Mutiple linear regression with a significant interaction term
fit <- lm(mpg ~ hp + wt + hp:wt, data=mtcars)
summary(fit)
## 
## Call:
## lm(formula = mpg ~ hp + wt + hp:wt, data = mtcars)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -3.0632 -1.6491 -0.7362  1.4211  4.5513 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 49.80842    3.60516  13.816 5.01e-14 ***
## hp          -0.12010    0.02470  -4.863 4.04e-05 ***
## wt          -8.21662    1.26971  -6.471 5.20e-07 ***
## hp:wt        0.02785    0.00742   3.753 0.000811 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 2.153 on 28 degrees of freedom
## Multiple R-squared:  0.8848, Adjusted R-squared:  0.8724 
## F-statistic: 71.66 on 3 and 28 DF,  p-value: 2.981e-13
library(effects)
## 
## Attaching package: 'effects'
## The following object is masked from 'package:car':
## 
##     Prestige
plot(effect("hp:wt", fit,, list(wt=c(2.2, 3.2, 4.2))), multiline=TRUE)

回归模型的类型

回归模型的模型表达

回归常用的函数

回归诊断(图方法)

原始的诊断

# simple regression diagnostics
fit <- lm(weight ~ height, data=women)
par(mfrow=c(2,2))
plot(fit)

newfit <- lm(weight ~ height + I(height^2), data=women)
par(opar)
par(mfrow=c(2,2))
plot(newfit)

par(opar)

car

car包提供的方法:

模型拟合:

library(car)
states <- as.data.frame(state.x77[,c("Murder", "Population",
                                     "Illiteracy", "Income", "Frost")])
fit <- lm(Murder ~ Population + Illiteracy + Income + Frost, data=states)

评估线性(一)

qqPlot(fit, labels=row.names(states), id.method="identify",
       simulate=TRUE, main="Q-Q Plot")

评估线性(二)

crPlots(fit)

评估正态性

residplot <- function(fit, nbreaks=10) {
  z <- rstudent(fit)
  hist(z, breaks=nbreaks, freq=FALSE,
       xlab="Studentized Residual",
       main="Distribution of Errors")
  rug(jitter(z), col="brown")
  curve(dnorm(x, mean=mean(z), sd=sd(z)),
        add=TRUE, col="blue", lwd=2)
  lines(density(z)$x, density(z)$y,
        col="red", lwd=2, lty=2)
  legend("topright",
         legend = c( "Normal Curve", "Kernel Density Curve"),
         lty=1:2, col=c("blue","red"), cex=.7)
}

residplot(fit)

评估方差齐性

ncvTest(fit)
## Non-constant Variance Score Test 
## Variance formula: ~ fitted.values 
## Chisquare = 1.746514    Df = 1     p = 0.1863156
spreadLevelPlot(fit)

## 
## Suggested power transformation:  1.209626

共线性诊断

计算方差膨胀因子

vif(fit) 
## Population Illiteracy     Income      Frost 
##   1.245282   2.165848   1.345822   2.082547
sqrt(vif(fit)) > 2 
## Population Illiteracy     Income      Frost 
##      FALSE      FALSE      FALSE      FALSE

离群点诊断

outlierTest(fit)
##        rstudent unadjusted p-value Bonferonni p
## Nevada 3.542929         0.00095088     0.047544

高杠杆点诊断

hat.plot <- function(fit) {
  p <- length(coefficients(fit))
  n <- length(fitted(fit))
  plot(hatvalues(fit), main="Index Plot of Hat Values")
  abline(h=c(2,3)*p/n, col="red", lty=2)
  identify(1:n, hatvalues(fit), names(hatvalues(fit)))
}
hat.plot(fit)

## integer(0)

找出高杆杆值:

tail(sort(hatvalues(fit)))
##     Oregon Washington     Hawaii   New York California     Alaska 
##  0.1905763  0.2166290  0.2244359  0.2329864  0.3408763  0.4324732

模型改进

  • 删除观测点
  • 变量变换
  • 添加或删除变量
  • 使用其他回归方法

变量变换

Box-Cox变换:

summary(powerTransform(states$Murder))
## bcPower Transformation to Normality 
## 
##               Est.Power Std.Err. Wald Lower Bound Wald Upper Bound
## states$Murder    0.6055   0.2639           0.0884           1.1227
## 
## Likelihood ratio tests about transformation parameters
##                            LRT df       pval
## LR test, lambda = (0) 5.665991  1 0.01729694
## LR test, lambda = (1) 2.122763  1 0.14512456
boxTidwell(Murder~Population+Illiteracy,data=states)
##            Score Statistic   p-value MLE of lambda
## Population      -0.3228003 0.7468465     0.8693882
## Illiteracy       0.6193814 0.5356651     1.3581188
## 
## iterations =  19

变量添加

avPlots(fit, ask=FALSE, id.method="identify")

模型选择

方差分析做模型选择

states <- as.data.frame(state.x77[,c("Murder", "Population",
                                     "Illiteracy", "Income", "Frost")])
fit1 <- lm(Murder ~ Population + Illiteracy + Income + Frost,
           data=states)
fit2 <- lm(Murder ~ Population + Illiteracy, data=states)
anova(fit2, fit1)
## Analysis of Variance Table
## 
## Model 1: Murder ~ Population + Illiteracy
## Model 2: Murder ~ Population + Illiteracy + Income + Frost
##   Res.Df    RSS Df Sum of Sq      F Pr(>F)
## 1     47 289.25                           
## 2     45 289.17  2  0.078505 0.0061 0.9939

AIC做模型选择

fit1 <- lm(Murder ~ Population + Illiteracy + Income + Frost,
           data=states)
fit2 <- lm(Murder ~ Population + Illiteracy, data=states)
AIC(fit1,fit2)
##      df      AIC
## fit1  6 241.6429
## fit2  4 237.6565

AIC值较小的模型要优先选择.

返回课程主页