install.packages(c("rpart", "ISLR","MASS","class","nnet","party", "randomForest", "e1071", "rpart.plot","ROCR") 
install.packages(rattle, dependencies = c("Depends", "Suggests"))                 
setwd("E:\\mywork\\study_R\\R_sim_2016\\R_multistat_5\\")
rm(list = ls(all = TRUE)) 
options(digits = 5  )

分类问题

经常需要基于一组预测变量预测一个分类结果,如:

  • 根据个人信息和财务历史记录预测其是否会还贷;
  • 根据重症病人的症状和生命体征判断其是否为心脏病发作;
  • 根据关键词、图像、超文本、主题栏、来源等判别一封邮件是否为病毒邮件。

上述例子的共同点是根据一组预测变量(或特征)来预测相对应的二分类结果(无信用风 险/有信用风险,心脏病发作/心脏病未发作,是病毒邮件/不是病毒邮件),目的是通过某种方法 实现对新出现单元的准确分类。

数据准备

数据集1

数据是威斯康星州乳腺癌数据集, 可在UCI机器学习数据库。数据集包含699个细针抽吸活检的样本单元,其中458个(65.5%)为良性样本单元,241个(34.5%)为恶性样本单元。数据集中共有11个变量:

  • ID
  • 肿块厚度
  • 细胞大小的均匀性
  • 细胞形状的均匀性
  • 边际附着力
  • 单个上皮细胞大小
  • 裸核
  • 乏味染色体
  • 正常核
  • 有丝分裂
  • 类别

第一个变量ID不纳入数据分析,最后一个变量(类别)即输出变量(编码为良性=2,恶性=4)。另外九个变量是与判别恶性肿瘤相关的细胞特征,并且得到了记录。 这些细胞特征得分为1(最接近良性)至10(最接近病变)之间的整数。任一变量都不能单独作 为判别良性或恶性的标准,建模的目的是找到九个细胞特征的某种组合,从而实现对恶性肿瘤的 准确预测。Mangasarian和Wolberg在其1990年的文章中详细探讨了这个数据集。

这里下载,然后运行下列语句加载

load("cl_data1.Rda")
names(cl_data1) <- c("ID", "clumpThickness", "sizeUniformity", "shapeUniformity", "maginalAdhesion",  "singleEpithelialCellSize", "bareNuclei",  "blandChromatin", "normalNucleoli", "mitosis", "class")
dim(cl_data1)
## [1] 699  11
head(cl_data1)
##        ID clumpThickness sizeUniformity shapeUniformity maginalAdhesion
## 1 1000025              5              1               1               1
## 2 1002945              5              4               4               5
## 3 1015425              3              1               1               1
## 4 1016277              6              8               8               1
## 5 1017023              4              1               1               3
## 6 1017122              8             10              10               8
##   singleEpithelialCellSize bareNuclei blandChromatin normalNucleoli
## 1                        2          1              3              1
## 2                        7         10              3              2
## 3                        2          2              3              1
## 4                        3          4              3              7
## 5                        2          1              3              1
## 6                        7         10              9              7
##   mitosis class
## 1       1     2
## 2       1     2
## 3       1     2
## 4       1     2
## 5       1     2
## 6       1     4
sum(is.na(cl_data1))## 查看缺失的观测数目
## [1] 16
cl_data1<- na.omit(cl_data1)## 删除有缺失的观测
sum(is.na(cl_data1))
## [1] 0

数据集2

数据来自标普500在2001年和2005年的1250个观测,每天一个观测 利用前四天的涨跌情况来预测明天的涨跌。

这里下载,然后运行下列语句加载

load("cl_data2.Rda")
dim(cl_data2)
## [1] 1250    9
head(cl_data2)
##   Year   Lag1   Lag2   Lag3   Lag4   Lag5 Volume  Today Direction
## 1 2001  0.381 -0.192 -2.624 -1.055  5.010 1.1913  0.959        Up
## 2 2001  0.959  0.381 -0.192 -2.624 -1.055 1.2965  1.032        Up
## 3 2001  1.032  0.959  0.381 -0.192 -2.624 1.4112 -0.623      Down
## 4 2001 -0.623  1.032  0.959  0.381 -0.192 1.2760  0.614        Up
## 5 2001  0.614 -0.623  1.032  0.959  0.381 1.2057  0.213        Up
## 6 2001  0.213  0.614 -0.623  1.032  0.959 1.3491  1.392        Up
sum(is.na(cl_data2))## 查看缺失的观测数目
## [1] 0

利用前四天的涨跌情况(Lag1-Lag4)来预测明天的涨跌(Direction)。

下面,第一个数据集用来讲解,第二个数据集用来自己练习。

训练集和测试集合

随机选择百分之七十的样本作为训练集合(df.train),其余作为测试集合(df.validate)。

df <- cl_data1[-1]
df$class <- factor(df$class, levels=c(2,4), labels=c("benign", "malignant"))

set.seed(12121)
train <- sample(nrow(df), 0.7*nrow(df))
df.train <- df[train,]
df.validate <- df[-train,]
table(df.train$class)
## 
##    benign malignant 
##       303       175
table(df.validate$class)
## 
##    benign malignant 
##       141        64
plot(df[,1:3],col=df[,10])

Logistic 回归

模型拟合

fit.logit <- glm(class~., data=df.train, family=binomial())
summary(fit.logit)
## 
## Call:
## glm(formula = class ~ ., family = binomial(), data = df.train)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -2.9673  -0.1225  -0.0558   0.0201   2.3795  
## 
## Coefficients:
##                           Estimate Std. Error z value Pr(>|z|)    
## (Intercept)              -10.42011    1.52247   -6.84  7.7e-12 ***
## clumpThickness             0.63507    0.18646    3.41  0.00066 ***
## sizeUniformity            -0.08317    0.24688   -0.34  0.73621    
## shapeUniformity            0.28541    0.27182    1.05  0.29371    
## maginalAdhesion            0.41158    0.13795    2.98  0.00285 ** 
## singleEpithelialCellSize  -0.00689    0.18101   -0.04  0.96965    
## bareNuclei                 0.35183    0.11312    3.11  0.00187 ** 
## blandChromatin             0.50870    0.22781    2.23  0.02555 *  
## normalNucleoli             0.30147    0.14432    2.09  0.03672 *  
## mitosis                    0.60138    0.36760    1.64  0.10184    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 627.951  on 477  degrees of freedom
## Residual deviance:  76.752  on 468  degrees of freedom
## AIC: 96.75
## 
## Number of Fisher Scoring iterations: 8

系数的OR值以及95% CI:

exp(cbind(OR = coef(fit.logit), confint(fit.logit)))
##                                  OR      2.5 %     97.5 %
## (Intercept)              2.9827e-05 8.5072e-07 0.00037483
## clumpThickness           1.8871e+00 1.3547e+00 2.84407135
## sizeUniformity           9.2020e-01 5.8305e-01 1.55776520
## shapeUniformity          1.3303e+00 7.6003e-01 2.24849245
## maginalAdhesion          1.5092e+00 1.1671e+00 2.03089522
## singleEpithelialCellSize 9.9314e-01 6.8338e-01 1.40512974
## bareNuclei               1.4217e+00 1.1504e+00 1.80081484
## blandChromatin           1.6631e+00 1.0954e+00 2.70336142
## normalNucleoli           1.3518e+00 1.0311e+00 1.82702987
## mitosis                  1.8246e+00 9.7315e-01 3.40449059

代入模型到测试集合中

prob <- predict(fit.logit, df.validate, type="response")
logit.pred <- factor(prob > .5, levels=c(FALSE, TRUE), labels=c("benign", "malignant"))

结果:

table(df.validate$class, logit.pred, dnn=c("Actual", "Predicted"))
##            Predicted
## Actual      benign malignant
##   benign       139         2
##   malignant      1        63
mean(df.validate$class==logit.pred)
## [1] 0.98537

ROC 曲线分析

library(ROCR)
p <- predict(fit.logit, df.validate, type="response")
pr <- prediction(p, df.validate$class )
prf <- performance(pr, measure = "tpr", x.measure = "fpr")
plot(prf)

auc <- performance(pr, measure = "auc")
auc <- auc@y.values[[1]]
auc
## [1] 0.99579

线性判别分析

一次线性判别函数

library(MASS)
table(df.train[,10])
## 
##    benign malignant 
##       303       175
## Linear Discriminant Analysis
lda.fit=lda(class~., data=df.train)
lda.fit
## Call:
## lda(class ~ ., data = df.train)
## 
## Prior probabilities of groups:
##    benign malignant 
##   0.63389   0.36611 
## 
## Group means:
##           clumpThickness sizeUniformity shapeUniformity maginalAdhesion
## benign            2.9175         1.3432          1.4554          1.3531
## malignant         7.1029         6.6343          6.5429          5.5600
##           singleEpithelialCellSize bareNuclei blandChromatin
## benign                      2.1023     1.3993         2.0726
## malignant                   5.3771     7.6571         5.7829
##           normalNucleoli mitosis
## benign            1.2772  1.0726
## malignant         5.5829  2.7486
## 
## Coefficients of linear discriminants:
##                               LD1
## clumpThickness           0.178674
## sizeUniformity           0.143376
## shapeUniformity          0.048864
## maginalAdhesion          0.075902
## singleEpithelialCellSize 0.055419
## bareNuclei               0.248243
## blandChromatin           0.099061
## normalNucleoli           0.100785
## mitosis                  0.022052
plot(lda.fit)

t(t(lda.fit$scaling))
##                               LD1
## clumpThickness           0.178674
## sizeUniformity           0.143376
## shapeUniformity          0.048864
## maginalAdhesion          0.075902
## singleEpithelialCellSize 0.055419
## bareNuclei               0.248243
## blandChromatin           0.099061
## normalNucleoli           0.100785
## mitosis                  0.022052
#t(t(lda.fit$scaling)%*% t(df.train[1:10,1:9]))
#df.train[1:10,10]

lda.pred=predict(lda.fit,df.validate)

table(df.validate$class)
## 
##    benign malignant 
##       141        64
table(lda.pred$class)
## 
##    benign malignant 
##       144        61
table(lda.pred$class,df.validate$class,dnn=c("Predicted","Actual" ))
##            Actual
## Predicted   benign malignant
##   benign       140         4
##   malignant      1        60
mean(lda.pred$class==df.validate$class)
## [1] 0.97561

二次判别函数

## Linear Discriminant Analysis
qda.fit=qda(class~., data=df.train)
qda.fit
## Call:
## qda(class ~ ., data = df.train)
## 
## Prior probabilities of groups:
##    benign malignant 
##   0.63389   0.36611 
## 
## Group means:
##           clumpThickness sizeUniformity shapeUniformity maginalAdhesion
## benign            2.9175         1.3432          1.4554          1.3531
## malignant         7.1029         6.6343          6.5429          5.5600
##           singleEpithelialCellSize bareNuclei blandChromatin
## benign                      2.1023     1.3993         2.0726
## malignant                   5.3771     7.6571         5.7829
##           normalNucleoli mitosis
## benign            1.2772  1.0726
## malignant         5.5829  2.7486
qda.pred=predict(qda.fit,df.validate)

table(df.validate$class)
## 
##    benign malignant 
##       141        64
table(qda.pred$class)
## 
##    benign malignant 
##       133        72
table(qda.pred$class,df.validate$class)
##            
##             benign malignant
##   benign       133         0
##   malignant      8        64
mean(qda.pred$class==df.validate$class)
## [1] 0.96098

决策树

library(rpart)
set.seed(1234)
dtree <- rpart(class ~ ., data=df.train, method="class", parms=list(split="information"))
dtree$cptable
##         CP nsplit rel error  xerror     xstd
## 1 0.794286      0   1.00000 1.00000 0.060185
## 2 0.051429      1   0.20571 0.20571 0.032969
## 3 0.010000      3   0.10286 0.10286 0.023783
#plotcp(dtree)
## 剪枝
dtree.pruned <- prune(dtree, cp=.0125) 

##画图

library(rpart.plot)
prp(dtree.pruned, type = 2, extra = 104,  
    fallen.leaves = TRUE, main="Decision Tree")

##评估
dtree.pred <- predict(dtree.pruned, df.validate, type="class")
 table(df.validate$class, dtree.pred,  dnn=c("Actual", "Predicted"))
##            Predicted
## Actual      benign malignant
##   benign       136         5
##   malignant      5        59
mean(df.validate$class==dtree.pred)
## [1] 0.95122

随机森林

library(randomForest)
## Warning: package 'randomForest' was built under R version 3.3.2
## randomForest 4.6-12
## Type rfNews() to see new features/changes/bug fixes.
set.seed(1234)
fit.forest <- randomForest(class~., data=df.train, na.action=na.roughfix,importance=TRUE)             
fit.forest
## 
## Call:
##  randomForest(formula = class ~ ., data = df.train, importance = TRUE,      na.action = na.roughfix) 
##                Type of random forest: classification
##                      Number of trees: 500
## No. of variables tried at each split: 3
## 
##         OOB estimate of  error rate: 2.93%
## Confusion matrix:
##           benign malignant class.error
## benign       294         9    0.029703
## malignant      5       170    0.028571
importance(fit.forest, type=2)                          
##                          MeanDecreaseGini
## clumpThickness                    11.0089
## sizeUniformity                    62.5302
## shapeUniformity                   44.6239
## maginalAdhesion                    6.6720
## singleEpithelialCellSize          15.3383
## bareNuclei                        42.4534
## blandChromatin                    24.3937
## normalNucleoli                    12.2556
## mitosis                            1.7556
forest.pred <- predict(fit.forest, df.validate)         
table(df.validate$class, forest.pred, dnn=c("Actual", "Predicted"))
##            Predicted
## Actual      benign malignant
##   benign       138         3
##   malignant      1        63
mean(df.validate$class==forest.pred)
## [1] 0.98049

支持向量机

library(e1071)
## Warning: package 'e1071' was built under R version 3.3.2
set.seed(1234)
fit.svm <- svm(class~., data=df.train)
fit.svm
## 
## Call:
## svm(formula = class ~ ., data = df.train)
## 
## 
## Parameters:
##    SVM-Type:  C-classification 
##  SVM-Kernel:  radial 
##        cost:  1 
##       gamma:  0.11111 
## 
## Number of Support Vectors:  78
svm.pred <- predict(fit.svm, df.validate)
svm.perf <- table(na.omit(df.validate)$class, 
                  svm.pred, dnn=c("Actual", "Predicted"))
svm.perf
##            Predicted
## Actual      benign malignant
##   benign       137         4
##   malignant      0        64
mean(df.validate$class==svm.pred)
## [1] 0.98049

神经网络

使用一个中间层三个节点的神经网络

library(nnet)
set.seed(123)
model_nnet <- nnet(class~., data=df.train,size =3, rang = 0.3,decay = 5e-4, maxit = 200)
## # weights:  34
## initial  value 392.577083 
## iter  10 value 100.438587
## iter  20 value 41.006502
## iter  30 value 29.004519
## iter  40 value 24.210156
## iter  50 value 22.133921
## iter  60 value 21.671345
## iter  70 value 21.488664
## iter  80 value 21.258612
## iter  90 value 20.937896
## iter 100 value 20.715429
## iter 110 value 20.016485
## iter 120 value 19.742480
## iter 130 value 19.461509
## iter 140 value 19.382120
## iter 150 value 19.370388
## iter 160 value 18.519927
## iter 170 value 18.099577
## iter 180 value 17.891451
## iter 190 value 17.607152
## iter 200 value 17.359475
## final  value 17.359475 
## stopped after 200 iterations
summary(model_nnet)
## a 9-3-1 network with 34 weights
## options were - entropy fitting  decay=5e-04
##  b->h1 i1->h1 i2->h1 i3->h1 i4->h1 i5->h1 i6->h1 i7->h1 i8->h1 i9->h1 
##  33.01  -3.68   5.50  -1.72  -3.02  -1.48   0.81  -6.38  -2.20  -2.76 
##  b->h2 i1->h2 i2->h2 i3->h2 i4->h2 i5->h2 i6->h2 i7->h2 i8->h2 i9->h2 
##  10.17  -0.99  -3.73  -1.10   2.94   6.71  -1.95  -4.59  -2.97  -3.99 
##  b->h3 i1->h3 i2->h3 i3->h3 i4->h3 i5->h3 i6->h3 i7->h3 i8->h3 i9->h3 
##  25.38  -0.90  -8.14   1.13  -3.40   2.89  -6.49   5.95  -2.24  -0.12 
##   b->o  h1->o  h2->o  h3->o 
##   4.61 -17.35  -3.88 -15.66
pred <- predict(model_nnet,df.validate)
prednn <- ifelse(pred<0.5,"benign", "malignant")
table(prednn)
## prednn
##    benign malignant 
##       141        64
table(df.validate$class)
## 
##    benign malignant 
##       141        64
table(df.validate$class,prednn)
##            prednn
##             benign malignant
##   benign       138         3
##   malignant      3        61
mean(prednn==df.validate$class)
## [1] 0.97073