install.packages(c("rpart", "ISLR","MASS","class","nnet","party", "randomForest", "e1071", "rpart.plot","ROCR")
install.packages(rattle, dependencies = c("Depends", "Suggests"))
setwd("E:\\mywork\\study_R\\R_sim_2016\\R_multistat_5\\")
rm(list = ls(all = TRUE))
options(digits = 5 )
经常需要基于一组预测变量预测一个分类结果,如:
上述例子的共同点是根据一组预测变量(或特征)来预测相对应的二分类结果(无信用风 险/有信用风险,心脏病发作/心脏病未发作,是病毒邮件/不是病毒邮件),目的是通过某种方法 实现对新出现单元的准确分类。
数据是威斯康星州乳腺癌数据集, 可在UCI机器学习数据库。数据集包含699个细针抽吸活检的样本单元,其中458个(65.5%)为良性样本单元,241个(34.5%)为恶性样本单元。数据集中共有11个变量:
第一个变量ID不纳入数据分析,最后一个变量(类别)即输出变量(编码为良性=2,恶性=4)。另外九个变量是与判别恶性肿瘤相关的细胞特征,并且得到了记录。 这些细胞特征得分为1(最接近良性)至10(最接近病变)之间的整数。任一变量都不能单独作 为判别良性或恶性的标准,建模的目的是找到九个细胞特征的某种组合,从而实现对恶性肿瘤的 准确预测。Mangasarian和Wolberg在其1990年的文章中详细探讨了这个数据集。
从这里下载,然后运行下列语句加载
load("cl_data1.Rda")
names(cl_data1) <- c("ID", "clumpThickness", "sizeUniformity", "shapeUniformity", "maginalAdhesion", "singleEpithelialCellSize", "bareNuclei", "blandChromatin", "normalNucleoli", "mitosis", "class")
dim(cl_data1)
## [1] 699 11
head(cl_data1)
## ID clumpThickness sizeUniformity shapeUniformity maginalAdhesion
## 1 1000025 5 1 1 1
## 2 1002945 5 4 4 5
## 3 1015425 3 1 1 1
## 4 1016277 6 8 8 1
## 5 1017023 4 1 1 3
## 6 1017122 8 10 10 8
## singleEpithelialCellSize bareNuclei blandChromatin normalNucleoli
## 1 2 1 3 1
## 2 7 10 3 2
## 3 2 2 3 1
## 4 3 4 3 7
## 5 2 1 3 1
## 6 7 10 9 7
## mitosis class
## 1 1 2
## 2 1 2
## 3 1 2
## 4 1 2
## 5 1 2
## 6 1 4
sum(is.na(cl_data1))## 查看缺失的观测数目
## [1] 16
cl_data1<- na.omit(cl_data1)## 删除有缺失的观测
sum(is.na(cl_data1))
## [1] 0
数据来自标普500在2001年和2005年的1250个观测,每天一个观测 利用前四天的涨跌情况来预测明天的涨跌。
从这里下载,然后运行下列语句加载
load("cl_data2.Rda")
dim(cl_data2)
## [1] 1250 9
head(cl_data2)
## Year Lag1 Lag2 Lag3 Lag4 Lag5 Volume Today Direction
## 1 2001 0.381 -0.192 -2.624 -1.055 5.010 1.1913 0.959 Up
## 2 2001 0.959 0.381 -0.192 -2.624 -1.055 1.2965 1.032 Up
## 3 2001 1.032 0.959 0.381 -0.192 -2.624 1.4112 -0.623 Down
## 4 2001 -0.623 1.032 0.959 0.381 -0.192 1.2760 0.614 Up
## 5 2001 0.614 -0.623 1.032 0.959 0.381 1.2057 0.213 Up
## 6 2001 0.213 0.614 -0.623 1.032 0.959 1.3491 1.392 Up
sum(is.na(cl_data2))## 查看缺失的观测数目
## [1] 0
利用前四天的涨跌情况(Lag1-Lag4)来预测明天的涨跌(Direction)。
下面,第一个数据集用来讲解,第二个数据集用来自己练习。
随机选择百分之七十的样本作为训练集合(df.train),其余作为测试集合(df.validate)。
df <- cl_data1[-1]
df$class <- factor(df$class, levels=c(2,4), labels=c("benign", "malignant"))
set.seed(12121)
train <- sample(nrow(df), 0.7*nrow(df))
df.train <- df[train,]
df.validate <- df[-train,]
table(df.train$class)
##
## benign malignant
## 303 175
table(df.validate$class)
##
## benign malignant
## 141 64
plot(df[,1:3],col=df[,10])
fit.logit <- glm(class~., data=df.train, family=binomial())
summary(fit.logit)
##
## Call:
## glm(formula = class ~ ., family = binomial(), data = df.train)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -2.9673 -0.1225 -0.0558 0.0201 2.3795
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -10.42011 1.52247 -6.84 7.7e-12 ***
## clumpThickness 0.63507 0.18646 3.41 0.00066 ***
## sizeUniformity -0.08317 0.24688 -0.34 0.73621
## shapeUniformity 0.28541 0.27182 1.05 0.29371
## maginalAdhesion 0.41158 0.13795 2.98 0.00285 **
## singleEpithelialCellSize -0.00689 0.18101 -0.04 0.96965
## bareNuclei 0.35183 0.11312 3.11 0.00187 **
## blandChromatin 0.50870 0.22781 2.23 0.02555 *
## normalNucleoli 0.30147 0.14432 2.09 0.03672 *
## mitosis 0.60138 0.36760 1.64 0.10184
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 627.951 on 477 degrees of freedom
## Residual deviance: 76.752 on 468 degrees of freedom
## AIC: 96.75
##
## Number of Fisher Scoring iterations: 8
系数的OR值以及95% CI:
exp(cbind(OR = coef(fit.logit), confint(fit.logit)))
## OR 2.5 % 97.5 %
## (Intercept) 2.9827e-05 8.5072e-07 0.00037483
## clumpThickness 1.8871e+00 1.3547e+00 2.84407135
## sizeUniformity 9.2020e-01 5.8305e-01 1.55776520
## shapeUniformity 1.3303e+00 7.6003e-01 2.24849245
## maginalAdhesion 1.5092e+00 1.1671e+00 2.03089522
## singleEpithelialCellSize 9.9314e-01 6.8338e-01 1.40512974
## bareNuclei 1.4217e+00 1.1504e+00 1.80081484
## blandChromatin 1.6631e+00 1.0954e+00 2.70336142
## normalNucleoli 1.3518e+00 1.0311e+00 1.82702987
## mitosis 1.8246e+00 9.7315e-01 3.40449059
代入模型到测试集合中
prob <- predict(fit.logit, df.validate, type="response")
logit.pred <- factor(prob > .5, levels=c(FALSE, TRUE), labels=c("benign", "malignant"))
结果:
table(df.validate$class, logit.pred, dnn=c("Actual", "Predicted"))
## Predicted
## Actual benign malignant
## benign 139 2
## malignant 1 63
mean(df.validate$class==logit.pred)
## [1] 0.98537
library(ROCR)
p <- predict(fit.logit, df.validate, type="response")
pr <- prediction(p, df.validate$class )
prf <- performance(pr, measure = "tpr", x.measure = "fpr")
plot(prf)
auc <- performance(pr, measure = "auc")
auc <- auc@y.values[[1]]
auc
## [1] 0.99579
library(MASS)
table(df.train[,10])
##
## benign malignant
## 303 175
## Linear Discriminant Analysis
lda.fit=lda(class~., data=df.train)
lda.fit
## Call:
## lda(class ~ ., data = df.train)
##
## Prior probabilities of groups:
## benign malignant
## 0.63389 0.36611
##
## Group means:
## clumpThickness sizeUniformity shapeUniformity maginalAdhesion
## benign 2.9175 1.3432 1.4554 1.3531
## malignant 7.1029 6.6343 6.5429 5.5600
## singleEpithelialCellSize bareNuclei blandChromatin
## benign 2.1023 1.3993 2.0726
## malignant 5.3771 7.6571 5.7829
## normalNucleoli mitosis
## benign 1.2772 1.0726
## malignant 5.5829 2.7486
##
## Coefficients of linear discriminants:
## LD1
## clumpThickness 0.178674
## sizeUniformity 0.143376
## shapeUniformity 0.048864
## maginalAdhesion 0.075902
## singleEpithelialCellSize 0.055419
## bareNuclei 0.248243
## blandChromatin 0.099061
## normalNucleoli 0.100785
## mitosis 0.022052
plot(lda.fit)
t(t(lda.fit$scaling))
## LD1
## clumpThickness 0.178674
## sizeUniformity 0.143376
## shapeUniformity 0.048864
## maginalAdhesion 0.075902
## singleEpithelialCellSize 0.055419
## bareNuclei 0.248243
## blandChromatin 0.099061
## normalNucleoli 0.100785
## mitosis 0.022052
#t(t(lda.fit$scaling)%*% t(df.train[1:10,1:9]))
#df.train[1:10,10]
lda.pred=predict(lda.fit,df.validate)
table(df.validate$class)
##
## benign malignant
## 141 64
table(lda.pred$class)
##
## benign malignant
## 144 61
table(lda.pred$class,df.validate$class,dnn=c("Predicted","Actual" ))
## Actual
## Predicted benign malignant
## benign 140 4
## malignant 1 60
mean(lda.pred$class==df.validate$class)
## [1] 0.97561
## Linear Discriminant Analysis
qda.fit=qda(class~., data=df.train)
qda.fit
## Call:
## qda(class ~ ., data = df.train)
##
## Prior probabilities of groups:
## benign malignant
## 0.63389 0.36611
##
## Group means:
## clumpThickness sizeUniformity shapeUniformity maginalAdhesion
## benign 2.9175 1.3432 1.4554 1.3531
## malignant 7.1029 6.6343 6.5429 5.5600
## singleEpithelialCellSize bareNuclei blandChromatin
## benign 2.1023 1.3993 2.0726
## malignant 5.3771 7.6571 5.7829
## normalNucleoli mitosis
## benign 1.2772 1.0726
## malignant 5.5829 2.7486
qda.pred=predict(qda.fit,df.validate)
table(df.validate$class)
##
## benign malignant
## 141 64
table(qda.pred$class)
##
## benign malignant
## 133 72
table(qda.pred$class,df.validate$class)
##
## benign malignant
## benign 133 0
## malignant 8 64
mean(qda.pred$class==df.validate$class)
## [1] 0.96098
library(rpart)
set.seed(1234)
dtree <- rpart(class ~ ., data=df.train, method="class", parms=list(split="information"))
dtree$cptable
## CP nsplit rel error xerror xstd
## 1 0.794286 0 1.00000 1.00000 0.060185
## 2 0.051429 1 0.20571 0.20571 0.032969
## 3 0.010000 3 0.10286 0.10286 0.023783
#plotcp(dtree)
## 剪枝
dtree.pruned <- prune(dtree, cp=.0125)
##画图
library(rpart.plot)
prp(dtree.pruned, type = 2, extra = 104,
fallen.leaves = TRUE, main="Decision Tree")
##评估
dtree.pred <- predict(dtree.pruned, df.validate, type="class")
table(df.validate$class, dtree.pred, dnn=c("Actual", "Predicted"))
## Predicted
## Actual benign malignant
## benign 136 5
## malignant 5 59
mean(df.validate$class==dtree.pred)
## [1] 0.95122
library(randomForest)
## Warning: package 'randomForest' was built under R version 3.3.2
## randomForest 4.6-12
## Type rfNews() to see new features/changes/bug fixes.
set.seed(1234)
fit.forest <- randomForest(class~., data=df.train, na.action=na.roughfix,importance=TRUE)
fit.forest
##
## Call:
## randomForest(formula = class ~ ., data = df.train, importance = TRUE, na.action = na.roughfix)
## Type of random forest: classification
## Number of trees: 500
## No. of variables tried at each split: 3
##
## OOB estimate of error rate: 2.93%
## Confusion matrix:
## benign malignant class.error
## benign 294 9 0.029703
## malignant 5 170 0.028571
importance(fit.forest, type=2)
## MeanDecreaseGini
## clumpThickness 11.0089
## sizeUniformity 62.5302
## shapeUniformity 44.6239
## maginalAdhesion 6.6720
## singleEpithelialCellSize 15.3383
## bareNuclei 42.4534
## blandChromatin 24.3937
## normalNucleoli 12.2556
## mitosis 1.7556
forest.pred <- predict(fit.forest, df.validate)
table(df.validate$class, forest.pred, dnn=c("Actual", "Predicted"))
## Predicted
## Actual benign malignant
## benign 138 3
## malignant 1 63
mean(df.validate$class==forest.pred)
## [1] 0.98049
library(e1071)
## Warning: package 'e1071' was built under R version 3.3.2
set.seed(1234)
fit.svm <- svm(class~., data=df.train)
fit.svm
##
## Call:
## svm(formula = class ~ ., data = df.train)
##
##
## Parameters:
## SVM-Type: C-classification
## SVM-Kernel: radial
## cost: 1
## gamma: 0.11111
##
## Number of Support Vectors: 78
svm.pred <- predict(fit.svm, df.validate)
svm.perf <- table(na.omit(df.validate)$class,
svm.pred, dnn=c("Actual", "Predicted"))
svm.perf
## Predicted
## Actual benign malignant
## benign 137 4
## malignant 0 64
mean(df.validate$class==svm.pred)
## [1] 0.98049
使用一个中间层三个节点的神经网络
library(nnet)
set.seed(123)
model_nnet <- nnet(class~., data=df.train,size =3, rang = 0.3,decay = 5e-4, maxit = 200)
## # weights: 34
## initial value 392.577083
## iter 10 value 100.438587
## iter 20 value 41.006502
## iter 30 value 29.004519
## iter 40 value 24.210156
## iter 50 value 22.133921
## iter 60 value 21.671345
## iter 70 value 21.488664
## iter 80 value 21.258612
## iter 90 value 20.937896
## iter 100 value 20.715429
## iter 110 value 20.016485
## iter 120 value 19.742480
## iter 130 value 19.461509
## iter 140 value 19.382120
## iter 150 value 19.370388
## iter 160 value 18.519927
## iter 170 value 18.099577
## iter 180 value 17.891451
## iter 190 value 17.607152
## iter 200 value 17.359475
## final value 17.359475
## stopped after 200 iterations
summary(model_nnet)
## a 9-3-1 network with 34 weights
## options were - entropy fitting decay=5e-04
## b->h1 i1->h1 i2->h1 i3->h1 i4->h1 i5->h1 i6->h1 i7->h1 i8->h1 i9->h1
## 33.01 -3.68 5.50 -1.72 -3.02 -1.48 0.81 -6.38 -2.20 -2.76
## b->h2 i1->h2 i2->h2 i3->h2 i4->h2 i5->h2 i6->h2 i7->h2 i8->h2 i9->h2
## 10.17 -0.99 -3.73 -1.10 2.94 6.71 -1.95 -4.59 -2.97 -3.99
## b->h3 i1->h3 i2->h3 i3->h3 i4->h3 i5->h3 i6->h3 i7->h3 i8->h3 i9->h3
## 25.38 -0.90 -8.14 1.13 -3.40 2.89 -6.49 5.95 -2.24 -0.12
## b->o h1->o h2->o h3->o
## 4.61 -17.35 -3.88 -15.66
pred <- predict(model_nnet,df.validate)
prednn <- ifelse(pred<0.5,"benign", "malignant")
table(prednn)
## prednn
## benign malignant
## 141 64
table(df.validate$class)
##
## benign malignant
## 141 64
table(df.validate$class,prednn)
## prednn
## benign malignant
## benign 138 3
## malignant 3 61
mean(prednn==df.validate$class)
## [1] 0.97073