R3: Graphics and Visualization



Classification with Random Forests

Random forests are an extension of classification and regression trees. They show great promise for accurate high-dimensional predictions, at the cost of some ease of interpretation. The R package randomForest must be installed.

1. South African Heart Disease Data – CART

library(rpart)

heart=read.table("SAheart.txt")

heart.cart=rpart(chd~., data=heart, method="class")

plot(heart.cart)

text(heart.cart, use.n=FALSE, pretty=0)

[pic]

heart.cart.pred = predict(heart.cart, heart[,-10], type="class")

table(Actual=heart$chd, Classified=heart.cart.pred)

Classified

Actual 0 1

0 275 27

1 71 89

nrow(heart)

[1] 462

(27+71)/462# Overall Error Rate

[1] 0.2121212

2. Bootstrap Samples

boot.samp = sample(462,462,replace=T)

sort(boot.samp)[1:50]

[1] 1 1 2 2 3 4 4 4 4 5 8 9 9 10 12 12 14 15 17 18 18 18 18 18 19

[26] 19 19 20 20 20 21 21 22 23 23 26 28 29 29 31 32 33 38 40 40 40 40 41 41 42

heart.boot = heart[boot.samp,]

heart.b.cart = rpart(chd~., data=heart.boot, method="class")

plot(heart.b.cart)

text(heart.b.cart, use.n=FALSE, pretty=0, cex=.8)

# Repeat for different trees

[pic][pic]

[pic][pic]

3. Random Forests

library(randomForest)

heart$chd = factor(heart$chd, labels=c("absent","present"))

# Response MUST be a factor for classification random forests

heart.rf = randomForest(chd~., data=heart, importance=T,

proximity = T)

heart.rf

Call:

randomForest(formula = chd ~ ., data = heart, importance = T, proximity = T)

Type of random forest: classification

Number of trees: 500

No. of variables tried at each split: 3

OOB estimate of error rate: 30.52%

Confusion matrix:

absent present class.error

absent 253 49 0.1622517

present 92 68 0.5750000

plot(heart.rf)# plot cumulative error rates - black (overall),

# red - class 0 (absent), green - class 1 (present)

[pic]

3.1 Variable Importance

importance(heart.rf)

absent present MeanDecreaseAccuracy MeanDecreaseGini

sbp 0.4031204 0.07389546 0.2955140 21.804440

tobacco 0.8521080 1.17672417 0.8929525 31.767707

ldl -0.2827476 1.32792125 0.4071038 28.266116

adiposity 0.7789223 -0.32097287 0.4299989 24.276759

famhist 0.0939252 1.47785213 0.6525743 9.551946

typea 0.4624707 0.16915688 0.3456935 22.790787

obesity 0.1233059 -0.56854442 -0.1568843 20.417390

alcohol -0.3414010 0.03055015 -0.2078866 17.141331

age 0.9833459 1.14711009 0.9907443 32.805161

varImpPlot(heart.rf)

pairs(heart[,c(2,3,4,5,9)],pch=unclass(heart$chd),

col=unclass(heart$chd))

[pic][pic]

3.2 MDS using Random Forest Proximities

MDSplot(heart.rf, heart$chd, pch=unclass(heart$chd))

[pic]

3.3 Balance the Sampling Rates – Balance the Error Rates?

heart.rf.bal = randomForest(chd~., data=heart, sampsize=c(100,100),

importance=T, proximity = T)

heart.rf.bal

Call:

randomForest(formula = chd ~ ., data = heart, sampsize = c(100, 100), importance = T, proximity = T)

Type of random forest: classification

Number of trees: 500

No. of variables tried at each split: 3

OOB estimate of error rate: 33.55%

Confusion matrix:

absent present class.error

absent 204 98 0.3245033

present 57 103 0.3562500

plot(heart.rf.bal)

importance(heart.rf.bal)

absent present MeanDecreaseAccuracy MeanDecreaseGini

sbp 0.20755907 0.16430217 0.1955865 9.879473

tobacco 0.60047269 1.18497452 0.6950574 15.013303

ldl -0.33206458 1.28848239 0.2348731 13.236184

adiposity 0.24590664 0.36650685 0.3076801 11.772124

famhist 0.03857619 1.25033284 0.4987571 4.153150

typea 0.23369742 0.01995233 0.1666102 10.205099

obesity -0.19863342 -0.36435995 -0.2746245 9.850312

alcohol -0.04821884 -0.03108856 -0.0413231 8.208580

age 0.70939584 1.40828691 0.8500487 17.679775

varImpPlot(heart.rf.bal)

[pic][pic]

4. Crabs Data

library(MASS)

data(crabs)

crabs = crabs[,c(1,4:8)]

crabs.rf = randomForest(sp~., data=crabs, importance=T,

proximity = T)

crabs.rf

Call:

randomForest(formula = sp ~ ., data = crabs, importance = T, proximity = T)

Type of random forest: classification

Number of trees: 500

No. of variables tried at each split: 2

OOB estimate of error rate: 10.5%

Confusion matrix:

B O class.error

B 89 11 0.11

O 10 90 0.10

plot(crabs.rf)

importance(crabs.rf)

B O MeanDecreaseAccuracy MeanDecreaseGini

FL 3.129921 3.401738 2.439871 29.30713

RW 0.507408 1.866268 1.361985 9.14394

CL 2.051226 2.072838 1.969847 16.28632

CW 3.057862 3.088256 2.321066 21.74244

BD 3.292988 2.691395 2.356361 23.05314

varImpPlot(crabs.rf)

[pic][pic]

5. Forensic Glass Data

data(fgl)

head(fgl)

RI Na Mg Al Si K Ca Ba Fe type

1 3.01 13.64 4.49 1.10 71.78 0.06 8.75 0 0.00 WinF

2 -0.39 13.89 3.60 1.36 72.73 0.48 7.83 0 0.00 WinF

3 -1.82 13.53 3.55 1.54 72.99 0.39 7.78 0 0.00 WinF

4 -0.34 13.21 3.69 1.29 72.61 0.57 8.22 0 0.00 WinF

5 -0.58 13.27 3.62 1.24 73.08 0.55 8.07 0 0.00 WinF

6 -2.04 12.79 3.61 1.62 72.97 0.64 8.07 0 0.26 WinF

fgl.rf = randomForest (type~., data=fgl, importance=T,

proximity = T)

fgl.rf

Call:

randomForest(formula = type ~ ., data = fgl, importance = T, proximity = T)

Type of random forest: classification

Number of trees: 500

No. of variables tried at each split: 3

OOB estimate of error rate: 21.03%

Confusion matrix:

WinF WinNF Veh Con Tabl Head class.error

WinF 61 7 2 0 0 0 0.1285714

WinNF 11 58 1 3 1 2 0.2368421

Veh 6 4 7 0 0 0 0.5882353

Con 0 2 0 10 0 1 0.2307692

Tabl 0 2 0 0 7 0 0.2222222

Head 1 2 0 0 0 26 0.1034483

plot(fgl.rf)

importance(fgl.rf)

WinF WinNF Veh Con Tabl Head

RI 3.6233041 2.9380901 3.2965225 2.9819727 2.95693090 1.500972

Na 1.9236424 1.7896380 1.2226327 4.7595074 7.13756492 3.491016

Mg 3.5014573 3.2330539 4.8330343 8.2824143 7.83981237 4.269236

Al 3.4661497 2.9960250 3.0014803 6.6885812 -0.02554500 3.820693

Si 2.7260564 1.5905822 1.3066588 1.7698685 2.21048334 1.135240

K 2.5614385 2.2209139 1.5738212 4.4253447 8.49489844 2.024626

Ca 2.5950833 3.0974151 2.7923134 6.2926096 1.41524366 2.274620

Ba 1.6202291 2.1016312 1.8920367 2.6347329 4.92029571 5.607658

Fe 0.6985066 0.2933668 0.8169259 -0.3860016 2.96900244 1.107556

MeanDecreaseAccuracy MeanDecreaseGini

RI 2.119120 23.179466

Na 1.809645 15.850663

Mg 2.170110 25.302383

Al 2.131897 25.226390

Si 1.710365 13.550757

K 1.804234 14.212838

Ca 2.020784 20.340626

Ba 1.909135 14.049170

Fe 0.572051 6.616441

varImpPlot(fgl.rf)

[pic][pic]

# Random Forest Classifiers

# Heart Attack Data

# Classification Tree (from R16)

library(rpart)

heart ................
................

In order to avoid copyright disputes, this page is only a partial summary.

Google Online Preview   Download