Rpart Package In R Download
Decision Tree (C4.5) Classifier Tutorial
- Install the package
install.packages("rpart",repos="http://cran.rstudio.com/")
## ## The downloaded binary packages are in ## /var/folders/3z/jqczpc_95yq_sbgl2665kg2c0000gq/T//Rtmpaaw3sP/downloaded_packages
install.packages("rpart.plot",repos="http://cran.rstudio.com/")
## ## The downloaded binary packages are in ## /var/folders/3z/jqczpc_95yq_sbgl2665kg2c0000gq/T//Rtmpaaw3sP/downloaded_packages
install.packages("caret",repos="http://cran.rstudio.com/")
## ## The downloaded binary packages are in ## /var/folders/3z/jqczpc_95yq_sbgl2665kg2c0000gq/T//Rtmpaaw3sP/downloaded_packages
library(lattice) library(ggplot2) library(caret) #packages for preprocess library(rpart) library(rpart.plot) #packages for decision tree library(mlbench) #package with data sample
2.Read data
data("PimaIndiansDiabetes2",package = 'mlbench') data <- PimaIndiansDiabetes2 head(data)
## pregnant glucose pressure triceps insulin mass pedigree age diabetes ## 1 6 148 72 35 NA 33.6 0.627 50 pos ## 2 1 85 66 29 NA 26.6 0.351 31 neg ## 3 8 183 64 NA NA 23.3 0.672 32 pos ## 4 1 89 66 23 94 28.1 0.167 21 neg ## 5 0 137 40 35 168 43.1 2.288 33 pos ## 6 5 116 74 NA NA 25.6 0.201 30 neg
summary(data)
## pregnant glucose pressure triceps ## Min. : 0.000 Min. : 44.0 Min. : 24.00 Min. : 7.00 ## 1st Qu.: 1.000 1st Qu.: 99.0 1st Qu.: 64.00 1st Qu.:22.00 ## Median : 3.000 Median :117.0 Median : 72.00 Median :29.00 ## Mean : 3.845 Mean :121.7 Mean : 72.41 Mean :29.15 ## 3rd Qu.: 6.000 3rd Qu.:141.0 3rd Qu.: 80.00 3rd Qu.:36.00 ## Max. :17.000 Max. :199.0 Max. :122.00 Max. :99.00 ## NA's :5 NA's :35 NA's :227 ## insulin mass pedigree age ## Min. : 14.00 Min. :18.20 Min. :0.0780 Min. :21.00 ## 1st Qu.: 76.25 1st Qu.:27.50 1st Qu.:0.2437 1st Qu.:24.00 ## Median :125.00 Median :32.30 Median :0.3725 Median :29.00 ## Mean :155.55 Mean :32.46 Mean :0.4719 Mean :33.24 ## 3rd Qu.:190.00 3rd Qu.:36.60 3rd Qu.:0.6262 3rd Qu.:41.00 ## Max. :846.00 Max. :67.10 Max. :2.4200 Max. :81.00 ## NA's :374 NA's :11 ## diabetes ## neg:500 ## pos:268 ## ## ## ## ##
3.Preprocess Data
preProValues <- preProcess(data[,-9],method = c("center","scale")) scaleddata <- predict(preProValues,data[,-9]) #Normalization preProcbox <- preProcess(scaleddata,method = c("YeoJohnson")) boxdata <- predict(preProcbox,scaleddata) #YeoJohnson Transfer (to norm distribution) preProcimp <- preProcess(boxdata,method = "bagImpute") procdata <- predict(preProcimp,boxdata) #Missing Values procdata$class <- data[,9] head(procdata)
## pregnant glucose pressure triceps insulin mass ## 1 0.5284016 0.7595155 -0.03275471 0.52823166 0.12783441 0.1613449 ## 2 -1.0902050 -1.4250227 -0.52420088 -0.01466832 -1.20274425 -0.9327976 ## 3 0.8956985 1.5845945 -0.69028150 -0.78108015 0.03740067 -1.5205042 ## 4 -1.0902050 -1.2509263 -0.52420088 -0.62258605 -0.67266546 -0.6791257 ## 5 -1.5823833 0.4627602 -2.74133178 0.52823166 0.09906180 1.3218244 ## 6 0.3065886 -0.1924995 0.12832774 -0.83700141 -0.56409029 -1.1067781 ## pedigree age class ## 1 0.3807017 0.90154325 pos ## 2 -0.4347451 -0.20794297 neg ## 3 0.4674736 -0.11085497 pos ## 4 -1.3678689 -1.55542743 neg ## 5 1.7918753 -0.02068449 pos ## 6 -1.1705718 -0.31192824 neg
summary(procdata)
## pregnant glucose pressure triceps ## Min. :-1.5824 Min. :-3.4150 Min. :-4.14946 Min. :-2.4794 ## 1st Qu.:-1.0902 1st Qu.:-0.8339 1st Qu.:-0.69028 1st Qu.:-0.7462 ## Median :-0.2734 Median :-0.1578 Median :-0.03275 Median :-0.1115 ## Mean :-0.2452 Mean :-0.1315 Mean :-0.02782 Mean :-0.1174 ## 3rd Qu.: 0.5284 3rd Qu.: 0.5530 3rd Qu.: 0.60405 3rd Qu.: 0.5282 ## Max. : 1.9793 Max. : 1.9208 Max. : 3.77277 Max. : 4.9183 ## insulin mass pedigree age ## Min. :-2.0260 Min. :-2.49746 Min. :-1.9381 Min. :-1.5554 ## 1st Qu.:-0.8153 1st Qu.:-0.77951 1st Qu.:-0.9385 1st Qu.:-1.0797 ## Median :-0.2642 Median :-0.02281 Median :-0.3472 Median :-0.4228 ## Mean :-0.3049 Mean :-0.09971 Mean :-0.2983 Mean :-0.3063 ## 3rd Qu.: 0.1426 3rd Qu.: 0.55689 3rd Qu.: 0.3792 3rd Qu.: 0.5118 ## Max. : 1.7013 Max. : 3.63423 Max. : 1.8466 Max. : 1.6746 ## class ## neg:500 ## pos:268 ## ## ## ##
featurePlot(scaleddata,data[,9],plot='box')
4.Decision Tree
rpartModel <- rpart(class~.,data=procdata,control = rpart.control(cp=0)) #Tree growth without limitation rpart.plot(rpartModel)
#Print tree plot plotcp(rpartModel)
#Print CP value Vs. tree levels rpartModel$cptable
## CP nsplit rel error xerror xstd ## 1 0.250000000 0 1.0000000 1.0000000 0.04928752 ## 2 0.100746269 1 0.7500000 0.8134328 0.04662235 ## 3 0.017723881 2 0.6492537 0.6902985 0.04421857 ## 4 0.016169154 6 0.5783582 0.6791045 0.04397128 ## 5 0.011194030 9 0.5298507 0.6828358 0.04405428 ## 6 0.009328358 11 0.5074627 0.7126866 0.04469811 ## 7 0.007462687 17 0.4440299 0.7201493 0.04485359 ## 8 0.005597015 19 0.4291045 0.7126866 0.04469811 ## 9 0.003731343 22 0.4104478 0.7238806 0.04493052 ## 10 0.002487562 23 0.4067164 0.7014925 0.04446083 ## 11 0.001865672 26 0.3992537 0.6902985 0.04421857 ## 12 0.000000000 28 0.3955224 0.6940299 0.04429988
cptable <- as.data.frame(rpartModel$cptable) cptable$errsd <- cptable$xerror + cptable$xstd cpvalue <- cptable[which.min(cptable$errsd),"CP"] #Find out the best CP value for tree pruneModel <- prune(rpartModel,0.007462687) #prune the tree rpart.plot(pruneModel)
#Print the tree after prune pre <- predict(pruneModel,procdata,type='class') pretable <- table(pre,procdata$class) pretable
## ## pre neg pos ## neg 432 51 ## pos 68 217
#Show the confusion matrix accurary <- sum(diag(pretable))/sum(pretable) accurary
## [1] 0.8450521
#Calculate the accurary varImp(pruneModel)
## Overall ## age 94.58974 ## glucose 106.90657 ## insulin 100.42287 ## mass 93.06814 ## pedigree 34.17301 ## pregnant 18.21234 ## pressure 14.25917 ## triceps 58.80070
#Show the importance of each features
Source: https://rstudio-pubs-static.s3.amazonaws.com/94101_c23179ee360c43e0a63a791e410e1f3a.html
Posted by: brown127.blogspot.com