Widget HTML Atas

Rpart Package In R Download

Decision Tree (C4.5) Classifier Tutorial

  1. Install the package
          install.packages("rpart",repos="http://cran.rstudio.com/")        
          ##  ## The downloaded binary packages are in ##  /var/folders/3z/jqczpc_95yq_sbgl2665kg2c0000gq/T//Rtmpaaw3sP/downloaded_packages        
          install.packages("rpart.plot",repos="http://cran.rstudio.com/")        
          ##  ## The downloaded binary packages are in ##  /var/folders/3z/jqczpc_95yq_sbgl2665kg2c0000gq/T//Rtmpaaw3sP/downloaded_packages        
          install.packages("caret",repos="http://cran.rstudio.com/")        
          ##  ## The downloaded binary packages are in ##  /var/folders/3z/jqczpc_95yq_sbgl2665kg2c0000gq/T//Rtmpaaw3sP/downloaded_packages        
          library(lattice) library(ggplot2) library(caret) #packages for preprocess library(rpart) library(rpart.plot) #packages for decision tree library(mlbench) #package with data sample        

2.Read data

          data("PimaIndiansDiabetes2",package = 'mlbench') data <- PimaIndiansDiabetes2 head(data)        
          ##   pregnant glucose pressure triceps insulin mass pedigree age diabetes ## 1        6     148       72      35      NA 33.6    0.627  50      pos ## 2        1      85       66      29      NA 26.6    0.351  31      neg ## 3        8     183       64      NA      NA 23.3    0.672  32      pos ## 4        1      89       66      23      94 28.1    0.167  21      neg ## 5        0     137       40      35     168 43.1    2.288  33      pos ## 6        5     116       74      NA      NA 25.6    0.201  30      neg        
          summary(data)        
          ##     pregnant         glucose         pressure         triceps      ##  Min.   : 0.000   Min.   : 44.0   Min.   : 24.00   Min.   : 7.00   ##  1st Qu.: 1.000   1st Qu.: 99.0   1st Qu.: 64.00   1st Qu.:22.00   ##  Median : 3.000   Median :117.0   Median : 72.00   Median :29.00   ##  Mean   : 3.845   Mean   :121.7   Mean   : 72.41   Mean   :29.15   ##  3rd Qu.: 6.000   3rd Qu.:141.0   3rd Qu.: 80.00   3rd Qu.:36.00   ##  Max.   :17.000   Max.   :199.0   Max.   :122.00   Max.   :99.00   ##                   NA's   :5       NA's   :35       NA's   :227     ##     insulin            mass          pedigree           age        ##  Min.   : 14.00   Min.   :18.20   Min.   :0.0780   Min.   :21.00   ##  1st Qu.: 76.25   1st Qu.:27.50   1st Qu.:0.2437   1st Qu.:24.00   ##  Median :125.00   Median :32.30   Median :0.3725   Median :29.00   ##  Mean   :155.55   Mean   :32.46   Mean   :0.4719   Mean   :33.24   ##  3rd Qu.:190.00   3rd Qu.:36.60   3rd Qu.:0.6262   3rd Qu.:41.00   ##  Max.   :846.00   Max.   :67.10   Max.   :2.4200   Max.   :81.00   ##  NA's   :374      NA's   :11                                       ##  diabetes  ##  neg:500   ##  pos:268   ##            ##            ##            ##            ##                  

3.Preprocess Data

          preProValues <- preProcess(data[,-9],method = c("center","scale")) scaleddata <- predict(preProValues,data[,-9]) #Normalization preProcbox <- preProcess(scaleddata,method = c("YeoJohnson")) boxdata <- predict(preProcbox,scaleddata) #YeoJohnson Transfer (to norm distribution) preProcimp <- preProcess(boxdata,method = "bagImpute") procdata <- predict(preProcimp,boxdata) #Missing Values procdata$class <- data[,9] head(procdata)        
          ##     pregnant    glucose    pressure     triceps     insulin       mass ## 1  0.5284016  0.7595155 -0.03275471  0.52823166  0.12783441  0.1613449 ## 2 -1.0902050 -1.4250227 -0.52420088 -0.01466832 -1.20274425 -0.9327976 ## 3  0.8956985  1.5845945 -0.69028150 -0.78108015  0.03740067 -1.5205042 ## 4 -1.0902050 -1.2509263 -0.52420088 -0.62258605 -0.67266546 -0.6791257 ## 5 -1.5823833  0.4627602 -2.74133178  0.52823166  0.09906180  1.3218244 ## 6  0.3065886 -0.1924995  0.12832774 -0.83700141 -0.56409029 -1.1067781 ##     pedigree         age class ## 1  0.3807017  0.90154325   pos ## 2 -0.4347451 -0.20794297   neg ## 3  0.4674736 -0.11085497   pos ## 4 -1.3678689 -1.55542743   neg ## 5  1.7918753 -0.02068449   pos ## 6 -1.1705718 -0.31192824   neg        
          summary(procdata)        
          ##     pregnant          glucose           pressure           triceps        ##  Min.   :-1.5824   Min.   :-3.4150   Min.   :-4.14946   Min.   :-2.4794   ##  1st Qu.:-1.0902   1st Qu.:-0.8339   1st Qu.:-0.69028   1st Qu.:-0.7462   ##  Median :-0.2734   Median :-0.1578   Median :-0.03275   Median :-0.1115   ##  Mean   :-0.2452   Mean   :-0.1315   Mean   :-0.02782   Mean   :-0.1174   ##  3rd Qu.: 0.5284   3rd Qu.: 0.5530   3rd Qu.: 0.60405   3rd Qu.: 0.5282   ##  Max.   : 1.9793   Max.   : 1.9208   Max.   : 3.77277   Max.   : 4.9183   ##     insulin             mass             pedigree            age          ##  Min.   :-2.0260   Min.   :-2.49746   Min.   :-1.9381   Min.   :-1.5554   ##  1st Qu.:-0.8153   1st Qu.:-0.77951   1st Qu.:-0.9385   1st Qu.:-1.0797   ##  Median :-0.2642   Median :-0.02281   Median :-0.3472   Median :-0.4228   ##  Mean   :-0.3049   Mean   :-0.09971   Mean   :-0.2983   Mean   :-0.3063   ##  3rd Qu.: 0.1426   3rd Qu.: 0.55689   3rd Qu.: 0.3792   3rd Qu.: 0.5118   ##  Max.   : 1.7013   Max.   : 3.63423   Max.   : 1.8466   Max.   : 1.6746   ##  class     ##  neg:500   ##  pos:268   ##            ##            ##            ##                  
          featurePlot(scaleddata,data[,9],plot='box')        

4.Decision Tree

          rpartModel <- rpart(class~.,data=procdata,control = rpart.control(cp=0)) #Tree growth without limitation rpart.plot(rpartModel)        

          #Print tree plot plotcp(rpartModel)        

          #Print CP value Vs. tree levels  rpartModel$cptable        
          ##             CP nsplit rel error    xerror       xstd ## 1  0.250000000      0 1.0000000 1.0000000 0.04928752 ## 2  0.100746269      1 0.7500000 0.8134328 0.04662235 ## 3  0.017723881      2 0.6492537 0.6902985 0.04421857 ## 4  0.016169154      6 0.5783582 0.6791045 0.04397128 ## 5  0.011194030      9 0.5298507 0.6828358 0.04405428 ## 6  0.009328358     11 0.5074627 0.7126866 0.04469811 ## 7  0.007462687     17 0.4440299 0.7201493 0.04485359 ## 8  0.005597015     19 0.4291045 0.7126866 0.04469811 ## 9  0.003731343     22 0.4104478 0.7238806 0.04493052 ## 10 0.002487562     23 0.4067164 0.7014925 0.04446083 ## 11 0.001865672     26 0.3992537 0.6902985 0.04421857 ## 12 0.000000000     28 0.3955224 0.6940299 0.04429988        
          cptable <- as.data.frame(rpartModel$cptable) cptable$errsd <- cptable$xerror + cptable$xstd cpvalue <- cptable[which.min(cptable$errsd),"CP"] #Find out the best CP value for tree pruneModel <- prune(rpartModel,0.007462687) #prune the tree rpart.plot(pruneModel)        

          #Print the tree after prune pre <- predict(pruneModel,procdata,type='class') pretable <- table(pre,procdata$class) pretable        
          ##       ## pre   neg pos ##   neg 432  51 ##   pos  68 217        
          #Show the confusion matrix accurary <- sum(diag(pretable))/sum(pretable) accurary        
          ## [1] 0.8450521        
          #Calculate the accurary varImp(pruneModel)        
          ##            Overall ## age       94.58974 ## glucose  106.90657 ## insulin  100.42287 ## mass      93.06814 ## pedigree  34.17301 ## pregnant  18.21234 ## pressure  14.25917 ## triceps   58.80070        
          #Show the importance of each features        

Source: https://rstudio-pubs-static.s3.amazonaws.com/94101_c23179ee360c43e0a63a791e410e1f3a.html

Posted by: brown127.blogspot.com