https://archive.ics.uci.edu/ml/datasets/default+of+credit+card+clients Yeh, I. C., & Lien, C. H. (2009). The comparisons of data mining techniques for the predictive accuracy of probability of default of credit card clients. Expert Systems with Applications, 36(2), 2473-2480.
dcc<-read.csv("C:/Course19/BDE/data/UCI_Credit_Card.csv")
dim(dcc) # 30000 obs. of 25 variables
## [1] 30000 25
# SEX: 1=male, 2=female
dcc$SEX<-as.factor(dcc$SEX)
# EDUCATION: 1=graduate school, 2=university, 3=high school, 0,4,5,6=others
dcc$EDUCATION<-ifelse(dcc$EDUCATION>3,0,dcc$EDUCATION)
dcc$EDUCATION<-as.factor(dcc$EDUCATION)
# MARRIAGE: 1=married, 2=single, 3=divorce, 0=others
dcc$MARRIAGE<-as.factor(dcc$MARRIAGE)
# payment status: -2=no consumption, -1=pay on due, #=delayed months of payment
dcc$PAY_0<-as.factor(dcc$PAY_0) # payment status as of 9/2005
dcc$PAY_2<-as.factor(dcc$PAY_2) # payment status as of 8/2005
dcc$PAY_3<-as.factor(dcc$PAY_3) # payment status as of 7/2005
dcc$PAY_4<-as.factor(dcc$PAY_4) # payment status as of 6/2005
dcc$PAY_5<-as.factor(dcc$PAY_5) # payment status as of 5/2005
dcc$PAY_6<-as.factor(dcc$PAY_6) # payment status as of 4/2005
dcc$default.payment.next.month<-as.factor(dcc$default.payment.next.month)
# numeric data: LIMIT_BAL, BILL_AMT1, ..., BILL_AMT6; PAY_AMT1, ..., PAY_AMT6, AGE
library(h2o)
##
## ----------------------------------------------------------------------
##
## Your next step is to start H2O:
## > h2o.init()
##
## For H2O package documentation, ask for help:
## > ??h2o
##
## After starting H2O, you can use the Web UI at http://localhost:54321
## For more information visit http://docs.h2o.ai
##
## ----------------------------------------------------------------------
##
## Attaching package: 'h2o'
## The following objects are masked from 'package:stats':
##
## cor, sd, var
## The following objects are masked from 'package:base':
##
## %*%, %in%, &&, ||, apply, as.factor, as.numeric, colnames,
## colnames<-, ifelse, is.character, is.factor, is.numeric, log,
## log10, log1p, log2, round, signif, trunc
h2o.init(nthreads=2)
##
## H2O is not running yet, starting it now...
##
## Note: In case of errors look at the following log files:
## C:\Users\link_000\AppData\Local\Temp\RtmpKqrk50/h2o_link_000_started_from_r.out
## C:\Users\link_000\AppData\Local\Temp\RtmpKqrk50/h2o_link_000_started_from_r.err
##
##
## Starting H2O JVM and connecting: Connection successful!
##
## R is connected to the H2O cluster:
## H2O cluster uptime: 4 seconds 234 milliseconds
## H2O cluster timezone: America/Los_Angeles
## H2O data parsing timezone: UTC
## H2O cluster version: 3.24.0.1
## H2O cluster version age: 21 days, 22 hours and 51 minutes
## H2O cluster name: H2O_started_from_R_link_000_ueb378
## H2O cluster total nodes: 1
## H2O cluster total memory: 1.75 GB
## H2O cluster total cores: 4
## H2O cluster allowed cores: 2
## H2O cluster healthy: TRUE
## H2O Connection ip: localhost
## H2O Connection port: 54321
## H2O Connection proxy: NA
## H2O Internal Security: FALSE
## H2O API Extensions: Amazon S3, Algos, AutoML, Core V3, Core V4
## R Version: R version 3.5.3 (2019-03-11)
h2o.dcc1<-as.h2o(dcc)
##
|
| | 0%
|
|=================================================================| 100%
h2o.dcc2<-h2o.interaction(h2o.dcc1,
factors=c("SEX","EDUCATION","MARRIAGE"),
pairwise=T,max_factors=10,min_occurrence=3)
##
|
| | 0%
|
|=================================================================| 100%
h2o.dcc<-h2o.cbind(h2o.dcc1,h2o.dcc2)
summary(h2o.dcc,exact_quantiles=TRUE)
## ID LIMIT_BAL SEX EDUCATION MARRIAGE
## Min. : 1 Min. : 10000 2:18112 2:14030 2:15964
## 1st Qu.: 7501 1st Qu.: 50000 1:11888 1:10585 1:13659
## Median :15000 Median : 140000 3: 4917 3: 323
## Mean :15000 Mean : 167484 0: 468 0: 54
## 3rd Qu.:22500 3rd Qu.: 240000
## Max. :30000 Max. :1000000
## AGE PAY_0 PAY_2 PAY_3 PAY_4 PAY_5
## Min. :21.00 0 :14737 0 :15730 0 :15764 0 :16455 0 :16947
## 1st Qu.:28.00 -1: 5686 -1: 6050 -1: 5938 -1: 5687 -1: 5539
## Median :34.00 1 : 3688 2 : 3927 -2: 4085 -2: 4348 -2: 4546
## Mean :35.49 -2: 2759 -2: 3782 2 : 3819 2 : 3159 2 : 2626
## 3rd Qu.:41.00 2 : 2667 3 : 326 3 : 240 3 : 180 3 : 178
## Max. :79.00 3 : 322 4 : 99 4 : 76 4 : 69 4 : 84
## PAY_6 BILL_AMT1 BILL_AMT2 BILL_AMT3
## 0 :16286 Min. :-165580 Min. :-69777 Min. :-157264
## -1: 5740 1st Qu.: 3559 1st Qu.: 2985 1st Qu.: 2666
## -2: 4895 Median : 22382 Median : 21200 Median : 20089
## 2 : 2766 Mean : 51223 Mean : 49179 Mean : 47013
## 3 : 184 3rd Qu.: 67091 3rd Qu.: 64006 3rd Qu.: 60165
## 4 : 49 Max. : 964511 Max. :983931 Max. :1664089
## BILL_AMT4 BILL_AMT5 BILL_AMT6 PAY_AMT1
## Min. :-170000 Min. :-81334 Min. :-339603 Min. : 0
## 1st Qu.: 2327 1st Qu.: 1763 1st Qu.: 1256 1st Qu.: 1000
## Median : 19052 Median : 18105 Median : 17071 Median : 2100
## Mean : 43263 Mean : 40311 Mean : 38872 Mean : 5664
## 3rd Qu.: 54506 3rd Qu.: 50191 3rd Qu.: 49198 3rd Qu.: 5006
## Max. : 891586 Max. :927171 Max. : 961664 Max. :873552
## PAY_AMT2 PAY_AMT3 PAY_AMT4 PAY_AMT5
## Min. : 0 Min. : 0 Min. : 0 Min. : 0.0
## 1st Qu.: 833 1st Qu.: 390 1st Qu.: 296 1st Qu.: 252.5
## Median : 2009 Median : 1800 Median : 1500 Median : 1500.0
## Mean : 5921 Mean : 5226 Mean : 4826 Mean : 4799.4
## 3rd Qu.: 5000 3rd Qu.: 4505 3rd Qu.: 4013 3rd Qu.: 4031.5
## Max. :1684259 Max. :896040 Max. :621000 Max. :426529.0
## PAY_AMT6 default.payment.next.month SEX_EDUCATION SEX_MARRIAGE
## Min. : 0.0 0:23364 2_2:8656 2_2:9411
## 1st Qu.: 117.8 1: 6636 2_1:6231 2_1:8469
## Median : 1500.0 1_2:5374 1_2:6553
## Mean : 5215.5 1_1:4354 1_1:5190
## 3rd Qu.: 4000.0 2_3:2927 2_3: 192
## Max. :528666.0 1_3:1990 1_3: 131
## EDUCATION_MARRIAGE
## 2_2:7020
## 2_1:6842
## 1_2:6809
## 1_1:3722
## 3_1:2861
## 3_2:1909
dcc.split<-h2o.splitFrame(h2o.dcc,0.5)
train<-dcc.split[[1]]
test<-dcc.split[[2]]
yvar<-"default.payment.next.month"
xvar<-c("LIMIT_BAL","SEX","EDUCATION","MARRIAGE","AGE",
"SEX_EDUCATION","SEX_MARRIAGE","EDUCATION_MARRIAGE",
"PAY_0","PAY_2","PAY_3","PAY_4","PAY_5","PAY_6",
"BILL_AMT1","BILL_AMT2","BILL_AMT3","BILL_AMT4","BILL_AMT5","BILL_AMT6",
"PAY_AMT1","PAY_AMT2","PAY_AMT3","PAY_AMT4","PAY_AMT5","PAY_AMT6")
default random forest: ntrees=50, max_depth=20, min_rows=1, sample rate=0.632, col_sample_rate_per_tree = 1, mtries=-1 search_criteria=“Cartesian”
parameters<-list(ntrees=c(25,50,100),
max_depth=c(20,30,40),mtries=c(10,25))
rfm0<-h2o.grid("randomForest",
grid_id="grid-rfm0",
x=xvar,y=yvar,seed=2019,
hyper_params=parameters,
training_frame=train,
validation_frame=test,
stopping_metric="misclassification",
stopping_rounds=5,
stopping_tolerance=0.001
)
##
|
| | 0%
|
| | 1%
|
|= | 1%
|
|= | 2%
|
|== | 3%
|
|== | 4%
|
|=== | 4%
|
|=== | 5%
|
|==== | 6%
|
|===== | 7%
|
|===== | 8%
|
|====== | 9%
|
|======= | 10%
|
|======= | 11%
|
|======= | 12%
|
|======== | 12%
|
|======== | 13%
|
|========= | 13%
|
|========= | 14%
|
|========= | 15%
|
|========== | 15%
|
|========== | 16%
|
|=========== | 16%
|
|=========== | 17%
|
|============ | 19%
|
|============= | 20%
|
|============== | 21%
|
|================ | 25%
|
|================= | 26%
|
|================= | 27%
|
|================== | 28%
|
|=================== | 29%
|
|==================== | 30%
|
|==================== | 31%
|
|====================== | 34%
|
|======================= | 35%
|
|======================= | 36%
|
|======================== | 37%
|
|======================== | 38%
|
|========================= | 38%
|
|========================= | 39%
|
|========================== | 40%
|
|========================== | 41%
|
|=========================== | 41%
|
|=========================== | 42%
|
|============================ | 43%
|
|============================ | 44%
|
|============================= | 44%
|
|============================= | 45%
|
|============================== | 46%
|
|=============================== | 48%
|
|================================ | 49%
|
|================================= | 51%
|
|================================== | 52%
|
|=================================== | 54%
|
|==================================== | 55%
|
|========================================= | 64%
|
|========================================== | 64%
|
|========================================== | 65%
|
|=========================================== | 65%
|
|=========================================== | 67%
|
|============================================ | 67%
|
|============================================= | 69%
|
|============================================== | 70%
|
|============================================== | 71%
|
|=============================================== | 72%
|
|=============================================== | 73%
|
|================================================ | 73%
|
|================================================ | 74%
|
|====================================================== | 83%
|
|====================================================== | 84%
|
|======================================================= | 84%
|
|======================================================= | 85%
|
|======================================================== | 85%
|
|======================================================== | 86%
|
|========================================================= | 87%
|
|========================================================= | 88%
|
|========================================================== | 89%
|
|========================================================== | 90%
|
|=========================================================== | 91%
|
|============================================================ | 92%
|
|============================================================ | 93%
|
|============================================================= | 93%
|
|============================================================= | 94%
|
|=================================================================| 100%
rfm0
## H2O Grid Details
## ================
##
## Grid ID: grid-rfm0
## Used hyper parameters:
## - max_depth
## - mtries
## - ntrees
## Number of models: 18
## Number of failed models: 0
##
## Hyper-Parameter Search Summary: ordered by increasing logloss
## max_depth mtries ntrees model_ids logloss
## 1 20 10 100 grid-rfm0_model_13 0.4411423060060494
## 2 20 10 50 grid-rfm0_model_7 0.4451022493273394
## 3 20 10 25 grid-rfm0_model_1 0.45297528008542437
## 4 30 25 100 grid-rfm0_model_17 0.45497000587010356
## 5 40 10 100 grid-rfm0_model_15 0.46025910979557516
## 6 30 25 50 grid-rfm0_model_11 0.4854970671571419
## 7 20 25 50 grid-rfm0_model_10 0.4894655622125987
## 8 20 25 25 grid-rfm0_model_4 0.4894655622125987
## 9 20 25 100 grid-rfm0_model_16 0.4894655622125987
## 10 40 10 50 grid-rfm0_model_9 0.5075271599094806
## 11 40 25 50 grid-rfm0_model_12 0.5190431963519798
## 12 30 25 25 grid-rfm0_model_5 0.5764874872118687
## 13 40 10 25 grid-rfm0_model_3 0.6095771373853845
## 14 40 25 25 grid-rfm0_model_6 0.6654217548761822
## 15 30 10 100 grid-rfm0_model_14 0.7639572959545795
## 16 30 10 50 grid-rfm0_model_8 0.7639572959545795
## 17 30 10 25 grid-rfm0_model_2 0.7639572959545795
## 18 40 25 100 grid-rfm0_model_18 0.8140266825611048
rfm0_best<-h2o.getModel(rfm0@model_ids[[1]])
h2o.performance(rfm0_best,test)
## H2OBinomialMetrics: drf
##
## MSE: 0.1390589
## RMSE: 0.3729061
## LogLoss: 0.4411423
## Mean Per-Class Error: 0.304081
## AUC: 0.7680122
## pr_auc: 0.532665
## Gini: 0.5360244
##
## Confusion Matrix (vertical: actual; across: predicted) for F1-optimal threshold:
## 0 1 Error Rate
## 0 10071 1585 0.135981 =1585/11656
## 1 1570 1755 0.472180 =1570/3325
## Totals 11641 3340 0.210600 =3155/14981
##
## Maximum Metrics: Maximum metrics at their respective thresholds
## metric threshold value idx
## 1 max f1 0.341319 0.526632 186
## 2 max f2 0.122528 0.631588 312
## 3 max f0point5 0.475159 0.561334 133
## 4 max accuracy 0.586919 0.815767 98
## 5 max precision 0.988142 1.000000 0
## 6 max recall 0.011557 1.000000 396
## 7 max specificity 0.988142 1.000000 0
## 8 max absolute_mcc 0.389739 0.396222 166
## 9 max min_per_class_accuracy 0.201229 0.693548 254
## 10 max mean_per_class_accuracy 0.222066 0.698709 243
##
## Gains/Lift Table: Extract with `h2o.gainsLift(<model>, <data>)` or `h2o.gainsLift(<model>, valid=<T/F>, xval=<T/F>)`
h2o.varimp(rfm0_best)
h2o.varimp_plot(rfm0_best)
Make sure to shutdown h2o
h2o.shutdown(prompt=F)
## [1] TRUE