https://archive.ics.uci.edu/ml/datasets/default+of+credit+card+clients Yeh, I. C., & Lien, C. H. (2009). The comparisons of data mining techniques for the predictive accuracy of probability of default of credit card clients. Expert Systems with Applications, 36(2), 2473-2480.
dcc<-read.csv("C:/Course19/BDE/data/UCI_Credit_Card.csv")
dim(dcc) # 30000 obs. of 25 variables
## [1] 30000 25
# SEX: 1=male, 2=female
dcc$SEX<-as.factor(dcc$SEX)
# EDUCATION: 1=graduate school, 2=university, 3=high school, 0,4,5,6=others
dcc$EDUCATION<-ifelse(dcc$EDUCATION>3,0,dcc$EDUCATION)
dcc$EDUCATION<-as.factor(dcc$EDUCATION)
# MARRIAGE: 1=married, 2=single, 3=divorce, 0=others
dcc$MARRIAGE<-as.factor(dcc$MARRIAGE)
# payment status: -2=no consumption, -1=pay on due, #=delayed months of payment
dcc$PAY_0<-as.factor(dcc$PAY_0) # payment status as of 9/2005
dcc$PAY_2<-as.factor(dcc$PAY_2) # payment status as of 8/2005
dcc$PAY_3<-as.factor(dcc$PAY_3) # payment status as of 7/2005
dcc$PAY_4<-as.factor(dcc$PAY_4) # payment status as of 6/2005
dcc$PAY_5<-as.factor(dcc$PAY_5) # payment status as of 5/2005
dcc$PAY_6<-as.factor(dcc$PAY_6) # payment status as of 4/2005
dcc$default.payment.next.month<-as.factor(dcc$default.payment.next.month)
# numeric data: LIMIT_BAL, BILL_AMT1, ..., BILL_AMT6; PAY_AMT1, ..., PAY_AMT6, AGE
library(h2o)
##
## ----------------------------------------------------------------------
##
## Your next step is to start H2O:
## > h2o.init()
##
## For H2O package documentation, ask for help:
## > ??h2o
##
## After starting H2O, you can use the Web UI at http://localhost:54321
## For more information visit http://docs.h2o.ai
##
## ----------------------------------------------------------------------
##
## Attaching package: 'h2o'
## The following objects are masked from 'package:stats':
##
## cor, sd, var
## The following objects are masked from 'package:base':
##
## %*%, %in%, &&, ||, apply, as.factor, as.numeric, colnames,
## colnames<-, ifelse, is.character, is.factor, is.numeric, log,
## log10, log1p, log2, round, signif, trunc
h2o.init(nthreads=2)
##
## H2O is not running yet, starting it now...
##
## Note: In case of errors look at the following log files:
## C:\Users\link_000\AppData\Local\Temp\RtmpoTVv4Q/h2o_link_000_started_from_r.out
## C:\Users\link_000\AppData\Local\Temp\RtmpoTVv4Q/h2o_link_000_started_from_r.err
##
##
## Starting H2O JVM and connecting: Connection successful!
##
## R is connected to the H2O cluster:
## H2O cluster uptime: 4 seconds 3 milliseconds
## H2O cluster timezone: America/Los_Angeles
## H2O data parsing timezone: UTC
## H2O cluster version: 3.24.0.1
## H2O cluster version age: 22 days
## H2O cluster name: H2O_started_from_R_link_000_uch551
## H2O cluster total nodes: 1
## H2O cluster total memory: 1.75 GB
## H2O cluster total cores: 4
## H2O cluster allowed cores: 2
## H2O cluster healthy: TRUE
## H2O Connection ip: localhost
## H2O Connection port: 54321
## H2O Connection proxy: NA
## H2O Internal Security: FALSE
## H2O API Extensions: Amazon S3, Algos, AutoML, Core V3, Core V4
## R Version: R version 3.5.3 (2019-03-11)
h2o.dcc1<-as.h2o(dcc)
##
|
| | 0%
|
|=================================================================| 100%
h2o.dcc2<-h2o.interaction(h2o.dcc1,
factors=c("SEX","EDUCATION","MARRIAGE"),
pairwise=T,max_factors=10,min_occurrence=3)
##
|
| | 0%
|
|=================================================================| 100%
h2o.dcc<-h2o.cbind(h2o.dcc1,h2o.dcc2)
summary(h2o.dcc,exact_quantiles=TRUE)
## ID LIMIT_BAL SEX EDUCATION MARRIAGE
## Min. : 1 Min. : 10000 2:18112 2:14030 2:15964
## 1st Qu.: 7501 1st Qu.: 50000 1:11888 1:10585 1:13659
## Median :15000 Median : 140000 3: 4917 3: 323
## Mean :15000 Mean : 167484 0: 468 0: 54
## 3rd Qu.:22500 3rd Qu.: 240000
## Max. :30000 Max. :1000000
## AGE PAY_0 PAY_2 PAY_3 PAY_4 PAY_5
## Min. :21.00 0 :14737 0 :15730 0 :15764 0 :16455 0 :16947
## 1st Qu.:28.00 -1: 5686 -1: 6050 -1: 5938 -1: 5687 -1: 5539
## Median :34.00 1 : 3688 2 : 3927 -2: 4085 -2: 4348 -2: 4546
## Mean :35.49 -2: 2759 -2: 3782 2 : 3819 2 : 3159 2 : 2626
## 3rd Qu.:41.00 2 : 2667 3 : 326 3 : 240 3 : 180 3 : 178
## Max. :79.00 3 : 322 4 : 99 4 : 76 4 : 69 4 : 84
## PAY_6 BILL_AMT1 BILL_AMT2 BILL_AMT3
## 0 :16286 Min. :-165580 Min. :-69777 Min. :-157264
## -1: 5740 1st Qu.: 3559 1st Qu.: 2985 1st Qu.: 2666
## -2: 4895 Median : 22382 Median : 21200 Median : 20089
## 2 : 2766 Mean : 51223 Mean : 49179 Mean : 47013
## 3 : 184 3rd Qu.: 67091 3rd Qu.: 64006 3rd Qu.: 60165
## 4 : 49 Max. : 964511 Max. :983931 Max. :1664089
## BILL_AMT4 BILL_AMT5 BILL_AMT6 PAY_AMT1
## Min. :-170000 Min. :-81334 Min. :-339603 Min. : 0
## 1st Qu.: 2327 1st Qu.: 1763 1st Qu.: 1256 1st Qu.: 1000
## Median : 19052 Median : 18105 Median : 17071 Median : 2100
## Mean : 43263 Mean : 40311 Mean : 38872 Mean : 5664
## 3rd Qu.: 54506 3rd Qu.: 50191 3rd Qu.: 49198 3rd Qu.: 5006
## Max. : 891586 Max. :927171 Max. : 961664 Max. :873552
## PAY_AMT2 PAY_AMT3 PAY_AMT4 PAY_AMT5
## Min. : 0 Min. : 0 Min. : 0 Min. : 0.0
## 1st Qu.: 833 1st Qu.: 390 1st Qu.: 296 1st Qu.: 252.5
## Median : 2009 Median : 1800 Median : 1500 Median : 1500.0
## Mean : 5921 Mean : 5226 Mean : 4826 Mean : 4799.4
## 3rd Qu.: 5000 3rd Qu.: 4505 3rd Qu.: 4013 3rd Qu.: 4031.5
## Max. :1684259 Max. :896040 Max. :621000 Max. :426529.0
## PAY_AMT6 default.payment.next.month SEX_EDUCATION SEX_MARRIAGE
## Min. : 0.0 0:23364 2_2:8656 2_2:9411
## 1st Qu.: 117.8 1: 6636 2_1:6231 2_1:8469
## Median : 1500.0 1_2:5374 1_2:6553
## Mean : 5215.5 1_1:4354 1_1:5190
## 3rd Qu.: 4000.0 2_3:2927 2_3: 192
## Max. :528666.0 1_3:1990 1_3: 131
## EDUCATION_MARRIAGE
## 2_2:7020
## 2_1:6842
## 1_2:6809
## 1_1:3722
## 3_1:2861
## 3_2:1909
dcc.split<-h2o.splitFrame(h2o.dcc,0.5)
train<-dcc.split[[1]]
test<-dcc.split[[2]]
yvar<-"default.payment.next.month"
xvar<-c("LIMIT_BAL","SEX","EDUCATION","MARRIAGE","AGE",
"SEX_EDUCATION","SEX_MARRIAGE","EDUCATION_MARRIAGE",
"PAY_0","PAY_2","PAY_3","PAY_4","PAY_5","PAY_6",
"BILL_AMT1","BILL_AMT2","BILL_AMT3","BILL_AMT4","BILL_AMT5","BILL_AMT6",
"PAY_AMT1","PAY_AMT2","PAY_AMT3","PAY_AMT4","PAY_AMT5","PAY_AMT6")
parameters<-list(ntrees=c(25,50,100),
max_depth=c(5,10),min_rows=c(10,20))
gbm1<-h2o.grid("gbm",
grid_id="grid-gbm1",
x=xvar,y=yvar,
hyper_params=parameters,
training_frame=train,
validation_frame=test,
learn_rate=0.05,
learn_rate_annealing=0.99,
sample_rate=0.8,
col_sample_rate=0.8,
stopping_metric="misclassification",
stopping_rounds=5,
stopping_tolerance=0.001,
seed=2019)
##
|
| | 0%
|
|= | 1%
|
|== | 3%
|
|=== | 5%
|
|==== | 6%
|
|====== | 9%
|
|======== | 12%
|
|========= | 14%
|
|========== | 16%
|
|=============== | 23%
|
|================ | 24%
|
|==================== | 31%
|
|========================= | 38%
|
|========================== | 40%
|
|============================== | 46%
|
|====================================== | 59%
|
|======================================== | 61%
|
|================================================ | 74%
|
|========================================================== | 89%
|
|=================================================================| 100%
gbm1
## H2O Grid Details
## ================
##
## Grid ID: grid-gbm1
## Used hyper parameters:
## - max_depth
## - min_rows
## - ntrees
## Number of models: 12
## Number of failed models: 0
##
## Hyper-Parameter Search Summary: ordered by increasing logloss
## max_depth min_rows ntrees model_ids logloss
## 1 10 20.0 50 grid-gbm1_model_8 0.44740853405827563
## 2 10 20.0 25 grid-gbm1_model_4 0.44740853405827563
## 3 10 20.0 100 grid-gbm1_model_12 0.44740853405827563
## 4 5 10.0 25 grid-gbm1_model_1 0.448063732958489
## 5 10 10.0 50 grid-gbm1_model_6 0.4511820433768311
## 6 10 10.0 100 grid-gbm1_model_10 0.4511820433768311
## 7 10 10.0 25 grid-gbm1_model_2 0.4511820433768311
## 8 5 10.0 50 grid-gbm1_model_5 0.4558220202480993
## 9 5 10.0 100 grid-gbm1_model_9 0.4558220202480993
## 10 5 20.0 50 grid-gbm1_model_7 0.46980285486604145
## 11 5 20.0 25 grid-gbm1_model_3 0.46980285486604145
## 12 5 20.0 100 grid-gbm1_model_11 0.46980285486604145
gbm1_best<-h2o.getModel(gbm1@model_ids[[1]])
h2o.performance(gbm1_best,test)
## H2OBinomialMetrics: gbm
##
## MSE: 0.1412706
## RMSE: 0.3758598
## LogLoss: 0.4474085
## Mean Per-Class Error: 0.3001775
## AUC: 0.7719399
## pr_auc: 0.5389481
## Gini: 0.5438799
##
## Confusion Matrix (vertical: actual; across: predicted) for F1-optimal threshold:
## 0 1 Error Rate
## 0 9789 1804 0.155611 =1804/11593
## 1 1485 1854 0.444744 =1485/3339
## Totals 11274 3658 0.220265 =3289/14932
##
## Maximum Metrics: Maximum metrics at their respective thresholds
## metric threshold value idx
## 1 max f1 0.245548 0.529941 216
## 2 max f2 0.146804 0.634098 323
## 3 max f0point5 0.371545 0.566401 144
## 4 max accuracy 0.443304 0.815497 110
## 5 max precision 0.724799 1.000000 0
## 6 max recall 0.090331 1.000000 399
## 7 max specificity 0.724799 1.000000 0
## 8 max absolute_mcc 0.338908 0.400061 162
## 9 max min_per_class_accuracy 0.192398 0.698697 267
## 10 max mean_per_class_accuracy 0.210093 0.705793 246
##
## Gains/Lift Table: Extract with `h2o.gainsLift(<model>, <data>)` or `h2o.gainsLift(<model>, valid=<T/F>, xval=<T/F>)`
h2o.varimp(gbm1_best)
h2o.varimp_plot(gbm1_best)
Make sure to shutdown h2o
h2o.shutdown(prompt=F)
## [1] TRUE