https://archive.ics.uci.edu/ml/datasets/default+of+credit+card+clients Yeh, I. C., & Lien, C. H. (2009). The comparisons of data mining techniques for the predictive accuracy of probability of default of credit card clients. Expert Systems with Applications, 36(2), 2473-2480.
dcc<-read.csv("C:/Course19/BDE/data/UCI_Credit_Card.csv")
dim(dcc) # 30000 obs. of 25 variables
## [1] 30000 25
# SEX: 1=male, 2=female
dcc$SEX<-as.factor(dcc$SEX)
# EDUCATION: 1=graduate school, 2=university, 3=high school, 0,4,5,6=others
dcc$EDUCATION<-ifelse(dcc$EDUCATION>3,0,dcc$EDUCATION)
dcc$EDUCATION<-as.factor(dcc$EDUCATION)
# MARRIAGE: 1=married, 2=single, 3=divorce, 0=others
dcc$MARRIAGE<-as.factor(dcc$MARRIAGE)
# payment status: -2=no consumption, -1=pay on due, #=delayed months of payment
dcc$PAY_0<-as.factor(dcc$PAY_0) # payment status as of 9/2005
dcc$PAY_2<-as.factor(dcc$PAY_2) # payment status as of 8/2005
dcc$PAY_3<-as.factor(dcc$PAY_3) # payment status as of 7/2005
dcc$PAY_4<-as.factor(dcc$PAY_4) # payment status as of 6/2005
dcc$PAY_5<-as.factor(dcc$PAY_5) # payment status as of 5/2005
dcc$PAY_6<-as.factor(dcc$PAY_6) # payment status as of 4/2005
dcc$default.payment.next.month<-as.factor(dcc$default.payment.next.month)
# numeric data: LIMIT_BAL, BILL_AMT1, ..., BILL_AMT6; PAY_AMT1, ..., PAY_AMT6, AGE
library(h2o)
##
## ----------------------------------------------------------------------
##
## Your next step is to start H2O:
## > h2o.init()
##
## For H2O package documentation, ask for help:
## > ??h2o
##
## After starting H2O, you can use the Web UI at http://localhost:54321
## For more information visit http://docs.h2o.ai
##
## ----------------------------------------------------------------------
##
## Attaching package: 'h2o'
## The following objects are masked from 'package:stats':
##
## cor, sd, var
## The following objects are masked from 'package:base':
##
## %*%, %in%, &&, ||, apply, as.factor, as.numeric, colnames,
## colnames<-, ifelse, is.character, is.factor, is.numeric, log,
## log10, log1p, log2, round, signif, trunc
h2o.init(nthreads=2)
##
## H2O is not running yet, starting it now...
##
## Note: In case of errors look at the following log files:
## C:\Users\link_000\AppData\Local\Temp\Rtmpyu8nrY/h2o_link_000_started_from_r.out
## C:\Users\link_000\AppData\Local\Temp\Rtmpyu8nrY/h2o_link_000_started_from_r.err
##
##
## Starting H2O JVM and connecting: Connection successful!
##
## R is connected to the H2O cluster:
## H2O cluster uptime: 3 seconds 920 milliseconds
## H2O cluster timezone: America/Los_Angeles
## H2O data parsing timezone: UTC
## H2O cluster version: 3.24.0.1
## H2O cluster version age: 20 days
## H2O cluster name: H2O_started_from_R_link_000_sfn818
## H2O cluster total nodes: 1
## H2O cluster total memory: 1.75 GB
## H2O cluster total cores: 4
## H2O cluster allowed cores: 2
## H2O cluster healthy: TRUE
## H2O Connection ip: localhost
## H2O Connection port: 54321
## H2O Connection proxy: NA
## H2O Internal Security: FALSE
## H2O API Extensions: Amazon S3, Algos, AutoML, Core V3, Core V4
## R Version: R version 3.5.3 (2019-03-11)
h2o.dcc1<-as.h2o(dcc)
##
|
| | 0%
|
|=================================================================| 100%
h2o.dcc2<-h2o.interaction(h2o.dcc1,
factors=c("SEX","EDUCATION","MARRIAGE"),
pairwise=T,max_factors=10,min_occurrence=3)
##
|
| | 0%
|
|=================================================================| 100%
h2o.dcc<-h2o.cbind(h2o.dcc1,h2o.dcc2)
summary(h2o.dcc,exact_quantiles=TRUE)
## ID LIMIT_BAL SEX EDUCATION MARRIAGE
## Min. : 1 Min. : 10000 2:18112 2:14030 2:15964
## 1st Qu.: 7501 1st Qu.: 50000 1:11888 1:10585 1:13659
## Median :15000 Median : 140000 3: 4917 3: 323
## Mean :15000 Mean : 167484 0: 468 0: 54
## 3rd Qu.:22500 3rd Qu.: 240000
## Max. :30000 Max. :1000000
## AGE PAY_0 PAY_2 PAY_3 PAY_4 PAY_5
## Min. :21.00 0 :14737 0 :15730 0 :15764 0 :16455 0 :16947
## 1st Qu.:28.00 -1: 5686 -1: 6050 -1: 5938 -1: 5687 -1: 5539
## Median :34.00 1 : 3688 2 : 3927 -2: 4085 -2: 4348 -2: 4546
## Mean :35.49 -2: 2759 -2: 3782 2 : 3819 2 : 3159 2 : 2626
## 3rd Qu.:41.00 2 : 2667 3 : 326 3 : 240 3 : 180 3 : 178
## Max. :79.00 3 : 322 4 : 99 4 : 76 4 : 69 4 : 84
## PAY_6 BILL_AMT1 BILL_AMT2 BILL_AMT3
## 0 :16286 Min. :-165580 Min. :-69777 Min. :-157264
## -1: 5740 1st Qu.: 3559 1st Qu.: 2985 1st Qu.: 2666
## -2: 4895 Median : 22382 Median : 21200 Median : 20089
## 2 : 2766 Mean : 51223 Mean : 49179 Mean : 47013
## 3 : 184 3rd Qu.: 67091 3rd Qu.: 64006 3rd Qu.: 60165
## 4 : 49 Max. : 964511 Max. :983931 Max. :1664089
## BILL_AMT4 BILL_AMT5 BILL_AMT6 PAY_AMT1
## Min. :-170000 Min. :-81334 Min. :-339603 Min. : 0
## 1st Qu.: 2327 1st Qu.: 1763 1st Qu.: 1256 1st Qu.: 1000
## Median : 19052 Median : 18105 Median : 17071 Median : 2100
## Mean : 43263 Mean : 40311 Mean : 38872 Mean : 5664
## 3rd Qu.: 54506 3rd Qu.: 50191 3rd Qu.: 49198 3rd Qu.: 5006
## Max. : 891586 Max. :927171 Max. : 961664 Max. :873552
## PAY_AMT2 PAY_AMT3 PAY_AMT4 PAY_AMT5
## Min. : 0 Min. : 0 Min. : 0 Min. : 0.0
## 1st Qu.: 833 1st Qu.: 390 1st Qu.: 296 1st Qu.: 252.5
## Median : 2009 Median : 1800 Median : 1500 Median : 1500.0
## Mean : 5921 Mean : 5226 Mean : 4826 Mean : 4799.4
## 3rd Qu.: 5000 3rd Qu.: 4505 3rd Qu.: 4013 3rd Qu.: 4031.5
## Max. :1684259 Max. :896040 Max. :621000 Max. :426529.0
## PAY_AMT6 default.payment.next.month SEX_EDUCATION SEX_MARRIAGE
## Min. : 0.0 0:23364 2_2:8656 2_2:9411
## 1st Qu.: 117.8 1: 6636 2_1:6231 2_1:8469
## Median : 1500.0 1_2:5374 1_2:6553
## Mean : 5215.5 1_1:4354 1_1:5190
## 3rd Qu.: 4000.0 2_3:2927 2_3: 192
## Max. :528666.0 1_3:1990 1_3: 131
## EDUCATION_MARRIAGE
## 2_2:7020
## 2_1:6842
## 1_2:6809
## 1_1:3722
## 3_1:2861
## 3_2:1909
dcc.split<-h2o.splitFrame(h2o.dcc,0.5)
train<-dcc.split[[1]]
test<-dcc.split[[2]]
yvar<-"default.payment.next.month"
xvar<-c("LIMIT_BAL","SEX","EDUCATION","MARRIAGE","AGE",
"SEX_EDUCATION","SEX_MARRIAGE","EDUCATION_MARRIAGE",
"PAY_0","PAY_2","PAY_3","PAY_4","PAY_5","PAY_6",
"BILL_AMT1","BILL_AMT2","BILL_AMT3","BILL_AMT4","BILL_AMT5","BILL_AMT6",
"PAY_AMT1","PAY_AMT2","PAY_AMT3","PAY_AMT4","PAY_AMT5","PAY_AMT6")
default neural network: hidden=c(200,200), epochs=10, bernoulli distribution, logloss critia, Rectifier activation
parameters<-list(hidden=list(100,c(100,50),c(100,50,10)),
activation=c("Tanh","Rectifier"))
dlm0<-h2o.grid("deeplearning",
grid_id="grid-dlm0",
x=xvar,y=yvar,
hyper_params=parameters,
training_frame=train,
validation_frame=test,
stopping_metric="misclassification",
stopping_rounds=5,
stopping_tolerance=0.001,
epochs=1000,seed=2019
)
##
|
| | 0%
|
| | 1%
|
|= | 1%
|
|======================= | 35%
|
|======================= | 36%
|
|======================== | 36%
|
|======================== | 37%
|
|============================================== | 70%
|
|============================================== | 71%
|
|========================================================= | 87%
|
|=================================================================| 100%
dlm0
## H2O Grid Details
## ================
##
## Grid ID: grid-dlm0
## Used hyper parameters:
## - activation
## - hidden
## Number of models: 6
## Number of failed models: 0
##
## Hyper-Parameter Search Summary: ordered by increasing logloss
## activation hidden model_ids logloss
## 1 Tanh [100, 50, 10] grid-dlm0_model_5 0.4462544620940534
## 2 Tanh [100] grid-dlm0_model_1 0.45122373954294015
## 3 Rectifier [100, 50] grid-dlm0_model_4 0.460287994396815
## 4 Tanh [100, 50] grid-dlm0_model_3 0.4609623645324703
## 5 Rectifier [100] grid-dlm0_model_2 0.46241674445137326
## 6 Rectifier [100, 50, 10] grid-dlm0_model_6 0.47193839353311295
Make sure to shutdown h2o
h2o.shutdown(prompt=F)
## [1] TRUE