Default of Credit Card Clients (data from UCI data archive)

https://archive.ics.uci.edu/ml/datasets/default+of+credit+card+clients Yeh, I. C., & Lien, C. H. (2009). The comparisons of data mining techniques for the predictive accuracy of probability of default of credit card clients. Expert Systems with Applications, 36(2), 2473-2480.

dcc<-read.csv("C:/Course19/BDE/data/UCI_Credit_Card.csv")
dim(dcc)  # 30000 obs. of 25 variables
## [1] 30000    25

Redefine factor variables

# SEX: 1=male, 2=female
dcc$SEX<-as.factor(dcc$SEX) 
# EDUCATION: 1=graduate school, 2=university, 3=high school, 0,4,5,6=others
dcc$EDUCATION<-ifelse(dcc$EDUCATION>3,0,dcc$EDUCATION)
dcc$EDUCATION<-as.factor(dcc$EDUCATION) 
# MARRIAGE: 1=married, 2=single, 3=divorce, 0=others
dcc$MARRIAGE<-as.factor(dcc$MARRIAGE)
# payment status: -2=no consumption, -1=pay on due, #=delayed months of payment
dcc$PAY_0<-as.factor(dcc$PAY_0) # payment status as of 9/2005
dcc$PAY_2<-as.factor(dcc$PAY_2) # payment status as of 8/2005
dcc$PAY_3<-as.factor(dcc$PAY_3) # payment status as of 7/2005
dcc$PAY_4<-as.factor(dcc$PAY_4) # payment status as of 6/2005
dcc$PAY_5<-as.factor(dcc$PAY_5) # payment status as of 5/2005
dcc$PAY_6<-as.factor(dcc$PAY_6) # payment status as of 4/2005
dcc$default.payment.next.month<-as.factor(dcc$default.payment.next.month)
# numeric data: LIMIT_BAL, BILL_AMT1, ..., BILL_AMT6; PAY_AMT1, ..., PAY_AMT6, AGE

Using h2o package with 2 CPU only

library(h2o)
## 
## ----------------------------------------------------------------------
## 
## Your next step is to start H2O:
##     > h2o.init()
## 
## For H2O package documentation, ask for help:
##     > ??h2o
## 
## After starting H2O, you can use the Web UI at http://localhost:54321
## For more information visit http://docs.h2o.ai
## 
## ----------------------------------------------------------------------
## 
## Attaching package: 'h2o'
## The following objects are masked from 'package:stats':
## 
##     cor, sd, var
## The following objects are masked from 'package:base':
## 
##     %*%, %in%, &&, ||, apply, as.factor, as.numeric, colnames,
##     colnames<-, ifelse, is.character, is.factor, is.numeric, log,
##     log10, log1p, log2, round, signif, trunc
h2o.init(nthreads=2)
## 
## H2O is not running yet, starting it now...
## 
## Note:  In case of errors look at the following log files:
##     C:\Users\link_000\AppData\Local\Temp\RtmpKqrk50/h2o_link_000_started_from_r.out
##     C:\Users\link_000\AppData\Local\Temp\RtmpKqrk50/h2o_link_000_started_from_r.err
## 
## 
## Starting H2O JVM and connecting:  Connection successful!
## 
## R is connected to the H2O cluster: 
##     H2O cluster uptime:         4 seconds 234 milliseconds 
##     H2O cluster timezone:       America/Los_Angeles 
##     H2O data parsing timezone:  UTC 
##     H2O cluster version:        3.24.0.1 
##     H2O cluster version age:    21 days, 22 hours and 51 minutes  
##     H2O cluster name:           H2O_started_from_R_link_000_ueb378 
##     H2O cluster total nodes:    1 
##     H2O cluster total memory:   1.75 GB 
##     H2O cluster total cores:    4 
##     H2O cluster allowed cores:  2 
##     H2O cluster healthy:        TRUE 
##     H2O Connection ip:          localhost 
##     H2O Connection port:        54321 
##     H2O Connection proxy:       NA 
##     H2O Internal Security:      FALSE 
##     H2O API Extensions:         Amazon S3, Algos, AutoML, Core V3, Core V4 
##     R Version:                  R version 3.5.3 (2019-03-11)

Load base data frame, convert to h2oFrame

h2o.dcc1<-as.h2o(dcc)
## 
  |                                                                       
  |                                                                 |   0%
  |                                                                       
  |=================================================================| 100%

Include interaction variables

h2o.dcc2<-h2o.interaction(h2o.dcc1,
                factors=c("SEX","EDUCATION","MARRIAGE"),
                pairwise=T,max_factors=10,min_occurrence=3)
## 
  |                                                                       
  |                                                                 |   0%
  |                                                                       
  |=================================================================| 100%
h2o.dcc<-h2o.cbind(h2o.dcc1,h2o.dcc2)
summary(h2o.dcc,exact_quantiles=TRUE)
##  ID              LIMIT_BAL         SEX      EDUCATION MARRIAGE
##  Min.   :    1   Min.   :  10000   2:18112  2:14030   2:15964 
##  1st Qu.: 7501   1st Qu.:  50000   1:11888  1:10585   1:13659 
##  Median :15000   Median : 140000            3: 4917   3:  323 
##  Mean   :15000   Mean   : 167484            0:  468   0:   54 
##  3rd Qu.:22500   3rd Qu.: 240000                              
##  Max.   :30000   Max.   :1000000                              
##  AGE             PAY_0     PAY_2     PAY_3     PAY_4     PAY_5    
##  Min.   :21.00   0 :14737  0 :15730  0 :15764  0 :16455  0 :16947 
##  1st Qu.:28.00   -1: 5686  -1: 6050  -1: 5938  -1: 5687  -1: 5539 
##  Median :34.00   1 : 3688  2 : 3927  -2: 4085  -2: 4348  -2: 4546 
##  Mean   :35.49   -2: 2759  -2: 3782  2 : 3819  2 : 3159  2 : 2626 
##  3rd Qu.:41.00   2 : 2667  3 :  326  3 :  240  3 :  180  3 :  178 
##  Max.   :79.00   3 :  322  4 :   99  4 :   76  4 :   69  4 :   84 
##  PAY_6     BILL_AMT1         BILL_AMT2        BILL_AMT3        
##  0 :16286  Min.   :-165580   Min.   :-69777   Min.   :-157264  
##  -1: 5740  1st Qu.:   3559   1st Qu.:  2985   1st Qu.:   2666  
##  -2: 4895  Median :  22382   Median : 21200   Median :  20089  
##  2 : 2766  Mean   :  51223   Mean   : 49179   Mean   :  47013  
##  3 :  184  3rd Qu.:  67091   3rd Qu.: 64006   3rd Qu.:  60165  
##  4 :   49  Max.   : 964511   Max.   :983931   Max.   :1664089  
##  BILL_AMT4         BILL_AMT5        BILL_AMT6         PAY_AMT1        
##  Min.   :-170000   Min.   :-81334   Min.   :-339603   Min.   :     0  
##  1st Qu.:   2327   1st Qu.:  1763   1st Qu.:   1256   1st Qu.:  1000  
##  Median :  19052   Median : 18105   Median :  17071   Median :  2100  
##  Mean   :  43263   Mean   : 40311   Mean   :  38872   Mean   :  5664  
##  3rd Qu.:  54506   3rd Qu.: 50191   3rd Qu.:  49198   3rd Qu.:  5006  
##  Max.   : 891586   Max.   :927171   Max.   : 961664   Max.   :873552  
##  PAY_AMT2          PAY_AMT3         PAY_AMT4         PAY_AMT5          
##  Min.   :      0   Min.   :     0   Min.   :     0   Min.   :     0.0  
##  1st Qu.:    833   1st Qu.:   390   1st Qu.:   296   1st Qu.:   252.5  
##  Median :   2009   Median :  1800   Median :  1500   Median :  1500.0  
##  Mean   :   5921   Mean   :  5226   Mean   :  4826   Mean   :  4799.4  
##  3rd Qu.:   5000   3rd Qu.:  4505   3rd Qu.:  4013   3rd Qu.:  4031.5  
##  Max.   :1684259   Max.   :896040   Max.   :621000   Max.   :426529.0  
##  PAY_AMT6           default.payment.next.month SEX_EDUCATION SEX_MARRIAGE
##  Min.   :     0.0   0:23364                    2_2:8656      2_2:9411    
##  1st Qu.:   117.8   1: 6636                    2_1:6231      2_1:8469    
##  Median :  1500.0                              1_2:5374      1_2:6553    
##  Mean   :  5215.5                              1_1:4354      1_1:5190    
##  3rd Qu.:  4000.0                              2_3:2927      2_3: 192    
##  Max.   :528666.0                              1_3:1990      1_3: 131    
##  EDUCATION_MARRIAGE
##  2_2:7020          
##  2_1:6842          
##  1_2:6809          
##  1_1:3722          
##  3_1:2861          
##  3_2:1909

Split data for training and testing

dcc.split<-h2o.splitFrame(h2o.dcc,0.5)
train<-dcc.split[[1]]
test<-dcc.split[[2]]

Setup x and y variables

yvar<-"default.payment.next.month"
xvar<-c("LIMIT_BAL","SEX","EDUCATION","MARRIAGE","AGE",
        "SEX_EDUCATION","SEX_MARRIAGE","EDUCATION_MARRIAGE",
        "PAY_0","PAY_2","PAY_3","PAY_4","PAY_5","PAY_6",
        "BILL_AMT1","BILL_AMT2","BILL_AMT3","BILL_AMT4","BILL_AMT5","BILL_AMT6",
        "PAY_AMT1","PAY_AMT2","PAY_AMT3","PAY_AMT4","PAY_AMT5","PAY_AMT6")

Grid search of random forest models

default random forest: ntrees=50, max_depth=20, min_rows=1, sample rate=0.632, col_sample_rate_per_tree = 1, mtries=-1 search_criteria=“Cartesian”

parameters<-list(ntrees=c(25,50,100),
                 max_depth=c(20,30,40),mtries=c(10,25))
rfm0<-h2o.grid("randomForest",
               grid_id="grid-rfm0",
               x=xvar,y=yvar,seed=2019,
               hyper_params=parameters,
               training_frame=train,
               validation_frame=test,
               stopping_metric="misclassification",
               stopping_rounds=5,
               stopping_tolerance=0.001
)
## 
  |                                                                       
  |                                                                 |   0%
  |                                                                       
  |                                                                 |   1%
  |                                                                       
  |=                                                                |   1%
  |                                                                       
  |=                                                                |   2%
  |                                                                       
  |==                                                               |   3%
  |                                                                       
  |==                                                               |   4%
  |                                                                       
  |===                                                              |   4%
  |                                                                       
  |===                                                              |   5%
  |                                                                       
  |====                                                             |   6%
  |                                                                       
  |=====                                                            |   7%
  |                                                                       
  |=====                                                            |   8%
  |                                                                       
  |======                                                           |   9%
  |                                                                       
  |=======                                                          |  10%
  |                                                                       
  |=======                                                          |  11%
  |                                                                       
  |=======                                                          |  12%
  |                                                                       
  |========                                                         |  12%
  |                                                                       
  |========                                                         |  13%
  |                                                                       
  |=========                                                        |  13%
  |                                                                       
  |=========                                                        |  14%
  |                                                                       
  |=========                                                        |  15%
  |                                                                       
  |==========                                                       |  15%
  |                                                                       
  |==========                                                       |  16%
  |                                                                       
  |===========                                                      |  16%
  |                                                                       
  |===========                                                      |  17%
  |                                                                       
  |============                                                     |  19%
  |                                                                       
  |=============                                                    |  20%
  |                                                                       
  |==============                                                   |  21%
  |                                                                       
  |================                                                 |  25%
  |                                                                       
  |=================                                                |  26%
  |                                                                       
  |=================                                                |  27%
  |                                                                       
  |==================                                               |  28%
  |                                                                       
  |===================                                              |  29%
  |                                                                       
  |====================                                             |  30%
  |                                                                       
  |====================                                             |  31%
  |                                                                       
  |======================                                           |  34%
  |                                                                       
  |=======================                                          |  35%
  |                                                                       
  |=======================                                          |  36%
  |                                                                       
  |========================                                         |  37%
  |                                                                       
  |========================                                         |  38%
  |                                                                       
  |=========================                                        |  38%
  |                                                                       
  |=========================                                        |  39%
  |                                                                       
  |==========================                                       |  40%
  |                                                                       
  |==========================                                       |  41%
  |                                                                       
  |===========================                                      |  41%
  |                                                                       
  |===========================                                      |  42%
  |                                                                       
  |============================                                     |  43%
  |                                                                       
  |============================                                     |  44%
  |                                                                       
  |=============================                                    |  44%
  |                                                                       
  |=============================                                    |  45%
  |                                                                       
  |==============================                                   |  46%
  |                                                                       
  |===============================                                  |  48%
  |                                                                       
  |================================                                 |  49%
  |                                                                       
  |=================================                                |  51%
  |                                                                       
  |==================================                               |  52%
  |                                                                       
  |===================================                              |  54%
  |                                                                       
  |====================================                             |  55%
  |                                                                       
  |=========================================                        |  64%
  |                                                                       
  |==========================================                       |  64%
  |                                                                       
  |==========================================                       |  65%
  |                                                                       
  |===========================================                      |  65%
  |                                                                       
  |===========================================                      |  67%
  |                                                                       
  |============================================                     |  67%
  |                                                                       
  |=============================================                    |  69%
  |                                                                       
  |==============================================                   |  70%
  |                                                                       
  |==============================================                   |  71%
  |                                                                       
  |===============================================                  |  72%
  |                                                                       
  |===============================================                  |  73%
  |                                                                       
  |================================================                 |  73%
  |                                                                       
  |================================================                 |  74%
  |                                                                       
  |======================================================           |  83%
  |                                                                       
  |======================================================           |  84%
  |                                                                       
  |=======================================================          |  84%
  |                                                                       
  |=======================================================          |  85%
  |                                                                       
  |========================================================         |  85%
  |                                                                       
  |========================================================         |  86%
  |                                                                       
  |=========================================================        |  87%
  |                                                                       
  |=========================================================        |  88%
  |                                                                       
  |==========================================================       |  89%
  |                                                                       
  |==========================================================       |  90%
  |                                                                       
  |===========================================================      |  91%
  |                                                                       
  |============================================================     |  92%
  |                                                                       
  |============================================================     |  93%
  |                                                                       
  |=============================================================    |  93%
  |                                                                       
  |=============================================================    |  94%
  |                                                                       
  |=================================================================| 100%
rfm0
## H2O Grid Details
## ================
## 
## Grid ID: grid-rfm0 
## Used hyper parameters: 
##   -  max_depth 
##   -  mtries 
##   -  ntrees 
## Number of models: 18 
## Number of failed models: 0 
## 
## Hyper-Parameter Search Summary: ordered by increasing logloss
##    max_depth mtries ntrees          model_ids             logloss
## 1         20     10    100 grid-rfm0_model_13  0.4411423060060494
## 2         20     10     50  grid-rfm0_model_7  0.4451022493273394
## 3         20     10     25  grid-rfm0_model_1 0.45297528008542437
## 4         30     25    100 grid-rfm0_model_17 0.45497000587010356
## 5         40     10    100 grid-rfm0_model_15 0.46025910979557516
## 6         30     25     50 grid-rfm0_model_11  0.4854970671571419
## 7         20     25     50 grid-rfm0_model_10  0.4894655622125987
## 8         20     25     25  grid-rfm0_model_4  0.4894655622125987
## 9         20     25    100 grid-rfm0_model_16  0.4894655622125987
## 10        40     10     50  grid-rfm0_model_9  0.5075271599094806
## 11        40     25     50 grid-rfm0_model_12  0.5190431963519798
## 12        30     25     25  grid-rfm0_model_5  0.5764874872118687
## 13        40     10     25  grid-rfm0_model_3  0.6095771373853845
## 14        40     25     25  grid-rfm0_model_6  0.6654217548761822
## 15        30     10    100 grid-rfm0_model_14  0.7639572959545795
## 16        30     10     50  grid-rfm0_model_8  0.7639572959545795
## 17        30     10     25  grid-rfm0_model_2  0.7639572959545795
## 18        40     25    100 grid-rfm0_model_18  0.8140266825611048

Pick up the best model

rfm0_best<-h2o.getModel(rfm0@model_ids[[1]]) 
h2o.performance(rfm0_best,test)
## H2OBinomialMetrics: drf
## 
## MSE:  0.1390589
## RMSE:  0.3729061
## LogLoss:  0.4411423
## Mean Per-Class Error:  0.304081
## AUC:  0.7680122
## pr_auc:  0.532665
## Gini:  0.5360244
## 
## Confusion Matrix (vertical: actual; across: predicted) for F1-optimal threshold:
##            0    1    Error         Rate
## 0      10071 1585 0.135981  =1585/11656
## 1       1570 1755 0.472180   =1570/3325
## Totals 11641 3340 0.210600  =3155/14981
## 
## Maximum Metrics: Maximum metrics at their respective thresholds
##                         metric threshold    value idx
## 1                       max f1  0.341319 0.526632 186
## 2                       max f2  0.122528 0.631588 312
## 3                 max f0point5  0.475159 0.561334 133
## 4                 max accuracy  0.586919 0.815767  98
## 5                max precision  0.988142 1.000000   0
## 6                   max recall  0.011557 1.000000 396
## 7              max specificity  0.988142 1.000000   0
## 8             max absolute_mcc  0.389739 0.396222 166
## 9   max min_per_class_accuracy  0.201229 0.693548 254
## 10 max mean_per_class_accuracy  0.222066 0.698709 243
## 
## Gains/Lift Table: Extract with `h2o.gainsLift(<model>, <data>)` or `h2o.gainsLift(<model>, valid=<T/F>, xval=<T/F>)`
h2o.varimp(rfm0_best)
h2o.varimp_plot(rfm0_best)

End of H2O

Make sure to shutdown h2o

h2o.shutdown(prompt=F)
## [1] TRUE