Default of Credit Card Clients (data from UCI data archive)

https://archive.ics.uci.edu/ml/datasets/default+of+credit+card+clients Yeh, I. C., & Lien, C. H. (2009). The comparisons of data mining techniques for the predictive accuracy of probability of default of credit card clients. Expert Systems with Applications, 36(2), 2473-2480.

dcc<-read.csv("C:/Course19/BDE/data/UCI_Credit_Card.csv")
dim(dcc)  # 30000 obs. of 25 variables

## [1] 30000    25

Redefine factor variables

# SEX: 1=male, 2=female
dcc$SEX<-as.factor(dcc$SEX) 
# EDUCATION: 1=graduate school, 2=university, 3=high school, 0,4,5,6=others
dcc$EDUCATION<-ifelse(dcc$EDUCATION>3,0,dcc$EDUCATION)
dcc$EDUCATION<-as.factor(dcc$EDUCATION) 
# MARRIAGE: 1=married, 2=single, 3=divorce, 0=others
dcc$MARRIAGE<-as.factor(dcc$MARRIAGE)
# payment status: -2=no consumption, -1=pay on due, #=delayed months of payment
dcc$PAY_0<-as.factor(dcc$PAY_0) # payment status as of 9/2005
dcc$PAY_2<-as.factor(dcc$PAY_2) # payment status as of 8/2005
dcc$PAY_3<-as.factor(dcc$PAY_3) # payment status as of 7/2005
dcc$PAY_4<-as.factor(dcc$PAY_4) # payment status as of 6/2005
dcc$PAY_5<-as.factor(dcc$PAY_5) # payment status as of 5/2005
dcc$PAY_6<-as.factor(dcc$PAY_6) # payment status as of 4/2005
dcc$default.payment.next.month<-as.factor(dcc$default.payment.next.month)
# numeric data: LIMIT_BAL, BILL_AMT1, ..., BILL_AMT6; PAY_AMT1, ..., PAY_AMT6, AGE

Using h2o package with 2 CPU only

library(h2o)

## 
## ----------------------------------------------------------------------
## 
## Your next step is to start H2O:
##     > h2o.init()
## 
## For H2O package documentation, ask for help:
##     > ??h2o
## 
## After starting H2O, you can use the Web UI at http://localhost:54321
## For more information visit http://docs.h2o.ai
## 
## ----------------------------------------------------------------------

## 
## Attaching package: 'h2o'

## The following objects are masked from 'package:stats':
## 
##     cor, sd, var

## The following objects are masked from 'package:base':
## 
##     %*%, %in%, &&, ||, apply, as.factor, as.numeric, colnames,
##     colnames<-, ifelse, is.character, is.factor, is.numeric, log,
##     log10, log1p, log2, round, signif, trunc

h2o.init(nthreads=2)

## 
## H2O is not running yet, starting it now...
## 
## Note:  In case of errors look at the following log files:
##     C:\Users\link_000\AppData\Local\Temp\RtmpoTVv4Q/h2o_link_000_started_from_r.out
##     C:\Users\link_000\AppData\Local\Temp\RtmpoTVv4Q/h2o_link_000_started_from_r.err
## 
## 
## Starting H2O JVM and connecting:  Connection successful!
## 
## R is connected to the H2O cluster: 
##     H2O cluster uptime:         4 seconds 3 milliseconds 
##     H2O cluster timezone:       America/Los_Angeles 
##     H2O data parsing timezone:  UTC 
##     H2O cluster version:        3.24.0.1 
##     H2O cluster version age:    22 days  
##     H2O cluster name:           H2O_started_from_R_link_000_uch551 
##     H2O cluster total nodes:    1 
##     H2O cluster total memory:   1.75 GB 
##     H2O cluster total cores:    4 
##     H2O cluster allowed cores:  2 
##     H2O cluster healthy:        TRUE 
##     H2O Connection ip:          localhost 
##     H2O Connection port:        54321 
##     H2O Connection proxy:       NA 
##     H2O Internal Security:      FALSE 
##     H2O API Extensions:         Amazon S3, Algos, AutoML, Core V3, Core V4 
##     R Version:                  R version 3.5.3 (2019-03-11)

Load base data frame, convert to h2oFrame

h2o.dcc1<-as.h2o(dcc)

## 
  |                                                                       
  |                                                                 |   0%
  |                                                                       
  |=================================================================| 100%

Include interaction variables

h2o.dcc2<-h2o.interaction(h2o.dcc1,
                factors=c("SEX","EDUCATION","MARRIAGE"),
                pairwise=T,max_factors=10,min_occurrence=3)

## 
  |                                                                       
  |                                                                 |   0%
  |                                                                       
  |=================================================================| 100%

h2o.dcc<-h2o.cbind(h2o.dcc1,h2o.dcc2)
summary(h2o.dcc,exact_quantiles=TRUE)

##  ID              LIMIT_BAL         SEX      EDUCATION MARRIAGE
##  Min.   :    1   Min.   :  10000   2:18112  2:14030   2:15964 
##  1st Qu.: 7501   1st Qu.:  50000   1:11888  1:10585   1:13659 
##  Median :15000   Median : 140000            3: 4917   3:  323 
##  Mean   :15000   Mean   : 167484            0:  468   0:   54 
##  3rd Qu.:22500   3rd Qu.: 240000                              
##  Max.   :30000   Max.   :1000000                              
##  AGE             PAY_0     PAY_2     PAY_3     PAY_4     PAY_5    
##  Min.   :21.00   0 :14737  0 :15730  0 :15764  0 :16455  0 :16947 
##  1st Qu.:28.00   -1: 5686  -1: 6050  -1: 5938  -1: 5687  -1: 5539 
##  Median :34.00   1 : 3688  2 : 3927  -2: 4085  -2: 4348  -2: 4546 
##  Mean   :35.49   -2: 2759  -2: 3782  2 : 3819  2 : 3159  2 : 2626 
##  3rd Qu.:41.00   2 : 2667  3 :  326  3 :  240  3 :  180  3 :  178 
##  Max.   :79.00   3 :  322  4 :   99  4 :   76  4 :   69  4 :   84 
##  PAY_6     BILL_AMT1         BILL_AMT2        BILL_AMT3        
##  0 :16286  Min.   :-165580   Min.   :-69777   Min.   :-157264  
##  -1: 5740  1st Qu.:   3559   1st Qu.:  2985   1st Qu.:   2666  
##  -2: 4895  Median :  22382   Median : 21200   Median :  20089  
##  2 : 2766  Mean   :  51223   Mean   : 49179   Mean   :  47013  
##  3 :  184  3rd Qu.:  67091   3rd Qu.: 64006   3rd Qu.:  60165  
##  4 :   49  Max.   : 964511   Max.   :983931   Max.   :1664089  
##  BILL_AMT4         BILL_AMT5        BILL_AMT6         PAY_AMT1        
##  Min.   :-170000   Min.   :-81334   Min.   :-339603   Min.   :     0  
##  1st Qu.:   2327   1st Qu.:  1763   1st Qu.:   1256   1st Qu.:  1000  
##  Median :  19052   Median : 18105   Median :  17071   Median :  2100  
##  Mean   :  43263   Mean   : 40311   Mean   :  38872   Mean   :  5664  
##  3rd Qu.:  54506   3rd Qu.: 50191   3rd Qu.:  49198   3rd Qu.:  5006  
##  Max.   : 891586   Max.   :927171   Max.   : 961664   Max.   :873552  
##  PAY_AMT2          PAY_AMT3         PAY_AMT4         PAY_AMT5          
##  Min.   :      0   Min.   :     0   Min.   :     0   Min.   :     0.0  
##  1st Qu.:    833   1st Qu.:   390   1st Qu.:   296   1st Qu.:   252.5  
##  Median :   2009   Median :  1800   Median :  1500   Median :  1500.0  
##  Mean   :   5921   Mean   :  5226   Mean   :  4826   Mean   :  4799.4  
##  3rd Qu.:   5000   3rd Qu.:  4505   3rd Qu.:  4013   3rd Qu.:  4031.5  
##  Max.   :1684259   Max.   :896040   Max.   :621000   Max.   :426529.0  
##  PAY_AMT6           default.payment.next.month SEX_EDUCATION SEX_MARRIAGE
##  Min.   :     0.0   0:23364                    2_2:8656      2_2:9411    
##  1st Qu.:   117.8   1: 6636                    2_1:6231      2_1:8469    
##  Median :  1500.0                              1_2:5374      1_2:6553    
##  Mean   :  5215.5                              1_1:4354      1_1:5190    
##  3rd Qu.:  4000.0                              2_3:2927      2_3: 192    
##  Max.   :528666.0                              1_3:1990      1_3: 131    
##  EDUCATION_MARRIAGE
##  2_2:7020          
##  2_1:6842          
##  1_2:6809          
##  1_1:3722          
##  3_1:2861          
##  3_2:1909

Split data for training and testing

dcc.split<-h2o.splitFrame(h2o.dcc,0.5)
train<-dcc.split[[1]]
test<-dcc.split[[2]]

Setup x and y variables

yvar<-"default.payment.next.month"
xvar<-c("LIMIT_BAL","SEX","EDUCATION","MARRIAGE","AGE",
        "SEX_EDUCATION","SEX_MARRIAGE","EDUCATION_MARRIAGE",
        "PAY_0","PAY_2","PAY_3","PAY_4","PAY_5","PAY_6",
        "BILL_AMT1","BILL_AMT2","BILL_AMT3","BILL_AMT4","BILL_AMT5","BILL_AMT6",
        "PAY_AMT1","PAY_AMT2","PAY_AMT3","PAY_AMT4","PAY_AMT5","PAY_AMT6")

Grid search of several gradient boosted models

parameters<-list(ntrees=c(25,50,100),
                 max_depth=c(5,10),min_rows=c(10,20))
gbm1<-h2o.grid("gbm",
               grid_id="grid-gbm1",
               x=xvar,y=yvar,
               hyper_params=parameters,
               training_frame=train,
               validation_frame=test,
               learn_rate=0.05,
               learn_rate_annealing=0.99,
               sample_rate=0.8,
               col_sample_rate=0.8,
               stopping_metric="misclassification",
               stopping_rounds=5,
               stopping_tolerance=0.001,
               seed=2019)

## 
  |                                                                       
  |                                                                 |   0%
  |                                                                       
  |=                                                                |   1%
  |                                                                       
  |==                                                               |   3%
  |                                                                       
  |===                                                              |   5%
  |                                                                       
  |====                                                             |   6%
  |                                                                       
  |======                                                           |   9%
  |                                                                       
  |========                                                         |  12%
  |                                                                       
  |=========                                                        |  14%
  |                                                                       
  |==========                                                       |  16%
  |                                                                       
  |===============                                                  |  23%
  |                                                                       
  |================                                                 |  24%
  |                                                                       
  |====================                                             |  31%
  |                                                                       
  |=========================                                        |  38%
  |                                                                       
  |==========================                                       |  40%
  |                                                                       
  |==============================                                   |  46%
  |                                                                       
  |======================================                           |  59%
  |                                                                       
  |========================================                         |  61%
  |                                                                       
  |================================================                 |  74%
  |                                                                       
  |==========================================================       |  89%
  |                                                                       
  |=================================================================| 100%

gbm1

## H2O Grid Details
## ================
## 
## Grid ID: grid-gbm1 
## Used hyper parameters: 
##   -  max_depth 
##   -  min_rows 
##   -  ntrees 
## Number of models: 12 
## Number of failed models: 0 
## 
## Hyper-Parameter Search Summary: ordered by increasing logloss
##    max_depth min_rows ntrees          model_ids             logloss
## 1         10     20.0     50  grid-gbm1_model_8 0.44740853405827563
## 2         10     20.0     25  grid-gbm1_model_4 0.44740853405827563
## 3         10     20.0    100 grid-gbm1_model_12 0.44740853405827563
## 4          5     10.0     25  grid-gbm1_model_1   0.448063732958489
## 5         10     10.0     50  grid-gbm1_model_6  0.4511820433768311
## 6         10     10.0    100 grid-gbm1_model_10  0.4511820433768311
## 7         10     10.0     25  grid-gbm1_model_2  0.4511820433768311
## 8          5     10.0     50  grid-gbm1_model_5  0.4558220202480993
## 9          5     10.0    100  grid-gbm1_model_9  0.4558220202480993
## 10         5     20.0     50  grid-gbm1_model_7 0.46980285486604145
## 11         5     20.0     25  grid-gbm1_model_3 0.46980285486604145
## 12         5     20.0    100 grid-gbm1_model_11 0.46980285486604145

Pick up the best model

gbm1_best<-h2o.getModel(gbm1@model_ids[[1]]) 
h2o.performance(gbm1_best,test)

## H2OBinomialMetrics: gbm
## 
## MSE:  0.1412706
## RMSE:  0.3758598
## LogLoss:  0.4474085
## Mean Per-Class Error:  0.3001775
## AUC:  0.7719399
## pr_auc:  0.5389481
## Gini:  0.5438799
## 
## Confusion Matrix (vertical: actual; across: predicted) for F1-optimal threshold:
##            0    1    Error         Rate
## 0       9789 1804 0.155611  =1804/11593
## 1       1485 1854 0.444744   =1485/3339
## Totals 11274 3658 0.220265  =3289/14932
## 
## Maximum Metrics: Maximum metrics at their respective thresholds
##                         metric threshold    value idx
## 1                       max f1  0.245548 0.529941 216
## 2                       max f2  0.146804 0.634098 323
## 3                 max f0point5  0.371545 0.566401 144
## 4                 max accuracy  0.443304 0.815497 110
## 5                max precision  0.724799 1.000000   0
## 6                   max recall  0.090331 1.000000 399
## 7              max specificity  0.724799 1.000000   0
## 8             max absolute_mcc  0.338908 0.400061 162
## 9   max min_per_class_accuracy  0.192398 0.698697 267
## 10 max mean_per_class_accuracy  0.210093 0.705793 246
## 
## Gains/Lift Table: Extract with `h2o.gainsLift(<model>, <data>)` or `h2o.gainsLift(<model>, valid=<T/F>, xval=<T/F>)`

h2o.varimp(gbm1_best)

h2o.varimp_plot(gbm1_best)

End of H2O

Make sure to shutdown h2o

h2o.shutdown(prompt=F)

## [1] TRUE

case_dcc_gbm_2