Default of Credit Card Clients (data from UCI data archive)

https://archive.ics.uci.edu/ml/datasets/default+of+credit+card+clients Yeh, I. C., & Lien, C. H. (2009). The comparisons of data mining techniques for the predictive accuracy of probability of default of credit card clients. Expert Systems with Applications, 36(2), 2473-2480.

dcc<-read.csv("C:/Course19/BDE/data/UCI_Credit_Card.csv")
dim(dcc)  # 30000 obs. of 25 variables
## [1] 30000    25

Redefine factor variables

# SEX: 1=male, 2=female
dcc$SEX<-as.factor(dcc$SEX) 
# EDUCATION: 1=graduate school, 2=university, 3=high school, 0,4,5,6=others
dcc$EDUCATION<-ifelse(dcc$EDUCATION>3,0,dcc$EDUCATION)
dcc$EDUCATION<-as.factor(dcc$EDUCATION) 
# MARRIAGE: 1=married, 2=single, 3=divorce, 0=others
dcc$MARRIAGE<-as.factor(dcc$MARRIAGE)
# payment status: -2=no consumption, -1=pay on due, #=delayed months of payment
dcc$PAY_0<-as.factor(dcc$PAY_0) # payment status as of 9/2005
dcc$PAY_2<-as.factor(dcc$PAY_2) # payment status as of 8/2005
dcc$PAY_3<-as.factor(dcc$PAY_3) # payment status as of 7/2005
dcc$PAY_4<-as.factor(dcc$PAY_4) # payment status as of 6/2005
dcc$PAY_5<-as.factor(dcc$PAY_5) # payment status as of 5/2005
dcc$PAY_6<-as.factor(dcc$PAY_6) # payment status as of 4/2005
dcc$default.payment.next.month<-as.factor(dcc$default.payment.next.month)
# numeric data: LIMIT_BAL, BILL_AMT1, ..., BILL_AMT6; PAY_AMT1, ..., PAY_AMT6, AGE

Using h2o package with 2 CPU only

library(h2o)
## 
## ----------------------------------------------------------------------
## 
## Your next step is to start H2O:
##     > h2o.init()
## 
## For H2O package documentation, ask for help:
##     > ??h2o
## 
## After starting H2O, you can use the Web UI at http://localhost:54321
## For more information visit http://docs.h2o.ai
## 
## ----------------------------------------------------------------------
## 
## Attaching package: 'h2o'
## The following objects are masked from 'package:stats':
## 
##     cor, sd, var
## The following objects are masked from 'package:base':
## 
##     %*%, %in%, &&, ||, apply, as.factor, as.numeric, colnames,
##     colnames<-, ifelse, is.character, is.factor, is.numeric, log,
##     log10, log1p, log2, round, signif, trunc
h2o.init(nthreads=2)
## 
## H2O is not running yet, starting it now...
## 
## Note:  In case of errors look at the following log files:
##     C:\Users\link_000\AppData\Local\Temp\Rtmpyu8nrY/h2o_link_000_started_from_r.out
##     C:\Users\link_000\AppData\Local\Temp\Rtmpyu8nrY/h2o_link_000_started_from_r.err
## 
## 
## Starting H2O JVM and connecting:  Connection successful!
## 
## R is connected to the H2O cluster: 
##     H2O cluster uptime:         3 seconds 920 milliseconds 
##     H2O cluster timezone:       America/Los_Angeles 
##     H2O data parsing timezone:  UTC 
##     H2O cluster version:        3.24.0.1 
##     H2O cluster version age:    20 days  
##     H2O cluster name:           H2O_started_from_R_link_000_sfn818 
##     H2O cluster total nodes:    1 
##     H2O cluster total memory:   1.75 GB 
##     H2O cluster total cores:    4 
##     H2O cluster allowed cores:  2 
##     H2O cluster healthy:        TRUE 
##     H2O Connection ip:          localhost 
##     H2O Connection port:        54321 
##     H2O Connection proxy:       NA 
##     H2O Internal Security:      FALSE 
##     H2O API Extensions:         Amazon S3, Algos, AutoML, Core V3, Core V4 
##     R Version:                  R version 3.5.3 (2019-03-11)

Load base data frame, convert to h2oFrame

h2o.dcc1<-as.h2o(dcc)
## 
  |                                                                       
  |                                                                 |   0%
  |                                                                       
  |=================================================================| 100%

Include interaction variables

h2o.dcc2<-h2o.interaction(h2o.dcc1,
                factors=c("SEX","EDUCATION","MARRIAGE"),
                pairwise=T,max_factors=10,min_occurrence=3)
## 
  |                                                                       
  |                                                                 |   0%
  |                                                                       
  |=================================================================| 100%
h2o.dcc<-h2o.cbind(h2o.dcc1,h2o.dcc2)
summary(h2o.dcc,exact_quantiles=TRUE)
##  ID              LIMIT_BAL         SEX      EDUCATION MARRIAGE
##  Min.   :    1   Min.   :  10000   2:18112  2:14030   2:15964 
##  1st Qu.: 7501   1st Qu.:  50000   1:11888  1:10585   1:13659 
##  Median :15000   Median : 140000            3: 4917   3:  323 
##  Mean   :15000   Mean   : 167484            0:  468   0:   54 
##  3rd Qu.:22500   3rd Qu.: 240000                              
##  Max.   :30000   Max.   :1000000                              
##  AGE             PAY_0     PAY_2     PAY_3     PAY_4     PAY_5    
##  Min.   :21.00   0 :14737  0 :15730  0 :15764  0 :16455  0 :16947 
##  1st Qu.:28.00   -1: 5686  -1: 6050  -1: 5938  -1: 5687  -1: 5539 
##  Median :34.00   1 : 3688  2 : 3927  -2: 4085  -2: 4348  -2: 4546 
##  Mean   :35.49   -2: 2759  -2: 3782  2 : 3819  2 : 3159  2 : 2626 
##  3rd Qu.:41.00   2 : 2667  3 :  326  3 :  240  3 :  180  3 :  178 
##  Max.   :79.00   3 :  322  4 :   99  4 :   76  4 :   69  4 :   84 
##  PAY_6     BILL_AMT1         BILL_AMT2        BILL_AMT3        
##  0 :16286  Min.   :-165580   Min.   :-69777   Min.   :-157264  
##  -1: 5740  1st Qu.:   3559   1st Qu.:  2985   1st Qu.:   2666  
##  -2: 4895  Median :  22382   Median : 21200   Median :  20089  
##  2 : 2766  Mean   :  51223   Mean   : 49179   Mean   :  47013  
##  3 :  184  3rd Qu.:  67091   3rd Qu.: 64006   3rd Qu.:  60165  
##  4 :   49  Max.   : 964511   Max.   :983931   Max.   :1664089  
##  BILL_AMT4         BILL_AMT5        BILL_AMT6         PAY_AMT1        
##  Min.   :-170000   Min.   :-81334   Min.   :-339603   Min.   :     0  
##  1st Qu.:   2327   1st Qu.:  1763   1st Qu.:   1256   1st Qu.:  1000  
##  Median :  19052   Median : 18105   Median :  17071   Median :  2100  
##  Mean   :  43263   Mean   : 40311   Mean   :  38872   Mean   :  5664  
##  3rd Qu.:  54506   3rd Qu.: 50191   3rd Qu.:  49198   3rd Qu.:  5006  
##  Max.   : 891586   Max.   :927171   Max.   : 961664   Max.   :873552  
##  PAY_AMT2          PAY_AMT3         PAY_AMT4         PAY_AMT5          
##  Min.   :      0   Min.   :     0   Min.   :     0   Min.   :     0.0  
##  1st Qu.:    833   1st Qu.:   390   1st Qu.:   296   1st Qu.:   252.5  
##  Median :   2009   Median :  1800   Median :  1500   Median :  1500.0  
##  Mean   :   5921   Mean   :  5226   Mean   :  4826   Mean   :  4799.4  
##  3rd Qu.:   5000   3rd Qu.:  4505   3rd Qu.:  4013   3rd Qu.:  4031.5  
##  Max.   :1684259   Max.   :896040   Max.   :621000   Max.   :426529.0  
##  PAY_AMT6           default.payment.next.month SEX_EDUCATION SEX_MARRIAGE
##  Min.   :     0.0   0:23364                    2_2:8656      2_2:9411    
##  1st Qu.:   117.8   1: 6636                    2_1:6231      2_1:8469    
##  Median :  1500.0                              1_2:5374      1_2:6553    
##  Mean   :  5215.5                              1_1:4354      1_1:5190    
##  3rd Qu.:  4000.0                              2_3:2927      2_3: 192    
##  Max.   :528666.0                              1_3:1990      1_3: 131    
##  EDUCATION_MARRIAGE
##  2_2:7020          
##  2_1:6842          
##  1_2:6809          
##  1_1:3722          
##  3_1:2861          
##  3_2:1909

Split data for training and testing

dcc.split<-h2o.splitFrame(h2o.dcc,0.5)
train<-dcc.split[[1]]
test<-dcc.split[[2]]

Setup x and y variables

yvar<-"default.payment.next.month"
xvar<-c("LIMIT_BAL","SEX","EDUCATION","MARRIAGE","AGE",
        "SEX_EDUCATION","SEX_MARRIAGE","EDUCATION_MARRIAGE",
        "PAY_0","PAY_2","PAY_3","PAY_4","PAY_5","PAY_6",
        "BILL_AMT1","BILL_AMT2","BILL_AMT3","BILL_AMT4","BILL_AMT5","BILL_AMT6",
        "PAY_AMT1","PAY_AMT2","PAY_AMT3","PAY_AMT4","PAY_AMT5","PAY_AMT6")

Binary classification with neural nets

default neural network: hidden=c(200,200), epochs=10, bernoulli distribution, logloss critia, Rectifier activation

parameters<-list(hidden=list(100,c(100,50),c(100,50,10)),
                 activation=c("Tanh","Rectifier"))
dlm0<-h2o.grid("deeplearning",
               grid_id="grid-dlm0",
               x=xvar,y=yvar,
               hyper_params=parameters,
               training_frame=train,
               validation_frame=test,
               stopping_metric="misclassification",
               stopping_rounds=5,
               stopping_tolerance=0.001,
               epochs=1000,seed=2019
)
## 
  |                                                                       
  |                                                                 |   0%
  |                                                                       
  |                                                                 |   1%
  |                                                                       
  |=                                                                |   1%
  |                                                                       
  |=======================                                          |  35%
  |                                                                       
  |=======================                                          |  36%
  |                                                                       
  |========================                                         |  36%
  |                                                                       
  |========================                                         |  37%
  |                                                                       
  |==============================================                   |  70%
  |                                                                       
  |==============================================                   |  71%
  |                                                                       
  |=========================================================        |  87%
  |                                                                       
  |=================================================================| 100%
dlm0
## H2O Grid Details
## ================
## 
## Grid ID: grid-dlm0 
## Used hyper parameters: 
##   -  activation 
##   -  hidden 
## Number of models: 6 
## Number of failed models: 0 
## 
## Hyper-Parameter Search Summary: ordered by increasing logloss
##   activation        hidden         model_ids             logloss
## 1       Tanh [100, 50, 10] grid-dlm0_model_5  0.4462544620940534
## 2       Tanh         [100] grid-dlm0_model_1 0.45122373954294015
## 3  Rectifier     [100, 50] grid-dlm0_model_4   0.460287994396815
## 4       Tanh     [100, 50] grid-dlm0_model_3  0.4609623645324703
## 5  Rectifier         [100] grid-dlm0_model_2 0.46241674445137326
## 6  Rectifier [100, 50, 10] grid-dlm0_model_6 0.47193839353311295

End of H2O

Make sure to shutdown h2o

h2o.shutdown(prompt=F)
## [1] TRUE