Tune and fit models for JEP paper using prepared dataset

Sendhil Mullainathan and Jann Spiess, Machine Learning: An Applied Econometric Approach, Journal of Economic Perspectives 31:2 (87-106), Spring 2017.

# datafile<-"http://web.pdx.edu/~crkl/BDE/data/ahs2011forjep.rdata"
datafile<-"C:/Course19/BDE/data/ahs2011forjep.rdata"
rdata<-readRDS(datafile)
# rdata contains data, vars, and formula

Use H2O with 2 CPUs

library(h2o)
## 
## ----------------------------------------------------------------------
## 
## Your next step is to start H2O:
##     > h2o.init()
## 
## For H2O package documentation, ask for help:
##     > ??h2o
## 
## After starting H2O, you can use the Web UI at http://localhost:54321
## For more information visit http://docs.h2o.ai
## 
## ----------------------------------------------------------------------
## 
## Attaching package: 'h2o'
## The following objects are masked from 'package:stats':
## 
##     cor, sd, var
## The following objects are masked from 'package:base':
## 
##     %*%, %in%, &&, ||, apply, as.factor, as.numeric, colnames,
##     colnames<-, ifelse, is.character, is.factor, is.numeric, log,
##     log10, log1p, log2, round, signif, trunc
h2o.init(nthreads=2)
## 
## H2O is not running yet, starting it now...
## 
## Note:  In case of errors look at the following log files:
##     C:\Users\link_000\AppData\Local\Temp\RtmpwPztdh/h2o_link_000_started_from_r.out
##     C:\Users\link_000\AppData\Local\Temp\RtmpwPztdh/h2o_link_000_started_from_r.err
## 
## 
## Starting H2O JVM and connecting:  Connection successful!
## 
## R is connected to the H2O cluster: 
##     H2O cluster uptime:         3 seconds 927 milliseconds 
##     H2O cluster timezone:       America/Los_Angeles 
##     H2O data parsing timezone:  UTC 
##     H2O cluster version:        3.24.0.1 
##     H2O cluster version age:    22 days  
##     H2O cluster name:           H2O_started_from_R_link_000_zvn447 
##     H2O cluster total nodes:    1 
##     H2O cluster total memory:   1.75 GB 
##     H2O cluster total cores:    4 
##     H2O cluster allowed cores:  2 
##     H2O cluster healthy:        TRUE 
##     H2O Connection ip:          localhost 
##     H2O Connection port:        54321 
##     H2O Connection proxy:       NA 
##     H2O Internal Security:      FALSE 
##     H2O API Extensions:         Amazon S3, Algos, AutoML, Core V3, Core V4 
##     R Version:                  R version 3.5.3 (2019-03-11)

Check data

Load base data frame and separate data for training and testing

data<-as.h2o(rdata$df)      # from data frame to h2oFrame
## 
  |                                                                       
  |                                                                 |   0%
  |                                                                       
  |=================================================================| 100%
dim(data)
## [1] 51808   165
# holdout data for testing based on paper
test<-data[data["holdout"]=="TRUE",]     
train<-data[data["holdout"]=="FALSE",]
dim(test)
## [1] 41808   165
dim(train)
## [1] 10000   165

Setup x and y variables

xvar<-rdata$vars
yvar<-"LOGVALUE"

Grid search of several gradient boosted models

parameters<-list(ntrees=c(50,100,200),
                 max_depth=c(5,10,20))
gbm1<-h2o.grid("gbm",
               grid_id="grid-gbm1",
               x=xvar,y=yvar,
               hyper_params=parameters,
               training_frame=train,
               validation_frame=test,
               learn_rate=0.05,
               learn_rate_annealing=0.99,
               sample_rate=0.8,
               col_sample_rate=0.8,
               stopping_metric="MSE",
               stopping_rounds=5,
               stopping_tolerance=0.001,
               seed=2019)
## 
  |                                                                       
  |                                                                 |   0%
  |                                                                       
  |=                                                                |   1%
  |                                                                       
  |==                                                               |   3%
  |                                                                       
  |===                                                              |   4%
  |                                                                       
  |===                                                              |   5%
  |                                                                       
  |====                                                             |   5%
  |                                                                       
  |====                                                             |   6%
  |                                                                       
  |====                                                             |   7%
  |                                                                       
  |=====                                                            |   7%
  |                                                                       
  |=====                                                            |   8%
  |                                                                       
  |======                                                           |   9%
  |                                                                       
  |======                                                           |  10%
  |                                                                       
  |=======                                                          |  10%
  |                                                                       
  |=======                                                          |  11%
  |                                                                       
  |========                                                         |  12%
  |                                                                       
  |========                                                         |  13%
  |                                                                       
  |=========                                                        |  13%
  |                                                                       
  |=========                                                        |  14%
  |                                                                       
  |==========                                                       |  15%
  |                                                                       
  |==========                                                       |  16%
  |                                                                       
  |===========                                                      |  16%
  |                                                                       
  |===========                                                      |  17%
  |                                                                       
  |============                                                     |  19%
  |                                                                       
  |=============                                                    |  20%
  |                                                                       
  |==============                                                   |  21%
  |                                                                       
  |==============                                                   |  22%
  |                                                                       
  |===============                                                  |  24%
  |                                                                       
  |================                                                 |  24%
  |                                                                       
  |================                                                 |  25%
  |                                                                       
  |=================                                                |  26%
  |                                                                       
  |=================                                                |  27%
  |                                                                       
  |==================                                               |  27%
  |                                                                       
  |==================                                               |  28%
  |                                                                       
  |===================                                              |  29%
  |                                                                       
  |===================                                              |  30%
  |                                                                       
  |====================                                             |  30%
  |                                                                       
  |====================                                             |  31%
  |                                                                       
  |=====================                                            |  32%
  |                                                                       
  |=====================                                            |  33%
  |                                                                       
  |======================                                           |  33%
  |                                                                       
  |======================                                           |  34%
  |                                                                       
  |======================                                           |  35%
  |                                                                       
  |=======================                                          |  35%
  |                                                                       
  |=======================                                          |  36%
  |                                                                       
  |========================                                         |  36%
  |                                                                       
  |========================                                         |  37%
  |                                                                       
  |=========================                                        |  38%
  |                                                                       
  |=========================                                        |  39%
  |                                                                       
  |==========================                                       |  39%
  |                                                                       
  |==========================                                       |  40%
  |                                                                       
  |==========================                                       |  41%
  |                                                                       
  |===========================                                      |  41%
  |                                                                       
  |===========================                                      |  42%
  |                                                                       
  |============================                                     |  42%
  |                                                                       
  |============================                                     |  43%
  |                                                                       
  |=============================                                    |  44%
  |                                                                       
  |=============================                                    |  45%
  |                                                                       
  |==============================                                   |  46%
  |                                                                       
  |===============================                                  |  47%
  |                                                                       
  |===============================                                  |  48%
  |                                                                       
  |================================                                 |  50%
  |                                                                       
  |=================================                                |  51%
  |                                                                       
  |==================================                               |  52%
  |                                                                       
  |===================================                              |  53%
  |                                                                       
  |====================================                             |  55%
  |                                                                       
  |=====================================                            |  56%
  |                                                                       
  |======================================                           |  58%
  |                                                                       
  |======================================                           |  59%
  |                                                                       
  |=======================================                          |  60%
  |                                                                       
  |========================================                         |  62%
  |                                                                       
  |=========================================                        |  63%
  |                                                                       
  |=========================================                        |  64%
  |                                                                       
  |==========================================                       |  64%
  |                                                                       
  |==========================================                       |  65%
  |                                                                       
  |===========================================                      |  66%
  |                                                                       
  |===========================================                      |  67%
  |                                                                       
  |============================================                     |  67%
  |                                                                       
  |============================================                     |  68%
  |                                                                       
  |=============================================                    |  69%
  |                                                                       
  |=============================================                    |  70%
  |                                                                       
  |==============================================                   |  70%
  |                                                                       
  |==============================================                   |  71%
  |                                                                       
  |===============================================                  |  72%
  |                                                                       
  |===============================================                  |  73%
  |                                                                       
  |================================================                 |  74%
  |                                                                       
  |=================================================                |  75%
  |                                                                       
  |=================================================                |  76%
  |                                                                       
  |==================================================               |  76%
  |                                                                       
  |==================================================               |  77%
  |                                                                       
  |==================================================               |  78%
  |                                                                       
  |===================================================              |  78%
  |                                                                       
  |===================================================              |  79%
  |                                                                       
  |====================================================             |  80%
  |                                                                       
  |=====================================================            |  81%
  |                                                                       
  |=====================================================            |  82%
  |                                                                       
  |======================================================           |  82%
  |                                                                       
  |======================================================           |  83%
  |                                                                       
  |======================================================           |  84%
  |                                                                       
  |=======================================================          |  84%
  |                                                                       
  |=======================================================          |  85%
  |                                                                       
  |========================================================         |  86%
  |                                                                       
  |========================================================         |  87%
  |                                                                       
  |=========================================================        |  87%
  |                                                                       
  |=========================================================        |  88%
  |                                                                       
  |==========================================================       |  89%
  |                                                                       
  |==========================================================       |  90%
  |                                                                       
  |===========================================================      |  90%
  |                                                                       
  |===========================================================      |  91%
  |                                                                       
  |============================================================     |  92%
  |                                                                       
  |============================================================     |  93%
  |                                                                       
  |=============================================================    |  93%
  |                                                                       
  |=============================================================    |  94%
  |                                                                       
  |=============================================================    |  95%
  |                                                                       
  |==============================================================   |  95%
  |                                                                       
  |==============================================================   |  96%
  |                                                                       
  |===============================================================  |  96%
  |                                                                       
  |===============================================================  |  97%
  |                                                                       
  |===============================================================  |  98%
  |                                                                       
  |================================================================ |  98%
  |                                                                       
  |================================================================ |  99%
  |                                                                       
  |=================================================================|  99%
  |                                                                       
  |=================================================================| 100%
gbm1
## H2O Grid Details
## ================
## 
## Grid ID: grid-gbm1 
## Used hyper parameters: 
##   -  max_depth 
##   -  ntrees 
## Number of models: 9 
## Number of failed models: 0 
## 
## Hyper-Parameter Search Summary: ordered by increasing residual_deviance
##   max_depth ntrees         model_ids  residual_deviance
## 1        10    200 grid-gbm1_model_8 0.6333289562244249
## 2        10    100 grid-gbm1_model_5 0.6342787913610163
## 3        20    100 grid-gbm1_model_6 0.6353330645765853
## 4        20    200 grid-gbm1_model_9 0.6377380306949612
## 5         5    200 grid-gbm1_model_7 0.6378572400669323
## 6        20     50 grid-gbm1_model_3 0.6416335906754876
## 7        10     50 grid-gbm1_model_2 0.6440999474699027
## 8         5    100 grid-gbm1_model_4  0.646468642235362
## 9         5     50 grid-gbm1_model_1 0.6702573662138052

Pick up the best model

gbm1_best<-h2o.getModel(gbm1@model_ids[[1]]) 
h2o.performance(gbm1_best,test)
## H2ORegressionMetrics: gbm
## 
## MSE:  0.633329
## RMSE:  0.7958197
## MAE:  0.4604087
## RMSLE:  0.1027878
## Mean Residual Deviance :  0.633329
h2o.varimp(gbm1_best)
h2o.varimp_plot(gbm1_best)

End of H2O

Make sure to shutdown h2o

h2o.shutdown(prompt=F)
## [1] TRUE