Tune and fit models for JEP paper using prepared dataset

Sendhil Mullainathan and Jann Spiess, Machine Learning: An Applied Econometric Approach, Journal of Economic Perspectives 31:2 (87-106), Spring 2017.

# datafile<-"http://web.pdx.edu/~crkl/BDE/data/ahs2011forjep.rdata"
datafile<-"C:/Course19/BDE/data/ahs2011forjep.rdata"
rdata<-readRDS(datafile)
# rdata contains data, vars, and formula

Use H2O with 2 CPUs

library(h2o)
## 
## ----------------------------------------------------------------------
## 
## Your next step is to start H2O:
##     > h2o.init()
## 
## For H2O package documentation, ask for help:
##     > ??h2o
## 
## After starting H2O, you can use the Web UI at http://localhost:54321
## For more information visit http://docs.h2o.ai
## 
## ----------------------------------------------------------------------
## 
## Attaching package: 'h2o'
## The following objects are masked from 'package:stats':
## 
##     cor, sd, var
## The following objects are masked from 'package:base':
## 
##     %*%, %in%, &&, ||, apply, as.factor, as.numeric, colnames,
##     colnames<-, ifelse, is.character, is.factor, is.numeric, log,
##     log10, log1p, log2, round, signif, trunc
h2o.init(nthreads=2)
## 
## H2O is not running yet, starting it now...
## 
## Note:  In case of errors look at the following log files:
##     C:\Users\link_000\AppData\Local\Temp\RtmpuqwSNH/h2o_link_000_started_from_r.out
##     C:\Users\link_000\AppData\Local\Temp\RtmpuqwSNH/h2o_link_000_started_from_r.err
## 
## 
## Starting H2O JVM and connecting: . Connection successful!
## 
## R is connected to the H2O cluster: 
##     H2O cluster uptime:         7 seconds 557 milliseconds 
##     H2O cluster timezone:       America/Los_Angeles 
##     H2O data parsing timezone:  UTC 
##     H2O cluster version:        3.24.0.1 
##     H2O cluster version age:    20 days  
##     H2O cluster name:           H2O_started_from_R_link_000_pai976 
##     H2O cluster total nodes:    1 
##     H2O cluster total memory:   1.74 GB 
##     H2O cluster total cores:    4 
##     H2O cluster allowed cores:  2 
##     H2O cluster healthy:        TRUE 
##     H2O Connection ip:          localhost 
##     H2O Connection port:        54321 
##     H2O Connection proxy:       NA 
##     H2O Internal Security:      FALSE 
##     H2O API Extensions:         Amazon S3, Algos, AutoML, Core V3, Core V4 
##     R Version:                  R version 3.5.3 (2019-03-11)

Check data

Load base data frame and separate data for training and testing

data<-as.h2o(rdata$df)      # from data frame to h2oFrame
## 
  |                                                                       
  |                                                                 |   0%
  |                                                                       
  |=================================================================| 100%
dim(data)
## [1] 51808   165
# holdout data for testing based on paper
test<-data[data["holdout"]=="TRUE",]     
train<-data[data["holdout"]=="FALSE",]
dim(test)
## [1] 41808   165
dim(train)
## [1] 10000   165

Setup x and y variables

xvar<-rdata$vars
yvar<-"LOGVALUE"

Grid search of several network structures

default neural network: hidden=c(200,200), epochs=10, gaussian distribution, Rectifier activation function

parameters<-list(hidden=list(100,c(100,50),c(100,50,20),c(100,50,20,10)),
                 activation=c("Tanh","Rectifier"))
dlm0<-h2o.grid("deeplearning",
  grid_id="grid-dlm0",
  x=xvar,y=yvar,
  hyper_params=parameters,
  training_frame=train,
  validation_frame=test,
  stopping_metric="MSE",
  stopping_rounds=5,
  stopping_tolerance=0.001,
  epochs=1000,seed=2019
)
## 
  |                                                                       
  |                                                                 |   0%
  |                                                                       
  |                                                                 |   1%
  |                                                                       
  |=                                                                |   1%
  |                                                                       
  |=========                                                        |  13%
  |                                                                       
  |=================                                                |  26%
  |                                                                       
  |=================                                                |  27%
  |                                                                       
  |==================                                               |  27%
  |                                                                       
  |==================                                               |  28%
  |                                                                       
  |===================                                              |  28%
  |                                                                       
  |===================                                              |  29%
  |                                                                       
  |===========================                                      |  41%
  |                                                                       
  |===================================                              |  54%
  |                                                                       
  |============================================                     |  67%
  |                                                                       
  |====================================================             |  79%
  |                                                                       
  |====================================================             |  80%
  |                                                                       
  |====================================================             |  81%
  |                                                                       
  |=====================================================            |  81%
  |                                                                       
  |=====================================================            |  82%
  |                                                                       
  |=============================================================    |  94%
  |                                                                       
  |=================================================================| 100%
dlm0
## H2O Grid Details
## ================
## 
## Grid ID: grid-dlm0 
## Used hyper parameters: 
##   -  activation 
##   -  hidden 
## Number of models: 8 
## Number of failed models: 0 
## 
## Hyper-Parameter Search Summary: ordered by increasing residual_deviance
##   activation            hidden         model_ids  residual_deviance
## 1       Tanh             [100] grid-dlm0_model_1 0.6439704580782319
## 2       Tanh [100, 50, 20, 10] grid-dlm0_model_7  0.650935371928123
## 3       Tanh     [100, 50, 20] grid-dlm0_model_5   0.66714299852623
## 4  Rectifier [100, 50, 20, 10] grid-dlm0_model_8 0.6807475466923595
## 5       Tanh         [100, 50] grid-dlm0_model_3 0.6869634611107662
## 6  Rectifier     [100, 50, 20] grid-dlm0_model_6 0.7179632930689983
## 7  Rectifier             [100] grid-dlm0_model_2 0.7194674406609515
## 8  Rectifier         [100, 50] grid-dlm0_model_4 0.7475596188309365

End of H2O

Make sure to shutdown h2o

h2o.shutdown(prompt=F)
## [1] TRUE