Sendhil Mullainathan and Jann Spiess, Machine Learning: An Applied Econometric Approach, Journal of Economic Perspectives 31:2 (87-106), Spring 2017.
# datafile<-"http://web.pdx.edu/~crkl/BDE/data/ahs2011forjep.rdata"
datafile<-"C:/Course19/BDE/data/ahs2011forjep.rdata"
rdata<-readRDS(datafile)
# rdata contains data, vars, and formula
library(h2o)
##
## ----------------------------------------------------------------------
##
## Your next step is to start H2O:
## > h2o.init()
##
## For H2O package documentation, ask for help:
## > ??h2o
##
## After starting H2O, you can use the Web UI at http://localhost:54321
## For more information visit http://docs.h2o.ai
##
## ----------------------------------------------------------------------
##
## Attaching package: 'h2o'
## The following objects are masked from 'package:stats':
##
## cor, sd, var
## The following objects are masked from 'package:base':
##
## %*%, %in%, &&, ||, apply, as.factor, as.numeric, colnames,
## colnames<-, ifelse, is.character, is.factor, is.numeric, log,
## log10, log1p, log2, round, signif, trunc
h2o.init(nthreads=2)
##
## H2O is not running yet, starting it now...
##
## Note: In case of errors look at the following log files:
## C:\Users\link_000\AppData\Local\Temp\RtmpuqwSNH/h2o_link_000_started_from_r.out
## C:\Users\link_000\AppData\Local\Temp\RtmpuqwSNH/h2o_link_000_started_from_r.err
##
##
## Starting H2O JVM and connecting: . Connection successful!
##
## R is connected to the H2O cluster:
## H2O cluster uptime: 7 seconds 557 milliseconds
## H2O cluster timezone: America/Los_Angeles
## H2O data parsing timezone: UTC
## H2O cluster version: 3.24.0.1
## H2O cluster version age: 20 days
## H2O cluster name: H2O_started_from_R_link_000_pai976
## H2O cluster total nodes: 1
## H2O cluster total memory: 1.74 GB
## H2O cluster total cores: 4
## H2O cluster allowed cores: 2
## H2O cluster healthy: TRUE
## H2O Connection ip: localhost
## H2O Connection port: 54321
## H2O Connection proxy: NA
## H2O Internal Security: FALSE
## H2O API Extensions: Amazon S3, Algos, AutoML, Core V3, Core V4
## R Version: R version 3.5.3 (2019-03-11)
Load base data frame and separate data for training and testing
data<-as.h2o(rdata$df) # from data frame to h2oFrame
##
|
| | 0%
|
|=================================================================| 100%
dim(data)
## [1] 51808 165
# holdout data for testing based on paper
test<-data[data["holdout"]=="TRUE",]
train<-data[data["holdout"]=="FALSE",]
dim(test)
## [1] 41808 165
dim(train)
## [1] 10000 165
xvar<-rdata$vars
yvar<-"LOGVALUE"
default neural network: hidden=c(200,200), epochs=10, gaussian distribution, Rectifier activation function
parameters<-list(hidden=list(100,c(100,50),c(100,50,20),c(100,50,20,10)),
activation=c("Tanh","Rectifier"))
dlm0<-h2o.grid("deeplearning",
grid_id="grid-dlm0",
x=xvar,y=yvar,
hyper_params=parameters,
training_frame=train,
validation_frame=test,
stopping_metric="MSE",
stopping_rounds=5,
stopping_tolerance=0.001,
epochs=1000,seed=2019
)
##
|
| | 0%
|
| | 1%
|
|= | 1%
|
|========= | 13%
|
|================= | 26%
|
|================= | 27%
|
|================== | 27%
|
|================== | 28%
|
|=================== | 28%
|
|=================== | 29%
|
|=========================== | 41%
|
|=================================== | 54%
|
|============================================ | 67%
|
|==================================================== | 79%
|
|==================================================== | 80%
|
|==================================================== | 81%
|
|===================================================== | 81%
|
|===================================================== | 82%
|
|============================================================= | 94%
|
|=================================================================| 100%
dlm0
## H2O Grid Details
## ================
##
## Grid ID: grid-dlm0
## Used hyper parameters:
## - activation
## - hidden
## Number of models: 8
## Number of failed models: 0
##
## Hyper-Parameter Search Summary: ordered by increasing residual_deviance
## activation hidden model_ids residual_deviance
## 1 Tanh [100] grid-dlm0_model_1 0.6439704580782319
## 2 Tanh [100, 50, 20, 10] grid-dlm0_model_7 0.650935371928123
## 3 Tanh [100, 50, 20] grid-dlm0_model_5 0.66714299852623
## 4 Rectifier [100, 50, 20, 10] grid-dlm0_model_8 0.6807475466923595
## 5 Tanh [100, 50] grid-dlm0_model_3 0.6869634611107662
## 6 Rectifier [100, 50, 20] grid-dlm0_model_6 0.7179632930689983
## 7 Rectifier [100] grid-dlm0_model_2 0.7194674406609515
## 8 Rectifier [100, 50] grid-dlm0_model_4 0.7475596188309365
Make sure to shutdown h2o
h2o.shutdown(prompt=F)
## [1] TRUE