Sendhil Mullainathan and Jann Spiess, Machine Learning: An Applied Econometric Approach, Journal of Economic Perspectives 31:2 (87-106), Spring 2017.
# datafile<-"http://web.pdx.edu/~crkl/BDE/data/ahs2011forjep.rdata"
datafile<-"C:/Course19/BDE/data/ahs2011forjep.rdata"
rdata<-readRDS(datafile)
# rdata contains data, vars, and formula
library(h2o)
##
## ----------------------------------------------------------------------
##
## Your next step is to start H2O:
## > h2o.init()
##
## For H2O package documentation, ask for help:
## > ??h2o
##
## After starting H2O, you can use the Web UI at http://localhost:54321
## For more information visit http://docs.h2o.ai
##
## ----------------------------------------------------------------------
##
## Attaching package: 'h2o'
## The following objects are masked from 'package:stats':
##
## cor, sd, var
## The following objects are masked from 'package:base':
##
## %*%, %in%, &&, ||, apply, as.factor, as.numeric, colnames,
## colnames<-, ifelse, is.character, is.factor, is.numeric, log,
## log10, log1p, log2, round, signif, trunc
h2o.init(nthreads=2)
## Connection successful!
##
## R is connected to the H2O cluster:
## H2O cluster uptime: 13 minutes 1 seconds
## H2O cluster timezone: America/Los_Angeles
## H2O data parsing timezone: UTC
## H2O cluster version: 3.24.0.1
## H2O cluster version age: 21 days, 20 hours and 58 minutes
## H2O cluster name: H2O_started_from_R_link_000_qxl857
## H2O cluster total nodes: 1
## H2O cluster total memory: 1.69 GB
## H2O cluster total cores: 4
## H2O cluster allowed cores: 4
## H2O cluster healthy: TRUE
## H2O Connection ip: localhost
## H2O Connection port: 54321
## H2O Connection proxy: NA
## H2O Internal Security: FALSE
## H2O API Extensions: Amazon S3, Algos, AutoML, Core V3, Core V4
## R Version: R version 3.5.3 (2019-03-11)
Load base data frame and separate data for training and testing
data<-as.h2o(rdata$df) # from data frame to h2oFrame
##
|
| | 0%
|
|=================================================================| 100%
dim(data)
## [1] 51808 165
# holdout data for testing based on paper
test<-data[data["holdout"]=="TRUE",]
train<-data[data["holdout"]=="FALSE",]
dim(test)
## [1] 41808 165
dim(train)
## [1] 10000 165
xvar<-rdata$vars
yvar<-"LOGVALUE"
default random forest: ntrees=50, max_depth=20, min_rows=1, sample rate=0.632, col_sample_rate_per_tree = 1, mtries=-1 search_criteria=list(strategy=“Cartesian”)
parameters<-list(ntrees=c(50,100,200),
max_depth=c(20,50),mtries=c(25,50))
rfm0<-h2o.grid("randomForest",
grid_id="grid-rfm0",
x=xvar,y=yvar,seed=2019,
hyper_params=parameters,
training_frame=train,
validation_frame=test,
stopping_metric="MSE",
stopping_rounds=5,
stopping_tolerance=0.001
)
##
|
| | 0%
|
| | 1%
|
|= | 1%
|
|= | 2%
|
|== | 2%
|
|== | 3%
|
|== | 4%
|
|=== | 4%
|
|=== | 5%
|
|==== | 5%
|
|==== | 6%
|
|==== | 7%
|
|===== | 7%
|
|===== | 8%
|
|====== | 9%
|
|====== | 10%
|
|======= | 10%
|
|======= | 11%
|
|======= | 12%
|
|======== | 12%
|
|======== | 13%
|
|========= | 13%
|
|========= | 14%
|
|========== | 15%
|
|========== | 16%
|
|=========== | 16%
|
|=========== | 17%
|
|=========== | 18%
|
|============ | 18%
|
|============ | 19%
|
|============= | 19%
|
|============= | 20%
|
|============= | 21%
|
|============== | 21%
|
|============== | 22%
|
|=============== | 22%
|
|=============== | 23%
|
|=============== | 24%
|
|================ | 24%
|
|================ | 25%
|
|================= | 25%
|
|================= | 26%
|
|================= | 27%
|
|================== | 27%
|
|================== | 28%
|
|=================== | 28%
|
|=================== | 29%
|
|=================== | 30%
|
|==================== | 30%
|
|==================== | 31%
|
|===================== | 32%
|
|===================== | 33%
|
|====================== | 33%
|
|====================== | 34%
|
|====================== | 35%
|
|======================= | 35%
|
|======================= | 36%
|
|======================== | 36%
|
|======================== | 37%
|
|======================== | 38%
|
|========================= | 38%
|
|========================= | 39%
|
|========================== | 39%
|
|========================== | 40%
|
|========================== | 41%
|
|=========================== | 41%
|
|=========================== | 42%
|
|============================ | 42%
|
|============================ | 43%
|
|============================ | 44%
|
|============================= | 44%
|
|============================= | 45%
|
|============================== | 46%
|
|============================== | 47%
|
|=============================== | 47%
|
|=============================== | 48%
|
|================================ | 48%
|
|================================ | 49%
|
|================================ | 50%
|
|================================= | 50%
|
|================================= | 51%
|
|================================== | 52%
|
|================================== | 53%
|
|=================================== | 53%
|
|=================================== | 54%
|
|==================================== | 55%
|
|==================================== | 56%
|
|===================================== | 56%
|
|===================================== | 57%
|
|===================================== | 58%
|
|====================================== | 58%
|
|====================================== | 59%
|
|======================================= | 59%
|
|======================================= | 60%
|
|======================================= | 61%
|
|======================================== | 61%
|
|======================================== | 62%
|
|========================================= | 62%
|
|========================================= | 63%
|
|========================================= | 64%
|
|========================================== | 64%
|
|========================================== | 65%
|
|=========================================== | 66%
|
|=========================================== | 67%
|
|============================================ | 67%
|
|============================================ | 68%
|
|============================================= | 69%
|
|============================================= | 70%
|
|============================================== | 70%
|
|============================================== | 71%
|
|============================================== | 72%
|
|=============================================== | 72%
|
|=============================================== | 73%
|
|================================================ | 73%
|
|================================================ | 74%
|
|================================================= | 75%
|
|================================================= | 76%
|
|================================================== | 76%
|
|================================================== | 77%
|
|================================================== | 78%
|
|=================================================== | 78%
|
|=================================================== | 79%
|
|==================================================== | 79%
|
|==================================================== | 80%
|
|==================================================== | 81%
|
|===================================================== | 81%
|
|===================================================== | 82%
|
|====================================================== | 82%
|
|====================================================== | 83%
|
|====================================================== | 84%
|
|======================================================= | 84%
|
|======================================================= | 85%
|
|======================================================== | 86%
|
|======================================================== | 87%
|
|========================================================= | 87%
|
|========================================================= | 88%
|
|========================================================== | 89%
|
|========================================================== | 90%
|
|=========================================================== | 90%
|
|=========================================================== | 91%
|
|=========================================================== | 92%
|
|============================================================ | 92%
|
|============================================================ | 93%
|
|============================================================= | 93%
|
|============================================================= | 94%
|
|============================================================= | 95%
|
|============================================================== | 95%
|
|============================================================== | 96%
|
|=============================================================== | 96%
|
|=============================================================== | 97%
|
|=============================================================== | 98%
|
|================================================================ | 98%
|
|================================================================ | 99%
|
|=================================================================| 99%
|
|=================================================================| 100%
rfm0
## H2O Grid Details
## ================
##
## Grid ID: grid-rfm0
## Used hyper parameters:
## - max_depth
## - mtries
## - ntrees
## Number of models: 12
## Number of failed models: 0
##
## Hyper-Parameter Search Summary: ordered by increasing residual_deviance
## max_depth mtries ntrees model_ids residual_deviance
## 1 50 25 200 grid-rfm0_model_10 0.640227120316315
## 2 20 25 200 grid-rfm0_model_9 0.6412714943199265
## 3 50 25 100 grid-rfm0_model_6 0.6436912617784274
## 4 20 25 100 grid-rfm0_model_5 0.6442594897324502
## 5 50 50 200 grid-rfm0_model_12 0.6466282149542224
## 6 20 50 200 grid-rfm0_model_11 0.6469508963711114
## 7 50 50 100 grid-rfm0_model_8 0.6508547146335724
## 8 20 50 100 grid-rfm0_model_7 0.6510212616021707
## 9 50 25 50 grid-rfm0_model_2 0.6523765447548675
## 10 20 25 50 grid-rfm0_model_1 0.6523864345962608
## 11 20 50 50 grid-rfm0_model_3 0.6558536727756534
## 12 50 50 50 grid-rfm0_model_4 0.6559234614787327
rfm0_best<-h2o.getModel(rfm0@model_ids[[1]])
h2o.performance(rfm0_best,test)
## H2ORegressionMetrics: drf
##
## MSE: 0.6402271
## RMSE: 0.8001419
## MAE: 0.4665049
## RMSLE: 0.1029256
## Mean Residual Deviance : 0.6402271
h2o.varimp(rfm0_best)
h2o.varimp_plot(rfm0_best)
Make sure to shutdown h2o
h2o.shutdown(prompt=F)