Sendhil Mullainathan and Jann Spiess, Machine Learning: An Applied Econometric Approach, Journal of Economic Perspectives 31:2 (87-106), Spring 2017.
# datafile<-"http://web.pdx.edu/~crkl/BDE/data/ahs2011forjep.rdata"
datafile<-"C:/Course19/BDE/data/ahs2011forjep.rdata"
library(h2o)
##
## ----------------------------------------------------------------------
##
## Your next step is to start H2O:
## > h2o.init()
##
## For H2O package documentation, ask for help:
## > ??h2o
##
## After starting H2O, you can use the Web UI at http://localhost:54321
## For more information visit http://docs.h2o.ai
##
## ----------------------------------------------------------------------
##
## Attaching package: 'h2o'
## The following objects are masked from 'package:stats':
##
## cor, sd, var
## The following objects are masked from 'package:base':
##
## %*%, %in%, &&, ||, apply, as.factor, as.numeric, colnames,
## colnames<-, ifelse, is.character, is.factor, is.numeric, log,
## log10, log1p, log2, round, signif, trunc
h2o.init(nthreads=2)
##
## H2O is not running yet, starting it now...
##
## Note: In case of errors look at the following log files:
## C:\Users\link_000\AppData\Local\Temp\Rtmp0i56lN/h2o_link_000_started_from_r.out
## C:\Users\link_000\AppData\Local\Temp\Rtmp0i56lN/h2o_link_000_started_from_r.err
##
##
## Starting H2O JVM and connecting: . Connection successful!
##
## R is connected to the H2O cluster:
## H2O cluster uptime: 4 seconds 932 milliseconds
## H2O cluster timezone: America/Los_Angeles
## H2O data parsing timezone: UTC
## H2O cluster version: 3.24.0.1
## H2O cluster version age: 27 days
## H2O cluster name: H2O_started_from_R_link_000_fns466
## H2O cluster total nodes: 1
## H2O cluster total memory: 1.75 GB
## H2O cluster total cores: 4
## H2O cluster allowed cores: 2
## H2O cluster healthy: TRUE
## H2O Connection ip: localhost
## H2O Connection port: 54321
## H2O Connection proxy: NA
## H2O Internal Security: FALSE
## H2O API Extensions: Amazon S3, Algos, AutoML, Core V3, Core V4
## R Version: R version 3.5.3 (2019-03-11)
data<-as.h2o(readRDS(datafile)$df) # from data frame to h2oFrame
##
|
| | 0%
|
|=================================================================| 100%
dim(data)
## [1] 51808 165
test<-data[data["holdout"]=="TRUE",] # holdout data for testing
train<-data[data["holdout"]=="FALSE",]
# note: data["holdout"] is character type
dim(test)
## [1] 41808 165
dim(train)
## [1] 10000 165
xvar<-readRDS(datafile)$vars
yvar<-"LOGVALUE"
auto1<-h2o.automl(xvar,yvar,seed=2019,nfold=10,
training_fram=train,
validation_frame=test,
stopping_metric = "MSE",
stopping_tolerance = 0.001,
stopping_rounds = 5,
max_runtime_secs = 3600)
##
|
| | 0%
|
|= | 1%
|
|= | 2%
|
|== | 2%
|
|== | 3%
|
|=== | 4%
|
|=== | 5%
|
|==== | 6%
|
|===== | 8%
|
|====== | 9%
|
|====== | 10%
|
|======= | 10%
|
|======= | 11%
|
|======== | 12%
|
|======== | 13%
|
|========= | 13%
|
|========= | 14%
|
|========== | 15%
|
|========== | 16%
|
|=========== | 16%
|
|=========== | 17%
|
|=========== | 18%
|
|============ | 18%
|
|============ | 19%
|
|============= | 20%
|
|============== | 21%
|
|============== | 22%
|
|=============== | 22%
|
|=============== | 23%
|
|=============== | 24%
|
|================ | 24%
|
|================ | 25%
|
|================= | 26%
|
|================= | 27%
|
|================== | 27%
|
|================== | 28%
|
|=================== | 29%
|
|=================== | 30%
|
|==================== | 30%
|
|==================== | 31%
|
|===================== | 32%
|
|===================== | 33%
|
|====================== | 33%
|
|====================== | 34%
|
|======================= | 35%
|
|======================= | 36%
|
|======================== | 36%
|
|======================== | 37%
|
|======================== | 38%
|
|========================= | 38%
|
|========================= | 39%
|
|========================== | 40%
|
|=========================== | 41%
|
|=========================== | 42%
|
|============================ | 42%
|
|============================ | 43%
|
|============================ | 44%
|
|============================= | 44%
|
|============================= | 45%
|
|============================== | 46%
|
|============================== | 47%
|
|=============================== | 47%
|
|=============================== | 48%
|
|================================ | 49%
|
|================================ | 50%
|
|================================= | 50%
|
|================================= | 51%
|
|================================== | 52%
|
|================================== | 53%
|
|=================================== | 53%
|
|=================================== | 54%
|
|==================================== | 55%
|
|==================================== | 56%
|
|===================================== | 56%
|
|===================================== | 57%
|
|===================================== | 58%
|
|====================================== | 58%
|
|====================================== | 59%
|
|======================================= | 60%
|
|======================================== | 61%
|
|======================================== | 62%
|
|========================================= | 62%
|
|========================================= | 63%
|
|========================================= | 64%
|
|========================================== | 64%
|
|========================================== | 65%
|
|=========================================== | 66%
|
|=========================================== | 67%
|
|============================================ | 67%
|
|============================================ | 68%
|
|============================================= | 69%
|
|============================================= | 70%
|
|============================================== | 70%
|
|============================================== | 71%
|
|=============================================== | 72%
|
|=============================================== | 73%
|
|================================================ | 73%
|
|================================================ | 74%
|
|================================================= | 75%
|
|================================================= | 76%
|
|================================================== | 76%
|
|================================================== | 77%
|
|================================================== | 78%
|
|=================================================== | 78%
|
|=================================================== | 79%
|
|==================================================== | 80%
|
|===================================================== | 81%
|
|===================================================== | 82%
|
|====================================================== | 82%
|
|====================================================== | 83%
|
|====================================================== | 84%
|
|======================================================= | 84%
|
|======================================================= | 85%
|
|======================================================== | 86%
|
|======================================================== | 87%
|
|========================================================= | 87%
|
|========================================================= | 88%
|
|============================================================= | 94%
|
|=================================================================| 100%
auto1
## An object of class "H2OAutoML"
## Slot "project_name":
## [1] "automl_RTMP_sid_b773_4"
##
## Slot "leader":
## Model Details:
## ==============
##
## H2ORegressionModel: stackedensemble
## Model ID: StackedEnsemble_AllModels_AutoML_20190428_071540
## NULL
##
##
## H2ORegressionMetrics: stackedensemble
## ** Reported on training data. **
##
## MSE: 0.3810828
## RMSE: 0.6173191
## MAE: 0.3512438
## RMSLE: 0.08455186
## Mean Residual Deviance : 0.3810828
##
##
## H2ORegressionMetrics: stackedensemble
## ** Reported on validation data. **
##
## MSE: 0.613427
## RMSE: 0.7832158
## MAE: 0.4486614
## RMSLE: 0.1021504
## Mean Residual Deviance : 0.613427
##
##
## H2ORegressionMetrics: stackedensemble
## ** Reported on cross-validation data. **
## ** 10-fold cross-validation on training data (Metrics computed for combined holdout predictions) **
##
## MSE: 0.5873844
## RMSE: 0.7664101
## MAE: 0.4374693
## RMSLE: 0.09545365
## Mean Residual Deviance : 0.5873844
##
##
##
## Slot "leaderboard":
## model_id
## 1 StackedEnsemble_AllModels_AutoML_20190428_071540
## 2 StackedEnsemble_BestOfFamily_AutoML_20190428_071540
## 3 GBM_grid_1_AutoML_20190428_071540_model_3
## 4 GBM_5_AutoML_20190428_071540
## 5 GBM_2_AutoML_20190428_071540
## 6 GBM_3_AutoML_20190428_071540
## mean_residual_deviance rmse mse mae rmsle
## 1 0.5873844 0.7664101 0.5873844 0.4374693 0.09545365
## 2 0.5898073 0.7679892 0.5898073 0.4383886 0.09551665
## 3 0.5918113 0.7692927 0.5918113 0.4406372 0.09554908
## 4 0.5942404 0.7708699 0.5942404 0.4444171 0.09562988
## 5 0.6066911 0.7789038 0.6066911 0.4513328 0.09618697
## 6 0.6076155 0.7794970 0.6076155 0.4520000 0.09621807
##
## [22 rows x 6 columns]
auto1@leader
## Model Details:
## ==============
##
## H2ORegressionModel: stackedensemble
## Model ID: StackedEnsemble_AllModels_AutoML_20190428_071540
## NULL
##
##
## H2ORegressionMetrics: stackedensemble
## ** Reported on training data. **
##
## MSE: 0.3810828
## RMSE: 0.6173191
## MAE: 0.3512438
## RMSLE: 0.08455186
## Mean Residual Deviance : 0.3810828
##
##
## H2ORegressionMetrics: stackedensemble
## ** Reported on validation data. **
##
## MSE: 0.613427
## RMSE: 0.7832158
## MAE: 0.4486614
## RMSLE: 0.1021504
## Mean Residual Deviance : 0.613427
##
##
## H2ORegressionMetrics: stackedensemble
## ** Reported on cross-validation data. **
## ** 10-fold cross-validation on training data (Metrics computed for combined holdout predictions) **
##
## MSE: 0.5873844
## RMSE: 0.7664101
## MAE: 0.4374693
## RMSLE: 0.09545365
## Mean Residual Deviance : 0.5873844
lb1<-auto1@leaderboard
print(lb1,nrow(lb1))
## model_id
## 1 StackedEnsemble_AllModels_AutoML_20190428_071540
## 2 StackedEnsemble_BestOfFamily_AutoML_20190428_071540
## 3 GBM_grid_1_AutoML_20190428_071540_model_3
## 4 GBM_5_AutoML_20190428_071540
## 5 GBM_2_AutoML_20190428_071540
## 6 GBM_3_AutoML_20190428_071540
## 7 GBM_4_AutoML_20190428_071540
## 8 GBM_grid_1_AutoML_20190428_071540_model_6
## 9 DRF_1_AutoML_20190428_071540
## 10 GLM_grid_1_AutoML_20190428_071540_model_1
## 11 GBM_grid_1_AutoML_20190428_071540_model_5
## 12 DeepLearning_grid_1_AutoML_20190428_071540_model_3
## 13 DeepLearning_grid_1_AutoML_20190428_071540_model_2
## 14 GBM_1_AutoML_20190428_071540
## 15 XRT_1_AutoML_20190428_071540
## 16 GBM_grid_1_AutoML_20190428_071540_model_8
## 17 GBM_grid_1_AutoML_20190428_071540_model_7
## 18 DeepLearning_1_AutoML_20190428_071540
## 19 DeepLearning_grid_1_AutoML_20190428_071540_model_1
## 20 GBM_grid_1_AutoML_20190428_071540_model_2
## 21 GBM_grid_1_AutoML_20190428_071540_model_1
## 22 GBM_grid_1_AutoML_20190428_071540_model_4
## mean_residual_deviance rmse mse mae rmsle
## 1 0.5873844 0.7664101 0.5873844 0.4374693 0.09545365
## 2 0.5898073 0.7679892 0.5898073 0.4383886 0.09551665
## 3 0.5918113 0.7692927 0.5918113 0.4406372 0.09554908
## 4 0.5942404 0.7708699 0.5942404 0.4444171 0.09562988
## 5 0.6066911 0.7789038 0.6066911 0.4513328 0.09618697
## 6 0.6076155 0.7794970 0.6076155 0.4520000 0.09621807
## 7 0.6117933 0.7821721 0.6117933 0.4534883 0.09633149
## 8 0.6185635 0.7864881 0.6185635 0.4614473 0.09638044
## 9 0.6275010 0.7921496 0.6275010 0.4600826 0.09670791
## 10 0.6286874 0.7928981 0.6286874 0.4688597 0.09697598
## 11 0.6291317 0.7931783 0.6291317 0.4743420 0.09663710
## 12 0.6324780 0.7952849 0.6324780 0.4710018 0.09711802
## 13 0.6384725 0.7990448 0.6384725 0.4701393 0.09737025
## 14 0.6429173 0.8018213 0.6429173 0.4617737 0.09771812
## 15 0.6488769 0.8055290 0.6488769 0.4770142 0.09732526
## 16 0.6570540 0.8105887 0.6570540 0.4878046 0.09761487
## 17 0.6636891 0.8146712 0.6636891 0.4907383 0.09794750
## 18 0.6763341 0.8223953 0.6763341 0.4918196 0.09867732
## 19 0.6794852 0.8243089 0.6794852 0.4848973 0.09899512
## 20 0.7240370 0.8509036 0.7240370 0.4937092 0.10282454
## 21 0.7600951 0.8718344 0.7600951 0.5587202 0.10078515
## 22 1.3030046 1.1414923 1.3030046 0.7116189 0.12388819
##
## [22 rows x 6 columns]
h2o.shutdown(F)
## [1] TRUE