Tune and fit models for JEP paper using prepared dataset for jep paper

Using AutoML

Sendhil Mullainathan and Jann Spiess, Machine Learning: An Applied Econometric Approach, Journal of Economic Perspectives 31:2 (87-106), Spring 2017.

# datafile<-"http://web.pdx.edu/~crkl/BDE/data/ahs2011forjep.rdata"
datafile<-"C:/Course19/BDE/data/ahs2011forjep.rdata"

Use h2o package

library(h2o)
## 
## ----------------------------------------------------------------------
## 
## Your next step is to start H2O:
##     > h2o.init()
## 
## For H2O package documentation, ask for help:
##     > ??h2o
## 
## After starting H2O, you can use the Web UI at http://localhost:54321
## For more information visit http://docs.h2o.ai
## 
## ----------------------------------------------------------------------
## 
## Attaching package: 'h2o'
## The following objects are masked from 'package:stats':
## 
##     cor, sd, var
## The following objects are masked from 'package:base':
## 
##     %*%, %in%, &&, ||, apply, as.factor, as.numeric, colnames,
##     colnames<-, ifelse, is.character, is.factor, is.numeric, log,
##     log10, log1p, log2, round, signif, trunc
h2o.init(nthreads=2)
## 
## H2O is not running yet, starting it now...
## 
## Note:  In case of errors look at the following log files:
##     C:\Users\link_000\AppData\Local\Temp\Rtmp0i56lN/h2o_link_000_started_from_r.out
##     C:\Users\link_000\AppData\Local\Temp\Rtmp0i56lN/h2o_link_000_started_from_r.err
## 
## 
## Starting H2O JVM and connecting: . Connection successful!
## 
## R is connected to the H2O cluster: 
##     H2O cluster uptime:         4 seconds 932 milliseconds 
##     H2O cluster timezone:       America/Los_Angeles 
##     H2O data parsing timezone:  UTC 
##     H2O cluster version:        3.24.0.1 
##     H2O cluster version age:    27 days  
##     H2O cluster name:           H2O_started_from_R_link_000_fns466 
##     H2O cluster total nodes:    1 
##     H2O cluster total memory:   1.75 GB 
##     H2O cluster total cores:    4 
##     H2O cluster allowed cores:  2 
##     H2O cluster healthy:        TRUE 
##     H2O Connection ip:          localhost 
##     H2O Connection port:        54321 
##     H2O Connection proxy:       NA 
##     H2O Internal Security:      FALSE 
##     H2O API Extensions:         Amazon S3, Algos, AutoML, Core V3, Core V4 
##     R Version:                  R version 3.5.3 (2019-03-11)

Load and split data

data<-as.h2o(readRDS(datafile)$df)      # from data frame to h2oFrame
## 
  |                                                                       
  |                                                                 |   0%
  |                                                                       
  |=================================================================| 100%
dim(data)
## [1] 51808   165
test<-data[data["holdout"]=="TRUE",]     # holdout data for testing
train<-data[data["holdout"]=="FALSE",]
# note: data["holdout"] is character type

dim(test)
## [1] 41808   165
dim(train)
## [1] 10000   165

setup x and y variables

xvar<-readRDS(datafile)$vars
yvar<-"LOGVALUE"

Auto ML

auto1<-h2o.automl(xvar,yvar,seed=2019,nfold=10,
                  training_fram=train,
                  validation_frame=test,
                  stopping_metric = "MSE",
                  stopping_tolerance = 0.001,
                  stopping_rounds = 5,
                  max_runtime_secs = 3600)
## 
  |                                                                       
  |                                                                 |   0%
  |                                                                       
  |=                                                                |   1%
  |                                                                       
  |=                                                                |   2%
  |                                                                       
  |==                                                               |   2%
  |                                                                       
  |==                                                               |   3%
  |                                                                       
  |===                                                              |   4%
  |                                                                       
  |===                                                              |   5%
  |                                                                       
  |====                                                             |   6%
  |                                                                       
  |=====                                                            |   8%
  |                                                                       
  |======                                                           |   9%
  |                                                                       
  |======                                                           |  10%
  |                                                                       
  |=======                                                          |  10%
  |                                                                       
  |=======                                                          |  11%
  |                                                                       
  |========                                                         |  12%
  |                                                                       
  |========                                                         |  13%
  |                                                                       
  |=========                                                        |  13%
  |                                                                       
  |=========                                                        |  14%
  |                                                                       
  |==========                                                       |  15%
  |                                                                       
  |==========                                                       |  16%
  |                                                                       
  |===========                                                      |  16%
  |                                                                       
  |===========                                                      |  17%
  |                                                                       
  |===========                                                      |  18%
  |                                                                       
  |============                                                     |  18%
  |                                                                       
  |============                                                     |  19%
  |                                                                       
  |=============                                                    |  20%
  |                                                                       
  |==============                                                   |  21%
  |                                                                       
  |==============                                                   |  22%
  |                                                                       
  |===============                                                  |  22%
  |                                                                       
  |===============                                                  |  23%
  |                                                                       
  |===============                                                  |  24%
  |                                                                       
  |================                                                 |  24%
  |                                                                       
  |================                                                 |  25%
  |                                                                       
  |=================                                                |  26%
  |                                                                       
  |=================                                                |  27%
  |                                                                       
  |==================                                               |  27%
  |                                                                       
  |==================                                               |  28%
  |                                                                       
  |===================                                              |  29%
  |                                                                       
  |===================                                              |  30%
  |                                                                       
  |====================                                             |  30%
  |                                                                       
  |====================                                             |  31%
  |                                                                       
  |=====================                                            |  32%
  |                                                                       
  |=====================                                            |  33%
  |                                                                       
  |======================                                           |  33%
  |                                                                       
  |======================                                           |  34%
  |                                                                       
  |=======================                                          |  35%
  |                                                                       
  |=======================                                          |  36%
  |                                                                       
  |========================                                         |  36%
  |                                                                       
  |========================                                         |  37%
  |                                                                       
  |========================                                         |  38%
  |                                                                       
  |=========================                                        |  38%
  |                                                                       
  |=========================                                        |  39%
  |                                                                       
  |==========================                                       |  40%
  |                                                                       
  |===========================                                      |  41%
  |                                                                       
  |===========================                                      |  42%
  |                                                                       
  |============================                                     |  42%
  |                                                                       
  |============================                                     |  43%
  |                                                                       
  |============================                                     |  44%
  |                                                                       
  |=============================                                    |  44%
  |                                                                       
  |=============================                                    |  45%
  |                                                                       
  |==============================                                   |  46%
  |                                                                       
  |==============================                                   |  47%
  |                                                                       
  |===============================                                  |  47%
  |                                                                       
  |===============================                                  |  48%
  |                                                                       
  |================================                                 |  49%
  |                                                                       
  |================================                                 |  50%
  |                                                                       
  |=================================                                |  50%
  |                                                                       
  |=================================                                |  51%
  |                                                                       
  |==================================                               |  52%
  |                                                                       
  |==================================                               |  53%
  |                                                                       
  |===================================                              |  53%
  |                                                                       
  |===================================                              |  54%
  |                                                                       
  |====================================                             |  55%
  |                                                                       
  |====================================                             |  56%
  |                                                                       
  |=====================================                            |  56%
  |                                                                       
  |=====================================                            |  57%
  |                                                                       
  |=====================================                            |  58%
  |                                                                       
  |======================================                           |  58%
  |                                                                       
  |======================================                           |  59%
  |                                                                       
  |=======================================                          |  60%
  |                                                                       
  |========================================                         |  61%
  |                                                                       
  |========================================                         |  62%
  |                                                                       
  |=========================================                        |  62%
  |                                                                       
  |=========================================                        |  63%
  |                                                                       
  |=========================================                        |  64%
  |                                                                       
  |==========================================                       |  64%
  |                                                                       
  |==========================================                       |  65%
  |                                                                       
  |===========================================                      |  66%
  |                                                                       
  |===========================================                      |  67%
  |                                                                       
  |============================================                     |  67%
  |                                                                       
  |============================================                     |  68%
  |                                                                       
  |=============================================                    |  69%
  |                                                                       
  |=============================================                    |  70%
  |                                                                       
  |==============================================                   |  70%
  |                                                                       
  |==============================================                   |  71%
  |                                                                       
  |===============================================                  |  72%
  |                                                                       
  |===============================================                  |  73%
  |                                                                       
  |================================================                 |  73%
  |                                                                       
  |================================================                 |  74%
  |                                                                       
  |=================================================                |  75%
  |                                                                       
  |=================================================                |  76%
  |                                                                       
  |==================================================               |  76%
  |                                                                       
  |==================================================               |  77%
  |                                                                       
  |==================================================               |  78%
  |                                                                       
  |===================================================              |  78%
  |                                                                       
  |===================================================              |  79%
  |                                                                       
  |====================================================             |  80%
  |                                                                       
  |=====================================================            |  81%
  |                                                                       
  |=====================================================            |  82%
  |                                                                       
  |======================================================           |  82%
  |                                                                       
  |======================================================           |  83%
  |                                                                       
  |======================================================           |  84%
  |                                                                       
  |=======================================================          |  84%
  |                                                                       
  |=======================================================          |  85%
  |                                                                       
  |========================================================         |  86%
  |                                                                       
  |========================================================         |  87%
  |                                                                       
  |=========================================================        |  87%
  |                                                                       
  |=========================================================        |  88%
  |                                                                       
  |=============================================================    |  94%
  |                                                                       
  |=================================================================| 100%
auto1
## An object of class "H2OAutoML"
## Slot "project_name":
## [1] "automl_RTMP_sid_b773_4"
## 
## Slot "leader":
## Model Details:
## ==============
## 
## H2ORegressionModel: stackedensemble
## Model ID:  StackedEnsemble_AllModels_AutoML_20190428_071540 
## NULL
## 
## 
## H2ORegressionMetrics: stackedensemble
## ** Reported on training data. **
## 
## MSE:  0.3810828
## RMSE:  0.6173191
## MAE:  0.3512438
## RMSLE:  0.08455186
## Mean Residual Deviance :  0.3810828
## 
## 
## H2ORegressionMetrics: stackedensemble
## ** Reported on validation data. **
## 
## MSE:  0.613427
## RMSE:  0.7832158
## MAE:  0.4486614
## RMSLE:  0.1021504
## Mean Residual Deviance :  0.613427
## 
## 
## H2ORegressionMetrics: stackedensemble
## ** Reported on cross-validation data. **
## ** 10-fold cross-validation on training data (Metrics computed for combined holdout predictions) **
## 
## MSE:  0.5873844
## RMSE:  0.7664101
## MAE:  0.4374693
## RMSLE:  0.09545365
## Mean Residual Deviance :  0.5873844
## 
## 
## 
## Slot "leaderboard":
##                                              model_id
## 1    StackedEnsemble_AllModels_AutoML_20190428_071540
## 2 StackedEnsemble_BestOfFamily_AutoML_20190428_071540
## 3           GBM_grid_1_AutoML_20190428_071540_model_3
## 4                        GBM_5_AutoML_20190428_071540
## 5                        GBM_2_AutoML_20190428_071540
## 6                        GBM_3_AutoML_20190428_071540
##   mean_residual_deviance      rmse       mse       mae      rmsle
## 1              0.5873844 0.7664101 0.5873844 0.4374693 0.09545365
## 2              0.5898073 0.7679892 0.5898073 0.4383886 0.09551665
## 3              0.5918113 0.7692927 0.5918113 0.4406372 0.09554908
## 4              0.5942404 0.7708699 0.5942404 0.4444171 0.09562988
## 5              0.6066911 0.7789038 0.6066911 0.4513328 0.09618697
## 6              0.6076155 0.7794970 0.6076155 0.4520000 0.09621807
## 
## [22 rows x 6 columns]
auto1@leader
## Model Details:
## ==============
## 
## H2ORegressionModel: stackedensemble
## Model ID:  StackedEnsemble_AllModels_AutoML_20190428_071540 
## NULL
## 
## 
## H2ORegressionMetrics: stackedensemble
## ** Reported on training data. **
## 
## MSE:  0.3810828
## RMSE:  0.6173191
## MAE:  0.3512438
## RMSLE:  0.08455186
## Mean Residual Deviance :  0.3810828
## 
## 
## H2ORegressionMetrics: stackedensemble
## ** Reported on validation data. **
## 
## MSE:  0.613427
## RMSE:  0.7832158
## MAE:  0.4486614
## RMSLE:  0.1021504
## Mean Residual Deviance :  0.613427
## 
## 
## H2ORegressionMetrics: stackedensemble
## ** Reported on cross-validation data. **
## ** 10-fold cross-validation on training data (Metrics computed for combined holdout predictions) **
## 
## MSE:  0.5873844
## RMSE:  0.7664101
## MAE:  0.4374693
## RMSLE:  0.09545365
## Mean Residual Deviance :  0.5873844
lb1<-auto1@leaderboard
print(lb1,nrow(lb1))
##                                               model_id
## 1     StackedEnsemble_AllModels_AutoML_20190428_071540
## 2  StackedEnsemble_BestOfFamily_AutoML_20190428_071540
## 3            GBM_grid_1_AutoML_20190428_071540_model_3
## 4                         GBM_5_AutoML_20190428_071540
## 5                         GBM_2_AutoML_20190428_071540
## 6                         GBM_3_AutoML_20190428_071540
## 7                         GBM_4_AutoML_20190428_071540
## 8            GBM_grid_1_AutoML_20190428_071540_model_6
## 9                         DRF_1_AutoML_20190428_071540
## 10           GLM_grid_1_AutoML_20190428_071540_model_1
## 11           GBM_grid_1_AutoML_20190428_071540_model_5
## 12  DeepLearning_grid_1_AutoML_20190428_071540_model_3
## 13  DeepLearning_grid_1_AutoML_20190428_071540_model_2
## 14                        GBM_1_AutoML_20190428_071540
## 15                        XRT_1_AutoML_20190428_071540
## 16           GBM_grid_1_AutoML_20190428_071540_model_8
## 17           GBM_grid_1_AutoML_20190428_071540_model_7
## 18               DeepLearning_1_AutoML_20190428_071540
## 19  DeepLearning_grid_1_AutoML_20190428_071540_model_1
## 20           GBM_grid_1_AutoML_20190428_071540_model_2
## 21           GBM_grid_1_AutoML_20190428_071540_model_1
## 22           GBM_grid_1_AutoML_20190428_071540_model_4
##    mean_residual_deviance      rmse       mse       mae      rmsle
## 1               0.5873844 0.7664101 0.5873844 0.4374693 0.09545365
## 2               0.5898073 0.7679892 0.5898073 0.4383886 0.09551665
## 3               0.5918113 0.7692927 0.5918113 0.4406372 0.09554908
## 4               0.5942404 0.7708699 0.5942404 0.4444171 0.09562988
## 5               0.6066911 0.7789038 0.6066911 0.4513328 0.09618697
## 6               0.6076155 0.7794970 0.6076155 0.4520000 0.09621807
## 7               0.6117933 0.7821721 0.6117933 0.4534883 0.09633149
## 8               0.6185635 0.7864881 0.6185635 0.4614473 0.09638044
## 9               0.6275010 0.7921496 0.6275010 0.4600826 0.09670791
## 10              0.6286874 0.7928981 0.6286874 0.4688597 0.09697598
## 11              0.6291317 0.7931783 0.6291317 0.4743420 0.09663710
## 12              0.6324780 0.7952849 0.6324780 0.4710018 0.09711802
## 13              0.6384725 0.7990448 0.6384725 0.4701393 0.09737025
## 14              0.6429173 0.8018213 0.6429173 0.4617737 0.09771812
## 15              0.6488769 0.8055290 0.6488769 0.4770142 0.09732526
## 16              0.6570540 0.8105887 0.6570540 0.4878046 0.09761487
## 17              0.6636891 0.8146712 0.6636891 0.4907383 0.09794750
## 18              0.6763341 0.8223953 0.6763341 0.4918196 0.09867732
## 19              0.6794852 0.8243089 0.6794852 0.4848973 0.09899512
## 20              0.7240370 0.8509036 0.7240370 0.4937092 0.10282454
## 21              0.7600951 0.8718344 0.7600951 0.5587202 0.10078515
## 22              1.3030046 1.1414923 1.3030046 0.7116189 0.12388819
## 
## [22 rows x 6 columns]

End of H2O: make sure to shutdown h2o

h2o.shutdown(F)
## [1] TRUE