Training and Evaluating Simple Regression Model

[1]:
import numpy as np
import pandas as pd

from matplotlib import pyplot as plt

Generate data

[2]:
import numpy.random as random

random.seed(150)

dates = pd.DataFrame({'score_date': pd.date_range('2016-01-01', '2016-12-31')})
dates['key'] = 1

ids = pd.DataFrame({'id': np.arange(0, 100)})
ids['key'] = 1

data = pd.merge(ids, dates).drop('key', axis=1)

data['x1'] = 23 * random.randn(data.shape[0]) + 500
data['x2'] = 59 * random.randn(data.shape[0]) + 235
data['x3'] = 73 * random.randn(data.shape[0]) + 793  # Noise feature.

data['y'] = 0.37*data['x1'] + 0.97*data['x2'] + 0.32*data['x2']**2 - 5.0*data['id']*0.2 + \
            np.cos(pd.to_datetime(data['score_date']).astype(int)*200)*20.0

nan_idx = np.random.randint(0, data.shape[0], size=100)  # Inject nan in x1.
data.loc[nan_idx, 'x1'] = np.nan

nan_idx = np.random.randint(0, data.shape[0], size=100)  # Inject nan in x2.
data.loc[nan_idx, 'x2'] = np.nan
[3]:
data.head()
[3]:
id score_date x1 x2 x3 y
0 0 2016-01-01 494.678185 212.949976 829.392324 14880.912784
1 0 2016-01-02 508.984104 158.174242 744.755316 8365.032313
2 0 2016-01-03 508.240842 291.856424 680.169336 27720.701968
3 0 2016-01-04 513.023623 137.007911 767.888326 6325.554470
4 0 2016-01-05 503.823382 279.123602 812.656548 25402.890737
[4]:
data['y'][data['id'] == 0].plot()
plt.show()
../_images/examples_regression_5_0.png

Train Test Split

[5]:
from fklearn.preprocessing.splitting import space_time_split_dataset

train_start = '2016-01-01'
train_end = '2016-06-30'
holdout_end = '2016-12-31'

split_fn = space_time_split_dataset(
    train_start_date=train_start,
    train_end_date=train_end,
    holdout_end_date=holdout_end,
    split_seed=50,
    space_holdout_percentage=.05,
    space_column='id',
    time_column='score_date',
)
[6]:
train_set, intime_outspace_hdout, outime_inspace_hdout, outime_outspace_hdout = split_fn(data)

train_set.shape, intime_outspace_hdout.shape, outime_inspace_hdout.shape, outime_outspace_hdout.shape
[6]:
((17195, 6), (905, 6), (18400, 6), (920, 6))

Define learner function

[7]:
FEATURES = ['x1', 'x2', 'x3']
TARGET = ['y']
[8]:
from fklearn.training.imputation import imputer

my_imputer = imputer(columns_to_impute=FEATURES, impute_strategy='median')
[37]:
from fklearn.training.transformation import standard_scaler

my_scaler = standard_scaler(columns_to_scale=FEATURES)
[10]:
from fklearn.training.regression import xgb_regression_learner

my_model = xgb_regression_learner(
    features=['x1', 'x2', 'x3'],
    target='y',
    prediction_column='prediction',
    extra_params={'seed': 139, 'nthread': 8},
)
[11]:
from fklearn.training.transformation import ecdfer

my_ecdefer = ecdfer(prediction_column='prediction', ecdf_column='prediction_ecdf')
[12]:
from fklearn.training.pipeline import build_pipeline

my_learner = build_pipeline(my_imputer, my_scaler, my_model, my_ecdefer)

Train

[13]:
(prediction_function, _, logs) = my_learner(train_set)
[14]:
logs
[14]:
{'imputer': {'impute_strategy': 'median',
  'columns_to_impute': ['x1', 'x2', 'x3'],
  'training_proportion_of_nulls': {'x1': 0.002675196277987787,
   'x2': 0.0023844140738586797,
   'x3': 0.0},
  'statistics': array([499.92366097, 234.4110308 , 792.50378949]),
  'running_time': '0.031 s'},
 'standard_scaler': {'standard_scaler': {'copy': True,
   'with_mean': True,
   'with_std': True},
  'transformed_column': ['x1', 'x2', 'x3'],
  'running_time': '0.018 s'},
 'xgb_regression_learner': {'features': ['x1', 'x2', 'x3'],
  'target': 'y',
  'prediction_column': 'prediction',
  'package': 'xgboost',
  'package_version': '0.82',
  'parameters': {'seed': 139,
   'nthread': 8,
   'eta': 0.1,
   'objective': 'reg:linear',
   'num_estimators': 100},
  'feature_importance': {'x2': 2415, 'x3': 514, 'x1': 919},
  'training_samples': 17195,
  'running_time': '1.191 s'},
 'ecdfer': {'nobs': 17195,
  'prediction_column': 'prediction',
  'ascending': True,
  'transformed_column': ['prediction_ecdf'],
  'running_time': '0.006 s'},
 '__fkml__': {'pipeline': ['imputer',
   'standard_scaler',
   'xgb_regression_learner',
   'ecdfer'],
  'output_columns': ['id',
   'score_date',
   'x1',
   'x2',
   'x3',
   'y',
   'prediction',
   'prediction_ecdf'],
  'features': ['id', 'score_date', 'x1', 'x2', 'x3', 'y'],
  'learners': {'imputer': {'fn': <function fklearn.training.imputation.imputer.<locals>.p(new_data_set: pandas.core.frame.DataFrame) -> pandas.core.frame.DataFrame>,
    'log': {'imputer': {'impute_strategy': 'median',
      'columns_to_impute': ['x1', 'x2', 'x3'],
      'training_proportion_of_nulls': {'x1': 0.002675196277987787,
       'x2': 0.0023844140738586797,
       'x3': 0.0},
      'statistics': array([499.92366097, 234.4110308 , 792.50378949]),
      'running_time': '0.031 s'}}},
   'standard_scaler': {'fn': <function fklearn.training.transformation.standard_scaler.<locals>.p(new_data_set: pandas.core.frame.DataFrame) -> pandas.core.frame.DataFrame>,
    'log': {'standard_scaler': {'standard_scaler': {'copy': True,
       'with_mean': True,
       'with_std': True},
      'transformed_column': ['x1', 'x2', 'x3'],
      'running_time': '0.018 s'}}},
   'xgb_regression_learner': {'fn': <function fklearn.training.regression.xgb_regression_learner.<locals>.p(new_df: pandas.core.frame.DataFrame, apply_shap: bool = False) -> pandas.core.frame.DataFrame>,
    'log': {'xgb_regression_learner': {'features': ['x1', 'x2', 'x3'],
      'target': 'y',
      'prediction_column': 'prediction',
      'package': 'xgboost',
      'package_version': '0.82',
      'parameters': {'seed': 139,
       'nthread': 8,
       'eta': 0.1,
       'objective': 'reg:linear',
       'num_estimators': 100},
      'feature_importance': {'x2': 2415, 'x3': 514, 'x1': 919},
      'training_samples': 17195,
      'running_time': '1.191 s'}}},
   'ecdfer': {'fn': <function fklearn.training.transformation.ecdfer.<locals>.p(new_df: pandas.core.frame.DataFrame) -> pandas.core.frame.DataFrame>,
    'log': {'ecdfer': {'nobs': 17195,
      'prediction_column': 'prediction',
      'ascending': True,
      'transformed_column': ['prediction_ecdf'],
      'running_time': '0.006 s'}}}}}}

Evaluate

[15]:
from fklearn.validation.evaluators import combined_evaluators, mean_prediction_evaluator, r2_evaluator, mse_evaluator

my_evaluator = combined_evaluators(
    evaluators=[
        mean_prediction_evaluator(prediction_column='prediction'),
        r2_evaluator(prediction_column='prediction', target_column='y'),
        mse_evaluator(prediction_column='prediction', target_column='y'),
    ]
)
[16]:
test_predictions = prediction_function(outime_outspace_hdout)

my_evaluator(test_predictions)
[16]:
{'mean_evaluator__prediction': 18914.416,
 'r2_evaluator__y': 0.9906295827453058,
 'mse_evaluator__y': 673677.8680663708}

Run K-Fold Cross Validation

[45]:
from fklearn.validation.validator import validator
from fklearn.metrics.pd_extractors import extract, evaluator_extractor
[18]:
from fklearn.validation.splitters import k_fold_splitter

my_split_fn = k_fold_splitter(n_splits=2, random_state=42)

cv_results = validator(train_set, split_fn=my_split_fn, train_fn=my_learner, eval_fn=my_evaluator)
[19]:
extract(cv_results['validator_log'], evaluator_extractor(evaluator_name='r2_evaluator__y'))
[19]:
r2_evaluator__y fold_num train_size test_size
0 0.995754 0 8597 8598
0 0.996796 1 8598 8597

Run Inverse Learning Curve Validation

[20]:
from fklearn.validation.splitters import reverse_time_learning_curve_splitter

sc_split_fn = reverse_time_learning_curve_splitter(
    training_time_limit=train_end,
    time_column='score_date',
    freq='W',
    min_samples=500,
)

lc_results = validator(train_set, split_fn=sc_split_fn, train_fn=my_learner, eval_fn=my_evaluator)
[21]:
extract(lc_results['validator_log'], evaluator_extractor(evaluator_name='r2_evaluator__y'))
[21]:
r2_evaluator__y fold_num train_start train_end train_size test_start test_end test_size
0 0.992036 0 2016-06-19 2016-06-26 760 2016-06-27 2016-06-29 285
0 0.999510 1 2016-06-12 2016-06-26 1425 2016-06-27 2016-06-29 285
0 0.999422 2 2016-06-05 2016-06-26 2090 2016-06-27 2016-06-29 285
0 0.999832 3 2016-05-29 2016-06-26 2755 2016-06-27 2016-06-29 285
0 0.999643 4 2016-05-22 2016-06-26 3420 2016-06-27 2016-06-29 285
0 0.999692 5 2016-05-15 2016-06-26 4085 2016-06-27 2016-06-29 285
0 0.999651 6 2016-05-08 2016-06-26 4750 2016-06-27 2016-06-29 285
0 0.999730 7 2016-05-01 2016-06-26 5415 2016-06-27 2016-06-29 285
0 0.999647 8 2016-04-24 2016-06-26 6080 2016-06-27 2016-06-29 285
0 0.999898 9 2016-04-17 2016-06-26 6745 2016-06-27 2016-06-29 285
0 0.999858 10 2016-04-10 2016-06-26 7410 2016-06-27 2016-06-29 285
0 0.999951 11 2016-04-03 2016-06-26 8075 2016-06-27 2016-06-29 285
0 0.999948 12 2016-03-27 2016-06-26 8740 2016-06-27 2016-06-29 285
0 0.999962 13 2016-03-20 2016-06-26 9405 2016-06-27 2016-06-29 285
0 0.999964 14 2016-03-13 2016-06-26 10070 2016-06-27 2016-06-29 285
0 0.999967 15 2016-03-06 2016-06-26 10735 2016-06-27 2016-06-29 285
0 0.999965 16 2016-02-28 2016-06-26 11400 2016-06-27 2016-06-29 285
0 0.999874 17 2016-02-21 2016-06-26 12065 2016-06-27 2016-06-29 285
0 0.999955 18 2016-02-14 2016-06-26 12730 2016-06-27 2016-06-29 285
0 0.999945 19 2016-02-07 2016-06-26 13395 2016-06-27 2016-06-29 285
0 0.999947 20 2016-01-31 2016-06-26 14060 2016-06-27 2016-06-29 285
0 0.999925 21 2016-01-24 2016-06-26 14725 2016-06-27 2016-06-29 285
0 0.998547 22 2016-01-17 2016-06-26 15390 2016-06-27 2016-06-29 285
0 0.997911 23 2016-01-10 2016-06-26 16055 2016-06-27 2016-06-29 285
0 0.998664 24 2016-01-03 2016-06-26 16720 2016-06-27 2016-06-29 285

Run Stability Curve

[22]:
from fklearn.validation.splitters import stability_curve_time_splitter

sc_split_fn = stability_curve_time_splitter(
    training_time_limit='2016-06-01',
    time_column='score_date',
    freq='M',
    min_samples=1000,
)

sc_results = validator(data, split_fn=sc_split_fn, train_fn=my_learner, eval_fn=my_evaluator)
[23]:
extract(sc_results['validator_log'], evaluator_extractor(evaluator_name='r2_evaluator__y'))
[23]:
r2_evaluator__y fold_num train_start train_end train_size test_start test_end test_size
0 0.998124 0 2016-01-01 2016-06-01 15300 2016-06-02 2016-06-30 2900
0 0.993554 0 2016-01-01 2016-06-01 15300 2016-07-01 2016-07-31 3100
0 0.998540 0 2016-01-01 2016-06-01 15300 2016-08-01 2016-08-31 3100
0 0.997695 0 2016-01-01 2016-06-01 15300 2016-09-01 2016-09-30 3000
0 0.995735 0 2016-01-01 2016-06-01 15300 2016-10-01 2016-10-31 3100
0 0.997235 0 2016-01-01 2016-06-01 15300 2016-11-01 2016-11-30 3000
0 0.998376 0 2016-01-01 2016-06-01 15300 2016-12-01 2016-12-31 3100

Run Forward Stability Curve

[24]:
from datetime import timedelta
from fklearn.validation.splitters import forward_stability_curve_time_splitter

fw_split_fn = forward_stability_curve_time_splitter(
    training_time_start='2016-01-01',
    training_time_end='2016-03-01',
    holdout_gap=timedelta(days=30),
    holdout_size=timedelta(days=30),
    step=timedelta(days=30),
    time_column='score_date',
)

fw_sc_results = validator(data, split_fn=fw_split_fn, train_fn=my_learner, eval_fn=my_evaluator)
[25]:
extract(fw_sc_results['validator_log'], evaluator_extractor(evaluator_name='r2_evaluator__y'))
[25]:
r2_evaluator__y fold_num train_start train_end train_size test_start test_end test_size
0 0.995640 0 2016-01-01 2016-02-29 6000 2016-03-31 2016-04-29 3000
0 0.992730 1 2016-01-31 2016-03-30 6000 2016-04-30 2016-05-29 3000
0 0.998711 2 2016-03-01 2016-04-29 6000 2016-05-30 2016-06-28 3000
0 0.992840 3 2016-03-31 2016-05-29 6000 2016-06-29 2016-07-28 3000
0 0.996390 4 2016-04-30 2016-06-28 6000 2016-07-29 2016-08-27 3000
0 0.996138 5 2016-05-30 2016-07-28 6000 2016-08-28 2016-09-26 3000
0 0.994155 6 2016-06-29 2016-08-27 6000 2016-09-27 2016-10-26 3000
0 0.997055 7 2016-07-29 2016-09-26 6000 2016-10-27 2016-11-25 3000
0 0.998399 8 2016-08-28 2016-10-26 6000 2016-11-26 2016-12-25 3000

Run Spatial Learning Curve

[26]:
from fklearn.validation.splitters import spatial_learning_curve_splitter

spatial_split_fn = spatial_learning_curve_splitter(
    train_percentages=np.linspace(0.1, 1, 10),
    space_column='id',
    time_column='score_date',
    training_limit='2016-06-01',
    holdout_gap=timedelta(days=180),
    random_state=0,
)

spatial_lc_results = validator(data, split_fn=spatial_split_fn, train_fn=my_learner, eval_fn=my_evaluator)
[27]:
extract(spatial_lc_results['validator_log'], evaluator_extractor(evaluator_name='r2_evaluator__y'))
[27]:
r2_evaluator__y fold_num train_start train_end train_size test_start test_end test_size percentage
0 0.994729 0 2016-01-01 2016-06-01 1530 2016-11-29 2016-12-31 3300 0.1
0 0.990840 1 2016-01-01 2016-06-01 3060 2016-11-29 2016-12-31 3300 0.2
0 0.997053 2 2016-01-01 2016-06-01 4590 2016-11-29 2016-12-31 3300 0.3
0 0.994885 3 2016-01-01 2016-06-01 6120 2016-11-29 2016-12-31 3300 0.4
0 0.996923 4 2016-01-01 2016-06-01 7650 2016-11-29 2016-12-31 3300 0.5
0 0.997560 5 2016-01-01 2016-06-01 9180 2016-11-29 2016-12-31 3300 0.6
0 0.998197 6 2016-01-01 2016-06-01 10710 2016-11-29 2016-12-31 3300 0.7
0 0.998720 7 2016-01-01 2016-06-01 12240 2016-11-29 2016-12-31 3300 0.8
0 0.997620 8 2016-01-01 2016-06-01 13770 2016-11-29 2016-12-31 3300 0.9
0 0.998228 9 2016-01-01 2016-06-01 15300 2016-11-29 2016-12-31 3300 1.0

Feature Selection

[28]:
from toolz import curry
from fklearn.tuning.selectors import feature_importance_backward_selection

# There might be columns that are features but are also auxiliary for evaluation, add them on the auxiliary list.
AUXILIARY_COLUMNS = list(set(train_set.columns) - set(FEATURES))

base_common_extractor = evaluator_extractor(evaluator_name='r2_evaluator__y')

@curry
def selector_pipeline(train_set, features):
    pipeline = build_pipeline(
        imputer(columns_to_impute=features, impute_strategy='median'),
        standard_scaler(columns_to_scale=features),
        xgb_regression_learner(
            features=features,
            target='y',
            prediction_column='prediction',
            extra_params={'seed': 139, 'nthread': 8}
        ),
        ecdfer(prediction_column='prediction', ecdf_column='prediction_ecdf'),
    )
    return pipeline(train_set)

logs_fibs = feature_importance_backward_selection(
    train_set,
    selector_pipeline,
    FEATURES,
    my_split_fn,
    my_evaluator,
    base_common_extractor,
    metric_name='y',
    num_removed_by_step=1,
    threshold=0.0001,
    early_stop=100,
    iter_limit=3,
    min_remaining_features=1,
)

logs_fibs
[28]:
[{'train_log': [{'imputer': {'impute_strategy': 'median',
     'columns_to_impute': ['x2'],
     'training_proportion_of_nulls': {'x2': 0.0018611143422123998},
     'statistics': array([234.52197656]),
     'running_time': '0.012 s'},
    'standard_scaler': {'standard_scaler': {'copy': True,
      'with_mean': True,
      'with_std': True},
     'transformed_column': ['x2'],
     'running_time': '0.009 s'},
    'xgb_regression_learner': {'features': ['x2'],
     'target': 'y',
     'prediction_column': 'prediction',
     'package': 'xgboost',
     'package_version': '0.82',
     'parameters': {'seed': 139,
      'nthread': 8,
      'eta': 0.1,
      'objective': 'reg:linear',
      'num_estimators': 100},
     'feature_importance': {'x2': 2833},
     'training_samples': 8597,
     'running_time': '0.447 s'},
    'ecdfer': {'nobs': 8597,
     'prediction_column': 'prediction',
     'ascending': True,
     'transformed_column': ['prediction_ecdf'],
     'running_time': '0.003 s'},
    '__fkml__': {'pipeline': ['imputer',
      'standard_scaler',
      'xgb_regression_learner',
      'ecdfer'],
     'output_columns': ['id',
      'score_date',
      'x1',
      'x2',
      'x3',
      'y',
      'prediction',
      'prediction_ecdf'],
     'features': ['id', 'score_date', 'x1', 'x2', 'x3', 'y'],
     'learners': {'imputer': {'fn': <function fklearn.training.imputation.imputer.<locals>.p(new_data_set: pandas.core.frame.DataFrame) -> pandas.core.frame.DataFrame>,
       'log': {'imputer': {'impute_strategy': 'median',
         'columns_to_impute': ['x2'],
         'training_proportion_of_nulls': {'x2': 0.0018611143422123998},
         'statistics': array([234.52197656]),
         'running_time': '0.012 s'}}},
      'standard_scaler': {'fn': <function fklearn.training.transformation.standard_scaler.<locals>.p(new_data_set: pandas.core.frame.DataFrame) -> pandas.core.frame.DataFrame>,
       'log': {'standard_scaler': {'standard_scaler': {'copy': True,
          'with_mean': True,
          'with_std': True},
         'transformed_column': ['x2'],
         'running_time': '0.009 s'}}},
      'xgb_regression_learner': {'fn': <function fklearn.training.regression.xgb_regression_learner.<locals>.p(new_df: pandas.core.frame.DataFrame, apply_shap: bool = False) -> pandas.core.frame.DataFrame>,
       'log': {'xgb_regression_learner': {'features': ['x2'],
         'target': 'y',
         'prediction_column': 'prediction',
         'package': 'xgboost',
         'package_version': '0.82',
         'parameters': {'seed': 139,
          'nthread': 8,
          'eta': 0.1,
          'objective': 'reg:linear',
          'num_estimators': 100},
         'feature_importance': {'x2': 2833},
         'training_samples': 8597,
         'running_time': '0.447 s'}}},
      'ecdfer': {'fn': <function fklearn.training.transformation.ecdfer.<locals>.p(new_df: pandas.core.frame.DataFrame) -> pandas.core.frame.DataFrame>,
       'log': {'ecdfer': {'nobs': 8597,
         'prediction_column': 'prediction',
         'ascending': True,
         'transformed_column': ['prediction_ecdf'],
         'running_time': '0.003 s'}}}}}},
   {'imputer': {'impute_strategy': 'median',
     'columns_to_impute': ['x2'],
     'training_proportion_of_nulls': {'x2': 0.002907652942544778},
     'statistics': array([234.0620775]),
     'running_time': '0.009 s'},
    'standard_scaler': {'standard_scaler': {'copy': True,
      'with_mean': True,
      'with_std': True},
     'transformed_column': ['x2'],
     'running_time': '0.005 s'},
    'xgb_regression_learner': {'features': ['x2'],
     'target': 'y',
     'prediction_column': 'prediction',
     'package': 'xgboost',
     'package_version': '0.82',
     'parameters': {'seed': 139,
      'nthread': 8,
      'eta': 0.1,
      'objective': 'reg:linear',
      'num_estimators': 100},
     'feature_importance': {'x2': 2682},
     'training_samples': 8598,
     'running_time': '0.464 s'},
    'ecdfer': {'nobs': 8598,
     'prediction_column': 'prediction',
     'ascending': True,
     'transformed_column': ['prediction_ecdf'],
     'running_time': '0.004 s'},
    '__fkml__': {'pipeline': ['imputer',
      'standard_scaler',
      'xgb_regression_learner',
      'ecdfer'],
     'output_columns': ['id',
      'score_date',
      'x1',
      'x2',
      'x3',
      'y',
      'prediction',
      'prediction_ecdf'],
     'features': ['id', 'score_date', 'x1', 'x2', 'x3', 'y'],
     'learners': {'imputer': {'fn': <function fklearn.training.imputation.imputer.<locals>.p(new_data_set: pandas.core.frame.DataFrame) -> pandas.core.frame.DataFrame>,
       'log': {'imputer': {'impute_strategy': 'median',
         'columns_to_impute': ['x2'],
         'training_proportion_of_nulls': {'x2': 0.002907652942544778},
         'statistics': array([234.0620775]),
         'running_time': '0.009 s'}}},
      'standard_scaler': {'fn': <function fklearn.training.transformation.standard_scaler.<locals>.p(new_data_set: pandas.core.frame.DataFrame) -> pandas.core.frame.DataFrame>,
       'log': {'standard_scaler': {'standard_scaler': {'copy': True,
          'with_mean': True,
          'with_std': True},
         'transformed_column': ['x2'],
         'running_time': '0.005 s'}}},
      'xgb_regression_learner': {'fn': <function fklearn.training.regression.xgb_regression_learner.<locals>.p(new_df: pandas.core.frame.DataFrame, apply_shap: bool = False) -> pandas.core.frame.DataFrame>,
       'log': {'xgb_regression_learner': {'features': ['x2'],
         'target': 'y',
         'prediction_column': 'prediction',
         'package': 'xgboost',
         'package_version': '0.82',
         'parameters': {'seed': 139,
          'nthread': 8,
          'eta': 0.1,
          'objective': 'reg:linear',
          'num_estimators': 100},
         'feature_importance': {'x2': 2682},
         'training_samples': 8598,
         'running_time': '0.464 s'}}},
      'ecdfer': {'fn': <function fklearn.training.transformation.ecdfer.<locals>.p(new_df: pandas.core.frame.DataFrame) -> pandas.core.frame.DataFrame>,
       'log': {'ecdfer': {'nobs': 8598,
         'prediction_column': 'prediction',
         'ascending': True,
         'transformed_column': ['prediction_ecdf'],
         'running_time': '0.004 s'}}}}}}],
  'validator_log': [{'fold_num': 0,
    'eval_results': [{'mean_evaluator__prediction': 19001.398,
      'r2_evaluator__y': 0.9967869656965197,
      'mse_evaluator__y': 264683.44314774574}],
    'split_log': {'train_size': 8597, 'test_size': 8598}},
   {'fold_num': 1,
    'eval_results': [{'mean_evaluator__prediction': 19067.227,
      'r2_evaluator__y': 0.9972940430229065,
      'mse_evaluator__y': 222424.24657619736}],
    'split_log': {'train_size': 8598, 'test_size': 8597}}]},
 {'train_log': [{'imputer': {'impute_strategy': 'median',
     'columns_to_impute': ['x2', 'x1'],
     'training_proportion_of_nulls': {'x2': 0.0018611143422123998,
      'x1': 0.0032569500988716992},
     'statistics': array([234.52197656, 499.86013654]),
     'running_time': '0.016 s'},
    'standard_scaler': {'standard_scaler': {'copy': True,
      'with_mean': True,
      'with_std': True},
     'transformed_column': ['x2', 'x1'],
     'running_time': '0.008 s'},
    'xgb_regression_learner': {'features': ['x2', 'x1'],
     'target': 'y',
     'prediction_column': 'prediction',
     'package': 'xgboost',
     'package_version': '0.82',
     'parameters': {'seed': 139,
      'nthread': 8,
      'eta': 0.1,
      'objective': 'reg:linear',
      'num_estimators': 100},
     'feature_importance': {'x2': 2116, 'x1': 1034},
     'training_samples': 8597,
     'running_time': '0.490 s'},
    'ecdfer': {'nobs': 8597,
     'prediction_column': 'prediction',
     'ascending': True,
     'transformed_column': ['prediction_ecdf'],
     'running_time': '0.003 s'},
    '__fkml__': {'pipeline': ['imputer',
      'standard_scaler',
      'xgb_regression_learner',
      'ecdfer'],
     'output_columns': ['id',
      'score_date',
      'x1',
      'x2',
      'x3',
      'y',
      'prediction',
      'prediction_ecdf'],
     'features': ['id', 'score_date', 'x1', 'x2', 'x3', 'y'],
     'learners': {'imputer': {'fn': <function fklearn.training.imputation.imputer.<locals>.p(new_data_set: pandas.core.frame.DataFrame) -> pandas.core.frame.DataFrame>,
       'log': {'imputer': {'impute_strategy': 'median',
         'columns_to_impute': ['x2', 'x1'],
         'training_proportion_of_nulls': {'x2': 0.0018611143422123998,
          'x1': 0.0032569500988716992},
         'statistics': array([234.52197656, 499.86013654]),
         'running_time': '0.016 s'}}},
      'standard_scaler': {'fn': <function fklearn.training.transformation.standard_scaler.<locals>.p(new_data_set: pandas.core.frame.DataFrame) -> pandas.core.frame.DataFrame>,
       'log': {'standard_scaler': {'standard_scaler': {'copy': True,
          'with_mean': True,
          'with_std': True},
         'transformed_column': ['x2', 'x1'],
         'running_time': '0.008 s'}}},
      'xgb_regression_learner': {'fn': <function fklearn.training.regression.xgb_regression_learner.<locals>.p(new_df: pandas.core.frame.DataFrame, apply_shap: bool = False) -> pandas.core.frame.DataFrame>,
       'log': {'xgb_regression_learner': {'features': ['x2', 'x1'],
         'target': 'y',
         'prediction_column': 'prediction',
         'package': 'xgboost',
         'package_version': '0.82',
         'parameters': {'seed': 139,
          'nthread': 8,
          'eta': 0.1,
          'objective': 'reg:linear',
          'num_estimators': 100},
         'feature_importance': {'x2': 2116, 'x1': 1034},
         'training_samples': 8597,
         'running_time': '0.490 s'}}},
      'ecdfer': {'fn': <function fklearn.training.transformation.ecdfer.<locals>.p(new_df: pandas.core.frame.DataFrame) -> pandas.core.frame.DataFrame>,
       'log': {'ecdfer': {'nobs': 8597,
         'prediction_column': 'prediction',
         'ascending': True,
         'transformed_column': ['prediction_ecdf'],
         'running_time': '0.003 s'}}}}}},
   {'imputer': {'impute_strategy': 'median',
     'columns_to_impute': ['x2', 'x1'],
     'training_proportion_of_nulls': {'x2': 0.002907652942544778,
      'x1': 0.00209351011863224},
     'statistics': array([234.0620775 , 499.96967609]),
     'running_time': '0.011 s'},
    'standard_scaler': {'standard_scaler': {'copy': True,
      'with_mean': True,
      'with_std': True},
     'transformed_column': ['x2', 'x1'],
     'running_time': '0.006 s'},
    'xgb_regression_learner': {'features': ['x2', 'x1'],
     'target': 'y',
     'prediction_column': 'prediction',
     'package': 'xgboost',
     'package_version': '0.82',
     'parameters': {'seed': 139,
      'nthread': 8,
      'eta': 0.1,
      'objective': 'reg:linear',
      'num_estimators': 100},
     'feature_importance': {'x2': 1998, 'x1': 767},
     'training_samples': 8598,
     'running_time': '0.545 s'},
    'ecdfer': {'nobs': 8598,
     'prediction_column': 'prediction',
     'ascending': True,
     'transformed_column': ['prediction_ecdf'],
     'running_time': '0.003 s'},
    '__fkml__': {'pipeline': ['imputer',
      'standard_scaler',
      'xgb_regression_learner',
      'ecdfer'],
     'output_columns': ['id',
      'score_date',
      'x1',
      'x2',
      'x3',
      'y',
      'prediction',
      'prediction_ecdf'],
     'features': ['id', 'score_date', 'x1', 'x2', 'x3', 'y'],
     'learners': {'imputer': {'fn': <function fklearn.training.imputation.imputer.<locals>.p(new_data_set: pandas.core.frame.DataFrame) -> pandas.core.frame.DataFrame>,
       'log': {'imputer': {'impute_strategy': 'median',
         'columns_to_impute': ['x2', 'x1'],
         'training_proportion_of_nulls': {'x2': 0.002907652942544778,
          'x1': 0.00209351011863224},
         'statistics': array([234.0620775 , 499.96967609]),
         'running_time': '0.011 s'}}},
      'standard_scaler': {'fn': <function fklearn.training.transformation.standard_scaler.<locals>.p(new_data_set: pandas.core.frame.DataFrame) -> pandas.core.frame.DataFrame>,
       'log': {'standard_scaler': {'standard_scaler': {'copy': True,
          'with_mean': True,
          'with_std': True},
         'transformed_column': ['x2', 'x1'],
         'running_time': '0.006 s'}}},
      'xgb_regression_learner': {'fn': <function fklearn.training.regression.xgb_regression_learner.<locals>.p(new_df: pandas.core.frame.DataFrame, apply_shap: bool = False) -> pandas.core.frame.DataFrame>,
       'log': {'xgb_regression_learner': {'features': ['x2', 'x1'],
         'target': 'y',
         'prediction_column': 'prediction',
         'package': 'xgboost',
         'package_version': '0.82',
         'parameters': {'seed': 139,
          'nthread': 8,
          'eta': 0.1,
          'objective': 'reg:linear',
          'num_estimators': 100},
         'feature_importance': {'x2': 1998, 'x1': 767},
         'training_samples': 8598,
         'running_time': '0.545 s'}}},
      'ecdfer': {'fn': <function fklearn.training.transformation.ecdfer.<locals>.p(new_df: pandas.core.frame.DataFrame) -> pandas.core.frame.DataFrame>,
       'log': {'ecdfer': {'nobs': 8598,
         'prediction_column': 'prediction',
         'ascending': True,
         'transformed_column': ['prediction_ecdf'],
         'running_time': '0.003 s'}}}}}}],
  'validator_log': [{'fold_num': 0,
    'eval_results': [{'mean_evaluator__prediction': 18998.422,
      'r2_evaluator__y': 0.996420620394994,
      'mse_evaluator__y': 294862.2481744438}],
    'split_log': {'train_size': 8597, 'test_size': 8598}},
   {'fold_num': 1,
    'eval_results': [{'mean_evaluator__prediction': 19067.477,
      'r2_evaluator__y': 0.9971315905847774,
      'mse_evaluator__y': 235777.51178373926}],
    'split_log': {'train_size': 8598, 'test_size': 8597}}]},
 {'train_log': [{'imputer': {'impute_strategy': 'median',
     'columns_to_impute': ['x1', 'x2', 'x3'],
     'training_proportion_of_nulls': {'x1': 0.0032569500988716992,
      'x2': 0.0018611143422123998,
      'x3': 0.0},
     'statistics': array([499.86013654, 234.52197656, 792.01836178]),
     'running_time': '0.017 s'},
    'standard_scaler': {'standard_scaler': {'copy': True,
      'with_mean': True,
      'with_std': True},
     'transformed_column': ['x1', 'x2', 'x3'],
     'running_time': '0.008 s'},
    'xgb_regression_learner': {'features': ['x1', 'x2', 'x3'],
     'target': 'y',
     'prediction_column': 'prediction',
     'package': 'xgboost',
     'package_version': '0.82',
     'parameters': {'seed': 139,
      'nthread': 8,
      'eta': 0.1,
      'objective': 'reg:linear',
      'num_estimators': 100},
     'feature_importance': {'x2': 1990, 'x3': 401, 'x1': 830},
     'training_samples': 8597,
     'running_time': '0.697 s'},
    'ecdfer': {'nobs': 8597,
     'prediction_column': 'prediction',
     'ascending': True,
     'transformed_column': ['prediction_ecdf'],
     'running_time': '0.005 s'},
    '__fkml__': {'pipeline': ['imputer',
      'standard_scaler',
      'xgb_regression_learner',
      'ecdfer'],
     'output_columns': ['id',
      'score_date',
      'x1',
      'x2',
      'x3',
      'y',
      'prediction',
      'prediction_ecdf'],
     'features': ['id', 'score_date', 'x1', 'x2', 'x3', 'y'],
     'learners': {'imputer': {'fn': <function fklearn.training.imputation.imputer.<locals>.p(new_data_set: pandas.core.frame.DataFrame) -> pandas.core.frame.DataFrame>,
       'log': {'imputer': {'impute_strategy': 'median',
         'columns_to_impute': ['x1', 'x2', 'x3'],
         'training_proportion_of_nulls': {'x1': 0.0032569500988716992,
          'x2': 0.0018611143422123998,
          'x3': 0.0},
         'statistics': array([499.86013654, 234.52197656, 792.01836178]),
         'running_time': '0.017 s'}}},
      'standard_scaler': {'fn': <function fklearn.training.transformation.standard_scaler.<locals>.p(new_data_set: pandas.core.frame.DataFrame) -> pandas.core.frame.DataFrame>,
       'log': {'standard_scaler': {'standard_scaler': {'copy': True,
          'with_mean': True,
          'with_std': True},
         'transformed_column': ['x1', 'x2', 'x3'],
         'running_time': '0.008 s'}}},
      'xgb_regression_learner': {'fn': <function fklearn.training.regression.xgb_regression_learner.<locals>.p(new_df: pandas.core.frame.DataFrame, apply_shap: bool = False) -> pandas.core.frame.DataFrame>,
       'log': {'xgb_regression_learner': {'features': ['x1', 'x2', 'x3'],
         'target': 'y',
         'prediction_column': 'prediction',
         'package': 'xgboost',
         'package_version': '0.82',
         'parameters': {'seed': 139,
          'nthread': 8,
          'eta': 0.1,
          'objective': 'reg:linear',
          'num_estimators': 100},
         'feature_importance': {'x2': 1990, 'x3': 401, 'x1': 830},
         'training_samples': 8597,
         'running_time': '0.697 s'}}},
      'ecdfer': {'fn': <function fklearn.training.transformation.ecdfer.<locals>.p(new_df: pandas.core.frame.DataFrame) -> pandas.core.frame.DataFrame>,
       'log': {'ecdfer': {'nobs': 8597,
         'prediction_column': 'prediction',
         'ascending': True,
         'transformed_column': ['prediction_ecdf'],
         'running_time': '0.005 s'}}}}}},
   {'imputer': {'impute_strategy': 'median',
     'columns_to_impute': ['x1', 'x2', 'x3'],
     'training_proportion_of_nulls': {'x1': 0.00209351011863224,
      'x2': 0.002907652942544778,
      'x3': 0.0},
     'statistics': array([499.96967609, 234.0620775 , 793.02418833]),
     'running_time': '0.020 s'},
    'standard_scaler': {'standard_scaler': {'copy': True,
      'with_mean': True,
      'with_std': True},
     'transformed_column': ['x1', 'x2', 'x3'],
     'running_time': '0.012 s'},
    'xgb_regression_learner': {'features': ['x1', 'x2', 'x3'],
     'target': 'y',
     'prediction_column': 'prediction',
     'package': 'xgboost',
     'package_version': '0.82',
     'parameters': {'seed': 139,
      'nthread': 8,
      'eta': 0.1,
      'objective': 'reg:linear',
      'num_estimators': 100},
     'feature_importance': {'x2': 1799, 'x1': 689, 'x3': 312},
     'training_samples': 8598,
     'running_time': '0.632 s'},
    'ecdfer': {'nobs': 8598,
     'prediction_column': 'prediction',
     'ascending': True,
     'transformed_column': ['prediction_ecdf'],
     'running_time': '0.004 s'},
    '__fkml__': {'pipeline': ['imputer',
      'standard_scaler',
      'xgb_regression_learner',
      'ecdfer'],
     'output_columns': ['id',
      'score_date',
      'x1',
      'x2',
      'x3',
      'y',
      'prediction',
      'prediction_ecdf'],
     'features': ['id', 'score_date', 'x1', 'x2', 'x3', 'y'],
     'learners': {'imputer': {'fn': <function fklearn.training.imputation.imputer.<locals>.p(new_data_set: pandas.core.frame.DataFrame) -> pandas.core.frame.DataFrame>,
       'log': {'imputer': {'impute_strategy': 'median',
         'columns_to_impute': ['x1', 'x2', 'x3'],
         'training_proportion_of_nulls': {'x1': 0.00209351011863224,
          'x2': 0.002907652942544778,
          'x3': 0.0},
         'statistics': array([499.96967609, 234.0620775 , 793.02418833]),
         'running_time': '0.020 s'}}},
      'standard_scaler': {'fn': <function fklearn.training.transformation.standard_scaler.<locals>.p(new_data_set: pandas.core.frame.DataFrame) -> pandas.core.frame.DataFrame>,
       'log': {'standard_scaler': {'standard_scaler': {'copy': True,
          'with_mean': True,
          'with_std': True},
         'transformed_column': ['x1', 'x2', 'x3'],
         'running_time': '0.012 s'}}},
      'xgb_regression_learner': {'fn': <function fklearn.training.regression.xgb_regression_learner.<locals>.p(new_df: pandas.core.frame.DataFrame, apply_shap: bool = False) -> pandas.core.frame.DataFrame>,
       'log': {'xgb_regression_learner': {'features': ['x1', 'x2', 'x3'],
         'target': 'y',
         'prediction_column': 'prediction',
         'package': 'xgboost',
         'package_version': '0.82',
         'parameters': {'seed': 139,
          'nthread': 8,
          'eta': 0.1,
          'objective': 'reg:linear',
          'num_estimators': 100},
         'feature_importance': {'x2': 1799, 'x1': 689, 'x3': 312},
         'training_samples': 8598,
         'running_time': '0.632 s'}}},
      'ecdfer': {'fn': <function fklearn.training.transformation.ecdfer.<locals>.p(new_df: pandas.core.frame.DataFrame) -> pandas.core.frame.DataFrame>,
       'log': {'ecdfer': {'nobs': 8598,
         'prediction_column': 'prediction',
         'ascending': True,
         'transformed_column': ['prediction_ecdf'],
         'running_time': '0.004 s'}}}}}}],
  'validator_log': [{'fold_num': 0,
    'eval_results': [{'mean_evaluator__prediction': 19002.934,
      'r2_evaluator__y': 0.9957544083219314,
      'mse_evaluator__y': 349743.4877472073}],
    'split_log': {'train_size': 8597, 'test_size': 8598}},
   {'fold_num': 1,
    'eval_results': [{'mean_evaluator__prediction': 19066.275,
      'r2_evaluator__y': 0.9967958244067132,
      'mse_evaluator__y': 263376.819464493}],
    'split_log': {'train_size': 8598, 'test_size': 8597}}]}]

Quick showing on how to get best features from selection logs

[29]:
from fklearn.tuning.samplers import get_used_features, get_avg_metric_from_extractor, order_feature_importance_avg_from_logs

features = list(map(lambda x: get_used_features(x), logs_fibs))
auc = list(map(lambda x: get_avg_metric_from_extractor(x, base_common_extractor, 'r2_evaluator__y'), logs_fibs))

sorted(zip(features, auc), reverse=True, key=lambda x: x[1])[0]
[29]:
(['x2'], 0.997040504359713)

Interpretability: SHAP

[38]:
import shap

shap.initjs()
[39]:
holdout = pd.concat((intime_outspace_hdout, outime_inspace_hdout, outime_outspace_hdout))
[40]:
preds = prediction_function(holdout, apply_shap=True)
[41]:
shap_values = np.vstack(preds['shap_values'])

shap_expected_value = preds.shap_expected_value.iloc[0]
[42]:
shap.summary_plot(shap_values, preds[FEATURES])
../_images/examples_regression_47_0.png
[43]:
shap.dependence_plot('x2', shap_values, preds[FEATURES])
../_images/examples_regression_48_0.png
[44]:
shap.force_plot(shap_expected_value, shap_values[0, :], preds[FEATURES].iloc[0, :])
[44]:
Visualization omitted, Javascript library not loaded!
Have you run `initjs()` in this notebook? If this notebook was from another user you must also trust this notebook (File -> Trust notebook). If you are viewing this notebook on github the Javascript has been stripped for security. If you are using JupyterLab this error is because a JupyterLab extension has not yet been written.