Source code for fklearn.training.ensemble

from typing import Any, Dict, List, TypeVar

import pandas as pd
from toolz import curry, assoc, compose

from fklearn.training.classification import xgb_classification_learner
from fklearn.common_docstrings import learner_pred_fn_docstring, learner_return_docstring
from fklearn.types import LearnerReturnType
from fklearn.training.utils import log_learner_time

T = TypeVar('T')


[docs]@curry
@log_learner_time(learner_name='xgb_octopus_classification_learner')
def xgb_octopus_classification_learner(train_set: pd.DataFrame,
                                       learning_rate_by_bin: Dict[T, float],
                                       num_estimators_by_bin: Dict[T, int],
                                       extra_params_by_bin: Dict[T, Dict[str, Any]],
                                       features_by_bin: Dict[T, List[str]],
                                       train_split_col: str,
                                       train_split_bins: List,
                                       nthread: int,
                                       target_column: str,
                                       prediction_column: str = "prediction") -> LearnerReturnType:

    """
    Octopus ensemble allows you to inject domain specific knowledge to force a split in an initial feature, instead of
    assuming the tree model will do that intelligent split on its own. It works by first defining a split on your
    dataset and then training one individual model in each separated dataset.

    Parameters
    ----------
    train_set: pd.DataFrame
        A Pandas' DataFrame with features, target columns and a splitting column that must be categorical.

    learning_rate_by_bin: dict
        A dictionary of learning rate in the XGBoost model to use in each model split. Ex: if you want to
        split your training by tenure and you have a tenure column with integer values [1,2,3,...,12], you have to
        specify a list of learning rates for each split::

            {
                1: 0.08,
                2: 0.08,
                ...
                12: 0.1
            }

    num_estimators_by_bin: dict
        A dictionary of number of tree estimators in the XGBoost model to use in each model split. Ex: if you want to
        split your training by tenure and you have a tenure column with integer values [1,2,3,...,12], you have to
        specify a list of estimators for each split::

            {
                1: 300,
                2: 250,
                ...
                12: 300
            }

    extra_params_by_bin: dict
        A dictionary of extra parameters dictionaries in the XGBoost model to use in each model split. Ex: if you want
        to split your training by tenure and you have a tenure column with integer values [1,2,3,...,12], you have to
        specify a list of extra parameters for each split::

            {
                1: {
                    'reg_alpha': 0.0,
                    'colsample_bytree': 0.4,
                    ...
                    'colsample_bylevel': 0.8
                    }
                2: {
                    'reg_alpha': 0.1,
                    'colsample_bytree': 0.6,
                    ...
                    'colsample_bylevel': 0.4
                    }
                ...
                12: {
                    'reg_alpha': 0.0,
                    'colsample_bytree': 0.7,
                    ...
                    'colsample_bylevel': 1.0
                    }
            }

    features_by_bin: dict
        A dictionary of features to use in each model split. Ex: if you want to split your training by tenure and you
        have a tenure column with integer values [1,2,3,...,12], you have to specify a list of features for each split::

            {
                1: [feature-1, feature-2, feature-3, ...],
                2: [feature-1, feature-3, feature-5, ...],
                ...
                12: [feature-2, feature-4, feature-8, ...]
            }

    train_split_col: str
        The name of the categorical column where the model will make the splits. Ex: if you want to split your training
        by tenure, you can have a categorical column called "tenure".

    train_split_bins: list
        A list with the actual values of the categories from the `train_split_col`. Ex: if you want to split your
        training by tenure and you have a tenure column with integer values [1,2,3,...,12] you can pass this list and
        you will split your training into 12 different models.

    nthread: int
        Number of threads for the XGBoost learners.

    target_column: str
        The name of the target column.

    prediction_column: str
        The name of the column with the predictions from the model.
    """

    train_fns = {b: xgb_classification_learner(features=features_by_bin[b],
                                               learning_rate=learning_rate_by_bin[b],
                                               num_estimators=num_estimators_by_bin[b],
                                               target=target_column,
                                               extra_params=assoc(extra_params_by_bin[b], 'nthread', nthread),
                                               prediction_column=prediction_column + "_bin_" + str(b))
                 for b in train_split_bins}

    train_sets = {b: train_set[train_set[train_split_col] == b]
                  for b in train_split_bins}

    train_results = {b: train_fns[b](train_sets[b])
                     for b in train_split_bins}

    # train_results is a 3-tuple (prediction functions, predicted train dataset, train logs)
    pred_fns = {b: train_results[b][0] for b in train_split_bins}
    train_logs = {b: train_results[b][2] for b in train_split_bins}

    def p(df: pd.DataFrame) -> pd.DataFrame:
        pred_fn = compose(*pred_fns.values())

        return (pred_fn(df)
                .assign(pred_bin=prediction_column + "_bin_" + df[train_split_col].astype(str))
                .assign(prediction=lambda d: d.lookup(d.index.values,
                                                      d.pred_bin.values.squeeze()))
                .rename(index=str, columns={"prediction": prediction_column})
                .drop("pred_bin", axis=1))

    p.__doc__ = learner_pred_fn_docstring("xgb_octopus_classification_learner")

    log = {
        'xgb_octopus_classification_learner': {
            'features': features_by_bin,
            'target': target_column,
            'prediction_column': prediction_column,
            'package': "xgboost",
            'train_logs': train_logs,
            'parameters': extra_params_by_bin,
            'training_samples': len(train_set)
        }
    }

    return p, p(train_set), log


xgb_octopus_classification_learner.__doc__ += learner_return_docstring("Octopus XGB Classifier")