Source code for fklearn.training.ensemble

from typing import Any, Dict, List, TypeVar

import numpy as np
import numpy.typing as npt
import pandas as pd
from toolz import curry, assoc, compose

from fklearn.training.classification import xgb_classification_learner
from fklearn.common_docstrings import learner_pred_fn_docstring, learner_return_docstring
from fklearn.types import LearnerReturnType
from fklearn.training.utils import log_learner_time

T = TypeVar('T')


[docs]@curry
@log_learner_time(learner_name='xgb_octopus_classification_learner')
def xgb_octopus_classification_learner(train_set: pd.DataFrame,
                                       learning_rate_by_bin: Dict[T, float],
                                       num_estimators_by_bin: Dict[T, int],
                                       extra_params_by_bin: Dict[T, Dict[str, Any]],
                                       features_by_bin: Dict[T, List[str]],
                                       train_split_col: str,
                                       train_split_bins: List,
                                       nthread: int,
                                       target_column: str,
                                       prediction_column: str = "prediction") -> LearnerReturnType:

    """
    Octopus ensemble allows you to inject domain specific knowledge to force a split in an initial feature, instead of
    assuming the tree model will do that intelligent split on its own. It works by first defining a split on your
    dataset and then training one individual model in each separated dataset.

    Parameters
    ----------
    train_set: pd.DataFrame
        A Pandas' DataFrame with features, target columns and a splitting column that must be categorical.

    learning_rate_by_bin: dict
        A dictionary of learning rate in the XGBoost model to use in each model split. Ex: if you want to
        split your training by tenure and you have a tenure column with integer values [1,2,3,...,12], you have to
        specify a list of learning rates for each split::

            {
                1: 0.08,
                2: 0.08,
                ...
                12: 0.1
            }

    num_estimators_by_bin: dict
        A dictionary of number of tree estimators in the XGBoost model to use in each model split. Ex: if you want to
        split your training by tenure and you have a tenure column with integer values [1,2,3,...,12], you have to
        specify a list of estimators for each split::

            {
                1: 300,
                2: 250,
                ...
                12: 300
            }

    extra_params_by_bin: dict
        A dictionary of extra parameters dictionaries in the XGBoost model to use in each model split. Ex: if you want
        to split your training by tenure and you have a tenure column with integer values [1,2,3,...,12], you have to
        specify a list of extra parameters for each split::

            {
                1: {
                    'reg_alpha': 0.0,
                    'colsample_bytree': 0.4,
                    ...
                    'colsample_bylevel': 0.8
                    }
                2: {
                    'reg_alpha': 0.1,
                    'colsample_bytree': 0.6,
                    ...
                    'colsample_bylevel': 0.4
                    }
                ...
                12: {
                    'reg_alpha': 0.0,
                    'colsample_bytree': 0.7,
                    ...
                    'colsample_bylevel': 1.0
                    }
            }

    features_by_bin: dict
        A dictionary of features to use in each model split. Ex: if you want to split your training by tenure and you
        have a tenure column with integer values [1,2,3,...,12], you have to specify a list of features for each split::

            {
                1: [feature-1, feature-2, feature-3, ...],
                2: [feature-1, feature-3, feature-5, ...],
                ...
                12: [feature-2, feature-4, feature-8, ...]
            }

    train_split_col: str
        The name of the categorical column where the model will make the splits. Ex: if you want to split your training
        by tenure, you can have a categorical column called "tenure".

    train_split_bins: list
        A list with the actual values of the categories from the `train_split_col`. Ex: if you want to split your
        training by tenure and you have a tenure column with integer values [1,2,3,...,12] you can pass this list and
        you will split your training into 12 different models.

    nthread: int
        Number of threads for the XGBoost learners.

    target_column: str
        The name of the target column.

    prediction_column: str
        The name of the column with the predictions from the model.
    """

    train_fns = {b: xgb_classification_learner(features=features_by_bin[b],
                                               learning_rate=learning_rate_by_bin[b],
                                               num_estimators=num_estimators_by_bin[b],
                                               target=target_column,
                                               extra_params=assoc(extra_params_by_bin[b], 'nthread', nthread),
                                               prediction_column=prediction_column + "_bin_" + str(b))
                 for b in train_split_bins}

    train_sets = {b: train_set[train_set[train_split_col] == b]
                  for b in train_split_bins}

    train_results = {b: train_fns[b](train_sets[b])
                     for b in train_split_bins}

    # train_results is a 3-tuple (prediction functions, predicted train dataset, train logs)
    pred_fns = {b: train_results[b][0] for b in train_split_bins}
    train_logs = {b: train_results[b][2] for b in train_split_bins}

    def p(df: pd.DataFrame) -> pd.DataFrame:
        pred_fn = compose(*pred_fns.values())

        def lookup(df: pd.DataFrame) -> npt.NDArray:
            idx, cols = pd.factorize(df.pred_bin.values.squeeze())
            output = df.reindex(cols, axis=1).to_numpy()[np.arange(len(df)), idx]
            return output

        return (pred_fn(df)
                .assign(pred_bin=prediction_column + "_bin_" + df[train_split_col].astype(str))
                .assign(prediction=lookup)
                .rename(index=str, columns={"prediction": prediction_column})
                .drop("pred_bin", axis=1))

    p.__doc__ = learner_pred_fn_docstring("xgb_octopus_classification_learner")

    log = {
        'xgb_octopus_classification_learner': {
            'features': features_by_bin,
            'target': target_column,
            'prediction_column': prediction_column,
            'package': "xgboost",
            'train_logs': train_logs,
            'parameters': extra_params_by_bin,
            'training_samples': len(train_set)
        }
    }

    return p, p(train_set), log


xgb_octopus_classification_learner.__doc__ += learner_return_docstring("Octopus XGB Classifier")