Source code for fklearn.training.ensemble

from typing import Any, Dict, List, TypeVar

import numpy as np
import numpy.typing as npt
import pandas as pd
from toolz import curry, assoc, compose

from fklearn.training.classification import xgb_classification_learner
from fklearn.common_docstrings import learner_pred_fn_docstring, learner_return_docstring
from fklearn.types import LearnerReturnType
from fklearn.training.utils import log_learner_time

T = TypeVar('T')


[docs]@curry @log_learner_time(learner_name='xgb_octopus_classification_learner') def xgb_octopus_classification_learner(train_set: pd.DataFrame, learning_rate_by_bin: Dict[T, float], num_estimators_by_bin: Dict[T, int], extra_params_by_bin: Dict[T, Dict[str, Any]], features_by_bin: Dict[T, List[str]], train_split_col: str, train_split_bins: List, nthread: int, target_column: str, prediction_column: str = "prediction") -> LearnerReturnType: """ Octopus ensemble allows you to inject domain specific knowledge to force a split in an initial feature, instead of assuming the tree model will do that intelligent split on its own. It works by first defining a split on your dataset and then training one individual model in each separated dataset. Parameters ---------- train_set: pd.DataFrame A Pandas' DataFrame with features, target columns and a splitting column that must be categorical. learning_rate_by_bin: dict A dictionary of learning rate in the XGBoost model to use in each model split. Ex: if you want to split your training by tenure and you have a tenure column with integer values [1,2,3,...,12], you have to specify a list of learning rates for each split:: { 1: 0.08, 2: 0.08, ... 12: 0.1 } num_estimators_by_bin: dict A dictionary of number of tree estimators in the XGBoost model to use in each model split. Ex: if you want to split your training by tenure and you have a tenure column with integer values [1,2,3,...,12], you have to specify a list of estimators for each split:: { 1: 300, 2: 250, ... 12: 300 } extra_params_by_bin: dict A dictionary of extra parameters dictionaries in the XGBoost model to use in each model split. Ex: if you want to split your training by tenure and you have a tenure column with integer values [1,2,3,...,12], you have to specify a list of extra parameters for each split:: { 1: { 'reg_alpha': 0.0, 'colsample_bytree': 0.4, ... 'colsample_bylevel': 0.8 } 2: { 'reg_alpha': 0.1, 'colsample_bytree': 0.6, ... 'colsample_bylevel': 0.4 } ... 12: { 'reg_alpha': 0.0, 'colsample_bytree': 0.7, ... 'colsample_bylevel': 1.0 } } features_by_bin: dict A dictionary of features to use in each model split. Ex: if you want to split your training by tenure and you have a tenure column with integer values [1,2,3,...,12], you have to specify a list of features for each split:: { 1: [feature-1, feature-2, feature-3, ...], 2: [feature-1, feature-3, feature-5, ...], ... 12: [feature-2, feature-4, feature-8, ...] } train_split_col: str The name of the categorical column where the model will make the splits. Ex: if you want to split your training by tenure, you can have a categorical column called "tenure". train_split_bins: list A list with the actual values of the categories from the `train_split_col`. Ex: if you want to split your training by tenure and you have a tenure column with integer values [1,2,3,...,12] you can pass this list and you will split your training into 12 different models. nthread: int Number of threads for the XGBoost learners. target_column: str The name of the target column. prediction_column: str The name of the column with the predictions from the model. """ train_fns = {b: xgb_classification_learner(features=features_by_bin[b], learning_rate=learning_rate_by_bin[b], num_estimators=num_estimators_by_bin[b], target=target_column, extra_params=assoc(extra_params_by_bin[b], 'nthread', nthread), prediction_column=prediction_column + "_bin_" + str(b)) for b in train_split_bins} train_sets = {b: train_set[train_set[train_split_col] == b] for b in train_split_bins} train_results = {b: train_fns[b](train_sets[b]) for b in train_split_bins} # train_results is a 3-tuple (prediction functions, predicted train dataset, train logs) pred_fns = {b: train_results[b][0] for b in train_split_bins} train_logs = {b: train_results[b][2] for b in train_split_bins} def p(df: pd.DataFrame) -> pd.DataFrame: pred_fn = compose(*pred_fns.values()) def lookup(df: pd.DataFrame) -> npt.NDArray: idx, cols = pd.factorize(df.pred_bin.values.squeeze()) output = df.reindex(cols, axis=1).to_numpy()[np.arange(len(df)), idx] return output return (pred_fn(df) .assign(pred_bin=prediction_column + "_bin_" + df[train_split_col].astype(str)) .assign(prediction=lookup) .rename(index=str, columns={"prediction": prediction_column}) .drop("pred_bin", axis=1)) p.__doc__ = learner_pred_fn_docstring("xgb_octopus_classification_learner") log = { 'xgb_octopus_classification_learner': { 'features': features_by_bin, 'target': target_column, 'prediction_column': prediction_column, 'package': "xgboost", 'train_logs': train_logs, 'parameters': extra_params_by_bin, 'training_samples': len(train_set) } } return p, p(train_set), log
xgb_octopus_classification_learner.__doc__ += learner_return_docstring("Octopus XGB Classifier")