Source code for fklearn.tuning.parameter_tuners

from collections import OrderedDict
from itertools import product
from typing import Callable, List

from numpy.random import seed
import pandas as pd
from toolz import curry, partial

from fklearn.validation.validator import parallel_validator, validator
from fklearn.types import EvalFnType, LearnerFnType, LogType, SplitterFnType, ValidatorReturnType

SaveIntermediaryFnType = Callable[[ValidatorReturnType], None]


[docs]@curry
def random_search_tuner(space: LogType,
                        train_set: pd.DataFrame,
                        param_train_fn: Callable[[LogType], LearnerFnType],
                        split_fn: SplitterFnType,
                        eval_fn: EvalFnType,
                        iterations: int,
                        random_seed: int = 1,
                        save_intermediary_fn: SaveIntermediaryFnType = None,
                        n_jobs: int = 1) -> List[ValidatorReturnType]:
    """
    Runs several training functions with each run taken from the parameter space

    Parameters
    ----------
    space : dict
        A dictionary with keys as parameter for the model and values as callable that return a parameter.
        Callable must take no parameters and can return always a constant value.
        Example::

            space = {
                'learning_rate': lambda: np.random.choice([1e-3, 1e-2, 1e-1, 1, 10]),
                'num_estimators': lambda: np.random.choice([20, 100, 150])
                }

    train_set : pd.DataFrame
        The training set

    param_train_fn : function(space, train_set) ->  p, new_df, train_log
        A curried training function that os only function of the parameters for the model and the training set.
        Example::

            @curry
            def param_train_fn(space, train_set):
                return xgb_classification_learner(features=["x"],
                                                  target="target",
                                                  learning_rate=space["learning_rate"],
                                                  num_estimators=space["num_estimators"])(train_set)

    split_fn : function(dataset) -> list of folds
        Partially defined split function that takes a dataset and returns
        a list of folds. Each fold is a Tuple of arrays. The fist array in
        each tuple contains training indexes while the second array
        contains validation indexes.
        Examples::

            out_of_time_and_space_splitter(n_splits=n_splits,
                                           in_time_limit=in_time_limit,
                                           space_column=space_column,
                                           time_column=time_column)

    eval_fn : function(dataset) -> eval_log
        A base evaluation function that returns a simple evaluation log. Can't be a spited or the extractor won't work.
        Example: roc_auc_evaluator(target_column="target")

    iterations : int
        The number of iterations to run the parameter tuner

    random_seed : int
        Random seed

    save_intermediary_fn : function(log) -> save to file
        Partially defined saver function that receives a log result from a
        tuning step and appends it into a file
        Example: save_intermediary_result(save_path='tuning.pkl')

    n_jobs : int
        Number of parallel processes to spawn when evaluating a training function

    Returns
    ----------
    tuning_log : list of dict
        A list of tuning log, each containing a training log and a validation log.
    """
    validation_fn = partial(parallel_validator, n_jobs=n_jobs) if n_jobs > 1 else validator

    def tune_iteration() -> ValidatorReturnType:
        iter_space = {k: space[k]() for k in space}
        train_fn = param_train_fn(iter_space)
        validator_log = validation_fn(train_data=train_set, split_fn=split_fn, train_fn=train_fn, eval_fn=eval_fn)

        if save_intermediary_fn is not None:
            save_intermediary_fn(validator_log)

        return validator_log

    seed(random_seed)

    return [tune_iteration() for _ in range(iterations)]


[docs]@curry
def grid_search_cv(space: LogType,
                   train_set: pd.DataFrame,
                   param_train_fn: Callable[[LogType], LearnerFnType],
                   split_fn: SplitterFnType,
                   eval_fn: EvalFnType,
                   save_intermediary_fn: SaveIntermediaryFnType = None,
                   load_intermediary_fn: Callable[[str], List[ValidatorReturnType]] = None,
                   warm_start_file: str = None,
                   n_jobs: int = 1) -> List[ValidatorReturnType]:
    """
    Runs several training functions with each run taken from the parameter space

    Parameters
    ----------
    space : dict
        A dictionary with keys as parameter for the model and values as callable that return a parameter.
        Callable must take no parameters and can return always a constant value.
        Example::

            space = {
                'learning_rate': lambda: [1e-3, 1e-2, 1e-1, 1, 10],
                'num_estimators': lambda: [20, 100, 150]
                }

    train_set : pd.DataFrame
        The training set

    param_train_fn : function(space, train_set) ->  p, new_df, train_log
        A curried training function that os only function of the parameters for the model and the training set.
        Example::

            @curry
            def param_train_fn(space, train_set):
                return xgb_classification_learner(features=["x"],
                                                  target="target",
                                                  learning_rate=space["learning_rate"],
                                                  num_estimators=space["num_estimators"])(train_set)

    split_fn : function(dataset) -> list of folds
        Partially defined split function that takes a dataset and returns
        a list of folds. Each fold is a Tuple of arrays. The fist array in
        each tuple contains training indexes while the second array
        contains validation indexes.
        Examples::

            out_of_time_and_space_splitter(n_splits=n_splits,
                                           in_time_limit=in_time_limit,
                                           space_column=space_column,
                                           time_column=time_column)

    eval_fn : function(dataset) -> eval_log
        A base evaluation function that returns a simple evaluation log. Can't be a spited or the extractor won't work.
        Example: roc_auc_evaluator(target_column="target")

    save_intermediary_fn : function(log) -> save to file
        Partially defined saver function that receives a log result from a
        tuning step and saves it into a file
        Example: save_intermediary_result(save_path='tuning.pkl')

    load_intermediary_fn : function(path) -> save to file
        Partially defined load function that receives a path and loads previous logs
        from this file
        Example: load_intermediary_result('tuning.pkl')

    warm_start_file: str
        File containing intermediary results for grid search. If this file
        is present, we will perform grid search from the last combination of
        parameters.

    n_jobs : int
        Number of parallel processes to spawn when evaluating a training function


    Returns
    ----------
    tuning_log : list of dict
        A list of tuning log, each containing a training log and a validation log.
    """

    validation_fn = partial(parallel_validator, n_jobs=n_jobs) if n_jobs > 1 else validator

    def tune_iteration(iter_space: LogType) -> ValidatorReturnType:
        train_fn = param_train_fn(iter_space)
        validator_log = validation_fn(train_data=train_set, split_fn=split_fn, train_fn=train_fn, eval_fn=eval_fn)
        validator_log['iter_space'] = OrderedDict(sorted(iter_space.items()))

        if save_intermediary_fn is not None:
            save_intermediary_fn(validator_log)

        return validator_log

    sorted_space_keys = sorted(space.keys())
    params = (space[k]() for k in sorted_space_keys)
    combinations = set(product(*params))

    if warm_start_file is not None and load_intermediary_fn is not None:
        results = load_intermediary_fn(warm_start_file)
        computed_combs = set([tuple(log['iter_space'].values()) for log in results])  # type: ignore
        combinations = combinations.difference(computed_combs)

    return [tune_iteration({k_v[0]: k_v[1] for k_v in zip(sorted_space_keys, comb)}) for comb in combinations]