Source code for fklearn.tuning.selectors

from typing import Callable, Dict, List

from toolz.curried import pipe, first, mapcat
import pandas as pd

from fklearn.tuning.samplers import remove_features_subsets, remove_by_feature_importance, remove_by_feature_shuffling
from fklearn.tuning.stoppers import stop_by_num_features, stop_by_num_features_parallel, stop_by_iter_num, \
    stop_by_no_improvement, stop_by_no_improvement_parallel, aggregate_stop_funcs
from fklearn.validation.validator import parallel_validator
from fklearn.types import EvalFnType, ExtractorFnType, LearnerReturnType, ListLogListType, LogListType, SplitterFnType,\
    ValidatorReturnType, LogType

SaveIntermediaryFnType = Callable[[List[ValidatorReturnType]], None]
TuningLearnerFnType = Callable[[pd.DataFrame, List[str]], LearnerReturnType]


[docs]def feature_importance_backward_selection(train_data: pd.DataFrame,
                                          param_train_fn: TuningLearnerFnType,
                                          features: List[str],
                                          split_fn: SplitterFnType,
                                          eval_fn: EvalFnType,
                                          extractor: ExtractorFnType,
                                          metric_name: str,
                                          num_removed_by_step: int = 5,
                                          threshold: float = 0.005,
                                          early_stop: int = 2,
                                          iter_limit: int = 50,
                                          min_remaining_features: int = 50,
                                          save_intermediary_fn: SaveIntermediaryFnType = None,
                                          n_jobs: int = 1) -> ListLogListType:
    """
        Performs train-evaluation iterations while subsampling the used features
        to compute statistics about feature relevance

        Parameters
        ----------
        train_data : pandas.DataFrame
            A Pandas' DataFrame with training data

        auxiliary_columns: list of str
            List of columns from the dataset that are not used as features but are
            used for evaluation or cross validation. (id, date, etc)

        param_train_fn : function (DataFrame, List of Strings) -> prediction_function, predictions_dataset, logs
            A partially defined learning function that takes a training set and a feature list and
            returns a predict function, a dataset with training predictions and training
            logs.

        features: list of str
            Elements must be columns of the train_data

        split_fn : function pandas.DataFrame ->  list of tuple
            Partially defined split function that takes a dataset and returns
            a list of folds. Each fold is a Tuple of arrays. The fist array in
            each tuple contains training indexes while the second array
            contains validation indexes.

        eval_fn : function pandas.DataFrame -> dict
            A partially defined evaluation function that takes a dataset with prediction and
            returns the evaluation logs.

        extractor: function str -> float
            A extractor that take a string and returns the value of that string on a dict

        metric_name: str
            String with the name of the column that refers to the metric column to be extracted

        num_removed_by_step: int (default 5)
            Number of features removed at each iteration

        threshold: float (default 0.005)
            Threshold for model performance comparison

        early_stop: int (default 2)
            Number of rounds without improvement before stopping process

        iter_limit: int (default 50)
            Maximum number of iterations before stopping

        min_remaining_features: int (default 50)
            Minimum number of features that should remain in the model,
            combining num_removed_by_step and iter_limit accomplishes the same
            functionality as this parameter.

        save_intermediary_fn : function(log) -> save to file
            Partially defined saver function that receives a log result from a
            tuning step and appends it into a file
            Example: save_intermediary_result(save_path='tuning.pkl')

        n_jobs : int
            Number of parallel processes to spawn.

        Returns
        ----------
        Logs: list of list of dict
            A list log-like lists of dictionaries evaluations. Each element of the
            list is validation step of the algorithm.

    """

    selector_fn = remove_by_feature_importance(num_removed_by_step=num_removed_by_step)

    stop_fn = aggregate_stop_funcs(
        stop_by_no_improvement(extractor=extractor, metric_name=metric_name, early_stop=early_stop,
                               threshold=threshold),
        stop_by_iter_num(iter_limit=iter_limit),
        stop_by_num_features(min_num_features=min_remaining_features))

    train_fn = lambda df: param_train_fn(df, features)
    first_logs = parallel_validator(train_data, split_fn, train_fn, eval_fn, n_jobs=n_jobs)

    logs = [first_logs]
    while not stop_fn(logs):
        curr_log = first(logs)

        new_features = selector_fn(curr_log)
        new_train_fn = lambda df: param_train_fn(df, new_features)
        next_log = parallel_validator(train_data, split_fn, new_train_fn, eval_fn, n_jobs=n_jobs)

        if save_intermediary_fn is not None:
            save_intermediary_fn(next_log)

        logs = [next_log] + logs

    return logs


[docs]def poor_man_boruta_selection(train_data: pd.DataFrame,
                              test_data: pd.DataFrame,
                              param_train_fn: TuningLearnerFnType,
                              features: List[str],
                              eval_fn: EvalFnType,
                              extractor: ExtractorFnType,
                              metric_name: str,
                              max_removed_by_step: int = 5,
                              threshold: float = 0.005,
                              early_stop: int = 2,
                              iter_limit: int = 50,
                              min_remaining_features: int = 50,
                              save_intermediary_fn: Callable[[LogType], None] = None,
                              speed_up_by_importance: bool = False,
                              parallel: bool = False,
                              nthread: int = 1,
                              seed: int = 7) -> LogListType:
    """
        Performs train-evaluation iterations while shuffiling the used features
        to compute statistics about feature relevance

        Parameters
        ----------
        train_data : pandas.DataFrame
            A Pandas' DataFrame with training data

        test_data : pandas.DataFrame
            A Pandas' DataFrame with test data

        param_train_fn : function (pandas.DataFrame, list of str) -> prediction_function, predictions_dataset, logs
            A partially defined AND curried learning function that takes a training set and a feature list and
            returns a predict function, a dataset with training predictions and training
            logs.

        features: list of str
            Elements must be columns of the train_data

        eval_fn : function pandas.DataFrame -> dict
            A partially defined evaluation function that takes a dataset with prediction and
            returns the evaluation logs.

        extractor: function str -> float
            A extractor that take a string and returns the value of that string on a dict

        metric_name: str
            String with the name of the column that refers to the metric column to be extracted

        max_removed_by_step: int (default 5)
            The maximum number of features to remove. It will only consider the least max_removed_by_step in terms of
            feature importance. If speed_up_by_importance=True it will first filter the least relevant feature an
            shuffle only those. If speed_up_by_importance=False it will shuffle all features and drop the last
            max_removed_by_step in terms of PIMP. In both cases, the features will only be removed if drop in
            performance is up to the defined threshold.

        threshold: float (default 0.005)
            Threshold for model performance comparison

        early_stop: int (default 2)
            Number of rounds without improvement before stopping process

        iter_limit: int (default 50)
            Maximum number of iterations before stopping

        min_remaining_features: int (default 50)
            Minimum number of features that should remain in the model,
            combining num_removed_by_step and iter_limit accomplishes the same
            functionality as this parameter.

        save_intermediary_fn: function(log) -> save to file
            Partially defined saver function that receives a log result from a
            tuning step and appends it into a file
            Example: save_intermediary_result(save_path='tuning.pkl')

        speed_up_by_importance: bool (default True)
            If it should narrow search looking at feature importance first before getting PIMP importance. If True,
            will only shuffle the top num_removed_by_step in terms of feature importance.

        max_removed_by_step: int (default 50)
            If speed_up_by_importance=False, this will limit the number of features dropped by iteration. It will only
            drop the max_removed_by_step features that decrease the metric by the least when dropped.

        parallel: bool (default False)
            Run shuffling and prediction in parallel. Only applies if speed_up_by_importance=False

        nthread: int (default 1)
            Number of threads to run predictions. ONly applied if speed_up_by_importance=False

        seed: int (default 7)
            random state for consistency.


        Returns
        ----------
        logs: list of list of dict
            A list log-like lists of dictionaries evaluations. Each element of the
            list is validation step of the algorithm.

    """

    selector_fn = remove_by_feature_shuffling(eval_fn=eval_fn,
                                              eval_data=test_data,
                                              extractor=extractor,
                                              metric_name=metric_name,
                                              max_removed_by_step=max_removed_by_step,
                                              threshold=threshold,
                                              speed_up_by_importance=speed_up_by_importance,
                                              parallel=parallel,
                                              nthread=nthread,
                                              seed=seed)

    stop_fn = aggregate_stop_funcs(
        stop_by_no_improvement(extractor=extractor, metric_name=metric_name, early_stop=early_stop,
                               threshold=threshold),
        stop_by_iter_num(iter_limit=iter_limit),
        stop_by_num_features(min_num_features=min_remaining_features)
    )

    predict_fn_first, _, train_logs = param_train_fn(train_data, features)
    eval_logs = eval_fn(predict_fn_first(test_data))

    first_logs = {
        'train_log': train_logs,
        'validator_log': [
            {
                'fold_num': 0,
                'split_log': {
                    'train_size': train_data.shape[0],
                    'test_size': test_data.shape[0]
                },
                'eval_results': [eval_logs]
            }
        ]
    }

    logs = [first_logs]
    predict_fn = predict_fn_first

    while not stop_fn(logs):  # type: ignore
        next_features = pipe(logs, first, selector_fn(predict_fn=predict_fn))

        if len(next_features) == 0:
            break

        next_predict_fn, _, next_train_logs = param_train_fn(train_data, next_features)

        eval_logs = pipe(test_data, next_predict_fn, eval_fn)
        next_log = {'train_log': next_train_logs, 'validator_log': [
            {'fold_num': 0, 'split_log': {'train_size': train_data.shape[0], 'test_size': test_data.shape[0]},
             'eval_results': [eval_logs]}]}

        logs = [next_log] + logs

        if save_intermediary_fn is not None:
            save_intermediary_fn(next_log)

        predict_fn = next_predict_fn

    return logs


[docs]def backward_subset_feature_selection(train_data: pd.DataFrame,
                                      param_train_fn: TuningLearnerFnType,
                                      features_sets: Dict[str, List[str]],
                                      split_fn: SplitterFnType,
                                      eval_fn: EvalFnType,
                                      extractor: ExtractorFnType,
                                      metric_name: str,
                                      threshold: float = 0.005,
                                      num_removed_by_step: int = 3,
                                      early_stop: int = 2,
                                      iter_limit: int = 50,
                                      min_remaining_features: int = 50,
                                      save_intermediary_fn: SaveIntermediaryFnType = None,
                                      n_jobs: int = 1) -> ListLogListType:
    """
        Performs train-evaluation iterations while testing the subsets of features
        to compute statistics about the importance of each feature category

        Parameters
        ----------
        train_data : pandas.DataFrame
            A Pandas' DataFrame with training data

        param_train_fn : function (pandas.DataFrame, list of str) -> prediction_function, predictions_dataset, logs
            A partially defined learning function that takes a training set and a feature list and
            returns a predict function, a dataset with training predictions and training
            logs.

        features_sets: dict of string -> list
            Each String Key on the dict is a subset of columns from the dataset, the function will
            analyse the influence of each group of features on the model performance

        split_fn : function pandas.DataFrame ->  list of tuple
            Partially defined split function that takes a dataset and returns
            a list of folds. Each fold is a Tuple of arrays. The fist array in
            each tuple contains training indexes while the second array
            contains validation indexes.

        eval_fn : function pandas.DataFrame -> dict
            A partially defined evaluation function that takes a dataset with prediction and
            returns the evaluation logs.

        extractor: function str -> float
            A extractor that take a string and returns the value of that string on a dict

        metric_name: str
            String with the name of the column that refers to the metric column to be extracted

        num_removed_by_step: int (default 3)
            Number of features removed at each iteration

        threshold: float (default 0.005)
            Threshold for model performance comparison

        early_stop: int (default 2)
            Number of rounds without improvement before stopping process

        iter_limit: int (default 50)
            Maximum number of iterations before stopping

        min_remaining_features: int (default 50)
            Minimum number of features that should remain in the model,
            combining num_removed_by_step and iter_limit accomplishes the same
            functionality as this parameter.

        save_intermediary_fn : function(log) -> save to file
            Partially defined saver function that receives a log result from a
            tuning step and appends it into a file
            Example: save_intermediary_result(save_path='tuning.pkl')

        n_jobs : int
            Number of parallel processes to spawn.

        Returns
        ----------
        logs: list of list of dict
            A list log-like lists of dictionaries evaluations. Each element of the
            list is validation step of the algorithm.

    """

    selector_fn = remove_features_subsets(extractor=extractor,
                                          metric_name=metric_name,
                                          num_removed_by_step=num_removed_by_step)

    stop_fn = aggregate_stop_funcs(
        stop_by_no_improvement_parallel(extractor=extractor, metric_name=metric_name, early_stop=early_stop,
                                        threshold=threshold),
        stop_by_iter_num(iter_limit=iter_limit),
        stop_by_num_features_parallel(extractor=extractor, metric_name=metric_name,
                                      min_num_features=min_remaining_features)
    )

    used_subsets = [features_sets.keys()]

    used_features = [list(mapcat(lambda key: features_sets[key], subset)) for subset in used_subsets]

    trainers = [lambda df: param_train_fn(df, feat) for feat in used_features]

    first_val_logs = [parallel_validator(train_data, split_fn, train_func, eval_fn, n_jobs) for train_func in trainers]
    logs = [[dict(log, **{"used_subsets": list(subset)}) for log, subset in zip(first_val_logs, used_subsets)]]

    while not stop_fn(logs):
        curr_log = first(logs)

        new_subsets = selector_fn(curr_log)
        new_features = [list(mapcat(lambda key: features_sets[key], subset)) for subset in new_subsets]

        trainers = [lambda df: param_train_fn(df, feat) for feat in new_features]

        val_logs = [parallel_validator(train_data, split_fn, train_func, eval_fn, n_jobs) for train_func in trainers]

        new_logs = [dict(log, **{"used_subsets": subset}) for log, subset in zip(val_logs, new_subsets)]

        if save_intermediary_fn is not None:
            save_intermediary_fn(new_logs)

        logs = [new_logs] + logs

    return logs