Source code for fklearn.tuning.samplers

import gc
from itertools import combinations
from typing import List, Tuple

import pandas as pd
from joblib import Parallel, delayed
from numpy import random
from toolz.curried import curry, first, compose, valfilter, sorted, pipe, take

from fklearn.tuning.utils import order_feature_importance_avg_from_logs, get_best_performing_log, gen_dict_extract, \
    get_avg_metric_from_extractor, get_used_features, gen_validator_log
from fklearn.types import EvalFnType, ExtractorFnType, LogListType, LogType, PredictFnType


[docs]@curry
def remove_by_feature_importance(log: LogType,
                                 num_removed_by_step: int = 5) -> List[str]:
    """
        Performs feature selection based on feature importance

        Parameters
        ----------
        log : dict
            Dictionaries evaluations.

        num_removed_by_step: int (default 5)
            The number of features to remove

        Returns
        ----------
        features: list of str
            The remaining features after removing based on feature importance

    """
    return order_feature_importance_avg_from_logs(log)[:-num_removed_by_step]


[docs]@curry
def remove_features_subsets(log_list: LogListType,
                            extractor: ExtractorFnType,
                            metric_name: str,
                            num_removed_by_step: int = 1) -> List[Tuple[str, ...]]:
    """
        Performs feature selection based on the best performing model out of
        several trained models

        Parameters
        ----------
        log_list : list of dict
            A list of log-like lists of dictionaries evaluations.

        extractor: function string -> float
            A extractor that take a string and returns the value of that string on a dict

        metric_name: str
            String with the name of the column that refers to the metric column to be extracted

        num_removed_by_step: int (default 1)
            The number of features to remove

        Returns
        ----------
        keys: list of str
            The remaining keys of feature sets after choosing the current best subset

    """

    best_log = get_best_performing_log(log_list, extractor, metric_name)
    best_subset: List[str] = first(gen_dict_extract('used_subsets', best_log))

    return list(combinations(best_subset, len(best_subset) - num_removed_by_step))


[docs]@curry
def remove_by_feature_shuffling(log: LogType,
                                predict_fn: PredictFnType,
                                eval_fn: EvalFnType,
                                eval_data: pd.DataFrame,
                                extractor: ExtractorFnType,
                                metric_name: str,
                                max_removed_by_step: int = 50,
                                threshold: float = 0.005,
                                speed_up_by_importance: bool = False,
                                parallel: bool = False,
                                nthread: int = 1,
                                seed: int = 7) -> List[str]:

    """
        Performs feature selection based on the evaluation of the test vs the
        evaluation of the test with randomly shuffled features

        Parameters
        ----------
        log : LogType
            Dictionaries evaluations.

        predict_fn: function pandas.DataFrame -> pandas.DataFrame
            A partially defined predictor that takes a DataFrame and returns the
            predicted score for this dataframe

        eval_fn : function DataFrame -> log dict
            A partially defined evaluation function that takes a dataset with prediction and
            returns the evaluation logs.

        eval_data: pandas.DataFrame
            Data used to evaluate the model after shuffling

        extractor: function str -> float
            A extractor that take a string and returns the value of that string on a dict

        metric_name: str
            String with the name of the column that refers to the metric column to be extracted

        max_removed_by_step: int (default 5)
            The maximum number of features to remove. It will only consider the least max_removed_by_step in terms of
            feature importance. If speed_up_by_importance=True it will first filter the least relevant feature an
            shuffle only those. If speed_up_by_importance=False it will shuffle all features and drop the last
            max_removed_by_step in terms of PIMP. In both cases, the features will only be removed if drop in
            performance is up to the defined threshold.

        threshold: float (default 0.005)
            Threshold for model performance comparison

        speed_up_by_importance: bool (default True)
            If it should narrow search looking at feature importance first before getting PIMP importance. If True,
            will only shuffle the top num_removed_by_step in terms of feature importance.

        parallel: bool (default False)

        nthread: int (default 1)

        seed: int (default 7)
            Random seed

        Returns
        ----------
        features: list of str
            The remaining features after removing based on feature importance

    """
    random.seed(seed)

    curr_metric = get_avg_metric_from_extractor(log, extractor, metric_name)
    eval_size = eval_data.shape[0]

    features_to_shuffle = order_feature_importance_avg_from_logs(log)[-max_removed_by_step:] \
        if speed_up_by_importance else get_used_features(log)

    def shuffle(feature: str) -> pd.DataFrame:
        return eval_data.assign(**{feature: eval_data[feature].sample(frac=1.0)})

    feature_to_delta_metric = compose(lambda m: curr_metric - m,
                                      get_avg_metric_from_extractor(extractor=extractor, metric_name=metric_name),
                                      gen_validator_log(fold_num=0, test_size=eval_size), eval_fn, predict_fn, shuffle)

    if parallel:
        metrics = Parallel(n_jobs=nthread, backend="threading")(
            delayed(feature_to_delta_metric)(feature) for feature in features_to_shuffle)
        feature_to_delta_metric = dict(zip(features_to_shuffle, metrics))
        gc.collect()

    else:
        feature_to_delta_metric = {feature: feature_to_delta_metric(feature) for feature in features_to_shuffle}

    return pipe(feature_to_delta_metric,
                valfilter(lambda delta_metric: delta_metric < threshold),
                sorted(key=lambda f: feature_to_delta_metric.get(f)),
                take(max_removed_by_step),
                list)