Source code for fklearn.tuning.selectors

from typing import Callable, Dict, List

from toolz.curried import pipe, first, mapcat
import pandas as pd

from fklearn.tuning.samplers import remove_features_subsets, remove_by_feature_importance, remove_by_feature_shuffling
from fklearn.tuning.stoppers import stop_by_num_features, stop_by_num_features_parallel, stop_by_iter_num, \
    stop_by_no_improvement, stop_by_no_improvement_parallel, aggregate_stop_funcs
from fklearn.validation.validator import parallel_validator
from fklearn.types import EvalFnType, ExtractorFnType, LearnerReturnType, ListLogListType, LogListType, SplitterFnType,\
    ValidatorReturnType, LogType

SaveIntermediaryFnType = Callable[[List[ValidatorReturnType]], None]
TuningLearnerFnType = Callable[[pd.DataFrame, List[str]], LearnerReturnType]


[docs]def feature_importance_backward_selection(train_data: pd.DataFrame, param_train_fn: TuningLearnerFnType, features: List[str], split_fn: SplitterFnType, eval_fn: EvalFnType, extractor: ExtractorFnType, metric_name: str, num_removed_by_step: int = 5, threshold: float = 0.005, early_stop: int = 2, iter_limit: int = 50, min_remaining_features: int = 50, save_intermediary_fn: SaveIntermediaryFnType = None, n_jobs: int = 1) -> ListLogListType: """ Performs train-evaluation iterations while subsampling the used features to compute statistics about feature relevance Parameters ---------- train_data : pandas.DataFrame A Pandas' DataFrame with training data auxiliary_columns: list of str List of columns from the dataset that are not used as features but are used for evaluation or cross validation. (id, date, etc) param_train_fn : function (DataFrame, List of Strings) -> prediction_function, predictions_dataset, logs A partially defined learning function that takes a training set and a feature list and returns a predict function, a dataset with training predictions and training logs. features: list of str Elements must be columns of the train_data split_fn : function pandas.DataFrame -> list of tuple Partially defined split function that takes a dataset and returns a list of folds. Each fold is a Tuple of arrays. The fist array in each tuple contains training indexes while the second array contains validation indexes. eval_fn : function pandas.DataFrame -> dict A partially defined evaluation function that takes a dataset with prediction and returns the evaluation logs. extractor: function str -> float A extractor that take a string and returns the value of that string on a dict metric_name: str String with the name of the column that refers to the metric column to be extracted num_removed_by_step: int (default 5) Number of features removed at each iteration threshold: float (default 0.005) Threshold for model performance comparison early_stop: int (default 2) Number of rounds without improvement before stopping process iter_limit: int (default 50) Maximum number of iterations before stopping min_remaining_features: int (default 50) Minimum number of features that should remain in the model, combining num_removed_by_step and iter_limit accomplishes the same functionality as this parameter. save_intermediary_fn : function(log) -> save to file Partially defined saver function that receives a log result from a tuning step and appends it into a file Example: save_intermediary_result(save_path='tuning.pkl') n_jobs : int Number of parallel processes to spawn. Returns ---------- Logs: list of list of dict A list log-like lists of dictionaries evaluations. Each element of the list is validation step of the algorithm. """ selector_fn = remove_by_feature_importance(num_removed_by_step=num_removed_by_step) stop_fn = aggregate_stop_funcs( stop_by_no_improvement(extractor=extractor, metric_name=metric_name, early_stop=early_stop, threshold=threshold), stop_by_iter_num(iter_limit=iter_limit), stop_by_num_features(min_num_features=min_remaining_features)) train_fn = lambda df: param_train_fn(df, features) first_logs = parallel_validator(train_data, split_fn, train_fn, eval_fn, n_jobs=n_jobs) logs = [first_logs] while not stop_fn(logs): curr_log = first(logs) new_features = selector_fn(curr_log) new_train_fn = lambda df: param_train_fn(df, new_features) next_log = parallel_validator(train_data, split_fn, new_train_fn, eval_fn, n_jobs=n_jobs) if save_intermediary_fn is not None: save_intermediary_fn(next_log) logs = [next_log] + logs return logs
[docs]def poor_man_boruta_selection(train_data: pd.DataFrame, test_data: pd.DataFrame, param_train_fn: TuningLearnerFnType, features: List[str], eval_fn: EvalFnType, extractor: ExtractorFnType, metric_name: str, max_removed_by_step: int = 5, threshold: float = 0.005, early_stop: int = 2, iter_limit: int = 50, min_remaining_features: int = 50, save_intermediary_fn: Callable[[LogType], None] = None, speed_up_by_importance: bool = False, parallel: bool = False, nthread: int = 1, seed: int = 7) -> LogListType: """ Performs train-evaluation iterations while shuffiling the used features to compute statistics about feature relevance Parameters ---------- train_data : pandas.DataFrame A Pandas' DataFrame with training data test_data : pandas.DataFrame A Pandas' DataFrame with test data param_train_fn : function (pandas.DataFrame, list of str) -> prediction_function, predictions_dataset, logs A partially defined AND curried learning function that takes a training set and a feature list and returns a predict function, a dataset with training predictions and training logs. features: list of str Elements must be columns of the train_data eval_fn : function pandas.DataFrame -> dict A partially defined evaluation function that takes a dataset with prediction and returns the evaluation logs. extractor: function str -> float A extractor that take a string and returns the value of that string on a dict metric_name: str String with the name of the column that refers to the metric column to be extracted max_removed_by_step: int (default 5) The maximum number of features to remove. It will only consider the least max_removed_by_step in terms of feature importance. If speed_up_by_importance=True it will first filter the least relevant feature an shuffle only those. If speed_up_by_importance=False it will shuffle all features and drop the last max_removed_by_step in terms of PIMP. In both cases, the features will only be removed if drop in performance is up to the defined threshold. threshold: float (default 0.005) Threshold for model performance comparison early_stop: int (default 2) Number of rounds without improvement before stopping process iter_limit: int (default 50) Maximum number of iterations before stopping min_remaining_features: int (default 50) Minimum number of features that should remain in the model, combining num_removed_by_step and iter_limit accomplishes the same functionality as this parameter. save_intermediary_fn: function(log) -> save to file Partially defined saver function that receives a log result from a tuning step and appends it into a file Example: save_intermediary_result(save_path='tuning.pkl') speed_up_by_importance: bool (default True) If it should narrow search looking at feature importance first before getting PIMP importance. If True, will only shuffle the top num_removed_by_step in terms of feature importance. max_removed_by_step: int (default 50) If speed_up_by_importance=False, this will limit the number of features dropped by iteration. It will only drop the max_removed_by_step features that decrease the metric by the least when dropped. parallel: bool (default False) Run shuffling and prediction in parallel. Only applies if speed_up_by_importance=False nthread: int (default 1) Number of threads to run predictions. ONly applied if speed_up_by_importance=False seed: int (default 7) random state for consistency. Returns ---------- logs: list of list of dict A list log-like lists of dictionaries evaluations. Each element of the list is validation step of the algorithm. """ selector_fn = remove_by_feature_shuffling(eval_fn=eval_fn, eval_data=test_data, extractor=extractor, metric_name=metric_name, max_removed_by_step=max_removed_by_step, threshold=threshold, speed_up_by_importance=speed_up_by_importance, parallel=parallel, nthread=nthread, seed=seed) stop_fn = aggregate_stop_funcs( stop_by_no_improvement(extractor=extractor, metric_name=metric_name, early_stop=early_stop, threshold=threshold), stop_by_iter_num(iter_limit=iter_limit), stop_by_num_features(min_num_features=min_remaining_features) ) predict_fn_first, _, train_logs = param_train_fn(train_data, features) eval_logs = eval_fn(predict_fn_first(test_data)) first_logs = { 'train_log': train_logs, 'validator_log': [ { 'fold_num': 0, 'split_log': { 'train_size': train_data.shape[0], 'test_size': test_data.shape[0] }, 'eval_results': [eval_logs] } ] } logs = [first_logs] predict_fn = predict_fn_first while not stop_fn(logs): # type: ignore next_features = pipe(logs, first, selector_fn(predict_fn=predict_fn)) if len(next_features) == 0: break next_predict_fn, _, next_train_logs = param_train_fn(train_data, next_features) eval_logs = pipe(test_data, next_predict_fn, eval_fn) next_log = {'train_log': next_train_logs, 'validator_log': [ {'fold_num': 0, 'split_log': {'train_size': train_data.shape[0], 'test_size': test_data.shape[0]}, 'eval_results': [eval_logs]}]} logs = [next_log] + logs if save_intermediary_fn is not None: save_intermediary_fn(next_log) predict_fn = next_predict_fn return logs
[docs]def backward_subset_feature_selection(train_data: pd.DataFrame, param_train_fn: TuningLearnerFnType, features_sets: Dict[str, List[str]], split_fn: SplitterFnType, eval_fn: EvalFnType, extractor: ExtractorFnType, metric_name: str, threshold: float = 0.005, num_removed_by_step: int = 3, early_stop: int = 2, iter_limit: int = 50, min_remaining_features: int = 50, save_intermediary_fn: SaveIntermediaryFnType = None, n_jobs: int = 1) -> ListLogListType: """ Performs train-evaluation iterations while testing the subsets of features to compute statistics about the importance of each feature category Parameters ---------- train_data : pandas.DataFrame A Pandas' DataFrame with training data param_train_fn : function (pandas.DataFrame, list of str) -> prediction_function, predictions_dataset, logs A partially defined learning function that takes a training set and a feature list and returns a predict function, a dataset with training predictions and training logs. features_sets: dict of string -> list Each String Key on the dict is a subset of columns from the dataset, the function will analyse the influence of each group of features on the model performance split_fn : function pandas.DataFrame -> list of tuple Partially defined split function that takes a dataset and returns a list of folds. Each fold is a Tuple of arrays. The fist array in each tuple contains training indexes while the second array contains validation indexes. eval_fn : function pandas.DataFrame -> dict A partially defined evaluation function that takes a dataset with prediction and returns the evaluation logs. extractor: function str -> float A extractor that take a string and returns the value of that string on a dict metric_name: str String with the name of the column that refers to the metric column to be extracted num_removed_by_step: int (default 3) Number of features removed at each iteration threshold: float (default 0.005) Threshold for model performance comparison early_stop: int (default 2) Number of rounds without improvement before stopping process iter_limit: int (default 50) Maximum number of iterations before stopping min_remaining_features: int (default 50) Minimum number of features that should remain in the model, combining num_removed_by_step and iter_limit accomplishes the same functionality as this parameter. save_intermediary_fn : function(log) -> save to file Partially defined saver function that receives a log result from a tuning step and appends it into a file Example: save_intermediary_result(save_path='tuning.pkl') n_jobs : int Number of parallel processes to spawn. Returns ---------- logs: list of list of dict A list log-like lists of dictionaries evaluations. Each element of the list is validation step of the algorithm. """ selector_fn = remove_features_subsets(extractor=extractor, metric_name=metric_name, num_removed_by_step=num_removed_by_step) stop_fn = aggregate_stop_funcs( stop_by_no_improvement_parallel(extractor=extractor, metric_name=metric_name, early_stop=early_stop, threshold=threshold), stop_by_iter_num(iter_limit=iter_limit), stop_by_num_features_parallel(extractor=extractor, metric_name=metric_name, min_num_features=min_remaining_features) ) used_subsets = [features_sets.keys()] used_features = [list(mapcat(lambda key: features_sets[key], subset)) for subset in used_subsets] trainers = [lambda df: param_train_fn(df, feat) for feat in used_features] first_val_logs = [parallel_validator(train_data, split_fn, train_func, eval_fn, n_jobs) for train_func in trainers] logs = [[dict(log, **{"used_subsets": list(subset)}) for log, subset in zip(first_val_logs, used_subsets)]] while not stop_fn(logs): curr_log = first(logs) new_subsets = selector_fn(curr_log) new_features = [list(mapcat(lambda key: features_sets[key], subset)) for subset in new_subsets] trainers = [lambda df: param_train_fn(df, feat) for feat in new_features] val_logs = [parallel_validator(train_data, split_fn, train_func, eval_fn, n_jobs) for train_func in trainers] new_logs = [dict(log, **{"used_subsets": subset}) for log, subset in zip(val_logs, new_subsets)] if save_intermediary_fn is not None: save_intermediary_fn(new_logs) logs = [new_logs] + logs return logs