Source code for fklearn.validation.evaluators

from typing import Any, Callable, Iterable, List

import toolz as fp
from toolz import curry
import pandas as pd
import numpy as np
from pandas.util import hash_pandas_object
from sklearn.metrics import roc_auc_score, r2_score, mean_squared_error, log_loss, precision_score, recall_score, \
    fbeta_score, brier_score_loss, mean_absolute_error

from fklearn.types import EvalFnType, EvalReturnType, PredictFnType, UncurriedEvalFnType


[docs]def generic_sklearn_evaluator(name_prefix: str, sklearn_metric: Callable[..., float]) -> UncurriedEvalFnType:
    """
    Returns an evaluator build from a metric from sklearn.metrics

    Parameters
    ----------
    name_prefix: str
        The default name of the evaluator will be name_prefix + target_column.

    sklearn_metric: Callable
        Metric function from sklearn.metrics. It should take as parameters y_true, y_score, kwargs.

    Returns
    ----------
    eval_fn: Callable
       An evaluator function that uses the provided metric
    """

    def p(test_data: pd.DataFrame,
          prediction_column: str = "prediction",
          target_column: str = "target",
          eval_name: str = None,
          **kwargs: Any) -> EvalReturnType:
        try:
            score = sklearn_metric(test_data[target_column], test_data[prediction_column], **kwargs)
        except ValueError:
            # this might happen if there's only one class in the fold
            score = np.nan

        if eval_name is None:
            eval_name = name_prefix + target_column

        return {eval_name: score}

    return p


[docs]@curry
def auc_evaluator(test_data: pd.DataFrame,
                  prediction_column: str = "prediction",
                  target_column: str = "target",
                  eval_name: str = None) -> EvalReturnType:
    """
    Computes the ROC AUC score, given true label and prediction scores.

    Parameters
    ----------
    test_data : Pandas' DataFrame
        A Pandas' DataFrame with with target and prediction scores.

    prediction_column : Strings
        The name of the column in `test_data` with the prediction scores.

    target_column : String
        The name of the column in `test_data` with the binary target.

    eval_name : String, optional (default=None)
        the name of the evaluator as it will appear in the logs.

    Returns
    ----------
    log: dict
        A log-like dictionary with the ROC AUC Score
    """

    eval_fn = generic_sklearn_evaluator("auc_evaluator__", roc_auc_score)
    eval_data = test_data.assign(**{target_column: lambda df: df[target_column].astype(int)})

    return eval_fn(eval_data, prediction_column, target_column, eval_name)


[docs]@curry
def precision_evaluator(test_data: pd.DataFrame,
                        threshold: float = 0.5,
                        prediction_column: str = "prediction",
                        target_column: str = "target",
                        eval_name: str = None) -> EvalReturnType:
    """
    Computes the precision score, given true label and prediction scores.

    Parameters
    ----------
    test_data : pandas.DataFrame
        A Pandas' DataFrame with with target and prediction scores.

    threshold : float
        A threshold for the prediction column above which samples
         will be classified as 1

    prediction_column : str
        The name of the column in `test_data` with the prediction scores.

    target_column : str
        The name of the column in `test_data` with the binary target.

    eval_name : str, optional (default=None)
        the name of the evaluator as it will appear in the logs.

    Returns
    ----------
    log: dict
        A log-like dictionary with the Precision Score
    """
    eval_fn = generic_sklearn_evaluator("precision_evaluator__", precision_score)
    eval_data = test_data.assign(**{prediction_column: (test_data[prediction_column] > threshold).astype(int)})

    return eval_fn(eval_data, prediction_column, target_column, eval_name)


[docs]@curry
def recall_evaluator(test_data: pd.DataFrame,
                     threshold: float = 0.5,
                     prediction_column: str = "prediction",
                     target_column: str = "target",
                     eval_name: str = None) -> EvalReturnType:
    """
    Computes the recall score, given true label and prediction scores.

    Parameters
    ----------

    test_data : pandas.DataFrame
        A Pandas' DataFrame with with target and prediction scores.

    threshold : float
        A threshold for the prediction column above which samples
         will be classified as 1

    prediction_column : str
        The name of the column in `test_data` with the prediction scores.

    target_column : str
        The name of the column in `test_data` with the binary target.

    eval_name : str, optional (default=None)
        the name of the evaluator as it will appear in the logs.

    Returns
    ----------
    log: dict
        A log-like dictionary with the Precision Score
    """

    eval_data = test_data.assign(**{prediction_column: (test_data[prediction_column] > threshold).astype(int)})
    eval_fn = generic_sklearn_evaluator("recall_evaluator__", recall_score)

    return eval_fn(eval_data, prediction_column, target_column, eval_name)


[docs]@curry
def fbeta_score_evaluator(test_data: pd.DataFrame,
                          threshold: float = 0.5,
                          beta: float = 1.0,
                          prediction_column: str = "prediction",
                          target_column: str = "target",
                          eval_name: str = None) -> EvalReturnType:
    """
    Computes the recall score, given true label and prediction scores.

    Parameters
    ----------

    test_data : pandas.DataFrame
        A Pandas' DataFrame with with target and prediction scores.

    threshold : float
        A threshold for the prediction column above which samples
         will be classified as 1

    beta : float
        The beta parameter determines the weight of precision in the combined score.
        beta < 1 lends more weight to precision, while beta > 1 favors recall
        (beta -> 0 considers only precision, beta -> inf only recall).

    prediction_column : str
        The name of the column in `test_data` with the prediction scores.

    target_column : str
        The name of the column in `test_data` with the binary target.

    eval_name : str, optional (default=None)
        the name of the evaluator as it will appear in the logs.

    Returns
    ----------
    log: dict
        A log-like dictionary with the Precision Score
    """

    eval_data = test_data.assign(**{prediction_column: (test_data[prediction_column] > threshold).astype(int)})
    eval_fn = generic_sklearn_evaluator("fbeta_evaluator__", fbeta_score)

    return eval_fn(eval_data, prediction_column, target_column, eval_name, beta=beta)


[docs]@curry
def logloss_evaluator(test_data: pd.DataFrame,
                      prediction_column: str = "prediction",
                      target_column: str = "target",
                      eval_name: str = None) -> EvalReturnType:
    """
    Computes the logloss score, given true label and prediction scores.

    Parameters
    ----------
    test_data : Pandas' DataFrame
        A Pandas' DataFrame with with target and prediction scores.

    prediction_column : Strings
        The name of the column in `test_data` with the prediction scores.

    target_column : String
        The name of the column in `test_data` with the binary target.

    eval_name : String, optional (default=None)
        the name of the evaluator as it will appear in the logs.

    Returns
    ----------
    log: dict
        A log-like dictionary with the logloss score.
    """

    eval_fn = generic_sklearn_evaluator("logloss_evaluator__", log_loss)
    eval_data = test_data.assign(**{target_column: lambda df: df[target_column].astype(int)})

    return eval_fn(eval_data, prediction_column, target_column, eval_name)


[docs]@curry
def brier_score_evaluator(test_data: pd.DataFrame,
                          prediction_column: str = "prediction",
                          target_column: str = "target",
                          eval_name: str = None) -> EvalReturnType:
    """
    Computes the Brier score, given true label and prediction scores.

    Parameters
    ----------
    test_data : Pandas' DataFrame
        A Pandas' DataFrame with with target and prediction scores.

    prediction_column : Strings
        The name of the column in `test_data` with the prediction scores.

    target_column : String
        The name of the column in `test_data` with the binary target.

    eval_name : String, optional (default=None)
        The name of the evaluator as it will appear in the logs.

    Returns
    ----------
    log: dict
        A log-like dictionary with the Brier score.
    """

    eval_fn = generic_sklearn_evaluator("brier_score_evaluator__", brier_score_loss)
    eval_data = test_data.assign(**{target_column: lambda df: df[target_column].astype(int)})

    return eval_fn(eval_data, prediction_column, target_column, eval_name)


[docs]@curry
def expected_calibration_error_evaluator(test_data: pd.DataFrame,
                                         prediction_column: str = "prediction",
                                         target_column: str = "target",
                                         eval_name: str = None,
                                         n_bins: int = 100,
                                         bin_choice: str = "count") -> EvalReturnType:
    """
    Computes the expected calibration error (ECE), given true label and prediction scores.
    See "On Calibration of Modern Neural Networks"(https://arxiv.org/abs/1706.04599) for more information.

    The ECE is the distance between the actuals observed frequency and the predicted probabilities,
    for a given choice of bins.

    Perfect calibration results in a score of 0.

    For example, if for the bin [0, 0.1] we have the three data points:
      1. prediction: 0.1, actual: 0
      2. prediction: 0.05, actual: 1
      3. prediction: 0.0, actual 0

    Then the predicted average is (0.1 + 0.05 + 0.00)/3 = 0.05, and the empirical frequency is (0 + 1 + 0)/3 = 1/3.
    Therefore, the distance for this bin is::

        |1/3 - 0.05| ~= 0.28.

    Graphical intuition::

        Actuals (empirical frequency between 0 and 1)
        |     *
        |   *
        | *
         ______ Predictions (probabilties between 0 and 1)

    Parameters
    ----------
    test_data : Pandas' DataFrame
        A Pandas' DataFrame with with target and prediction scores.

    prediction_column : Strings
        The name of the column in `test_data` with the prediction scores.

    target_column : String
        The name of the column in `test_data` with the binary target.

    eval_name : String, optional (default=None)
        The name of the evaluator as it will appear in the logs.

    n_bins: Int (default=100)
        The number of bins.
        This is a trade-off between the number of points in each bin and the probability range they span.
        You want a small enough range that still contains a significant number of points for the distance to work.

    bin_choice: String (default="count")
        Two possibilities:
        "count" for equally populated bins (e.g. uses `pandas.qcut` for the bins)
        "prob" for equally spaced probabilities (e.g. uses `pandas.cut` for the bins),
        with distance weighed by the number of samples in each bin.

    Returns
    -------
    log: dict
       A log-like dictionary with the expected calibration error.
    """

    if eval_name is None:
        eval_name = "expected_calibration_error_evaluator__" + target_column

    if bin_choice == "count":
        bins = pd.qcut(test_data[prediction_column], q=n_bins)
    elif bin_choice == "prob":
        bins = pd.cut(test_data[prediction_column], bins=n_bins)
    else:
        raise AttributeError("Invalid bin_choice")

    metric_df = pd.DataFrame({"bins": bins,
                              "predictions": test_data[prediction_column],
                              "actuals": test_data[target_column]})

    agg_df = metric_df.groupby("bins").agg({"bins": "count", "predictions": "mean", "actuals": "mean"})

    sample_weight = None
    if bin_choice == "prob":
        sample_weight = agg_df["bins"].values

    distance = mean_absolute_error(agg_df["actuals"].values, agg_df["predictions"].values, sample_weight=sample_weight)

    return {eval_name: distance}


[docs]@curry
def r2_evaluator(test_data: pd.DataFrame,
                 prediction_column: str = "prediction",
                 target_column: str = "target",
                 eval_name: str = None) -> EvalReturnType:
    """
    Computes the R2 score, given true label and predictions.

    Parameters
    ----------
    test_data : Pandas' DataFrame
        A Pandas' DataFrame with with target and prediction.

    prediction_column : Strings
        The name of the column in `test_data` with the prediction.

    target_column : String
        The name of the column in `test_data` with the continuous target.

    eval_name : String, optional (default=None)
        the name of the evaluator as it will appear in the logs.

    Returns
    ----------
    log: dict
        A log-like dictionary with the R2 Score
    """

    eval_fn = generic_sklearn_evaluator("r2_evaluator__", r2_score)

    return eval_fn(test_data, prediction_column, target_column, eval_name)


[docs]@curry
def mse_evaluator(test_data: pd.DataFrame,
                  prediction_column: str = "prediction",
                  target_column: str = "target",
                  eval_name: str = None) -> EvalReturnType:
    """
    Computes the Mean Squared Error, given true label and predictions.

    Parameters
    ----------
    test_data : Pandas' DataFrame
        A Pandas' DataFrame with with target and predictions.

    prediction_column : Strings
        The name of the column in `test_data` with the predictions.

    target_column : String
        The name of the column in `test_data` with the continuous target.

    eval_name : String, optional (default=None)
        the name of the evaluator as it will appear in the logs.

    Returns
    ----------
    log: dict
        A log-like dictionary with the MSE Score
    """
    eval_fn = generic_sklearn_evaluator("mse_evaluator__", mean_squared_error)

    return eval_fn(test_data, prediction_column, target_column, eval_name)


[docs]@curry
def mean_prediction_evaluator(test_data: pd.DataFrame,
                              prediction_column: str = "prediction",
                              eval_name: str = None) -> EvalReturnType:
    """
    Computes mean for the specified column.

    Parameters
    ----------
    test_data : Pandas' DataFrame
        A Pandas' DataFrame with a column to compute the mean

    prediction_column : Strings
        The name of the column in `test_data` to compute the mean.

    eval_name : String, optional (default=None)
        the name of the evaluator as it will appear in the logs.

    Returns
    ----------
    log: dict
        A log-like dictionary with the column mean
    """

    if eval_name is None:
        eval_name = 'mean_evaluator__' + prediction_column

    return {eval_name: test_data[prediction_column].mean()}


[docs]@curry
def correlation_evaluator(test_data: pd.DataFrame,
                          prediction_column: str = "prediction",
                          target_column: str = "target",
                          eval_name: str = None) -> EvalReturnType:
    """
    Computes the Pearson correlation between prediction and target.

    Parameters
    ----------
    test_data : Pandas' DataFrame
        A Pandas' DataFrame with with target and prediction.

    prediction_column : Strings
        The name of the column in `test_data` with the prediction.

    target_column : String
        The name of the column in `test_data` with the continuous target.

    eval_name : String, optional (default=None)
        the name of the evaluator as it will appear in the logs.

    Returns
    ----------
    log: dict
        A log-like dictionary with the Pearson correlation
    """

    if eval_name is None:
        eval_name = "correlation_evaluator__" + target_column

    score = test_data[[prediction_column, target_column]].corr(method="pearson").iloc[0, 1]
    return {eval_name: score}


[docs]@curry
def spearman_evaluator(test_data: pd.DataFrame,
                       prediction_column: str = "prediction",
                       target_column: str = "target",
                       eval_name: str = None) -> EvalReturnType:
    """
    Computes the Spearman correlation between prediction and target.
    The Spearman correlation evaluates the rank order between two variables:
    https://en.wikipedia.org/wiki/Spearman%27s_rank_correlation_coefficient

    Parameters
    ----------
    test_data : Pandas' DataFrame
        A Pandas' DataFrame with with target and prediction.

    prediction_column : Strings
        The name of the column in `test_data` with the prediction.

    target_column : String
        The name of the column in `test_data` with the continuous target.

    eval_name : String, optional (default=None)
        the name of the evaluator as it will appear in the logs.

    Returns
    ----------
    log: dict
        A log-like dictionary with the Spearman correlation
    """

    if eval_name is None:
        eval_name = "spearman_evaluator__" + target_column

    score = test_data[[prediction_column, target_column]].corr(method="spearman").iloc[0, 1]
    return {eval_name: score}


[docs]@curry
def combined_evaluators(test_data: pd.DataFrame,
                        evaluators: List[EvalFnType]) -> EvalReturnType:
    """
    Combine partially applies evaluation functions.

    Parameters
    ----------
    test_data : Pandas' DataFrame
        A Pandas' DataFrame to apply the evaluators on

    evaluators: List
        List of evaluator functions

    Returns
    ----------
    log: dict
        A log-like dictionary with the column mean
    """
    return fp.merge(e(test_data) for e in evaluators)


[docs]@curry
def split_evaluator(test_data: pd.DataFrame,
                    eval_fn: EvalFnType,
                    split_col: str,
                    split_values: Iterable = None,
                    eval_name: str = None) -> EvalReturnType:
    """
    Splits the dataset into the categories in `split_col` and evaluate
    model performance in each split. Useful when you belive the model
    performs differs in a sub population defined by `split_col`.

    Parameters
    ----------
    test_data : Pandas' DataFrame
        A Pandas' DataFrame with with target and predictions.

    eval_fn : function DataFrame -> Log Dict
        A partially applied evaluation function.

    split_col : String
        The name of the column in `test_data` to split by.

    split_values : Array, optional (default=None)
        An Array to split by. If not provided, `test_data[split_col].unique()`
        will be used.

    eval_name : String, optional (default=None)
        the name of the evaluator as it will appear in the logs.

    Returns
    ----------
    log: dict
        A log-like dictionary with evaluation results by split.
    """
    if split_values is None:
        split_values = test_data[split_col].unique()

    if eval_name is None:
        eval_name = 'split_evaluator__' + split_col

    return {eval_name + "_" + str(value): eval_fn(test_data.loc[lambda df: df[split_col] == value])
            for value in split_values}


[docs]@curry
def temporal_split_evaluator(test_data: pd.DataFrame,
                             eval_fn: EvalFnType,
                             time_col: str,
                             time_format: str = "%Y-%m",
                             split_values: Iterable[str] = None,
                             eval_name: str = None) -> EvalReturnType:
    """
    Splits the dataset into the temporal categories by `time_col` and evaluate
    model performance in each split.

    The splits are implicitly defined by the `time_format`.
    For example, for the default time format ("%Y-%m"), we will split by year and month.

    Parameters
    ----------
    test_data : Pandas' DataFrame
        A Pandas' DataFrame with with target and predictions.

    eval_fn : function DataFrame -> Log Dict
        A partially applied evaluation function.

    time_col : string
        The name of the column in `test_data` to split by.

    time_format : string
        The way to format the `time_col` into temporal categories.

    split_values : Array of string, optional (default=None)
        An array of date formatted strings to split the evaluation by.
        If not provided, all unique formatted dates will be used.

    eval_name : String, optional (default=None)
        the name of the evaluator as it will appear in the logs.

    Returns
    -------
    log: dict
        A log-like dictionary with evaluation results by split.
    """

    formatted_time_col = test_data[time_col].dt.strftime(time_format)
    unique_values = formatted_time_col.unique()

    if eval_name is None:
        eval_name = 'split_evaluator__' + time_col

    if split_values is None:
        split_values = unique_values
    else:
        assert all(sv in unique_values for sv in split_values), (
            "All split values must be present in the column (after date formatting it)")

    return {eval_name + "_" + str(value): eval_fn(test_data.loc[lambda df: formatted_time_col == value])
            for value in split_values}


[docs]@curry
def permutation_evaluator(test_data: pd.DataFrame,
                          predict_fn: PredictFnType,
                          eval_fn: EvalFnType,
                          baseline: bool = True,
                          features: List[str] = None,
                          shuffle_all_at_once: bool = False,
                          random_state: int = None) -> EvalReturnType:
    """
    Permutation importance evaluator.
    It works by shuffling one or more features on test_data dataframe,
    getting the preditions with predict_fn, and evaluating the results with eval_fn.

    Parameters
    ----------
    test_data : Pandas' DataFrame
        A Pandas' DataFrame with with target, predictions and features.

    predict_fn : function DataFrame -> DataFrame
        Function that receives the input dataframe and returns a dataframe with the pipeline predictions.

    eval_fn : function DataFrame -> Log Dict
        A partially applied evaluation function.

    baseline: bool
        Also evaluates the predict_fn on an unshuffled baseline.

    features : List of strings
        The features to shuffle and then evaluate eval_fn on the shuffled results.
        The default case shuffles all dataframe columns.

    shuffle_all_at_once: bool
        Shuffle all features at once instead of one per turn.

    random_state: int
        Seed to be used by the random number generator.

    eval_name : String, optional (default=None)
        the name of the evaluator as it will appear in the logs.

    Returns
    -------
    log: dict
        A log-like dictionary with evaluation results by feature shuffle.
        Use the permutation_extractor for better visualization of the results.
    """

    if features is None:
        features = list(test_data.columns)

    def col_shuffler(f: str) -> np.ndarray:
        return test_data[f].sample(frac=1.0, random_state=random_state).values

    def permutation_eval(features_to_shuffle: List[str]) -> EvalReturnType:
        shuffled_cols = {f: col_shuffler(f) for f in features_to_shuffle}
        return eval_fn(predict_fn(test_data.assign(**shuffled_cols)))

    if shuffle_all_at_once:
        permutation_results = {'-'.join(features): permutation_eval(features)}
    else:
        permutation_results = {f: permutation_eval([f]) for f in features}

    feature_importance = {'permutation_importance': permutation_results}

    if baseline:
        baseline_results = {'permutation_importance_baseline': eval_fn(predict_fn(test_data))}
    else:
        baseline_results = {}

    return fp.merge(feature_importance, baseline_results)


[docs]@curry
def hash_evaluator(test_data: pd.DataFrame,
                   hash_columns: List[str] = None,
                   eval_name: str = None,
                   consider_index: bool = False) -> EvalReturnType:
    """
    Computes the hash of a pandas dataframe, filtered by hash columns. The
    purpose is to uniquely identify a dataframe, to be able to check if two
    dataframes are equal or not.

    Parameters
    ----------
    test_data : Pandas' DataFrame
        A Pandas' DataFrame to be hashed.

    hash_columns : List[str], optional (default=None)
        A list of column names to filter the dataframe before hashing. If None,
        it will hash the dataframe with all the columns

    eval_name : String, optional (default=None)
        the name of the evaluator as it will appear in the logs.

    consider_index: bool, optional (default=False)
        If true, will consider the index of the dataframe to calculate the hash.
        The default behaviour will ignore the index and just hash the content of
        the features.

    Returns
    -------
    log: dict
        A log-like dictionary with the hash of the dataframe
    """
    if hash_columns is None:
        hash_columns = test_data.columns

    def calculate_dataframe_hash(df: pd.DataFrame, eval_name: str) -> EvalReturnType:
        # Get the hashes per row, them sum all of them in a single value
        return {eval_name: hash_pandas_object(df).sum()}

    if eval_name is None:
        eval_name = "hash_evaluator__" + "_".join(sorted(hash_columns))
    eval_data = test_data[hash_columns]

    if not consider_index:  # set 0 for all indexes
        return calculate_dataframe_hash(eval_data.set_index(np.zeros(len(eval_data), dtype="int")), eval_name)

    return calculate_dataframe_hash(eval_data, eval_name)