Source code for fklearn.training.calibration

import pandas as pd
import sklearn

from sklearn.isotonic import IsotonicRegression
from toolz import curry

from fklearn.common_docstrings import learner_pred_fn_docstring, learner_return_docstring
from fklearn.types import LearnerReturnType
from fklearn.training.utils import log_learner_time


[docs]@curry @log_learner_time(learner_name='isotonic_calibration_learner') def isotonic_calibration_learner(df: pd.DataFrame, target_column: str = "target", prediction_column: str = "prediction", output_column: str = "calibrated_prediction", y_min: float = 0.0, y_max: float = 1.0) -> LearnerReturnType: """ Fits a single feature isotonic regression to the dataset. Parameters ---------- df : pandas.DataFrame A Pandas' DataFrame with features and target columns. The model will be trained to predict the target column from the features. target_column : str The name of the column in `df` that should be used as target for the model. This column should be binary, since this is a classification model. prediction_column : str The name of the column with the uncalibrated predictions from the model. output_column : str The name of the column with the calibrated predictions from the model. y_min: float Lower bound of Isotonic Regression y_max: float Upper bound of Isotonic Regression """ clf = IsotonicRegression(y_min=y_min, y_max=y_max, out_of_bounds='clip') clf.fit(df[prediction_column], df[target_column]) def p(new_df: pd.DataFrame) -> pd.DataFrame: return new_df.assign(**{output_column: clf.predict(new_df[prediction_column])}) p.__doc__ = learner_pred_fn_docstring("isotonic_calibration_learner") log = {'isotonic_calibration_learner': { 'output_column': output_column, 'target_column': target_column, 'prediction_column': prediction_column, 'package': "sklearn", 'package_version': sklearn.__version__, 'training_samples': len(df)}, 'object': clf} return p, p(df), log
isotonic_calibration_learner.__doc__ += learner_return_docstring("Isotonic Calibration")
[docs]@curry @log_learner_time(learner_name='find_thresholds_with_same_risk') def find_thresholds_with_same_risk(df: pd.DataFrame, sensitive_factor: str, unfair_band_column: str, model_prediction_output: str, target_column: str = "target", output_column_name: str = "fair_band") -> LearnerReturnType: """ Calculate fair calibration, where for each band any sensitive factor group have the same target mean. Parameters ---------- df : pandas.DataFrame A Pandas' DataFrame with features and target columns. The model will be trained to predict the target column from the features. sensitive_factor: str Column where we have the different group classifications that we want to have the same target mean unfair_band_column: str Column with the original bands model_prediction_output : str Risk model's output target_column : str The name of the column in `df` that should be used as target for the model. This column should be binary, since this is a classification model. output_column_name : str The name of the column with the fair bins. """ sorted_df = df.sort_values(by=model_prediction_output).reset_index(drop=True) def _find_thresholds_with_same_risk(df: pd.DataFrame, metric_by_band: pd.DataFrame) -> list: current_threshold = -1 fair_thresholds = [current_threshold] for band, metric in metric_by_band.iterrows(): df = df[df[model_prediction_output] > current_threshold] if df.empty: break df["cumulative_risk"] = df[target_column].expanding(min_periods=1).mean() df["distance"] = abs(df["cumulative_risk"] - metric[target_column]) threshold = df.sort_values(by="distance").iloc[0][model_prediction_output] fair_thresholds.append(threshold) current_threshold = threshold fair_thresholds[-1] = df[model_prediction_output].max() return fair_thresholds unfair_bands = sorted(sorted_df[unfair_band_column].unique()) metric_by_band = sorted_df.groupby(unfair_band_column).agg({target_column: "mean"}) sensitive_groups = list(filter(lambda x: x, sorted_df[sensitive_factor].unique())) fair_thresholds = {} for group in sensitive_groups: raw_ecdf_with_target = sorted_df[sorted_df[sensitive_factor] == group][[model_prediction_output, target_column]] fair_thresholds[group] = _find_thresholds_with_same_risk(raw_ecdf_with_target, metric_by_band) def p(new_df: pd.DataFrame) -> pd.DataFrame: new_df_copy = new_df.copy() new_df_copy[output_column_name] = pd.Series(dtype='int') for group in sensitive_groups: group_filter = new_df_copy[sensitive_factor] == group n_of_bands = len(fair_thresholds[group]) - 1 new_df_copy.loc[group_filter, output_column_name] = pd.cut( new_df_copy.loc[group_filter, model_prediction_output], bins=fair_thresholds[group], labels=unfair_bands[:n_of_bands]).astype(float) return new_df_copy[output_column_name] p.__doc__ = learner_pred_fn_docstring("find_thresholds_with_same_risk") log = {'find_thresholds_with_same_risk': { 'output_column': output_column_name, 'prediction_ecdf': model_prediction_output, 'target_column': target_column, 'unfair_band_column': unfair_band_column, 'sensitive_factor': sensitive_factor, 'fair_thresholds': fair_thresholds}} return p, p(df), log
find_thresholds_with_same_risk.__doc__ += learner_return_docstring("find_thresholds_with_same_risk")