Source code for fklearn.causal.validation.curves

from typing import List

import numpy as np
import pandas as pd
from toolz import curry, partial

from fklearn.types import EffectFnType
from fklearn.causal.effects import linear_effect


[docs]@curry def effect_by_segment(df: pd.DataFrame, treatment: str, outcome: str, prediction: str, segments: int = 10, effect_fn: EffectFnType = linear_effect) -> pd.Series: """ Segments the dataset by a prediction's quantile and estimates the treatment effect by segment. Parameters ---------- df : Pandas' DataFrame A Pandas' DataFrame with target and prediction scores. treatment : Strings The name of the treatment column in `df`. outcome : Strings The name of the outcome column in `df`. prediction : Strings The name of the prediction column in `df`. segments : Integer The number of the segments to create. Uses Pandas' qcut under the hood. effect_fn : function (df: pandas.DataFrame, treatment: str, outcome: str) -> int or Array of int A function that computes the treatment effect given a dataframe, the name of the treatment column and the name of the outcome column. Returns ---------- effect by band : Pandas' Series The effect stored in a Pandas' series were the indexes are the segments """ effect_fn_partial = partial(effect_fn, treatment_column=treatment, outcome_column=outcome) return (df .assign(**{f"{prediction}_band": pd.qcut(df[prediction], q=segments)}) .groupby(f"{prediction}_band") .apply(effect_fn_partial))
[docs]@curry def cumulative_effect_curve(df: pd.DataFrame, treatment: str, outcome: str, prediction: str, min_rows: int = 30, steps: int = 100, effect_fn: EffectFnType = linear_effect) -> np.ndarray: """ Orders the dataset by prediction and computes the cumulative effect curve according to that ordering Parameters ---------- df : Pandas' DataFrame A Pandas' DataFrame with target and prediction scores. treatment : Strings The name of the treatment column in `df`. outcome : Strings The name of the outcome column in `df`. prediction : Strings The name of the prediction column in `df`. min_rows : Integer Minimum number of observations needed to have a valid result. steps : Integer The number of cumulative steps to iterate when accumulating the effect effect_fn : function (df: pandas.DataFrame, treatment: str, outcome: str) -> int or Array of int A function that computes the treatment effect given a dataframe, the name of the treatment column and the name of the outcome column. Returns ---------- cumulative effect curve: Numpy's Array The cumulative treatment effect according to the predictions ordering. """ size = df.shape[0] ordered_df = df.sort_values(prediction, ascending=False).reset_index(drop=True) n_rows = list(range(min_rows, size, size // steps)) + [size] return np.array([effect_fn(ordered_df.head(rows), treatment, outcome) for rows in n_rows])
[docs]@curry def cumulative_gain_curve(df: pd.DataFrame, treatment: str, outcome: str, prediction: str, min_rows: int = 30, steps: int = 100, effect_fn: EffectFnType = linear_effect) -> np.ndarray: """ Orders the dataset by prediction and computes the cumulative gain (effect * proportional sample size) curve according to that ordering. Parameters ---------- df : Pandas' DataFrame A Pandas' DataFrame with target and prediction scores. treatment : Strings The name of the treatment column in `df`. outcome : Strings The name of the outcome column in `df`. prediction : Strings The name of the prediction column in `df`. min_rows : Integer Minimum number of observations needed to have a valid result. steps : Integer The number of cumulative steps to iterate when accumulating the effect effect_fn : function (df: pandas.DataFrame, treatment: str, outcome: str) -> int or Array of int A function that computes the treatment effect given a dataframe, the name of the treatment column and the name of the outcome column. Returns ---------- cumulative gain curve: float The cumulative gain according to the predictions ordering. """ size = df.shape[0] n_rows = list(range(min_rows, size, size // steps)) + [size] cum_effect = cumulative_effect_curve(df=df, treatment=treatment, outcome=outcome, prediction=prediction, min_rows=min_rows, steps=steps, effect_fn=effect_fn) return np.array([effect * (rows / size) for rows, effect in zip(n_rows, cum_effect)])
[docs]@curry def relative_cumulative_gain_curve(df: pd.DataFrame, treatment: str, outcome: str, prediction: str, min_rows: int = 30, steps: int = 100, effect_fn: EffectFnType = linear_effect) -> np.ndarray: """ Orders the dataset by prediction and computes the relative cumulative gain curve curve according to that ordering. The relative gain is simply the cumulative effect minus the Average Treatment Effect (ATE) times the relative sample size. Parameters ---------- df : Pandas' DataFrame A Pandas' DataFrame with target and prediction scores. treatment : Strings The name of the treatment column in `df`. outcome : Strings The name of the outcome column in `df`. prediction : Strings The name of the prediction column in `df`. min_rows : Integer Minimum number of observations needed to have a valid result. steps : Integer The number of cumulative steps to iterate when accumulating the effect effect_fn : function (df: pandas.DataFrame, treatment: str, outcome: str) -> int or Array of int A function that computes the treatment effect given a dataframe, the name of the treatment column and the name of the outcome column. Returns ---------- relative cumulative gain curve: float The relative cumulative gain according to the predictions ordering. """ ate = effect_fn(df, treatment, outcome) size = df.shape[0] n_rows = list(range(min_rows, size, size // steps)) + [size] cum_effect = cumulative_effect_curve(df=df, treatment=treatment, outcome=outcome, prediction=prediction, min_rows=min_rows, steps=steps, effect_fn=effect_fn) return np.array([(effect - ate) * (rows / size) for rows, effect in zip(n_rows, cum_effect)])
[docs]@curry def effect_curves( df: pd.DataFrame, treatment: str, outcome: str, prediction: str, min_rows: int = 30, steps: int = 100, effect_fn: EffectFnType = linear_effect, ) -> pd.DataFrame: """ Creates a dataset summarizing the effect curves: cumulative effect, cumulative gain and relative cumulative gain. The dataset also contains two columns referencing the data used to compute the curves at each step: number of samples and fraction of samples used. Moreover one column indicating the cumulative gain for a corresponding random model is also included as a benchmark. Parameters ---------- df : Pandas' DataFrame A Pandas' DataFrame with target and prediction scores. treatment : Strings The name of the treatment column in `df`. outcome : Strings The name of the outcome column in `df`. prediction : Strings The name of the prediction column in `df`. min_rows : Integer Minimum number of observations needed to have a valid result. steps : Integer The number of cumulative steps to iterate when accumulating the effect effect_fn : function (df: pandas.DataFrame, treatment: str, outcome: str) -> int or Array of int A function that computes the treatment effect given a dataframe, the name of the treatment column and the name of the outcome column. Returns ---------- summary curves dataset: pd.DataFrame The dataset with the results for multiple validation causal curves according to the predictions ordering. """ size: int = df.shape[0] n_rows: List[int] = list(range(min_rows, size, size // steps)) + [size] cum_effect: np.ndarray = cumulative_effect_curve( df=df, treatment=treatment, outcome=outcome, prediction=prediction, min_rows=min_rows, steps=steps, effect_fn=effect_fn, ) ate: float = cum_effect[-1] return pd.DataFrame({"samples_count": n_rows, "cumulative_effect_curve": cum_effect}).assign( samples_fraction=lambda x: x["samples_count"] / size, cumulative_gain_curve=lambda x: x["samples_fraction"] * x["cumulative_effect_curve"], random_model_cumulative_gain_curve=lambda x: x["samples_fraction"] * ate, relative_cumulative_gain_curve=lambda x: ( x["samples_fraction"] * x["cumulative_effect_curve"] - x["random_model_cumulative_gain_curve"] ), )