Source code for fklearn.causal.validation.auc

import pandas as pd
from toolz import curry

from fklearn.types import EffectFnType
from fklearn.causal.validation.curves import cumulative_effect_curve
from fklearn.causal.effects import linear_effect


[docs]@curry
def area_under_the_cumulative_effect_curve(df: pd.DataFrame,
                                           treatment: str,
                                           outcome: str,
                                           prediction: str,
                                           min_rows: int = 30,
                                           steps: int = 100,
                                           effect_fn: EffectFnType = linear_effect) -> float:
    """
     Orders the dataset by prediction and computes the area under the cumulative effect curve, according to that
     ordering.

     Parameters
     ----------
     df : Pandas' DataFrame
         A Pandas' DataFrame with target and prediction scores.

     treatment : str
         The name of the treatment column in `df`.

     outcome : Strings
         The name of the outcome column in `df`.

     prediction : Strings
         The name of the prediction column in `df`.

     min_rows : int
         Minimum number of observations needed to have a valid result.

     steps : Integer
         The number of cumulative steps to iterate when accumulating the effect

     effect_fn : function (df: pandas.DataFrame, treatment: str, outcome: str) -> int or Array of int
         A function that computes the treatment effect given a dataframe, the name of the treatment column and the name
         of the outcome column.


     Returns
     ----------
     area_under_the_cumulative_gain_curve: float
         The area under the cumulative gain curve according to the predictions ordering.
     """

    ate = effect_fn(df, treatment, outcome)
    size = df.shape[0]
    n_rows = list(range(min_rows, size, size // steps)) + [size]
    step_sizes = [min_rows] + [t - s for s, t in zip(n_rows, n_rows[1:])]

    cum_effect = cumulative_effect_curve(df=df, treatment=treatment, outcome=outcome, prediction=prediction,
                                         min_rows=min_rows, steps=steps, effect_fn=effect_fn)

    return abs(sum([(effect - ate) * (step_size / size) for effect, step_size in zip(cum_effect, step_sizes)]))


[docs]@curry
def area_under_the_cumulative_gain_curve(df: pd.DataFrame,
                                         treatment: str,
                                         outcome: str,
                                         prediction: str,
                                         min_rows: int = 30,
                                         steps: int = 100,
                                         effect_fn: EffectFnType = linear_effect) -> float:
    """
     Orders the dataset by prediction and computes the area under the cumulative gain curve, according to that ordering.

     Parameters
     ----------
     df : Pandas' DataFrame
         A Pandas' DataFrame with target and prediction scores.

     treatment : Strings
         The name of the treatment column in `df`.

     outcome : Strings
         The name of the outcome column in `df`.

     prediction : Strings
         The name of the prediction column in `df`.

     min_rows : Integer
         Minimum number of observations needed to have a valid result.

     steps : Integer
         The number of cumulative steps to iterate when accumulating the effect

     effect_fn : function (df: pandas.DataFrame, treatment: str, outcome: str) -> int or Array of int
         A function that computes the treatment effect given a dataframe, the name of the treatment column and the name
         of the outcome column.


     Returns
     ----------
     area_under_the_cumulative_gain_curve: float
         The area under the cumulative gain curve according to the predictions ordering.
     """

    size = df.shape[0]
    n_rows = list(range(min_rows, size, size // steps)) + [size]
    step_sizes = [min_rows] + [t - s for s, t in zip(n_rows, n_rows[1:])]

    cum_effect = cumulative_effect_curve(df=df, treatment=treatment, outcome=outcome, prediction=prediction,
                                         min_rows=min_rows, steps=steps, effect_fn=effect_fn)

    return abs(sum([effect * (rows / size) * (step_size / size)
                    for rows, effect, step_size in zip(n_rows, cum_effect, step_sizes)]))


[docs]@curry
def area_under_the_relative_cumulative_gain_curve(df: pd.DataFrame,
                                                  treatment: str,
                                                  outcome: str,
                                                  prediction: str,
                                                  min_rows: int = 30,
                                                  steps: int = 100,
                                                  effect_fn: EffectFnType = linear_effect) -> float:
    """
     Orders the dataset by prediction and computes the area under the relative cumulative gain curve, according to that
      ordering.

     Parameters
     ----------
     df : Pandas' DataFrame
         A Pandas' DataFrame with target and prediction scores.

     treatment : Strings
         The name of the treatment column in `df`.

     outcome : Strings
         The name of the outcome column in `df`.

     prediction : Strings
         The name of the prediction column in `df`.

     min_rows : Integer
         Minimum number of observations needed to have a valid result.

     steps : Integer
         The number of cumulative steps to iterate when accumulating the effect

     effect_fn : function (df: pandas.DataFrame, treatment: str, outcome: str) -> int or Array of int
         A function that computes the treatment effect given a dataframe, the name of the treatment column and the name
         of the outcome column.


     Returns
     ----------
     area under the relative cumulative gain curve: float
         The area under the relative cumulative gain curve according to the predictions ordering.
     """

    ate = effect_fn(df, treatment, outcome)
    size = df.shape[0]
    n_rows = list(range(min_rows, size, size // steps)) + [size]
    step_sizes = [min_rows] + [t - s for s, t in zip(n_rows, n_rows[1:])]

    cum_effect = cumulative_effect_curve(df=df, treatment=treatment, outcome=outcome, prediction=prediction,
                                         min_rows=min_rows, steps=steps, effect_fn=effect_fn)

    return abs(sum([(effect - ate) * (rows / size) * (step_size / size)
                    for rows, effect, step_size in zip(n_rows, cum_effect, step_sizes)]))