Source code for fklearn.causal.debias

from typing import List

import numpy as np
import pandas as pd
from sklearn.base import RegressorMixin
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import cross_val_predict
from statsmodels.formula.api import ols
from toolz import curry, merge
from typing import Dict, Any


[docs]@curry
def debias_with_regression_formula(df: pd.DataFrame,
                                   treatment_column: str,
                                   outcome_column: str,
                                   confounder_formula: str,
                                   suffix: str = "_debiased",
                                   denoise: bool = True) -> pd.DataFrame:
    """
    Frisch-Waugh-Lovell style debiasing with linear regression. With R formula to define confounders.
    To debias, we
     1) fit a linear model to predict the treatment from the confounders and take the residuals from this fit
     (debias step)
     2) fit a linear model to predict the outcome from the confounders and take the residuals from this fit
     (denoise step).
    We then add back the average outcome and treatment so that their levels remain unchanged.

    Returns a dataframe with the debiased columns with suffix appended to the name

    Parameters
    ----------

    df : Pandas DataFrame
        A Pandas' DataFrame with with treatment, outcome and confounder columns

    treatment_column : str
        The name of the column in `df` with the treatment.

    outcome_column : str
        The name of the column in `df` with the outcome.

    confounder_formula : str
        An R formula modeling the confounders. Check https://www.statsmodels.org/dev/example_formulas.html for examples.

    suffix : str
        A suffix to append to the returning debiased column names.

    denoise : bool (Default=True)
        If it should denoise the outcome using the confounders or not

    Returns
    ----------
    debiased_df : Pandas DataFrame
        The original `df` dataframe with debiased columns added.
    """

    cols_to_debias = [treatment_column, outcome_column] if denoise else [treatment_column]

    def get_resid(col_to_debias: str) -> np.ndarray:
        model = ols(f"{col_to_debias}~{confounder_formula}", data=df).fit()
        return model.resid + df[col_to_debias].mean()

    return df.assign(**{c + suffix: get_resid(c) for c in cols_to_debias})


[docs]@curry
def debias_with_regression(df: pd.DataFrame,
                           treatment_column: str,
                           outcome_column: str,
                           confounder_columns: List[str],
                           suffix: str = "_debiased",
                           denoise: bool = True) -> pd.DataFrame:
    """
    Frisch-Waugh-Lovell style debiasing with linear regression.
    To debias, we
     1) fit a linear model to predict the treatment from the confounders and take the residuals from this fit
     (debias step)
     2) fit a linear model to predict the outcome from the confounders and take the residuals from this fit
     (denoise step).
    We then add back the average outcome and treatment so that their levels remain unchanged.

    Returns a dataframe with the debiased columns with suffix appended to the name

    Parameters
    ----------

    df : Pandas DataFrame
        A Pandas' DataFrame with with treatment, outcome and confounder columns

    treatment_column : str
        The name of the column in `df` with the treatment.

    outcome_column : str
        The name of the column in `df` with the outcome.

    confounder_columns : list of str
        A list of confounder present in df

    suffix : str
        A suffix to append to the returning debiased column names.

    denoise : bool (Default=True)
        If it should denoise the outcome using the confounders or not

    Returns
    ----------
    debiased_df : Pandas DataFrame
        The original `df` dataframe with debiased columns added.
    """

    model = LinearRegression()
    cols_to_debias = [treatment_column, outcome_column] if denoise else [treatment_column]

    model.fit(df[confounder_columns], df[cols_to_debias])

    debiased = (df[cols_to_debias] - model.predict(df[confounder_columns]) + df[cols_to_debias].mean())

    return df.assign(**{c + suffix: debiased[c] for c in cols_to_debias})


[docs]@curry
def debias_with_fixed_effects(df: pd.DataFrame,
                              treatment_column: str,
                              outcome_column: str,
                              confounder_columns: List[str],
                              suffix: str = "_debiased",
                              denoise: bool = True) -> pd.DataFrame:
    """
    Returns a dataframe with the debiased columns with suffix appended to the name

    This is equivalent of debiasing with regression where the forumla is "C(x1) + C(x2) + ...".
    However, it is much more eficient than runing such a dummy variable regression.

    Parameters
    ----------

    df : Pandas DataFrame
        A Pandas' DataFrame with with treatment, outcome and confounder columns

    treatment_column : str
        The name of the column in `df` with the treatment.

    outcome_column : str
        The name of the column in `df` with the outcome.

    confounder_columns : list of str
        Confounders are categorical groups we wish to explain away. Some examples are units (ex: customers),
        and time (day, months...). We perform a group by on these columns, so they should not be continuous
        variables.

    suffix : str
        A suffix to append to the returning debiased column names.

    denoise : bool (Default=True)
        If it should denoise the outcome using the confounders or not

    Returns
    ----------
    debiased_df : Pandas DataFrame
        The original `df` dataframe with debiased columns added.
    """

    cols_to_debias = [treatment_column, outcome_column] if denoise else [treatment_column]

    def debias_column(c: str) -> dict:
        mu = sum([df.groupby(x)[c].transform("mean") for x in confounder_columns])
        return {c + suffix: df[c] - mu + df[c].mean()}

    return df.assign(**merge(*[debias_column(c) for c in cols_to_debias]))


[docs]@curry
def debias_with_double_ml(df: pd.DataFrame,
                          treatment_column: str,
                          outcome_column: str,
                          confounder_columns: List[str],
                          ml_regressor: RegressorMixin = GradientBoostingRegressor,
                          extra_params: Dict[str, Any] = None,
                          cv: int = 5,
                          suffix: str = "_debiased",
                          denoise: bool = True,
                          seed: int = 123) -> pd.DataFrame:
    """
    Frisch-Waugh-Lovell style debiasing with ML model.
    To debias, we
     1) fit a regression ml model to predict the treatment from the confounders and take out of fold residuals from
      this fit (debias step)
     2) fit a regression ml model to predict the outcome from the confounders and take the out of fold residuals from
      this fit (denoise step).
    We then add back the average outcome and treatment so that their levels remain unchanged.

    Returns a dataframe with the debiased columns with suffix appended to the name

    Parameters
    ----------

    df : Pandas DataFrame
        A Pandas' DataFrame with with treatment, outcome and confounder columns

    treatment_column : str
        The name of the column in `df` with the treatment.

    outcome_column : str
        The name of the column in `df` with the outcome.

    confounder_columns : list of str
        A list of confounder present in df

    ml_regressor : Sklearn's RegressorMixin
        A regressor model that implements a fit and a predict method

    extra_params : dict
        The hyper-parameters for the model

    cv : int
        The number of folds to cross predict

    suffix : str
        A suffix to append to the returning debiased column names.

    denoise : bool (Default=True)
        If it should denoise the outcome using the confounders or not

    seed : int
        A seed for consistency in random computation

    Returns
    ----------
    debiased_df : Pandas DataFrame
        The original `df` dataframe with debiased columns added.
    """

    params = extra_params if extra_params else {}

    cols_to_debias = [treatment_column, outcome_column] if denoise else [treatment_column]

    np.random.seed(seed)

    def get_cv_resid(col_to_debias: str) -> np.ndarray:
        model = ml_regressor(**params)
        cv_pred = cross_val_predict(estimator=model, X=df[confounder_columns], y=df[col_to_debias], cv=cv)
        return df[col_to_debias] - cv_pred + df[col_to_debias].mean()

    return df.assign(**{c + suffix: get_cv_resid(c) for c in cols_to_debias})