from typing import List
import numpy as np
import pandas as pd
from sklearn.base import RegressorMixin
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import cross_val_predict
from statsmodels.formula.api import ols
from toolz import curry, merge
from typing import Dict, Any
[docs]@curry
def debias_with_regression(df: pd.DataFrame,
treatment_column: str,
outcome_column: str,
confounder_columns: List[str],
suffix: str = "_debiased",
denoise: bool = True) -> pd.DataFrame:
"""
Frisch-Waugh-Lovell style debiasing with linear regression.
To debias, we
1) fit a linear model to predict the treatment from the confounders and take the residuals from this fit
(debias step)
2) fit a linear model to predict the outcome from the confounders and take the residuals from this fit
(denoise step).
We then add back the average outcome and treatment so that their levels remain unchanged.
Returns a dataframe with the debiased columns with suffix appended to the name
Parameters
----------
df : Pandas DataFrame
A Pandas' DataFrame with with treatment, outcome and confounder columns
treatment_column : str
The name of the column in `df` with the treatment.
outcome_column : str
The name of the column in `df` with the outcome.
confounder_columns : list of str
A list of confounder present in df
suffix : str
A suffix to append to the returning debiased column names.
denoise : bool (Default=True)
If it should denoise the outcome using the confounders or not
Returns
----------
debiased_df : Pandas DataFrame
The original `df` dataframe with debiased columns added.
"""
model = LinearRegression()
cols_to_debias = [treatment_column, outcome_column] if denoise else [treatment_column]
model.fit(df[confounder_columns], df[cols_to_debias])
debiased = (df[cols_to_debias] - model.predict(df[confounder_columns]) + df[cols_to_debias].mean())
return df.assign(**{c + suffix: debiased[c] for c in cols_to_debias})
[docs]@curry
def debias_with_fixed_effects(df: pd.DataFrame,
treatment_column: str,
outcome_column: str,
confounder_columns: List[str],
suffix: str = "_debiased",
denoise: bool = True) -> pd.DataFrame:
"""
Returns a dataframe with the debiased columns with suffix appended to the name
This is equivalent of debiasing with regression where the forumla is "C(x1) + C(x2) + ...".
However, it is much more eficient than runing such a dummy variable regression.
Parameters
----------
df : Pandas DataFrame
A Pandas' DataFrame with with treatment, outcome and confounder columns
treatment_column : str
The name of the column in `df` with the treatment.
outcome_column : str
The name of the column in `df` with the outcome.
confounder_columns : list of str
Confounders are categorical groups we wish to explain away. Some examples are units (ex: customers),
and time (day, months...). We perform a group by on these columns, so they should not be continuous
variables.
suffix : str
A suffix to append to the returning debiased column names.
denoise : bool (Default=True)
If it should denoise the outcome using the confounders or not
Returns
----------
debiased_df : Pandas DataFrame
The original `df` dataframe with debiased columns added.
"""
cols_to_debias = [treatment_column, outcome_column] if denoise else [treatment_column]
def debias_column(c: str) -> dict:
mu = sum([df.groupby(x)[c].transform("mean") for x in confounder_columns])
return {c + suffix: df[c] - mu + df[c].mean()}
return df.assign(**merge(*[debias_column(c) for c in cols_to_debias]))
[docs]@curry
def debias_with_double_ml(df: pd.DataFrame,
treatment_column: str,
outcome_column: str,
confounder_columns: List[str],
ml_regressor: RegressorMixin = GradientBoostingRegressor,
extra_params: Dict[str, Any] = None,
cv: int = 5,
suffix: str = "_debiased",
denoise: bool = True,
seed: int = 123) -> pd.DataFrame:
"""
Frisch-Waugh-Lovell style debiasing with ML model.
To debias, we
1) fit a regression ml model to predict the treatment from the confounders and take out of fold residuals from
this fit (debias step)
2) fit a regression ml model to predict the outcome from the confounders and take the out of fold residuals from
this fit (denoise step).
We then add back the average outcome and treatment so that their levels remain unchanged.
Returns a dataframe with the debiased columns with suffix appended to the name
Parameters
----------
df : Pandas DataFrame
A Pandas' DataFrame with with treatment, outcome and confounder columns
treatment_column : str
The name of the column in `df` with the treatment.
outcome_column : str
The name of the column in `df` with the outcome.
confounder_columns : list of str
A list of confounder present in df
ml_regressor : Sklearn's RegressorMixin
A regressor model that implements a fit and a predict method
extra_params : dict
The hyper-parameters for the model
cv : int
The number of folds to cross predict
suffix : str
A suffix to append to the returning debiased column names.
denoise : bool (Default=True)
If it should denoise the outcome using the confounders or not
seed : int
A seed for consistency in random computation
Returns
----------
debiased_df : Pandas DataFrame
The original `df` dataframe with debiased columns added.
"""
params = extra_params if extra_params else {}
cols_to_debias = [treatment_column, outcome_column] if denoise else [treatment_column]
np.random.seed(seed)
def get_cv_resid(col_to_debias: str) -> np.ndarray:
model = ml_regressor(**params)
cv_pred = cross_val_predict(estimator=model, X=df[confounder_columns], y=df[col_to_debias], cv=cv)
return df[col_to_debias] - cv_pred + df[col_to_debias].mean()
return df.assign(**{c + suffix: get_cv_resid(c) for c in cols_to_debias})