Source code for fklearn.causal.cate_learning.double_machine_learning

from typing import List, Tuple

import numpy as np
import pandas as pd
from sklearn import __version__ as sk_version
from sklearn.base import RegressorMixin
from sklearn.model_selection import KFold
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.base import clone
from toolz import curry
from typing import Union

from fklearn.common_docstrings import learner_pred_fn_docstring, learner_return_docstring
from fklearn.training.utils import log_learner_time, expand_features_encoded
from fklearn.types import LearnerReturnType


def _cv_estimate(model: RegressorMixin,
                 train_data: pd.DataFrame,
                 features: List[str],
                 y: str,
                 n_splits: int) -> Tuple[pd.Series, List[RegressorMixin]]:

    cv = KFold(n_splits=n_splits)
    models = []
    cv_pred = pd.Series(np.nan, index=train_data.index)

    for train, test in cv.split(train_data):
        m = clone(model, safe=False).fit(train_data[features].iloc[train], train_data[y].iloc[train])
        cv_pred.iloc[test] = m.predict(train_data[features].iloc[test])
        models += [m]

    return cv_pred, models


[docs]@curry
@log_learner_time(learner_name='non_parametric_double_ml_learner')
def non_parametric_double_ml_learner(df: pd.DataFrame,
                                     feature_columns: List[str],
                                     treatment_column: str,
                                     outcome_column: str,
                                     debias_model: Union[RegressorMixin, None] = None,
                                     debias_feature_columns: List[str] = None,
                                     denoise_model: Union[RegressorMixin, None] = None,
                                     denoise_feature_columns: List[str] = None,
                                     final_model: Union[RegressorMixin, None] = None,
                                     final_model_feature_columns: List[str] = None,
                                     prediction_column: str = "prediction",
                                     cv_splits: int = 2,
                                     encode_extra_cols: bool = True) -> LearnerReturnType:
    """
    Fits an Non-Parametric Double/ML Meta Learner for Conditional Average Treatment Effect Estimation. It implements the
    following steps:
    1) fits k instances of the debias model to predict the treatment from the features and get out-of-fold residuals
        t_res=t-t_hat;
    2) fits k instances of the denoise model to predict the outcome from the features and get out-of-fold residuals
        y_res=y-y_hat;
    3) fits a final ML model to predict y_res / t_res from the features using weighted regression with weights set to
        t_res^2. Trained like this, the final model will output treatment effect predictions.

    Parameters
    ----------

    df : pandas.DataFrame
        A Pandas' DataFrame with features, treatment and target columns.
        The model will be trained to predict the target column
        from the features.

    feature_columns : list of str
        A list os column names that are used as features for the denoise, debias and final models in double-ml. All
         this names should be in `df`.

    treatment_column : str
        The name of the column in `df` that should be used as treatment for the double-ml model. It will learn the
         impact of this column with respect to the outcome column.

    outcome_column : str
        The name of the column in `df` that should be used as outcome for the double-ml model. It will learn the impact
        of the treatment column on this outcome column.

    debias_model : RegressorMixin (default None)
        The estimator for fitting the treatment from the features. Must implement fit and predict methods. It can be an
        scikit-learn regressor. When None, defaults to GradientBoostingRegressor.

    debias_feature_columns : list of str (default None)
        A list os column names to be used only for the debias model. If not None, it will replace feature_columns when
        fitting the debias model.

    denoise_model : RegressorMixin (default None)
        The estimator for fitting the outcome from the features. Must implement fit and predict methods. It can be an
        scikit-learn regressor. When None, defaults to GradientBoostingRegressor.

    denoise_feature_columns : list of str (default None)
        A list os column names to be used only for the denoise model. If not None, it will replace feature_columns when
        fitting the denoise model.

    final_model : RegressorMixin (default None)
        The estimator for fitting the outcome residuals from the treatment residuals. Must implement fit and predict
        methods. It can be an arbitrary scikit-learn regressor. The fit method must accept sample_weight as a keyword
        argument. When None, defaults to GradientBoostingRegressor.

    final_model_feature_columns : list of str (default None)
        A list os column names to be used only for the final model. If not None, it will replace feature_columns when
        fitting the final model.

    prediction_column : str (default "prediction")
        The name of the column with the treatment effect predictions from the final model.

    cv_splits : int (default 2)
        Number of folds to split the training data when fitting the debias and denoise models

    encode_extra_cols : bool (default: True)
        If True, treats all columns in `df` with name pattern fklearn_feat__col==val` as feature columns.
    """

    features = feature_columns if not encode_extra_cols else expand_features_encoded(df, feature_columns)

    debias_model = GradientBoostingRegressor() if debias_model is None else clone(debias_model, safe=False)
    denoise_model = GradientBoostingRegressor() if denoise_model is None else clone(denoise_model, safe=False)
    final_model = GradientBoostingRegressor() if final_model is None else clone(final_model, safe=False)

    t_hat, mts = _cv_estimate(debias_model, df,
                              features if debias_feature_columns is None else debias_feature_columns,
                              treatment_column, cv_splits)
    y_hat, mys = _cv_estimate(denoise_model, df,
                              features if denoise_feature_columns is None else denoise_feature_columns,
                              outcome_column, cv_splits)

    y_res = df[outcome_column] - y_hat
    t_res = df[treatment_column] - t_hat

    final_target = y_res / t_res
    weights = t_res ** 2
    final_model_x = features if final_model_feature_columns is None else final_model_feature_columns

    model_final_fitted = final_model.fit(X=df[final_model_x],
                                         y=final_target,
                                         sample_weight=weights)

    def p(new_df: pd.DataFrame) -> pd.DataFrame:
        return new_df.assign(**{prediction_column: model_final_fitted.predict(new_df[final_model_x].values)})

    p.__doc__ = learner_pred_fn_docstring("non_parametric_double_ml_learner")

    log = {'non_parametric_double_ml_learner': {
        'features': feature_columns,
        'debias_feature_columns': debias_feature_columns,
        'denoise_feature_columns': denoise_feature_columns,
        'final_model_feature_columns': final_model_feature_columns,
        'outcome_column': outcome_column,
        'treatment_column': treatment_column,
        'prediction_column': prediction_column,
        'package': "sklearn",
        'package_version': sk_version,
        'feature_importance': None,
        'training_samples': len(df)},
        'debias_models': mts,
        'denoise_models': mys,
        'cv_splits': cv_splits,
        'object': model_final_fitted}

    return p, p(df), log


non_parametric_double_ml_learner.__doc__ += learner_return_docstring("Non Parametric Double/ML")