Source code for fklearn.data.datasets

from typing import Tuple

import numpy as np
from numpy import nan
import pandas as pd


[docs]def make_tutorial_data(n: int) -> pd.DataFrame:
    """
    Generates fake data for a tutorial. There are 3 numerical features ("num1", "num3" and "num3")
    and tow categorical features ("cat1" and "cat2")
    sex, age and severity, the treatment is a binary variable, medication and the response
    days until recovery.

    Parameters
    ----------
    n : int
        The number of samples to generate

    Returns
    ----------
    df : pd.DataFrame
        A tutorial dataset
    """
    np.random.seed(1111)

    dataset = pd.DataFrame({
        "id": list(map(lambda x: "id%d" % x, np.random.randint(0, 100, n))),
        "date": np.random.choice(pd.date_range("2015-01-01", periods=100), n),
        "feature1": np.random.gamma(20, size=n),
        "feature2": np.random.normal(40, size=n),
        "feature3": np.random.choice(["a", "b", "c"], size=n)})

    dataset["target"] = (dataset["feature1"]
                         + dataset["feature2"]
                         + dataset["feature3"].apply(lambda x: 0 if x == "a" else 30 if x == "b" else 10)
                         + np.random.normal(0, 5, size=n))

    # insert some NANs
    dataset.loc[np.random.randint(0, n, 100), "feature1"] = nan
    dataset.loc[np.random.randint(0, n, 100), "feature3"] = nan

    return dataset


[docs]def make_confounded_data(n: int) -> Tuple[pd.DataFrame, pd.DataFrame, pd.DataFrame]:
    """
    Generates fake data for counterfactual experimentation. The covariants are
    sex, age and severity, the treatment is a binary variable, medication and the response
    days until recovery.

    Parameters
    ----------
    n : int
        The number of samples to generate

    Returns
    ----------
    df_rnd : pd.DataFrame
        A dataframe where the treatment is randomly assigned.

    df_obs : pd.DataFrame
        A dataframe with confounding.

    df_df : pd.DataFrame
        A counter factual dataframe with confounding. Same as df_obs, but
        with the treatment flipped.
    """

    def get_severity(df: pd.DataFrame) -> np.ndarray:
        return ((np.random.beta(1, 3, size=df.shape[0]) * (df["age"] < 30))
                + (np.random.beta(3, 1.5, size=df.shape[0]) * (df["age"] >= 30)))

    def get_treatment(df: pd.DataFrame) -> pd.Series:
        return (.33 * df["sex"]
                + 1.5 * df["severity"]
                + 0.15 * np.random.normal(size=df.shape[0]) > 0.8).astype(float)

    def get_recovery(df: pd.DataFrame) -> np.ndarray:
        return np.random.poisson(np.exp(2
                                        + 0.5 * df["sex"]
                                        + 0.03 * df["age"]
                                        + df["severity"]
                                        - df["medication"]))

    np.random.seed(1111)
    sexes = np.random.randint(0, 2, size=n)
    ages = np.random.gamma(8, scale=4, size=n)
    meds = np.random.randint(0, 2, size=n)

    # random data
    df_rnd = pd.DataFrame(dict(sex=sexes, age=ages, medication=meds))
    df_rnd['severity'] = get_severity(df_rnd)
    df_rnd['recovery'] = get_recovery(df_rnd)

    features = ['sex', 'age', 'severity', 'medication', 'recovery']
    df_rnd = df_rnd[features]  # to enforce column order

    # obs data
    df_obs = df_rnd.copy()
    df_obs['medication'] = get_treatment(df_obs)
    df_obs['recovery'] = get_recovery(df_obs)

    # caunter_factual data
    df_ctf = df_obs.copy()
    df_ctf['medication'] = ((df_ctf['medication'] == 1) ^ 1).astype(float)
    df_ctf['recovery'] = get_recovery(df_ctf)

    return df_rnd, df_obs, df_ctf