Source code for fklearn.data.datasets

from typing import Tuple

import numpy as np
from numpy import nan
import pandas as pd


[docs]def make_tutorial_data(n: int) -> pd.DataFrame: """ Generates fake data for a tutorial. There are 3 numerical features ("num1", "num3" and "num3") and tow categorical features ("cat1" and "cat2") sex, age and severity, the treatment is a binary variable, medication and the response days until recovery. Parameters ---------- n : int The number of samples to generate Returns ---------- df : pd.DataFrame A tutorial dataset """ np.random.seed(1111) dataset = pd.DataFrame({ "id": list(map(lambda x: "id%d" % x, np.random.randint(0, 100, n))), "date": np.random.choice(pd.date_range("2015-01-01", periods=100), n), "feature1": np.random.gamma(20, size=n), "feature2": np.random.normal(40, size=n), "feature3": np.random.choice(["a", "b", "c"], size=n)}) dataset["target"] = (dataset["feature1"] + dataset["feature2"] + dataset["feature3"].apply(lambda x: 0 if x == "a" else 30 if x == "b" else 10) + np.random.normal(0, 5, size=n)) # insert some NANs dataset.loc[np.random.randint(0, n, 100), "feature1"] = nan dataset.loc[np.random.randint(0, n, 100), "feature3"] = nan return dataset
[docs]def make_confounded_data(n: int) -> Tuple[pd.DataFrame, pd.DataFrame, pd.DataFrame]: """ Generates fake data for counterfactual experimentation. The covariants are sex, age and severity, the treatment is a binary variable, medication and the response days until recovery. Parameters ---------- n : int The number of samples to generate Returns ---------- df_rnd : pd.DataFrame A dataframe where the treatment is randomly assigned. df_obs : pd.DataFrame A dataframe with confounding. df_df : pd.DataFrame A counter factual dataframe with confounding. Same as df_obs, but with the treatment flipped. """ def get_severity(df: pd.DataFrame) -> np.ndarray: return ((np.random.beta(1, 3, size=df.shape[0]) * (df["age"] < 30)) + (np.random.beta(3, 1.5, size=df.shape[0]) * (df["age"] >= 30))) def get_treatment(df: pd.DataFrame) -> pd.Series: return (.33 * df["sex"] + 1.5 * df["severity"] + 0.15 * np.random.normal(size=df.shape[0]) > 0.8).astype(float) def get_recovery(df: pd.DataFrame) -> np.ndarray: return np.random.poisson(np.exp(2 + 0.5 * df["sex"] + 0.03 * df["age"] + df["severity"] - df["medication"])) np.random.seed(1111) sexes = np.random.randint(0, 2, size=n) ages = np.random.gamma(8, scale=4, size=n) meds = np.random.randint(0, 2, size=n) # random data df_rnd = pd.DataFrame(dict(sex=sexes, age=ages, medication=meds)) df_rnd['severity'] = get_severity(df_rnd) df_rnd['recovery'] = get_recovery(df_rnd) features = ['sex', 'age', 'severity', 'medication', 'recovery'] df_rnd = df_rnd[features] # to enforce column order # obs data df_obs = df_rnd.copy() df_obs['medication'] = get_treatment(df_obs) df_obs['recovery'] = get_recovery(df_obs) # caunter_factual data df_ctf = df_obs.copy() df_ctf['medication'] = ((df_ctf['medication'] == 1) ^ 1).astype(float) df_ctf['recovery'] = get_recovery(df_ctf) return df_rnd, df_obs, df_ctf