Source code for fklearn.preprocessing.splitting

from typing import Tuple

import numpy as np
from numpy.random import RandomState
import pandas as pd
from toolz import curry

from fklearn.types import DateType


[docs]@curry def time_split_dataset(dataset: pd.DataFrame, train_start_date: DateType, train_end_date: DateType, holdout_end_date: DateType, time_column: str) -> Tuple[pd.DataFrame, pd.DataFrame]: """ Splits temporal data into a training and testing datasets such that all training data comes before the testings one. Parameters ---------- dataset : pandas.DataFrame A Pandas' DataFrame with an Identifier Column and a Date Column. The model will be trained to predict the target column from the features. train_start_date : str A date string representing a the starting time of the training data. It should be in the same format as the Date Column in `dataset`. train_end_date : str A date string representing a the ending time of the training data. This will also be used as the start date of the holdout period. It should be in the same format as the Date Column in `dataset`. holdout_end_date : str A date string representing a the ending time of the holdout data. It should be in the same format as the Date Column in `dataset`. time_column : str The name of the Date column of `dataset`. Returns ---------- train_set : pandas.DataFrame The in ID sample and in time training set. test_set : pandas.DataFrame The out of ID sample and in time hold out set. """ train_set = dataset[ (dataset[time_column] >= train_start_date) & (dataset[time_column] < train_end_date)] test_set = dataset[ (dataset[time_column] >= train_end_date) & (dataset[time_column] < holdout_end_date)] return train_set, test_set
[docs]@curry def space_time_split_dataset(dataset: pd.DataFrame, train_start_date: DateType, train_end_date: DateType, holdout_end_date: DateType, split_seed: int, space_holdout_percentage: float, space_column: str, time_column: str, holdout_space: np.ndarray = None) -> Tuple[pd.DataFrame, ...]: """ Splits panel data using both ID and Time columns, resulting in four datasets 1. A training set; 2. An in training time, but out sample id hold out dataset; 3. An out of training time, but in sample id hold out dataset; 4. An out of training time and out of sample id hold out dataset. Parameters ---------- dataset : pandas.DataFrame A Pandas' DataFrame with an Identifier Column and a Date Column. The model will be trained to predict the target column from the features. train_start_date : str A date string representing a the starting time of the training data. It should be in the same format as the Date Column in `dataset`. train_end_date : str A date string representing a the ending time of the training data. This will also be used as the start date of the holdout period. It should be in the same format as the Date Column in `dataset`. holdout_end_date : str A date string representing a the ending time of the holdout data. It should be in the same format as the Date Column in `dataset`. split_seed : int A seed used by the random number generator. space_holdout_percentage : float The out of id holdout size as a proportion of the in id training size. space_column : str The name of the Identifier column of `dataset`. time_column : str The name of the Date column of `dataset`. holdout_space : np.array An array containing the hold out IDs. If not specified, A random subset of IDs will be selected for holdout. Returns ---------- train_set : pandas.DataFrame The in ID sample and in time training set. intime_outspace_hdout : pandas.DataFrame The out of ID sample and in time hold out set. outime_inspace_hdout : pandas.DataFrame The out of ID sample and in time hold out set. holdout_space : pandas.DataFrame The out of ID sample and in time hold out set. """ train_period = dataset[ (dataset[time_column] >= train_start_date) & (dataset[time_column] < train_end_date)] outime_inspace_hdout = dataset[ (dataset[time_column] >= train_end_date) & (dataset[time_column] < holdout_end_date)] if holdout_space is None: train_period_space = train_period[space_column].unique() # for repeatability state = RandomState(split_seed) train_period_space = np.sort(train_period_space) # randomly sample accounts from the train period to hold out holdout_space = state.choice(train_period_space, int(space_holdout_percentage * len(train_period_space)), replace=False) train_set = train_period[~train_period[space_column].isin(holdout_space)] intime_outspace_hdout = train_period[train_period[space_column].isin(holdout_space)] outime_outspace_hdout = outime_inspace_hdout[outime_inspace_hdout[space_column].isin(holdout_space)] return train_set, intime_outspace_hdout, outime_inspace_hdout, outime_outspace_hdout