Source code for fklearn.preprocessing.splitting

from typing import Optional, Tuple

import numpy as np
from numpy.random import RandomState
import pandas as pd
from sklearn.model_selection import StratifiedShuffleSplit
from toolz import curry

from fklearn.types import DateType


[docs]@curry def time_split_dataset(dataset: pd.DataFrame, train_start_date: DateType, train_end_date: DateType, holdout_end_date: DateType, time_column: str, holdout_start_date: DateType = None) -> Tuple[pd.DataFrame, pd.DataFrame]: """ Splits temporal data into a training and testing datasets such that all training data comes before the testings one. Parameters ---------- dataset : pandas.DataFrame A Pandas' DataFrame with an Identifier Column and a Date Column. The model will be trained to predict the target column from the features. train_start_date : str A date string representing a the starting time of the training data. It should be in the same format as the Date Column in `dataset`. train_end_date : str A date string representing a the ending time of the training data. This will also be used as the start date of the holdout period if no `holdout_start_date` is given. It should be in the same format as the Date Column in `dataset`. holdout_end_date : str A date string representing a the ending time of the holdout data. It should be in the same format as the Date Column in `dataset`. time_column : str The name of the Date column of `dataset`. holdout_start_date: str A date string representing the starting time of the holdout data. If `None` is given it will be equal to `train_end_date`. It should be in the same format as the Date Column in `dataset`. Returns ---------- train_set : pandas.DataFrame The in ID sample and in time training set. test_set : pandas.DataFrame The out of ID sample and in time hold out set. """ holdout_start_date = holdout_start_date if holdout_start_date else train_end_date train_set = dataset[ (dataset[time_column] >= train_start_date) & (dataset[time_column] < train_end_date)] test_set = dataset[ (dataset[time_column] >= holdout_start_date) & (dataset[time_column] < holdout_end_date)] return train_set, test_set
[docs]@curry def space_time_split_dataset(dataset: pd.DataFrame, train_start_date: DateType, train_end_date: DateType, holdout_end_date: DateType, split_seed: int, space_holdout_percentage: float, space_column: str, time_column: str, holdout_space: np.ndarray = None, holdout_start_date: DateType = None) -> Tuple[pd.DataFrame, ...]: """ Splits panel data using both ID and Time columns, resulting in four datasets 1. A training set; 2. An in training time, but out sample id hold out dataset; 3. An out of training time, but in sample id hold out dataset; 4. An out of training time and out of sample id hold out dataset. Parameters ---------- dataset : pandas.DataFrame A Pandas' DataFrame with an Identifier Column and a Date Column. The model will be trained to predict the target column from the features. train_start_date : str A date string representing a the starting time of the training data. It should be in the same format as the Date Column in `dataset`. train_end_date : str A date string representing a the ending time of the training data. This will also be used as the start date of the holdout period if no `holdout_start_date` is given. It should be in the same format as the Date Column in `dataset`. holdout_end_date : str A date string representing a the ending time of the holdout data. It should be in the same format as the Date Column in `dataset`. split_seed : int A seed used by the random number generator. space_holdout_percentage : float The out of id holdout size as a proportion of the in id training size. space_column : str The name of the Identifier column of `dataset`. time_column : str The name of the Date column of `dataset`. holdout_space : np.array An array containing the hold out IDs. If not specified, A random subset of IDs will be selected for holdout. holdout_start_date: str A date string representing the starting time of the holdout data. If `None` is given it will be equal to `train_end_date`. It should be in the same format as the Date Column in `dataset`. Returns ---------- train_set : pandas.DataFrame The in ID sample and in time training set. intime_outspace_hdout : pandas.DataFrame The out of ID sample and in time hold out set. outime_inspace_hdout : pandas.DataFrame The in ID sample and out of time hold out set. outime_outspace_hdout : pandas.DataFrame The out of ID sample and out of time hold out set. """ holdout_start_date = holdout_start_date if holdout_start_date else train_end_date in_time_mask = (dataset[time_column] >= train_start_date) & (dataset[time_column] < train_end_date) out_time_mask = (dataset[time_column] >= holdout_start_date) & (dataset[time_column] < holdout_end_date) all_space_in_time = dataset[in_time_mask][space_column].unique() if holdout_space is None: # for repeatability state = RandomState(split_seed) train_period_space = np.sort(all_space_in_time) # randomly sample accounts from the train period to hold out partial_holdout_space = state.choice(train_period_space, int(space_holdout_percentage * len(train_period_space)), replace=False) in_space = pd.Index(all_space_in_time).difference(pd.Index(partial_holdout_space)).values else: in_space = pd.Index(all_space_in_time).difference(pd.Index(holdout_space)).values in_space_mask = dataset[space_column].isin(in_space) train_set = dataset[in_space_mask & in_time_mask] intime_outspace_hdout = dataset[~in_space_mask & in_time_mask] outtime_outspace_hdout = dataset[~in_space_mask & out_time_mask] outtime_inspace_hdout = dataset[in_space_mask & out_time_mask] return train_set, intime_outspace_hdout, outtime_inspace_hdout, outtime_outspace_hdout
[docs]@curry def stratified_split_dataset(dataset: pd.DataFrame, target_column: str, test_size: float, random_state: Optional[int] = None) -> Tuple[pd.DataFrame, pd.DataFrame]: """ Splits data into a training and testing datasets such that they maintain the same class ratio of the original dataset. Parameters ---------- dataset : pandas.DataFrame A Pandas' DataFrame with the target column. The model will be trained to predict the target column from the features. target_column : str The name of the target column of `dataset`. test_size : float Represent the proportion of the dataset to include in the test split. should be between 0.0 and 1.0. random_state : int or None, optional (default=None) If int, random_state is the seed used by the random number generator; If None, the random number generator is the RandomState instance used by `np.random`. Returns ---------- train_set : pandas.DataFrame The train dataset sampled from the full dataset. test_set : pandas.DataFrame The test dataset sampled from the full dataset. """ train_placeholder = np.zeros(len(dataset)) target = dataset[target_column] splitter = StratifiedShuffleSplit(n_splits=1, test_size=test_size, random_state=random_state) train_indices, test_indices = next(splitter.split(train_placeholder, target)) train_set = dataset.iloc[train_indices] test_set = dataset.iloc[test_indices] return train_set, test_set