from typing import Optional, Tuple
import numpy as np
from numpy.random import RandomState
import pandas as pd
from sklearn.model_selection import StratifiedShuffleSplit
from toolz import curry
from fklearn.types import DateType
[docs]@curry
def time_split_dataset(dataset: pd.DataFrame,
train_start_date: DateType,
train_end_date: DateType,
holdout_end_date: DateType,
time_column: str,
holdout_start_date: DateType = None) -> Tuple[pd.DataFrame, pd.DataFrame]:
"""
Splits temporal data into a training and testing datasets such that
all training data comes before the testings one.
Parameters
----------
dataset : pandas.DataFrame
A Pandas' DataFrame with an Identifier Column and a Date Column.
The model will be trained to predict the target column
from the features.
train_start_date : str
A date string representing a the starting time of the training data.
It should be in the same format as the Date Column in `dataset`.
train_end_date : str
A date string representing a the ending time of the training data.
This will also be used as the start date of the holdout period if no `holdout_start_date` is given.
It should be in the same format as the Date Column in `dataset`.
holdout_end_date : str
A date string representing a the ending time of the holdout data.
It should be in the same format as the Date Column in `dataset`.
time_column : str
The name of the Date column of `dataset`.
holdout_start_date: str
A date string representing the starting time of the holdout data.
If `None` is given it will be equal to `train_end_date`.
It should be in the same format as the Date Column in `dataset`.
Returns
----------
train_set : pandas.DataFrame
The in ID sample and in time training set.
test_set : pandas.DataFrame
The out of ID sample and in time hold out set.
"""
holdout_start_date = holdout_start_date if holdout_start_date else train_end_date
train_set = dataset[
(dataset[time_column] >= train_start_date) & (dataset[time_column] < train_end_date)]
test_set = dataset[
(dataset[time_column] >= holdout_start_date) & (dataset[time_column] < holdout_end_date)]
return train_set, test_set
[docs]@curry
def space_time_split_dataset(dataset: pd.DataFrame,
train_start_date: DateType,
train_end_date: DateType,
holdout_end_date: DateType,
split_seed: int,
space_holdout_percentage: float,
space_column: str,
time_column: str,
holdout_space: np.ndarray = None,
holdout_start_date: DateType = None) -> Tuple[pd.DataFrame, ...]:
"""
Splits panel data using both ID and Time columns, resulting in four datasets
1. A training set;
2. An in training time, but out sample id hold out dataset;
3. An out of training time, but in sample id hold out dataset;
4. An out of training time and out of sample id hold out dataset.
Parameters
----------
dataset : pandas.DataFrame
A Pandas' DataFrame with an Identifier Column and a Date Column.
The model will be trained to predict the target column
from the features.
train_start_date : str
A date string representing a the starting time of the training data.
It should be in the same format as the Date Column in `dataset`.
train_end_date : str
A date string representing a the ending time of the training data.
This will also be used as the start date of the holdout period if no `holdout_start_date` is given.
It should be in the same format as the Date Column in `dataset`.
holdout_end_date : str
A date string representing a the ending time of the holdout data.
It should be in the same format as the Date Column in `dataset`.
split_seed : int
A seed used by the random number generator.
space_holdout_percentage : float
The out of id holdout size as a proportion of the in id training
size.
space_column : str
The name of the Identifier column of `dataset`.
time_column : str
The name of the Date column of `dataset`.
holdout_space : np.array
An array containing the hold out IDs. If not specified,
A random subset of IDs will be selected for holdout.
holdout_start_date: str
A date string representing the starting time of the holdout data.
If `None` is given it will be equal to `train_end_date`.
It should be in the same format as the Date Column in `dataset`.
Returns
----------
train_set : pandas.DataFrame
The in ID sample and in time training set.
intime_outspace_hdout : pandas.DataFrame
The out of ID sample and in time hold out set.
outime_inspace_hdout : pandas.DataFrame
The in ID sample and out of time hold out set.
outime_outspace_hdout : pandas.DataFrame
The out of ID sample and out of time hold out set.
"""
holdout_start_date = holdout_start_date if holdout_start_date else train_end_date
in_time_mask = (dataset[time_column] >= train_start_date) & (dataset[time_column] < train_end_date)
out_time_mask = (dataset[time_column] >= holdout_start_date) & (dataset[time_column] < holdout_end_date)
all_space_in_time = dataset[in_time_mask][space_column].unique()
if holdout_space is None:
# for repeatability
state = RandomState(split_seed)
train_period_space = np.sort(all_space_in_time)
# randomly sample accounts from the train period to hold out
partial_holdout_space = state.choice(train_period_space,
int(space_holdout_percentage * len(train_period_space)),
replace=False)
in_space = pd.Index(all_space_in_time).difference(pd.Index(partial_holdout_space)).values
else:
in_space = pd.Index(all_space_in_time).difference(pd.Index(holdout_space)).values
in_space_mask = dataset[space_column].isin(in_space)
train_set = dataset[in_space_mask & in_time_mask]
intime_outspace_hdout = dataset[~in_space_mask & in_time_mask]
outtime_outspace_hdout = dataset[~in_space_mask & out_time_mask]
outtime_inspace_hdout = dataset[in_space_mask & out_time_mask]
return train_set, intime_outspace_hdout, outtime_inspace_hdout, outtime_outspace_hdout
[docs]@curry
def stratified_split_dataset(dataset: pd.DataFrame, target_column: str, test_size: float,
random_state: Optional[int] = None) -> Tuple[pd.DataFrame, pd.DataFrame]:
"""
Splits data into a training and testing datasets such that
they maintain the same class ratio of the original dataset.
Parameters
----------
dataset : pandas.DataFrame
A Pandas' DataFrame with the target column.
The model will be trained to predict the target column
from the features.
target_column : str
The name of the target column of `dataset`.
test_size : float
Represent the proportion of the dataset to include in the test split.
should be between 0.0 and 1.0.
random_state : int or None, optional (default=None)
If int, random_state is the seed used by the random number generator;
If None, the random number generator is the RandomState instance used
by `np.random`.
Returns
----------
train_set : pandas.DataFrame
The train dataset sampled from the full dataset.
test_set : pandas.DataFrame
The test dataset sampled from the full dataset.
"""
train_placeholder = np.zeros(len(dataset))
target = dataset[target_column]
splitter = StratifiedShuffleSplit(n_splits=1, test_size=test_size, random_state=random_state)
train_indices, test_indices = next(splitter.split(train_placeholder, target))
train_set = dataset.iloc[train_indices]
test_set = dataset.iloc[test_indices]
return train_set, test_set