Source code for fklearn.preprocessing.rebalancing

import pandas as pd
from toolz import curry, partial


[docs]@curry def rebalance_by_categorical(dataset: pd.DataFrame, categ_column: str, max_lines_by_categ: int = None, seed: int = 1) -> pd.DataFrame: """ Resample dataset so that the result contains the same number of lines per category in categ_column. Parameters ---------- dataset: pandas.DataFrame A Pandas' DataFrame with an categ_column column categ_column: str The name of the categorical column max_lines_by_categ: int (default None) The maximum number of lines by category. If None it will be set to the number of lines for the smallest category seed: int (default 1) Random state for consistency. Returns ---------- rebalanced_dataset : pandas.DataFrame A dataset with fewer lines than dataset, but with the same number of lines per category in categ_column """ categs = dataset[categ_column].value_counts().to_dict() max_lines_by_categ = max_lines_by_categ if max_lines_by_categ else min(categs.values()) return pd.concat([(dataset .loc[dataset[categ_column] == categ, :] .sample(max_lines_by_categ, random_state=seed)) for categ in list(categs.keys())])
[docs]@curry def rebalance_by_continuous(dataset: pd.DataFrame, continuous_column: str, buckets: int, max_lines_by_categ: int = None, by_quantile: bool = False, seed: int = 1) -> pd.DataFrame: """ Resample dataset so that the result contains the same number of lines per bucket in a continuous column. Parameters ---------- dataset: pandas.DataFrame A Pandas' DataFrame with an categ_column column continuous_column: str The name of the continuous column buckets: int The number of buckets to split the continuous column into max_lines_by_categ: int (default None) The maximum number of lines by category. If None it will be set to the number of lines for the smallest category by_quantile: bool (default False) If True, uses pd.qcut instead of pd.cut to get the buckets from the continuous column seed: int (default 1) Random state for consistency. Returns ---------- rebalanced_dataset : pandas.DataFrame A dataset with fewer lines than dataset, but with the same number of lines per category in categ_column """ bin_fn = partial(pd.qcut, q=buckets, duplicates="drop") if by_quantile else partial(pd.cut, bins=buckets) return (dataset .assign(bins=bin_fn(dataset[continuous_column])) .pipe(rebalance_by_categorical(categ_column="bins", max_lines_by_categ=max_lines_by_categ, seed=seed)) .drop(columns=["bins"]))