Source code for fklearn.training.imputation

from typing import Any, List, Optional

import pandas as pd
from sklearn.impute import SimpleImputer
from toolz import curry, identity

from fklearn.common_docstrings import learner_return_docstring, learner_pred_fn_docstring
from fklearn.types import LearnerReturnType
from fklearn.training.utils import log_learner_time


[docs]@curry @log_learner_time(learner_name='imputer') def imputer(df: pd.DataFrame, columns_to_impute: List[str], impute_strategy: str = 'median', placeholder_value: Optional[Any] = None) -> LearnerReturnType: """ Fits a missing value imputer to the dataset. Parameters ---------- df : pandas.DataFrame A Pandas' DataFrame with columns to impute missing values. It must contain all columns listed in `columns_to_impute` columns_to_impute : List of strings A list of names of the columns for missing value imputation. impute_strategy : String, (default="median") The imputation strategy. - If "mean", then replace missing values using the mean along the axis. - If "median", then replace missing values using the median along the axis. - If "most_frequent", then replace missing using the most frequent value along the axis. placeholder_value : Any, (default=None) if not None, use this as default value when some features only contains NA values on training. For transformation, NA values on those features will be replaced by `fill_value`. """ if placeholder_value is not None: mask_feat_is_na = df[columns_to_impute].isna().all(axis=0) columns_to_fill = mask_feat_is_na[mask_feat_is_na].index.values columns_imputable = mask_feat_is_na[~mask_feat_is_na].index.values fill_fn, _, fill_logs = placeholder_imputer( df, columns_to_impute=columns_to_fill, placeholder_value=placeholder_value) else: columns_to_fill = list() columns_imputable = columns_to_impute fill_fn, _, fill_logs = identity, None, dict() imp = SimpleImputer(strategy=impute_strategy) imp.fit(df[columns_imputable].values) def p(new_data_set: pd.DataFrame) -> pd.DataFrame: new_data = imp.transform(new_data_set[columns_imputable]) new_cols = pd.DataFrame(data=new_data, columns=columns_imputable).to_dict('list') return fill_fn(new_data_set.assign(**new_cols)) p.__doc__ = learner_pred_fn_docstring("imputer") log = { 'imputer': { 'impute_strategy': impute_strategy, 'placeholder_value': placeholder_value, 'columns_to_impute': columns_to_impute, 'columns_to_fill': columns_to_fill, 'columns_imputable': columns_imputable, 'training_proportion_of_nulls': df[columns_to_impute].isnull().mean(axis=0).to_dict(), 'statistics': imp.statistics_, 'placeholder_imputer_fn': fill_fn, 'placeholder_imputer_logs': fill_logs, } } return p, p(df), log
imputer.__doc__ += learner_return_docstring("SimpleImputer")
[docs]@curry @log_learner_time(learner_name='placeholder_imputer') def placeholder_imputer(df: pd.DataFrame, columns_to_impute: List[str], placeholder_value: Any = -999) -> LearnerReturnType: """ Fills missing values with a fixed value. Parameters ---------- df : pandas.DataFrame A Pandas' DataFrame with columns to fill missing values. It must contain all columns listed in `columns_to_impute` columns_to_impute : List of strings A list of names of the columns for filling missing value. placeholder_value : Any, (default=-999) The value used to fill in missing values. """ def p(new_data_set: pd.DataFrame) -> pd.DataFrame: new_cols = new_data_set[columns_to_impute].fillna(placeholder_value).to_dict('list') return new_data_set.assign(**new_cols) p.__doc__ = learner_pred_fn_docstring("placeholder_imputer") log = { 'placeholder_imputer': { 'columns_to_impute': columns_to_impute, 'training_proportion_of_nulls': df[columns_to_impute].isnull().mean(axis=0).to_dict(), 'placeholder_value': placeholder_value } } return p, p(df), log
placeholder_imputer.__doc__ += learner_return_docstring("Placeholder SimpleImputer")