Source code for fklearn.preprocessing.schema

import functools
import inspect
import pandas as pd
import toolz

from typing import Any, Callable, Dict, List, Optional, Union
from fklearn.types import LearnerLogType, LearnerReturnType


[docs]@toolz.curry def feature_duplicator( df: pd.DataFrame, columns_to_duplicate: Optional[List[str]] = None, columns_mapping: Optional[Dict[str, str]] = None, prefix: Optional[str] = None, suffix: Optional[str] = None ) -> LearnerReturnType: """ Duplicates some columns in the dataframe. When encoding features, a good practice is to save the encoded version in a different column rather than replacing the original values. The purpose of this function is to duplicate the column to be encoded, to be later replaced by the encoded values. The duplication method is used to preserve the original behaviour (replace). Parameters ---------- df: pandas.DataFrame A Pandas' DataFrame with columns_to_duplicate columns columns_to_duplicate: list of str List of columns names columns_mapping: int (default None) Mapping of source columns to destination columns prefix: int (default None) prefix to add to columns to duplicate suffix: int (default None) Suffix to add to columns to duplicate Returns ---------- increased_dataset : pandas.DataFrame A dataset with repeated columns """ columns_final_mapping = ( columns_mapping if columns_mapping is not None else { col: (prefix or '') + str(col) + (suffix or '') for col in columns_to_duplicate } if columns_to_duplicate else dict() ) def p(new_df: pd.DataFrame) -> pd.DataFrame: categ_columns = {dest_col: new_df[src_col] for src_col, dest_col in columns_final_mapping.items()} return new_df.assign(**categ_columns) p.__doc__ = feature_duplicator.__doc__ log: LearnerLogType = { 'feature_duplicator': { 'columns_to_duplicate': columns_to_duplicate, 'columns_mapping': columns_mapping, 'prefix': prefix, 'suffix': suffix, 'columns_final_mapping': columns_final_mapping, } } return p, p(df.copy()), log
[docs]def column_duplicatable(columns_to_bind: str) -> Callable: """ Decorator to prepend the feature_duplicator learner. Identifies the columns to be duplicated and applies duplicator. Parameters ---------- columns_to_bind: str Sets feature_duplicator's "columns_to_duplicate" parameter equal to the `columns_to_bind` parameter from the decorated learner """ def _decorator(child: toolz.curry) -> Callable: mixin = feature_duplicator def _init( *args: List[Any], **kwargs: Dict[str, Any] ) -> Union[Callable, LearnerReturnType]: mixin_spec = inspect.getfullargspec(mixin) mixin_named_args = set(mixin_spec.args) | set(mixin_spec.kwonlyargs) mixin_kwargs = { key: value for key, value in kwargs.items() if key in mixin_named_args } child_spec = inspect.getfullargspec(child) child_named_args = set(child_spec.args) | set(child_spec.kwonlyargs) child_kwargs: Dict[str, Any] = { key: value for key, value in kwargs.items() if key in child_named_args } child_arg_names = list(inspect.signature(child).parameters.keys()) columns_to_bind_idx = child_arg_names.index(columns_to_bind) curry_is_ready = not child._should_curry(args, child_kwargs) if curry_is_ready: columns_to_duplicate = ( kwargs[columns_to_bind] if columns_to_bind in kwargs else args[columns_to_bind_idx] ) mixin_fn, mixin_df, mixin_log = mixin( args[0], **mixin_kwargs, columns_to_duplicate=columns_to_duplicate) child_fn, child_df, child_log = child( mixin_df, *args[1:], **child_kwargs) return ( toolz.compose(child_fn, mixin_fn), child_df, {**mixin_log, **child_log} ) else: return functools.update_wrapper( functools.partial( _init, *args, **kwargs), child(*args[1:], **child_kwargs)) callable_fn = functools.update_wrapper(_init, child) return callable_fn return _decorator