Source code for fklearn.training.unsupervised

from typing import Any, Dict, List

import pandas as pd
from sklearn.ensemble import IsolationForest
import sklearn
from toolz import curry, merge

from fklearn.common_docstrings import learner_pred_fn_docstring, learner_return_docstring
from fklearn.types import LearnerReturnType
from fklearn.training.utils import log_learner_time, expand_features_encoded


[docs]@curry
@log_learner_time(learner_name='isolation_forest_learner')
def isolation_forest_learner(df: pd.DataFrame,
                             features: List[str],
                             params: Dict[str, Any] = None,
                             prediction_column: str = "prediction",
                             encode_extra_cols: bool = True) -> LearnerReturnType:
    """
    Fits an anomaly detection algorithm (Isolation Forest) to the dataset

    Parameters
    ----------
    df : pandas.DataFrame
        A Pandas' DataFrame with features and target columns.
        The model will be trained to predict the target column
        from the features.

    features : list of str
        A list os column names that are used as features for the model. All this names
        should be in `df`.

    params : dict
        The IsolationForest parameters in the format {"par_name": param}. See:
        http://scikit-learn.org/stable/modules/generated/sklearn.ensemble.IsolationForest.html

    prediction_column : str
        The name of the column with the predictions from the model.

    encode_extra_cols : bool (default: True)
        If True, treats all columns in `df` with name pattern fklearn_feat__col==val` as feature columns.
    """

    model = IsolationForest()

    default_params: Dict[str, Any] = {"n_jobs": -1, "random_state": 1729, "contamination": 0.1}
    # Remove this when we stop supporting scikit-learn<0.24 as this param is deprecated
    if "behaviour" in model.get_params():
        default_params["behaviour"] = "new"
    params = default_params if not params else merge(default_params, params)
    model.set_params(**params)

    features = features if not encode_extra_cols else expand_features_encoded(df, features)

    model.fit(df[features].values)

    def p(new_df: pd.DataFrame) -> pd.DataFrame:
        output_col = {prediction_column: model.decision_function(
            new_df[features])}

        return new_df.assign(**output_col)

    p.__doc__ = learner_pred_fn_docstring("isolation_forest_learner")

    log = {'isolation_forest_learner': {
        'features': features,
        'parameters': params,
        'prediction_column': prediction_column,
        'package': "sklearn",
        'package_version': sklearn.__version__,
        'training_samples': len(df)}}

    return p, p(df), log


isolation_forest_learner.__doc__ += learner_return_docstring("Isolation Forest")