NLP Classification¶
In this example we will use the Consumer Complaint Database to predict whether the consumer is asking a question about “Credit reporting, credit repair services, or other personal consumer reports”, using the question as a feature.
Since this is a binary classification task we will use TF-IDF and Logistic Regression, the
baseline of any NLP classification task, as our model (available as
nlp_logistic_classification_learner
on fklearn).
import pandas as pd
from fklearn.preprocessing.splitting import time_split_dataset
from fklearn.training.classification import nlp_logistic_classification_learner
from fklearn.validation.evaluators import fbeta_score_evaluator
# Load consumer complaints data
def load_data(path):
df = pd.read_csv(path, usecols=["Product", "Consumer complaint narrative", "Date received", "Complaint ID"], parse_dates=["Date received"])\
.rename(columns={"Product": "product", "Consumer complaint narrative": "text", "Date received": "time", "Complaint ID": "id"})
df["target"] = (df["product"] == "Credit reporting, credit repair services, or other personal consumer reports").astype(int)
return df.dropna()
df = load_data("Consumer_Complaints.csv")
# Split using the `time` column, using 2017 to train and 2018 to test
train, holdout = time_split_dataset(df, train_start_date="2017-01-01", train_end_date="2018-01-01", holdout_end_date="2019-01-01", time_column="time")
# Train model and predict on holdout data
predict_fn, train_pred, logs = nlp_logistic_classification_learner(train, text_feature_cols=["text"], target="target")
holdout_pred = predict_fn(holdout)
# Measure F1-Score
f1_score = fbeta_score_evaluator(holdout_pred)
# {'fbeta_evaluator__target': 0.7611906547172731}