Handling Categorical Data

This tutorial demonstrates how Perpetual automatically handles categorical features without the need for manual encoding (like one-hot or label encoding). We use the UCI Adult dataset to compare Perpetual’s performance with LightGBM.

[ ]:

import sys
from importlib.metadata import version

import numpy as np
import optuna
import pandas as pd
from lightgbm import LGBMClassifier
from perpetual import PerpetualBooster
from scipy.special import expit
from sklearn.datasets import fetch_openml
from sklearn.metrics import accuracy_score, log_loss
from sklearn.model_selection import cross_validate, train_test_split

[ ]:

pd.set_option("display.max_rows", 1000)

[ ]:

print(sys.version)

[ ]:

print(f"numpy: {version('numpy')}")
print(f"optuna: {version('optuna')}")
print(f"lightgbm: {version('lightgbm')}")
print(f"scikit-learn: {version('scikit-learn')}")
print(f"perpetual: {version('perpetual')}")

[ ]:

# fetch dataset: https://archive.ics.uci.edu/dataset/2/adult
adult = fetch_openml(name="adult", version=2, as_frame=True)

[ ]:

adult.data.head()

[ ]:

data = adult.data.copy()
data["sex"] = pd.get_dummies(adult.data["sex"], drop_first=True, dtype=float).to_numpy()
cols = [
    "workclass",
    "education",
    "marital-status",
    "occupation",
    "relationship",
    "race",
    "native-country",
]
data[cols] = data[cols].astype("category")
data.head()

[ ]:

y = adult.target.str.contains("<").to_numpy().astype(int)
y

[ ]:

set(adult.target)

[ ]:

seed = 0
n_estimators = 50
n_trials = 1

[ ]:

scoring = "neg_log_loss"
metric_function = log_loss
metric_name = "log_loss"
LGBMBooster = LGBMClassifier
objective_type = "LogLoss"

[ ]:

X_train, X_test, y_train, y_test = train_test_split(
    data, y, test_size=0.2, random_state=seed
)

print(f"len(X_train): {len(X_train)}")
print(f"len(X_test): {len(X_test)}")

[ ]:

X_train.head()

[ ]:

best_cv_results = None
cv_results = None


def save_best_cv_results(study, trial):
    global best_cv_results
    if study.best_trial.number == trial.number:
        best_cv_results = cv_results

[ ]:

def objective_function(trial):
    global cv_results
    params = {
        "seed": seed,
        "verbosity": -1,
        "n_jobs": 1,
        "n_estimators": n_estimators,
        "learning_rate": trial.suggest_float("learning_rate", 0.001, 0.5, log=True),
        "min_split_gain": trial.suggest_float("min_split_gain", 1e-6, 1.0, log=True),
        "reg_alpha": trial.suggest_float("reg_alpha", 1e-6, 1.0, log=True),
        "reg_lambda": trial.suggest_float("reg_lambda", 1e-6, 1.0, log=True),
        "colsample_bytree": trial.suggest_float("colsample_bytree", 0.2, 1.0),
        "subsample": trial.suggest_float("subsample", 0.2, 1.0),
        "subsample_freq": trial.suggest_int("subsample_freq", 1, 10),
        "max_depth": trial.suggest_int("max_depth", 3, 33),
        "num_leaves": trial.suggest_int("num_leaves", 2, 256),
        "min_child_samples": trial.suggest_int("min_child_samples", 1, 100),
    }
    model = LGBMBooster(**params)
    cv_results = cross_validate(
        model,
        X_train,
        y_train,
        cv=5,
        scoring=scoring,
        return_train_score=True,
        return_estimator=True,
    )
    return -1 * np.mean(cv_results["test_score"])

[ ]:

sampler = optuna.samplers.TPESampler(seed=seed)
study = optuna.create_study(direction="minimize", sampler=sampler)

[ ]:

study.optimize(objective_function, n_trials=n_trials, callbacks=[save_best_cv_results])

[ ]:

print(f"Number of finished trials: {len(study.trials)}")
print("Best trial:")
print(f"  Number: {study.best_trial.number}")
print(f"  Value: {study.best_trial.value}")
print("  Params: ")
for key, value in study.best_trial.params.items():
    print(f"    {key}: {value}")

[ ]:

print(f"CV train scores: {-1 * best_cv_results['train_score']}")
print(
    f"CV train scores average : {round(np.mean(-1 * best_cv_results['train_score']), 6)}"
)
print(f"CV test scores: {-1 * best_cv_results['test_score']}")
print(
    f"CV test scores average : {round(np.mean(-1 * best_cv_results['test_score']), 6)}"
)

[ ]:

models = best_cv_results["estimator"]

[ ]:

for i, model in enumerate(models):
    y_pred = (
        model.predict_proba(X_train)
        if metric_name == "log_loss"
        else model.predict(X_train)
    )
    print(
        f"Model {i}, train {metric_name}: {round(metric_function(y_train, y_pred), 6)}"
    )

[ ]:

for i, model in enumerate(models):
    y_pred = (
        model.predict_proba(X_test)
        if metric_name == "log_loss"
        else model.predict(X_test)
    )
    print(f"Model {i}, test {metric_name}: {round(metric_function(y_test, y_pred), 6)}")

[ ]:

if metric_name == "log_loss":
    y_pred = np.mean([model.predict_proba(X_train) for model in models], axis=0)
else:
    y_pred = np.mean([model.predict(X_train) for model in models], axis=0)
print(f"Train {metric_name}: {round(metric_function(y_train, y_pred), 6)}")

[ ]:

if metric_name == "log_loss":
    y_pred = np.mean([model.predict_proba(X_test) for model in models], axis=0)
else:
    y_pred = np.mean([model.predict(X_test) for model in models], axis=0)
print(f"Test {metric_name}: {round(metric_function(y_test, y_pred), 6)}")

[ ]:

y_pred = np.round(np.mean([model.predict(X_test) for model in models], axis=0))
print(accuracy_score(y_test, y_pred))

[ ]:

model = PerpetualBooster(objective="LogLoss")
model.fit(X_train, y_train)

[ ]:

y_pred = np.round(expit(model.predict(X_test)))
print(accuracy_score(y_test, y_pred))

[ ]:

y_pred = np.round(expit(model.predict(X_train)))
print(accuracy_score(y_train, y_pred))

[ ]:

if metric_name == "log_loss":
    y_pred = expit(model.predict(X_test))
else:
    y_pred = np.round(expit(model.predict(X_test)))
print(f"Test {metric_name}: {round(metric_function(y_test, y_pred), 6)}")

[ ]:

df_trees = model.trees_to_dataframe()

[ ]:

df_trees.head(10)

[ ]:

models[0].booster_.trees_to_dataframe().head(10000)