Handling Categorical Data
This tutorial demonstrates how Perpetual automatically handles categorical features without the need for manual encoding (like one-hot or label encoding). We use the UCI Adult dataset to compare Perpetual’s performance with LightGBM.
[ ]:
import sys
from importlib.metadata import version
import numpy as np
import optuna
import pandas as pd
from lightgbm import LGBMClassifier
from perpetual import PerpetualBooster
from scipy.special import expit
from sklearn.datasets import fetch_openml
from sklearn.metrics import accuracy_score, log_loss
from sklearn.model_selection import cross_validate, train_test_split
[ ]:
pd.set_option("display.max_rows", 1000)
[ ]:
print(sys.version)
[ ]:
print(f"numpy: {version('numpy')}")
print(f"optuna: {version('optuna')}")
print(f"lightgbm: {version('lightgbm')}")
print(f"scikit-learn: {version('scikit-learn')}")
print(f"perpetual: {version('perpetual')}")
[ ]:
# fetch dataset: https://archive.ics.uci.edu/dataset/2/adult
adult = fetch_openml(data_id=2, as_frame=True)
[ ]:
adult.data.features.head()
[ ]:
data = adult.data.features.copy()
data["sex"] = pd.get_dummies(
adult.data.features["sex"], drop_first=True, dtype=float
).to_numpy()
cols = [
"workclass",
"education",
"marital-status",
"occupation",
"relationship",
"race",
"native-country",
]
data[cols] = data[cols].astype("category")
data.head()
[ ]:
y = adult.data.targets["income"].str.contains("<").to_numpy().astype(int)
y
[ ]:
set(adult.data.targets["income"])
[ ]:
seed = 0
n_estimators = 50
n_trials = 1
[ ]:
scoring = "neg_log_loss"
metric_function = log_loss
metric_name = "log_loss"
LGBMBooster = LGBMClassifier
objective_type = "LogLoss"
[ ]:
X_train, X_test, y_train, y_test = train_test_split(
data, y, test_size=0.2, random_state=seed
)
print(f"len(X_train): {len(X_train)}")
print(f"len(X_test): {len(X_test)}")
[ ]:
X_train.head()
[ ]:
best_cv_results = None
cv_results = None
def save_best_cv_results(study, trial):
global best_cv_results
if study.best_trial.number == trial.number:
best_cv_results = cv_results
[ ]:
def objective_function(trial):
global cv_results
params = {
"seed": seed,
"verbosity": -1,
"n_jobs": 1,
"n_estimators": n_estimators,
"learning_rate": trial.suggest_float("learning_rate", 0.001, 0.5, log=True),
"min_split_gain": trial.suggest_float("min_split_gain", 1e-6, 1.0, log=True),
"reg_alpha": trial.suggest_float("reg_alpha", 1e-6, 1.0, log=True),
"reg_lambda": trial.suggest_float("reg_lambda", 1e-6, 1.0, log=True),
"colsample_bytree": trial.suggest_float("colsample_bytree", 0.2, 1.0),
"subsample": trial.suggest_float("subsample", 0.2, 1.0),
"subsample_freq": trial.suggest_int("subsample_freq", 1, 10),
"max_depth": trial.suggest_int("max_depth", 3, 33),
"num_leaves": trial.suggest_int("num_leaves", 2, 256),
"min_child_samples": trial.suggest_int("min_child_samples", 1, 100),
}
model = LGBMBooster(**params)
cv_results = cross_validate(
model,
X_train,
y_train,
cv=5,
scoring=scoring,
return_train_score=True,
return_estimator=True,
)
return -1 * np.mean(cv_results["test_score"])
[ ]:
sampler = optuna.samplers.TPESampler(seed=seed)
study = optuna.create_study(direction="minimize", sampler=sampler)
[ ]:
study.optimize(objective_function, n_trials=n_trials, callbacks=[save_best_cv_results])
[ ]:
print(f"Number of finished trials: {len(study.trials)}")
print("Best trial:")
print(f" Number: {study.best_trial.number}")
print(f" Value: {study.best_trial.value}")
print(" Params: ")
for key, value in study.best_trial.params.items():
print(f" {key}: {value}")
[ ]:
print(f"CV train scores: {-1 * best_cv_results['train_score']}")
print(
f"CV train scores average : {round(np.mean(-1 * best_cv_results['train_score']), 6)}"
)
print(f"CV test scores: {-1 * best_cv_results['test_score']}")
print(
f"CV test scores average : {round(np.mean(-1 * best_cv_results['test_score']), 6)}"
)
[ ]:
models = best_cv_results["estimator"]
[ ]:
for i, model in enumerate(models):
y_pred = (
model.predict_proba(X_train)
if metric_name == "log_loss"
else model.predict(X_train)
)
print(
f"Model {i}, train {metric_name}: {round(metric_function(y_train, y_pred), 6)}"
)
[ ]:
for i, model in enumerate(models):
y_pred = (
model.predict_proba(X_test)
if metric_name == "log_loss"
else model.predict(X_test)
)
print(f"Model {i}, test {metric_name}: {round(metric_function(y_test, y_pred), 6)}")
[ ]:
if metric_name == "log_loss":
y_pred = np.mean([model.predict_proba(X_train) for model in models], axis=0)
else:
y_pred = np.mean([model.predict(X_train) for model in models], axis=0)
print(f"Train {metric_name}: {round(metric_function(y_train, y_pred), 6)}")
[ ]:
if metric_name == "log_loss":
y_pred = np.mean([model.predict_proba(X_test) for model in models], axis=0)
else:
y_pred = np.mean([model.predict(X_test) for model in models], axis=0)
print(f"Test {metric_name}: {round(metric_function(y_test, y_pred), 6)}")
[ ]:
y_pred = np.round(np.mean([model.predict(X_test) for model in models], axis=0))
print(accuracy_score(y_test, y_pred))
[ ]:
model = PerpetualBooster(budget=0.35, objective="LogLoss")
model.fit(X_train, y_train)
[ ]:
y_pred = np.round(expit(model.predict(X_test)))
print(accuracy_score(y_test, y_pred))
[ ]:
y_pred = np.round(expit(model.predict(X_train)))
print(accuracy_score(y_train, y_pred))
[ ]:
if metric_name == "log_loss":
y_pred = expit(model.predict(X_test))
else:
y_pred = np.round(expit(model.predict(X_test)))
print(f"Test {metric_name}: {round(metric_function(y_test, y_pred), 6)}")
[ ]:
df_trees = model.trees_to_dataframe()
[ ]:
df_trees.head(10)
[ ]:
models[0].booster_.trees_to_dataframe().head(10000)