{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "# Handling Categorical Data\n", "\n", "This tutorial demonstrates how Perpetual automatically handles categorical features without the need for manual encoding (like one-hot or label encoding). We use the UCI Adult dataset to compare Perpetual's performance with LightGBM." ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "import sys\n", "from importlib.metadata import version\n", "\n", "import numpy as np\n", "import optuna\n", "import pandas as pd\n", "from lightgbm import LGBMClassifier\n", "from perpetual import PerpetualBooster\n", "from scipy.special import expit\n", "from sklearn.datasets import fetch_openml\n", "from sklearn.metrics import accuracy_score, log_loss\n", "from sklearn.model_selection import cross_validate, train_test_split" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "pd.set_option(\"display.max_rows\", 1000)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "print(sys.version)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "print(f\"numpy: {version('numpy')}\")\n", "print(f\"optuna: {version('optuna')}\")\n", "print(f\"lightgbm: {version('lightgbm')}\")\n", "print(f\"scikit-learn: {version('scikit-learn')}\")\n", "print(f\"perpetual: {version('perpetual')}\")" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# fetch dataset: https://archive.ics.uci.edu/dataset/2/adult\n", "adult = fetch_openml(data_id=2, as_frame=True)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "adult.data.features.head()" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "data = adult.data.features.copy()\n", "data[\"sex\"] = pd.get_dummies(\n", " adult.data.features[\"sex\"], drop_first=True, dtype=float\n", ").to_numpy()\n", "cols = [\n", " \"workclass\",\n", " \"education\",\n", " \"marital-status\",\n", " \"occupation\",\n", " \"relationship\",\n", " \"race\",\n", " \"native-country\",\n", "]\n", "data[cols] = data[cols].astype(\"category\")\n", "data.head()" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "y = adult.data.targets[\"income\"].str.contains(\"<\").to_numpy().astype(int)\n", "y" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "set(adult.data.targets[\"income\"])" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "seed = 0\n", "n_estimators = 50\n", "n_trials = 1" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "scoring = \"neg_log_loss\"\n", "metric_function = log_loss\n", "metric_name = \"log_loss\"\n", "LGBMBooster = LGBMClassifier\n", "objective_type = \"LogLoss\"" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "X_train, X_test, y_train, y_test = train_test_split(\n", " data, y, test_size=0.2, random_state=seed\n", ")\n", "\n", "print(f\"len(X_train): {len(X_train)}\")\n", "print(f\"len(X_test): {len(X_test)}\")" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "X_train.head()" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "best_cv_results = None\n", "cv_results = None\n", "\n", "\n", "def save_best_cv_results(study, trial):\n", " global best_cv_results\n", " if study.best_trial.number == trial.number:\n", " best_cv_results = cv_results" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "def objective_function(trial):\n", " global cv_results\n", " params = {\n", " \"seed\": seed,\n", " \"verbosity\": -1,\n", " \"n_jobs\": 1,\n", " \"n_estimators\": n_estimators,\n", " \"learning_rate\": trial.suggest_float(\"learning_rate\", 0.001, 0.5, log=True),\n", " \"min_split_gain\": trial.suggest_float(\"min_split_gain\", 1e-6, 1.0, log=True),\n", " \"reg_alpha\": trial.suggest_float(\"reg_alpha\", 1e-6, 1.0, log=True),\n", " \"reg_lambda\": trial.suggest_float(\"reg_lambda\", 1e-6, 1.0, log=True),\n", " \"colsample_bytree\": trial.suggest_float(\"colsample_bytree\", 0.2, 1.0),\n", " \"subsample\": trial.suggest_float(\"subsample\", 0.2, 1.0),\n", " \"subsample_freq\": trial.suggest_int(\"subsample_freq\", 1, 10),\n", " \"max_depth\": trial.suggest_int(\"max_depth\", 3, 33),\n", " \"num_leaves\": trial.suggest_int(\"num_leaves\", 2, 256),\n", " \"min_child_samples\": trial.suggest_int(\"min_child_samples\", 1, 100),\n", " }\n", " model = LGBMBooster(**params)\n", " cv_results = cross_validate(\n", " model,\n", " X_train,\n", " y_train,\n", " cv=5,\n", " scoring=scoring,\n", " return_train_score=True,\n", " return_estimator=True,\n", " )\n", " return -1 * np.mean(cv_results[\"test_score\"])" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "sampler = optuna.samplers.TPESampler(seed=seed)\n", "study = optuna.create_study(direction=\"minimize\", sampler=sampler)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "study.optimize(objective_function, n_trials=n_trials, callbacks=[save_best_cv_results])" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "print(f\"Number of finished trials: {len(study.trials)}\")\n", "print(\"Best trial:\")\n", "print(f\" Number: {study.best_trial.number}\")\n", "print(f\" Value: {study.best_trial.value}\")\n", "print(\" Params: \")\n", "for key, value in study.best_trial.params.items():\n", " print(f\" {key}: {value}\")" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "print(f\"CV train scores: {-1 * best_cv_results['train_score']}\")\n", "print(\n", " f\"CV train scores average : {round(np.mean(-1 * best_cv_results['train_score']), 6)}\"\n", ")\n", "print(f\"CV test scores: {-1 * best_cv_results['test_score']}\")\n", "print(\n", " f\"CV test scores average : {round(np.mean(-1 * best_cv_results['test_score']), 6)}\"\n", ")" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "models = best_cv_results[\"estimator\"]" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "for i, model in enumerate(models):\n", " y_pred = (\n", " model.predict_proba(X_train)\n", " if metric_name == \"log_loss\"\n", " else model.predict(X_train)\n", " )\n", " print(\n", " f\"Model {i}, train {metric_name}: {round(metric_function(y_train, y_pred), 6)}\"\n", " )" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "for i, model in enumerate(models):\n", " y_pred = (\n", " model.predict_proba(X_test)\n", " if metric_name == \"log_loss\"\n", " else model.predict(X_test)\n", " )\n", " print(f\"Model {i}, test {metric_name}: {round(metric_function(y_test, y_pred), 6)}\")" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "if metric_name == \"log_loss\":\n", " y_pred = np.mean([model.predict_proba(X_train) for model in models], axis=0)\n", "else:\n", " y_pred = np.mean([model.predict(X_train) for model in models], axis=0)\n", "print(f\"Train {metric_name}: {round(metric_function(y_train, y_pred), 6)}\")" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "if metric_name == \"log_loss\":\n", " y_pred = np.mean([model.predict_proba(X_test) for model in models], axis=0)\n", "else:\n", " y_pred = np.mean([model.predict(X_test) for model in models], axis=0)\n", "print(f\"Test {metric_name}: {round(metric_function(y_test, y_pred), 6)}\")" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "y_pred = np.round(np.mean([model.predict(X_test) for model in models], axis=0))\n", "print(accuracy_score(y_test, y_pred))" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "model = PerpetualBooster(budget=0.35, objective=\"LogLoss\")\n", "model.fit(X_train, y_train)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "y_pred = np.round(expit(model.predict(X_test)))\n", "print(accuracy_score(y_test, y_pred))" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "y_pred = np.round(expit(model.predict(X_train)))\n", "print(accuracy_score(y_train, y_pred))" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "if metric_name == \"log_loss\":\n", " y_pred = expit(model.predict(X_test))\n", "else:\n", " y_pred = np.round(expit(model.predict(X_test)))\n", "print(f\"Test {metric_name}: {round(metric_function(y_test, y_pred), 6)}\")" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "df_trees = model.trees_to_dataframe()" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "df_trees.head(10)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "models[0].booster_.trees_to_dataframe().head(10000)" ] } ], "metadata": { "kernelspec": { "display_name": "py311", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.11.9" } }, "nbformat": 4, "nbformat_minor": 2 }