{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "# Performance Benchmarking\n", "\n", "This tutorial compares Perpetual with a heavily tuned LightGBM model across different datasets (California Housing and Cover Type). Perpetual achieves state-of-the-art results without any hyperparameter tuning." ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "import sys\n", "from importlib.metadata import version\n", "\n", "import numpy as np\n", "import optuna\n", "from lightgbm import LGBMClassifier, LGBMRegressor\n", "from perpetual import PerpetualBooster\n", "from sklearn.datasets import fetch_california_housing, fetch_covtype\n", "from sklearn.metrics import log_loss, mean_squared_error\n", "from sklearn.model_selection import cross_validate, train_test_split" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "print(sys.version)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "print(f\"numpy: {version('numpy')}\")\n", "print(f\"optuna: {version('optuna')}\")\n", "print(f\"lightgbm: {version('lightgbm')}\")\n", "print(f\"scikit-learn: {version('scikit-learn')}\")\n", "print(f\"perpetual: {version('perpetual')}\")" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "task_is_cal_housing = False # change to False for Cover Types task." ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "seed = 0 # average results are reported for 5 seeds -> [0, 1, 2, 3, 4]\n", "n_estimators = 1 # results are reported for 100, 300, 1000 n_estimators.\n", "n_trials = 1" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "if task_is_cal_housing:\n", " data, target = fetch_california_housing(return_X_y=True, as_frame=True)\n", " scoring = \"neg_mean_squared_error\"\n", " metric_function = mean_squared_error\n", " metric_name = \"mse\"\n", " LGBMBooster = LGBMRegressor\n", " objective_type = \"SquaredLoss\"\n", "else:\n", " data, target = fetch_covtype(return_X_y=True, as_frame=True)\n", " scoring = \"neg_log_loss\"\n", " metric_function = log_loss\n", " metric_name = \"log_loss\"\n", " LGBMBooster = LGBMClassifier\n", " objective_type = \"LogLoss\"" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "X_train, X_test, y_train, y_test = train_test_split(\n", " data, target, test_size=0.2248, random_state=seed\n", ")\n", "\n", "print(f\"len(X_train): {len(X_train)}\")\n", "print(f\"len(X_test): {len(X_test)}\")" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "best_cv_results = None\n", "cv_results = None\n", "\n", "\n", "def save_best_cv_results(study, trial):\n", " global best_cv_results\n", " if study.best_trial.number == trial.number:\n", " best_cv_results = cv_results" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "def objective_function(trial):\n", " global cv_results\n", " params = {\n", " \"seed\": seed,\n", " \"verbosity\": -1,\n", " \"n_estimators\": n_estimators,\n", " \"learning_rate\": trial.suggest_float(\"learning_rate\", 0.001, 0.5, log=True),\n", " \"min_split_gain\": trial.suggest_float(\"min_split_gain\", 1e-6, 1.0, log=True),\n", " \"reg_alpha\": trial.suggest_float(\"reg_alpha\", 1e-6, 1.0, log=True),\n", " \"reg_lambda\": trial.suggest_float(\"reg_lambda\", 1e-6, 1.0, log=True),\n", " \"colsample_bytree\": trial.suggest_float(\"colsample_bytree\", 0.2, 1.0),\n", " \"subsample\": trial.suggest_float(\"subsample\", 0.2, 1.0),\n", " \"subsample_freq\": trial.suggest_int(\"subsample_freq\", 1, 10),\n", " \"max_depth\": trial.suggest_int(\"max_depth\", 3, 33),\n", " \"num_leaves\": trial.suggest_int(\"num_leaves\", 2, 1024),\n", " \"min_child_samples\": trial.suggest_int(\"min_child_samples\", 1, 100),\n", " }\n", " model = LGBMBooster(**params)\n", " cv_results = cross_validate(\n", " model,\n", " X_train,\n", " y_train,\n", " cv=5,\n", " scoring=scoring,\n", " return_train_score=True,\n", " return_estimator=True,\n", " )\n", " return -1 * np.mean(cv_results[\"test_score\"])" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "sampler = optuna.samplers.TPESampler(seed=seed)\n", "study = optuna.create_study(direction=\"minimize\", sampler=sampler)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "study.optimize(objective_function, n_trials=n_trials, callbacks=[save_best_cv_results])" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "print(f\"Number of finished trials: {len(study.trials)}\")\n", "print(\"Best trial:\")\n", "print(f\" Number: {study.best_trial.number}\")\n", "print(f\" Value: {study.best_trial.value}\")\n", "print(\" Params: \")\n", "for key, value in study.best_trial.params.items():\n", " print(f\" {key}: {value}\")" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "print(f\"CV train scores: {-1 * best_cv_results['train_score']}\")\n", "print(\n", " f\"CV train scores average : {round(np.mean(-1 * best_cv_results['train_score']), 6)}\"\n", ")\n", "print(f\"CV valid scores: {-1 * best_cv_results['test_score']}\")\n", "print(\n", " f\"CV valid scores average : {round(np.mean(-1 * best_cv_results['test_score']), 6)}\"\n", ")" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "models = best_cv_results[\"estimator\"]" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "for i, model in enumerate(models):\n", " y_pred = (\n", " model.predict_proba(X_train)\n", " if metric_name == \"log_loss\"\n", " else model.predict(X_train)\n", " )\n", " print(\n", " f\"Model {i}, train {metric_name}: {round(metric_function(y_train, y_pred), 6)}\"\n", " )" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "for i, model in enumerate(models):\n", " y_pred = (\n", " model.predict_proba(X_test)\n", " if metric_name == \"log_loss\"\n", " else model.predict(X_test)\n", " )\n", " print(f\"Model {i}, test {metric_name}: {round(metric_function(y_test, y_pred), 6)}\")" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "if metric_name == \"log_loss\":\n", " y_pred = np.mean([model.predict_proba(X_train) for model in models], axis=0)\n", "else:\n", " y_pred = np.mean([model.predict(X_train) for model in models], axis=0)\n", "print(f\"Train {metric_name}: {round(metric_function(y_train, y_pred), 6)}\")" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "if metric_name == \"log_loss\":\n", " y_pred = np.mean([model.predict_proba(X_test) for model in models], axis=0)\n", "else:\n", " y_pred = np.mean([model.predict(X_test) for model in models], axis=0)\n", "print(f\"Test {metric_name}: {round(metric_function(y_test, y_pred), 6)}\")" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "\n", "\n", "| LightGBM n_estimators | Seed | LightGBM mse | LightGBM cpu time |\n", "| --------------------- | ---- | ------------ | ----------------- |\n", "| 100 | 0 | 0.186588 | 729 |\n", "| 100 | 1 | 0.194348 | 1294 |\n", "| 100 | 2 | 0.197862 | 990 |\n", "| 100 | 3 | 0.188629 | 1143 |\n", "| 100 | 4 | 0.194338 | 860 |\n", "| 100 | avg | 0.192196 | 978 |\n", "| 300 | 0 | 0.185100 | 2282 |\n", "| 300 | 1 | 0.192767 | 3650 |\n", "| 300 | 2 | 0.190481 | 2746 |\n", "| 300 | 3 | 0.182359 | 2782 |\n", "| 300 | 4 | 0.191614 | 3871 |\n", "| 300 | avg | 0.188464 | 3066 |\n", "| 1000 | 0 | 0.179158 | 9615 |\n", "| 1000 | 1 | 0.190866 | 7258 |\n", "| 1000 | 2 | 0.188030 | 10997 |\n", "| 1000 | 3 | 0.179903 | 7636 |\n", "| 1000 | 4 | 0.190033 | 8095 |\n", "| 1000 | avg | 0.185598 | 8720 |" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "model = PerpetualBooster(objective=objective_type)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "model.budget = 1.0\n", "model.fit(X_train, y_train)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "if metric_name == \"log_loss\":\n", " y_pred = model.predict_proba(X_test)\n", "else:\n", " y_pred = model.predict(X_test)\n", "print(f\"Test {metric_name}: {round(metric_function(y_test, y_pred), 6)}\")" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "model.number_of_trees" ] } ], "metadata": { "kernelspec": { "display_name": "py310", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.9.20" } }, "nbformat": 4, "nbformat_minor": 2 }