{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Performance Benchmarking\n",
    "\n",
    "This tutorial compares Perpetual with a heavily tuned LightGBM model across different datasets (California Housing and Cover Type). Perpetual achieves state-of-the-art results without any hyperparameter tuning."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "import sys\n",
    "from importlib.metadata import version\n",
    "\n",
    "import numpy as np\n",
    "import optuna\n",
    "from lightgbm import LGBMClassifier, LGBMRegressor\n",
    "from perpetual import PerpetualBooster\n",
    "from sklearn.datasets import fetch_california_housing, fetch_covtype\n",
    "from sklearn.metrics import log_loss, mean_squared_error\n",
    "from sklearn.model_selection import cross_validate, train_test_split"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "print(sys.version)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "print(f\"numpy: {version('numpy')}\")\n",
    "print(f\"optuna: {version('optuna')}\")\n",
    "print(f\"lightgbm: {version('lightgbm')}\")\n",
    "print(f\"scikit-learn: {version('scikit-learn')}\")\n",
    "print(f\"perpetual: {version('perpetual')}\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "task_is_cal_housing = False  # change to False for Cover Types task."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "seed = 0  # average results are reported for 5 seeds -> [0, 1, 2, 3, 4]\n",
    "n_estimators = 1  # results are reported for 100, 300, 1000 n_estimators.\n",
    "n_trials = 1"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "if task_is_cal_housing:\n",
    "    data, target = fetch_california_housing(return_X_y=True, as_frame=True)\n",
    "    scoring = \"neg_mean_squared_error\"\n",
    "    metric_function = mean_squared_error\n",
    "    metric_name = \"mse\"\n",
    "    LGBMBooster = LGBMRegressor\n",
    "    objective_type = \"SquaredLoss\"\n",
    "else:\n",
    "    data, target = fetch_covtype(return_X_y=True, as_frame=True)\n",
    "    scoring = \"neg_log_loss\"\n",
    "    metric_function = log_loss\n",
    "    metric_name = \"log_loss\"\n",
    "    LGBMBooster = LGBMClassifier\n",
    "    objective_type = \"LogLoss\""
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "X_train, X_test, y_train, y_test = train_test_split(\n",
    "    data, target, test_size=0.2248, random_state=seed\n",
    ")\n",
    "\n",
    "print(f\"len(X_train): {len(X_train)}\")\n",
    "print(f\"len(X_test): {len(X_test)}\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "best_cv_results = None\n",
    "cv_results = None\n",
    "\n",
    "\n",
    "def save_best_cv_results(study, trial):\n",
    "    global best_cv_results\n",
    "    if study.best_trial.number == trial.number:\n",
    "        best_cv_results = cv_results"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "def objective_function(trial):\n",
    "    global cv_results\n",
    "    params = {\n",
    "        \"seed\": seed,\n",
    "        \"verbosity\": -1,\n",
    "        \"n_estimators\": n_estimators,\n",
    "        \"learning_rate\": trial.suggest_float(\"learning_rate\", 0.001, 0.5, log=True),\n",
    "        \"min_split_gain\": trial.suggest_float(\"min_split_gain\", 1e-6, 1.0, log=True),\n",
    "        \"reg_alpha\": trial.suggest_float(\"reg_alpha\", 1e-6, 1.0, log=True),\n",
    "        \"reg_lambda\": trial.suggest_float(\"reg_lambda\", 1e-6, 1.0, log=True),\n",
    "        \"colsample_bytree\": trial.suggest_float(\"colsample_bytree\", 0.2, 1.0),\n",
    "        \"subsample\": trial.suggest_float(\"subsample\", 0.2, 1.0),\n",
    "        \"subsample_freq\": trial.suggest_int(\"subsample_freq\", 1, 10),\n",
    "        \"max_depth\": trial.suggest_int(\"max_depth\", 3, 33),\n",
    "        \"num_leaves\": trial.suggest_int(\"num_leaves\", 2, 1024),\n",
    "        \"min_child_samples\": trial.suggest_int(\"min_child_samples\", 1, 100),\n",
    "    }\n",
    "    model = LGBMBooster(**params)\n",
    "    cv_results = cross_validate(\n",
    "        model,\n",
    "        X_train,\n",
    "        y_train,\n",
    "        cv=5,\n",
    "        scoring=scoring,\n",
    "        return_train_score=True,\n",
    "        return_estimator=True,\n",
    "    )\n",
    "    return -1 * np.mean(cv_results[\"test_score\"])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "sampler = optuna.samplers.TPESampler(seed=seed)\n",
    "study = optuna.create_study(direction=\"minimize\", sampler=sampler)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "study.optimize(objective_function, n_trials=n_trials, callbacks=[save_best_cv_results])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "print(f\"Number of finished trials: {len(study.trials)}\")\n",
    "print(\"Best trial:\")\n",
    "print(f\"  Number: {study.best_trial.number}\")\n",
    "print(f\"  Value: {study.best_trial.value}\")\n",
    "print(\"  Params: \")\n",
    "for key, value in study.best_trial.params.items():\n",
    "    print(f\"    {key}: {value}\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "print(f\"CV train scores: {-1 * best_cv_results['train_score']}\")\n",
    "print(\n",
    "    f\"CV train scores average : {round(np.mean(-1 * best_cv_results['train_score']), 6)}\"\n",
    ")\n",
    "print(f\"CV valid scores: {-1 * best_cv_results['test_score']}\")\n",
    "print(\n",
    "    f\"CV valid scores average : {round(np.mean(-1 * best_cv_results['test_score']), 6)}\"\n",
    ")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "models = best_cv_results[\"estimator\"]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "for i, model in enumerate(models):\n",
    "    y_pred = (\n",
    "        model.predict_proba(X_train)\n",
    "        if metric_name == \"log_loss\"\n",
    "        else model.predict(X_train)\n",
    "    )\n",
    "    print(\n",
    "        f\"Model {i}, train {metric_name}: {round(metric_function(y_train, y_pred), 6)}\"\n",
    "    )"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "for i, model in enumerate(models):\n",
    "    y_pred = (\n",
    "        model.predict_proba(X_test)\n",
    "        if metric_name == \"log_loss\"\n",
    "        else model.predict(X_test)\n",
    "    )\n",
    "    print(f\"Model {i}, test {metric_name}: {round(metric_function(y_test, y_pred), 6)}\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "if metric_name == \"log_loss\":\n",
    "    y_pred = np.mean([model.predict_proba(X_train) for model in models], axis=0)\n",
    "else:\n",
    "    y_pred = np.mean([model.predict(X_train) for model in models], axis=0)\n",
    "print(f\"Train {metric_name}: {round(metric_function(y_train, y_pred), 6)}\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "if metric_name == \"log_loss\":\n",
    "    y_pred = np.mean([model.predict_proba(X_test) for model in models], axis=0)\n",
    "else:\n",
    "    y_pred = np.mean([model.predict(X_test) for model in models], axis=0)\n",
    "print(f\"Test {metric_name}: {round(metric_function(y_test, y_pred), 6)}\")"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "<style scoped>\n",
    "table {\n",
    "  font-size: 12px;\n",
    "}\n",
    "</style>\n",
    "\n",
    "| LightGBM n_estimators | Seed | LightGBM mse | LightGBM cpu time |\n",
    "| --------------------- | ---- | ------------ | ----------------- |\n",
    "| 100                   | 0    | 0.186588     | 729               |\n",
    "| 100                   | 1    | 0.194348     | 1294              |\n",
    "| 100                   | 2    | 0.197862     | 990               |\n",
    "| 100                   | 3    | 0.188629     | 1143              |\n",
    "| 100                   | 4    | 0.194338     | 860               |\n",
    "| 100                   | avg  | 0.192196     | 978               |\n",
    "| 300                   | 0    | 0.185100     | 2282              |\n",
    "| 300                   | 1    | 0.192767     | 3650              |\n",
    "| 300                   | 2    | 0.190481     | 2746              |\n",
    "| 300                   | 3    | 0.182359     | 2782              |\n",
    "| 300                   | 4    | 0.191614     | 3871              |\n",
    "| 300                   | avg  | 0.188464     | 3066              |\n",
    "| 1000                  | 0    | 0.179158     | 9615              |\n",
    "| 1000                  | 1    | 0.190866     | 7258              |\n",
    "| 1000                  | 2    | 0.188030     | 10997             |\n",
    "| 1000                  | 3    | 0.179903     | 7636              |\n",
    "| 1000                  | 4    | 0.190033     | 8095              |\n",
    "| 1000                  | avg  | 0.185598     | 8720              |"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "model = PerpetualBooster(objective=objective_type)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "model.budget = 1.0\n",
    "model.fit(X_train, y_train)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "if metric_name == \"log_loss\":\n",
    "    y_pred = model.predict_proba(X_test)\n",
    "else:\n",
    "    y_pred = model.predict(X_test)\n",
    "print(f\"Test {metric_name}: {round(metric_function(y_test, y_pred), 6)}\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "model.number_of_trees"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "py310",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.9.20"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}