{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Handling Categorical Data\n",
    "\n",
    "This tutorial demonstrates how Perpetual automatically handles categorical features without the need for manual encoding (like one-hot or label encoding). We use the UCI Adult dataset to compare Perpetual's performance with LightGBM."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "import sys\n",
    "from importlib.metadata import version\n",
    "\n",
    "import numpy as np\n",
    "import optuna\n",
    "import pandas as pd\n",
    "from lightgbm import LGBMClassifier\n",
    "from perpetual import PerpetualBooster\n",
    "from scipy.special import expit\n",
    "from sklearn.datasets import fetch_openml\n",
    "from sklearn.metrics import accuracy_score, log_loss\n",
    "from sklearn.model_selection import cross_validate, train_test_split"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "pd.set_option(\"display.max_rows\", 1000)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "print(sys.version)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "print(f\"numpy: {version('numpy')}\")\n",
    "print(f\"optuna: {version('optuna')}\")\n",
    "print(f\"lightgbm: {version('lightgbm')}\")\n",
    "print(f\"scikit-learn: {version('scikit-learn')}\")\n",
    "print(f\"perpetual: {version('perpetual')}\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# fetch dataset: https://archive.ics.uci.edu/dataset/2/adult\n",
    "adult = fetch_openml(data_id=2, as_frame=True)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "adult.data.features.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "data = adult.data.features.copy()\n",
    "data[\"sex\"] = pd.get_dummies(\n",
    "    adult.data.features[\"sex\"], drop_first=True, dtype=float\n",
    ").to_numpy()\n",
    "cols = [\n",
    "    \"workclass\",\n",
    "    \"education\",\n",
    "    \"marital-status\",\n",
    "    \"occupation\",\n",
    "    \"relationship\",\n",
    "    \"race\",\n",
    "    \"native-country\",\n",
    "]\n",
    "data[cols] = data[cols].astype(\"category\")\n",
    "data.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "y = adult.data.targets[\"income\"].str.contains(\"<\").to_numpy().astype(int)\n",
    "y"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "set(adult.data.targets[\"income\"])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "seed = 0\n",
    "n_estimators = 50\n",
    "n_trials = 1"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "scoring = \"neg_log_loss\"\n",
    "metric_function = log_loss\n",
    "metric_name = \"log_loss\"\n",
    "LGBMBooster = LGBMClassifier\n",
    "objective_type = \"LogLoss\""
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "X_train, X_test, y_train, y_test = train_test_split(\n",
    "    data, y, test_size=0.2, random_state=seed\n",
    ")\n",
    "\n",
    "print(f\"len(X_train): {len(X_train)}\")\n",
    "print(f\"len(X_test): {len(X_test)}\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "X_train.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "best_cv_results = None\n",
    "cv_results = None\n",
    "\n",
    "\n",
    "def save_best_cv_results(study, trial):\n",
    "    global best_cv_results\n",
    "    if study.best_trial.number == trial.number:\n",
    "        best_cv_results = cv_results"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "def objective_function(trial):\n",
    "    global cv_results\n",
    "    params = {\n",
    "        \"seed\": seed,\n",
    "        \"verbosity\": -1,\n",
    "        \"n_jobs\": 1,\n",
    "        \"n_estimators\": n_estimators,\n",
    "        \"learning_rate\": trial.suggest_float(\"learning_rate\", 0.001, 0.5, log=True),\n",
    "        \"min_split_gain\": trial.suggest_float(\"min_split_gain\", 1e-6, 1.0, log=True),\n",
    "        \"reg_alpha\": trial.suggest_float(\"reg_alpha\", 1e-6, 1.0, log=True),\n",
    "        \"reg_lambda\": trial.suggest_float(\"reg_lambda\", 1e-6, 1.0, log=True),\n",
    "        \"colsample_bytree\": trial.suggest_float(\"colsample_bytree\", 0.2, 1.0),\n",
    "        \"subsample\": trial.suggest_float(\"subsample\", 0.2, 1.0),\n",
    "        \"subsample_freq\": trial.suggest_int(\"subsample_freq\", 1, 10),\n",
    "        \"max_depth\": trial.suggest_int(\"max_depth\", 3, 33),\n",
    "        \"num_leaves\": trial.suggest_int(\"num_leaves\", 2, 256),\n",
    "        \"min_child_samples\": trial.suggest_int(\"min_child_samples\", 1, 100),\n",
    "    }\n",
    "    model = LGBMBooster(**params)\n",
    "    cv_results = cross_validate(\n",
    "        model,\n",
    "        X_train,\n",
    "        y_train,\n",
    "        cv=5,\n",
    "        scoring=scoring,\n",
    "        return_train_score=True,\n",
    "        return_estimator=True,\n",
    "    )\n",
    "    return -1 * np.mean(cv_results[\"test_score\"])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "sampler = optuna.samplers.TPESampler(seed=seed)\n",
    "study = optuna.create_study(direction=\"minimize\", sampler=sampler)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "study.optimize(objective_function, n_trials=n_trials, callbacks=[save_best_cv_results])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "print(f\"Number of finished trials: {len(study.trials)}\")\n",
    "print(\"Best trial:\")\n",
    "print(f\"  Number: {study.best_trial.number}\")\n",
    "print(f\"  Value: {study.best_trial.value}\")\n",
    "print(\"  Params: \")\n",
    "for key, value in study.best_trial.params.items():\n",
    "    print(f\"    {key}: {value}\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "print(f\"CV train scores: {-1 * best_cv_results['train_score']}\")\n",
    "print(\n",
    "    f\"CV train scores average : {round(np.mean(-1 * best_cv_results['train_score']), 6)}\"\n",
    ")\n",
    "print(f\"CV test scores: {-1 * best_cv_results['test_score']}\")\n",
    "print(\n",
    "    f\"CV test scores average : {round(np.mean(-1 * best_cv_results['test_score']), 6)}\"\n",
    ")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "models = best_cv_results[\"estimator\"]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "for i, model in enumerate(models):\n",
    "    y_pred = (\n",
    "        model.predict_proba(X_train)\n",
    "        if metric_name == \"log_loss\"\n",
    "        else model.predict(X_train)\n",
    "    )\n",
    "    print(\n",
    "        f\"Model {i}, train {metric_name}: {round(metric_function(y_train, y_pred), 6)}\"\n",
    "    )"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "for i, model in enumerate(models):\n",
    "    y_pred = (\n",
    "        model.predict_proba(X_test)\n",
    "        if metric_name == \"log_loss\"\n",
    "        else model.predict(X_test)\n",
    "    )\n",
    "    print(f\"Model {i}, test {metric_name}: {round(metric_function(y_test, y_pred), 6)}\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "if metric_name == \"log_loss\":\n",
    "    y_pred = np.mean([model.predict_proba(X_train) for model in models], axis=0)\n",
    "else:\n",
    "    y_pred = np.mean([model.predict(X_train) for model in models], axis=0)\n",
    "print(f\"Train {metric_name}: {round(metric_function(y_train, y_pred), 6)}\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "if metric_name == \"log_loss\":\n",
    "    y_pred = np.mean([model.predict_proba(X_test) for model in models], axis=0)\n",
    "else:\n",
    "    y_pred = np.mean([model.predict(X_test) for model in models], axis=0)\n",
    "print(f\"Test {metric_name}: {round(metric_function(y_test, y_pred), 6)}\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "y_pred = np.round(np.mean([model.predict(X_test) for model in models], axis=0))\n",
    "print(accuracy_score(y_test, y_pred))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "model = PerpetualBooster(budget=0.35, objective=\"LogLoss\")\n",
    "model.fit(X_train, y_train)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "y_pred = np.round(expit(model.predict(X_test)))\n",
    "print(accuracy_score(y_test, y_pred))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "y_pred = np.round(expit(model.predict(X_train)))\n",
    "print(accuracy_score(y_train, y_pred))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "if metric_name == \"log_loss\":\n",
    "    y_pred = expit(model.predict(X_test))\n",
    "else:\n",
    "    y_pred = np.round(expit(model.predict(X_test)))\n",
    "print(f\"Test {metric_name}: {round(metric_function(y_test, y_pred), 6)}\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "df_trees = model.trees_to_dataframe()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "df_trees.head(10)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "models[0].booster_.trees_to_dataframe().head(10000)"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "py311",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.11.9"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}