{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Quick Start with Toy Datasets\n",
    "\n",
    "This tutorial provides a quick start guide using classic toy datasets from scikit-learn. You will see how to train a Perpetual model and evaluate its performance in just a few lines of code."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "import logging\n",
    "import time\n",
    "from importlib.metadata import version\n",
    "\n",
    "import pandas as pd\n",
    "from perpetual import PerpetualBooster\n",
    "from sklearn.datasets import load_breast_cancer, load_iris\n",
    "from sklearn.ensemble import RandomForestClassifier\n",
    "from sklearn.metrics import accuracy_score, log_loss\n",
    "from sklearn.model_selection import train_test_split"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "print(f\"scikit-learn: {version('scikit-learn')}\")\n",
    "print(f\"perpetual: {version('perpetual')}\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "logging.basicConfig(level=logging.INFO)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "def evaluate(model, X_train, y_train, X_test, y_test, budget=None):\n",
    "    start = time.time()\n",
    "    if budget:\n",
    "        model.budget = budget\n",
    "    model.fit(X_train, y_train)\n",
    "    if budget:\n",
    "        print(model.number_of_trees)\n",
    "    duration = time.time() - start\n",
    "    return (\n",
    "        duration,\n",
    "        accuracy_score(y_test, model.predict(X_test)),\n",
    "        log_loss(y_test, model.predict_proba(X_test)),\n",
    "    )\n",
    "\n",
    "\n",
    "datasets = {\n",
    "    \"Breast Cancer\": load_breast_cancer(return_X_y=True),\n",
    "    \"Binary Iris\": (\n",
    "        load_iris(return_X_y=True)[0][load_iris().target != 2],\n",
    "        load_iris(return_X_y=True)[1][load_iris().target != 2],\n",
    "    ),\n",
    "}\n",
    "results = pd.DataFrame(\n",
    "    columns=[\"Dataset\", \"Model\", \"Budget\", \"Time\", \"Accuracy\", \"Log Loss\"]\n",
    ")\n",
    "\n",
    "for name, (X, y) in datasets.items():\n",
    "    X_train, X_test, y_train, y_test = train_test_split(\n",
    "        X, y, test_size=0.2, random_state=42\n",
    "    )\n",
    "    pb = PerpetualBooster(\n",
    "        objective=\"LogLoss\", log_iterations=0, stopping_rounds=1, iteration_limit=1000\n",
    "    )\n",
    "    rf = RandomForestClassifier()\n",
    "    results = pd.concat(\n",
    "        [\n",
    "            results,\n",
    "            pd.DataFrame(\n",
    "                [\n",
    "                    [\n",
    "                        name,\n",
    "                        \"Perpetual\",\n",
    "                        \"0.1\",\n",
    "                        *evaluate(pb, X_train, y_train, X_test, y_test, budget=0.1),\n",
    "                    ]\n",
    "                ],\n",
    "                columns=results.columns,\n",
    "            ),\n",
    "        ],\n",
    "        ignore_index=True,\n",
    "    )"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "results"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "py311",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.11.9"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}