{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "# Quick Start with Toy Datasets\n", "\n", "This tutorial provides a quick start guide using classic toy datasets from scikit-learn. You will see how to train a Perpetual model and evaluate its performance in just a few lines of code." ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "import logging\n", "import time\n", "from importlib.metadata import version\n", "\n", "import pandas as pd\n", "from perpetual import PerpetualBooster\n", "from sklearn.datasets import load_breast_cancer, load_iris\n", "from sklearn.ensemble import RandomForestClassifier\n", "from sklearn.metrics import accuracy_score, log_loss\n", "from sklearn.model_selection import train_test_split" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "print(f\"scikit-learn: {version('scikit-learn')}\")\n", "print(f\"perpetual: {version('perpetual')}\")" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "logging.basicConfig(level=logging.INFO)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "def evaluate(model, X_train, y_train, X_test, y_test, budget=None):\n", " start = time.time()\n", " if budget:\n", " model.budget = budget\n", " model.fit(X_train, y_train)\n", " if budget:\n", " print(model.number_of_trees)\n", " duration = time.time() - start\n", " return (\n", " duration,\n", " accuracy_score(y_test, model.predict(X_test)),\n", " log_loss(y_test, model.predict_proba(X_test)),\n", " )\n", "\n", "\n", "datasets = {\n", " \"Breast Cancer\": load_breast_cancer(return_X_y=True),\n", " \"Binary Iris\": (\n", " load_iris(return_X_y=True)[0][load_iris().target != 2],\n", " load_iris(return_X_y=True)[1][load_iris().target != 2],\n", " ),\n", "}\n", "results = pd.DataFrame(\n", " columns=[\"Dataset\", \"Model\", \"Budget\", \"Time\", \"Accuracy\", \"Log Loss\"]\n", ")\n", "\n", "for name, (X, y) in datasets.items():\n", " X_train, X_test, y_train, y_test = train_test_split(\n", " X, y, test_size=0.2, random_state=42\n", " )\n", " pb = PerpetualBooster(\n", " objective=\"LogLoss\", log_iterations=0, stopping_rounds=1, iteration_limit=1000\n", " )\n", " rf = RandomForestClassifier()\n", " results = pd.concat(\n", " [\n", " results,\n", " pd.DataFrame(\n", " [\n", " [\n", " name,\n", " \"Perpetual\",\n", " \"0.1\",\n", " *evaluate(pb, X_train, y_train, X_test, y_test, budget=0.1),\n", " ]\n", " ],\n", " columns=results.columns,\n", " ),\n", " ],\n", " ignore_index=True,\n", " )" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "results" ] } ], "metadata": { "kernelspec": { "display_name": "py311", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.11.9" } }, "nbformat": 4, "nbformat_minor": 2 }