Quick Start with Toy Datasets
This tutorial provides a quick start guide using classic toy datasets from scikit-learn. You will see how to train a Perpetual model and evaluate its performance in just a few lines of code.
[ ]:
import logging
import time
from importlib.metadata import version
import pandas as pd
from perpetual import PerpetualBooster
from sklearn.datasets import load_breast_cancer, load_iris
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, log_loss
from sklearn.model_selection import train_test_split
[ ]:
print(f"scikit-learn: {version('scikit-learn')}")
print(f"perpetual: {version('perpetual')}")
[ ]:
logging.basicConfig(level=logging.INFO)
[ ]:
def evaluate(model, X_train, y_train, X_test, y_test, budget=None):
start = time.time()
if budget:
model.budget = budget
model.fit(X_train, y_train)
if budget:
print(model.number_of_trees)
duration = time.time() - start
return (
duration,
accuracy_score(y_test, model.predict(X_test)),
log_loss(y_test, model.predict_proba(X_test)),
)
datasets = {
"Breast Cancer": load_breast_cancer(return_X_y=True),
"Binary Iris": (
load_iris(return_X_y=True)[0][load_iris().target != 2],
load_iris(return_X_y=True)[1][load_iris().target != 2],
),
}
results = pd.DataFrame(
columns=["Dataset", "Model", "Budget", "Time", "Accuracy", "Log Loss"]
)
for name, (X, y) in datasets.items():
X_train, X_test, y_train, y_test = train_test_split(
X, y, test_size=0.2, random_state=42
)
pb = PerpetualBooster(
objective="LogLoss", log_iterations=0, stopping_rounds=1, iteration_limit=1000
)
rf = RandomForestClassifier()
results = pd.concat(
[
results,
pd.DataFrame(
[
[
name,
"Perpetual",
"0.1",
*evaluate(pb, X_train, y_train, X_test, y_test, budget=0.1),
]
],
columns=results.columns,
),
],
ignore_index=True,
)
[ ]:
results