Explainability
Perpetual provides several methods to interpret the model and understand its predictions. These methods include Feature Importance, Partial Dependence, and Prediction Contributions (SHAP-like values).
Feature Importance
Feature importance scores indicate how useful each feature was for the construction of the boosted decision trees. Perpetual supports several importance metrics:
Gain: Average improvement in loss brought by a feature.
Weight: Number of times a feature is used in splits.
Cover: Average number of samples affected by splits on a feature.
TotalGain: Total improvement in loss brought by a feature.
TotalCover: Total number of samples affected by splits on a feature.
importance = model.calculate_feature_importance(method="Gain", normalize=True)
print(importance)
Partial Dependence
Partial dependence plots (PDP) show the dependence between the target response and a set of input features, marginalizing over the values of all other features.
pd_values = model.partial_dependence(X, feature="feature_name", samples=100)
# pd_values is an array where col 0 is feature value and col 1 is the predicted value
Prediction Contributions
Perpetual can calculate the contribution of each feature to a specific prediction. This is often referred to as SHAP (SHapley Additive exPlanations) values. The sum of the contributions plus the bias term equals the model’s raw prediction (e.g., log-odds for classification).
contributions = model.predict_contributions(X_sample, method="Average")
# contributions[:, :-1] are the feature contributions
# contributions[:, -1] is the bias (expected value)
Example
Here is a complete example demonstrating these explainability methods:
1import numpy as np
2import pandas as pd
3from perpetual import PerpetualBooster
4from sklearn.datasets import make_classification, make_regression
5from sklearn.model_selection import train_test_split
6
7
8def explain_classification():
9 print("\n--- Classification Explainability ---")
10 # Generate synthetic data
11 X, y = make_classification(
12 n_samples=1000, n_features=10, n_informative=5, random_state=42
13 )
14 feature_names = [f"feature_{i}" for i in range(X.shape[1])]
15 X_df = pd.DataFrame(X, columns=feature_names)
16
17 X_train, X_test, y_train, y_test = train_test_split(
18 X_df, y, test_size=0.2, random_state=42
19 )
20
21 # Train model
22 model = PerpetualBooster(objective="LogLoss", budget=1.0)
23 model.fit(X_train, y_train)
24
25 # 1. Feature Importance
26 print("\n1. Feature Importance (Gain):")
27 importance = model.calculate_feature_importance(method="Gain", normalize=True)
28 # Sort by importance
29 sorted_importance = sorted(importance.items(), key=lambda x: x[1], reverse=True)
30 for feat, imp in sorted_importance[:5]: # Show top 5
31 print(f"{feat}: {imp:.4f}")
32
33 # 2. Partial Dependence
34 print("\n2. Partial Dependence for top feature:")
35 top_feature = sorted_importance[0][0]
36 pd_values = model.partial_dependence(X_train, feature=top_feature, samples=10)
37 print(f"Values for {top_feature}:")
38 print(pd_values)
39
40 # 3. Predict Contributions (SHAP-like)
41 print("\n3. Prediction Contributions (SHAP-like) for first 2 samples:")
42 # Calculate contributions
43 contributions = model.predict_contributions(X_test.iloc[:2], method="Average")
44
45 # contributions shape is (n_samples, n_features + 1), last column is bias
46 bias = contributions[:, -1]
47 feat_contribs = contributions[:, :-1]
48
49 for i in range(2):
50 print(f"\nSample {i}:")
51 print(f"Bias: {bias[i]:.4f}")
52 print("Top 3 contributing features:")
53 # Get indices of top 3 absolute contributions
54 top_idxs = np.argsort(np.abs(feat_contribs[i]))[-3:][::-1]
55 for idx in top_idxs:
56 print(f"{feature_names[idx]}: {feat_contribs[i, idx]:.4f}")
57
58 prediction = model.predict(X_test.iloc[[i]])[0]
59 # For LogLoss, contributions sum to the log-odds prediction
60 sum_contribs = np.sum(feat_contribs[i]) + bias[i]
61 print(f"Sum of contributions: {sum_contribs:.4f}")
62 print(f"Model prediction: {prediction:.4f}")
63
64
65def explain_regression():
66 print("\n--- Regression Explainability ---")
67 X, y = make_regression(
68 n_samples=1000, n_features=10, n_informative=5, random_state=42
69 )
70 feature_names = [f"feature_{i}" for i in range(X.shape[1])]
71 X_df = pd.DataFrame(X, columns=feature_names)
72
73 X_train, X_test, y_train, y_test = train_test_split(
74 X_df, y, test_size=0.2, random_state=42
75 )
76
77 model = PerpetualBooster(objective="SquaredLoss", budget=1.0)
78 model.fit(X_train, y_train)
79
80 print("\n1. Feature Importance (Cover):")
81 importance = model.calculate_feature_importance(method="Cover", normalize=True)
82 sorted_importance = sorted(importance.items(), key=lambda x: x[1], reverse=True)
83 for feat, imp in sorted_importance[:5]:
84 print(f"{feat}: {imp:.4f}")
85
86
87if __name__ == "__main__":
88 explain_classification()
89 explain_regression()