Scikit-learn Compatible Estimators

Slides

The core class plqERM_Ridge serves as a base implementation for both classification and regression tasks. Its subclasses, plqERMClassifier and plqERMRegressor, provide task-specific functionality while integrating seamlessly with scikit-learn utilities such as Pipeline, cross_val_score, and GridSearchCV. In addition, these models support common evaluation methods, allowing users to compute metrics such as accuracy scores for classification or R² values for regression.

Classification Example with GridSearchCV and Pipeline

Here we shows a classification example which contains pipeline, cross_val_score and GridSearchCV.

[2]:
import numpy as np
from sklearn.datasets import make_classification
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.model_selection import GridSearchCV, cross_val_score, train_test_split
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
[3]:
# generate the dataset
X, y = make_classification(
    n_samples=2000,
    n_features=20,
    n_informative=8,
    n_redundant=4,
    n_repeated=0,
    n_classes=2,
    weights=[0.7, 0.3],  # imbalance
    class_sep=1.2,
    flip_y=0.01,
    random_state=42,
)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, stratify=y, random_state=42)
[7]:
from rehline import plq_Ridge_Classifier

# set the pipeline
pipe = Pipeline(
    [
        ("scaler", StandardScaler()),
        ("clf", plq_Ridge_Classifier(loss={"name": "svm"})),
    ]
)
[11]:
# set the parameter grid
param_grid = {
    "clf__loss": [{"name": "svm"}, {"name": "sSVM"}],
    "clf__C": [0.1, 1.0, 3.0],
    "clf__fit_intercept": [True, False],
    "clf__intercept_scaling": [0.5, 1.0, 2.0],
    "clf__max_iter": [5000, 10000],
    "clf__class_weight": [None, "balanced", {0: 1.0, 1: 2.0}],
    "clf__constraint": [
        [],  # no constraint
        [{"name": "nonnegative"}],
        [{"name": "fair", "sen_idx": [0], "tol_sen": 0.1}],
    ],
}
[12]:
# cross_val_score function
cv_scores = cross_val_score(
    pipe,
    X_train,
    y_train,
    cv=5,
    scoring="accuracy",
    n_jobs=-1,
)
print("CV scores:", cv_scores)
CV scores: [0.79333333 0.82       0.82333333 0.81       0.80666667]
[13]:
# perform GridSearchCV to tune the hyperparameter
grid = GridSearchCV(
    estimator=pipe,
    param_grid=param_grid,
    scoring="accuracy",
    cv=5,
    n_jobs=-1,
    refit=True,
    verbose=1,
)

grid.fit(X_train, y_train)
Fitting 5 folds for each of 648 candidates, totalling 3240 fits
[13]:
GridSearchCV(cv=5,
             estimator=Pipeline(steps=[('scaler', StandardScaler()),
                                       ('clf',
                                        plq_Ridge_Classifier(loss={'name': 'svm'}))]),
             n_jobs=-1,
             param_grid={'clf__C': [0.1, 1.0, 3.0],
                         'clf__class_weight': [None, 'balanced',
                                               {0: 1.0, 1: 2.0}],
                         'clf__constraint': [[], [{'name': 'nonnegative'}],
                                             [{'name': 'fair', 'sen_idx': [0],
                                               'tol_sen': 0.1}]],
                         'clf__fit_intercept': [True, False],
                         'clf__intercept_scaling': [0.5, 1.0, 2.0],
                         'clf__loss': [{'name': 'svm'}, {'name': 'sSVM'}],
                         'clf__max_iter': [5000, 10000]},
             scoring='accuracy', verbose=1)
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
[14]:
print("Best params:", grid.best_params_)
print("Best CV accuracy:", grid.best_score_)
Best params: {'clf__C': 0.1, 'clf__class_weight': None, 'clf__constraint': [{'name': 'fair', 'sen_idx': [0], 'tol_sen': 0.1}], 'clf__fit_intercept': True, 'clf__intercept_scaling': 1.0, 'clf__loss': {'name': 'svm'}, 'clf__max_iter': 5000}
Best CV accuracy: 0.8146666666666667
[15]:
# use the best estimator fit and predict
best_model = grid.best_estimator_
y_pred = best_model.predict(X_test)
test_acc = accuracy_score(y_test, y_pred)

print("Test accuracy:", test_acc)
print("\nClassification report:\n", classification_report(y_test, y_pred, digits=4))
print("Confusion matrix:\n", confusion_matrix(y_test, y_pred))
Test accuracy: 0.802

Classification report:
               precision    recall  f1-score   support

           0     0.8094    0.9370    0.8685       349
           1     0.7708    0.4901    0.5992       151

    accuracy                         0.8020       500
   macro avg     0.7901    0.7135    0.7339       500
weighted avg     0.7978    0.8020    0.7872       500

Confusion matrix:
 [[327  22]
 [ 77  74]]

Regression Example

Here we shows a regression example which contains pipeline, cross_val_score and GridSearchCV.

[12]:
from sklearn.datasets import make_regression
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.model_selection import GridSearchCV, cross_val_score, train_test_split
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
[13]:
# generate the data
X, y = make_regression(n_samples=1500, n_features=15, n_informative=10, noise=10.0, random_state=42)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)
[14]:
from rehline import plq_Ridge_Regressor

# set the pipeline
pipe = Pipeline(
    [
        ("scaler", StandardScaler()),
        ("reg", plq_Ridge_Regressor(loss={"name": "QR", "qt": 0.5})),
    ]
)
[18]:
# set the param_grid
param_grid = {
    "reg__loss": [
        {"name": "QR", "qt": 0.5},
        {"name": "huber", "tau": 1.0},  # Huber needs tau
        {"name": "SVR", "epsilon": 0.1},  # SVR needs epsilon
    ],
    "reg__C": [0.1, 1.0, 10.0],
    "reg__fit_intercept": [True, False],
    "reg__intercept_scaling": [0.5, 1.0],
    "reg__max_iter": [5000, 8000],
    "reg__constraint": [
        [],  # no constraint
        [{"name": "nonnegative"}],
        [{"name": "fair", "sen_idx": [0], "tol_sen": 0.1}],
    ],
}
[19]:
# cross_val_score function

cv_scores = cross_val_score(
    pipe,
    X_train,
    y_train,
    cv=5,
    scoring="r2",
    n_jobs=-1,
)
print("CV R^2 scores:", cv_scores)
print("Mean CV R^2:", np.mean(cv_scores))
CV R^2 scores: [0.99578266 0.99573973 0.99608371 0.99525645 0.9949942 ]
Mean CV R^2: 0.9955713512215377
[20]:
# use GridSearchCV to tune the hyperparameters

grid = GridSearchCV(
    estimator=pipe,
    param_grid=param_grid,
    scoring="r2",
    cv=5,
    n_jobs=-1,
    refit=True,
    verbose=1,
)

grid.fit(X_train, y_train)
Fitting 5 folds for each of 216 candidates, totalling 1080 fits
/usr/local/lib/python3.12/dist-packages/rehline/_class.py:419: ConvergenceWarning: ReHLine failed to converge, increase the number of iterations: `max_iter`.
  warnings.warn(
[20]:
GridSearchCV(cv=5,
             estimator=Pipeline(steps=[('scaler', StandardScaler()),
                                       ('reg', plq_Ridge_Regressor())]),
             n_jobs=-1,
             param_grid={'reg__C': [0.1, 1.0, 10.0],
                         'reg__constraint': [[], [{'name': 'nonnegative'}],
                                             [{'name': 'fair', 'sen_idx': [0],
                                               'tol_sen': 0.1}]],
                         'reg__fit_intercept': [True, False],
                         'reg__intercept_scaling': [0.5, 1.0],
                         'reg__loss': [{'name': 'QR', 'qt': 0.5},
                                       {'name': 'huber', 'tau': 1.0},
                                       {'epsilon': 0.1, 'name': 'SVR'}],
                         'reg__max_iter': [5000, 8000]},
             scoring='r2', verbose=1)
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
[22]:
# print the best parameters and the best CV R^2 score
print("Best params:", grid.best_params_)
print("Best CV R^2:", grid.best_score_)
Best params: {'reg__C': 10.0, 'reg__constraint': [{'name': 'nonnegative'}], 'reg__fit_intercept': True, 'reg__intercept_scaling': 1.0, 'reg__loss': {'name': 'SVR', 'epsilon': 0.1}, 'reg__max_iter': 8000}
Best CV R^2: 0.9967851378070526
[23]:
# use the best estimator to fit and predict the model
best_model = grid.best_estimator_
y_pred = best_model.predict(X_test)

print("Test R^2:", r2_score(y_test, y_pred))
print("Test MSE:", mean_squared_error(y_test, y_pred))
Test R^2: 0.9968147697852413
Test MSE: 103.43336817904354