{ "cells": [ { "cell_type": "markdown", "metadata": { "id": "Vv6noLRoaqh0" }, "source": [ "# Scikit-learn Compatible Estimators" ] }, { "cell_type": "markdown", "metadata": { "id": "Y9pEv59la5CV" }, "source": [ "[](https://rehline-python.readthedocs.io/en/latest/)\n", "\n", "The core class `plqERM_Ridge` serves as a base implementation for both classification and regression tasks. Its subclasses, `plqERMClassifier` and `plqERMRegressor`, provide task-specific functionality while integrating seamlessly with scikit-learn utilities such as `Pipeline`, `cross_val_score`, and `GridSearchCV`. In addition, these models support common evaluation methods, allowing users to compute metrics such as accuracy scores for classification or R² values for regression." ] }, { "cell_type": "markdown", "metadata": { "id": "chXqSvec7yqI" }, "source": [ "#### Classification Example with GridSearchCV and Pipeline\n", "\n", "Here we shows a classification example which contains `pipeline`, `cross_val_score` and `GridSearchCV`." ] }, { "cell_type": "code", "execution_count": 2, "metadata": { "id": "umXH0TZG9Zsl" }, "outputs": [], "source": [ "import numpy as np\n", "from sklearn.datasets import make_classification\n", "from sklearn.metrics import accuracy_score, classification_report, confusion_matrix\n", "from sklearn.model_selection import GridSearchCV, cross_val_score, train_test_split\n", "from sklearn.pipeline import Pipeline\n", "from sklearn.preprocessing import StandardScaler" ] }, { "cell_type": "code", "execution_count": 3, "metadata": { "id": "Qh3tOux-9gZ9" }, "outputs": [], "source": [ "# generate the dataset\n", "X, y = make_classification(\n", " n_samples=2000,\n", " n_features=20,\n", " n_informative=8,\n", " n_redundant=4,\n", " n_repeated=0,\n", " n_classes=2,\n", " weights=[0.7, 0.3], # imbalance\n", " class_sep=1.2,\n", " flip_y=0.01,\n", " random_state=42,\n", ")\n", "\n", "X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, stratify=y, random_state=42)" ] }, { "cell_type": "code", "execution_count": 7, "metadata": { "id": "_MR1vTRc93xM" }, "outputs": [], "source": [ "from rehline import plq_Ridge_Classifier\n", "\n", "# set the pipeline\n", "pipe = Pipeline(\n", " [\n", " (\"scaler\", StandardScaler()),\n", " (\"clf\", plq_Ridge_Classifier(loss={\"name\": \"svm\"})),\n", " ]\n", ")" ] }, { "cell_type": "code", "execution_count": 11, "metadata": { "id": "tfGUG7auABMG" }, "outputs": [], "source": [ "# set the parameter grid\n", "param_grid = {\n", " \"clf__loss\": [{\"name\": \"svm\"}, {\"name\": \"sSVM\"}],\n", " \"clf__C\": [0.1, 1.0, 3.0],\n", " \"clf__fit_intercept\": [True, False],\n", " \"clf__intercept_scaling\": [0.5, 1.0, 2.0],\n", " \"clf__max_iter\": [5000, 10000],\n", " \"clf__class_weight\": [None, \"balanced\", {0: 1.0, 1: 2.0}],\n", " \"clf__constraint\": [\n", " [], # no constraint\n", " [{\"name\": \"nonnegative\"}],\n", " [{\"name\": \"fair\", \"sen_idx\": [0], \"tol_sen\": 0.1}],\n", " ],\n", "}" ] }, { "cell_type": "code", "execution_count": 12, "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "LBsUuv6bBW00", "outputId": "fbd50af6-a23a-4eb9-c0dc-0c1e538c9574" }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "CV scores: [0.79333333 0.82 0.82333333 0.81 0.80666667]\n" ] } ], "source": [ "# cross_val_score function\n", "cv_scores = cross_val_score(\n", " pipe,\n", " X_train,\n", " y_train,\n", " cv=5,\n", " scoring=\"accuracy\",\n", " n_jobs=-1,\n", ")\n", "print(\"CV scores:\", cv_scores)" ] }, { "cell_type": "code", "execution_count": 13, "metadata": { "colab": { "base_uri": "https://localhost:8080/", "height": 207 }, "id": "s0Ur4GgIAGET", "outputId": "f2fee1ba-d348-472f-9cb2-8c5d62a776ba" }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Fitting 5 folds for each of 648 candidates, totalling 3240 fits\n" ] }, { "data": { "text/html": [ "
GridSearchCV(cv=5,\n",
" estimator=Pipeline(steps=[('scaler', StandardScaler()),\n",
" ('clf',\n",
" plq_Ridge_Classifier(loss={'name': 'svm'}))]),\n",
" n_jobs=-1,\n",
" param_grid={'clf__C': [0.1, 1.0, 3.0],\n",
" 'clf__class_weight': [None, 'balanced',\n",
" {0: 1.0, 1: 2.0}],\n",
" 'clf__constraint': [[], [{'name': 'nonnegative'}],\n",
" [{'name': 'fair', 'sen_idx': [0],\n",
" 'tol_sen': 0.1}]],\n",
" 'clf__fit_intercept': [True, False],\n",
" 'clf__intercept_scaling': [0.5, 1.0, 2.0],\n",
" 'clf__loss': [{'name': 'svm'}, {'name': 'sSVM'}],\n",
" 'clf__max_iter': [5000, 10000]},\n",
" scoring='accuracy', verbose=1)In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook. GridSearchCV(cv=5,\n",
" estimator=Pipeline(steps=[('scaler', StandardScaler()),\n",
" ('clf',\n",
" plq_Ridge_Classifier(loss={'name': 'svm'}))]),\n",
" n_jobs=-1,\n",
" param_grid={'clf__C': [0.1, 1.0, 3.0],\n",
" 'clf__class_weight': [None, 'balanced',\n",
" {0: 1.0, 1: 2.0}],\n",
" 'clf__constraint': [[], [{'name': 'nonnegative'}],\n",
" [{'name': 'fair', 'sen_idx': [0],\n",
" 'tol_sen': 0.1}]],\n",
" 'clf__fit_intercept': [True, False],\n",
" 'clf__intercept_scaling': [0.5, 1.0, 2.0],\n",
" 'clf__loss': [{'name': 'svm'}, {'name': 'sSVM'}],\n",
" 'clf__max_iter': [5000, 10000]},\n",
" scoring='accuracy', verbose=1)Pipeline(steps=[('scaler', StandardScaler()),\n",
" ('clf',\n",
" plq_Ridge_Classifier(C=0.1,\n",
" constraint=[{'name': 'fair',\n",
" 'sen_idx': [0],\n",
" 'tol_sen': 0.1}],\n",
" loss={'name': 'svm'}, max_iter=5000))])StandardScaler()
plq_Ridge_Classifier(C=0.1,\n",
" constraint=[{'name': 'fair', 'sen_idx': [0],\n",
" 'tol_sen': 0.1}],\n",
" loss={'name': 'svm'}, max_iter=5000)GridSearchCV(cv=5,\n",
" estimator=Pipeline(steps=[('scaler', StandardScaler()),\n",
" ('reg', plq_Ridge_Regressor())]),\n",
" n_jobs=-1,\n",
" param_grid={'reg__C': [0.1, 1.0, 10.0],\n",
" 'reg__constraint': [[], [{'name': 'nonnegative'}],\n",
" [{'name': 'fair', 'sen_idx': [0],\n",
" 'tol_sen': 0.1}]],\n",
" 'reg__fit_intercept': [True, False],\n",
" 'reg__intercept_scaling': [0.5, 1.0],\n",
" 'reg__loss': [{'name': 'QR', 'qt': 0.5},\n",
" {'name': 'huber', 'tau': 1.0},\n",
" {'epsilon': 0.1, 'name': 'SVR'}],\n",
" 'reg__max_iter': [5000, 8000]},\n",
" scoring='r2', verbose=1)In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook. GridSearchCV(cv=5,\n",
" estimator=Pipeline(steps=[('scaler', StandardScaler()),\n",
" ('reg', plq_Ridge_Regressor())]),\n",
" n_jobs=-1,\n",
" param_grid={'reg__C': [0.1, 1.0, 10.0],\n",
" 'reg__constraint': [[], [{'name': 'nonnegative'}],\n",
" [{'name': 'fair', 'sen_idx': [0],\n",
" 'tol_sen': 0.1}]],\n",
" 'reg__fit_intercept': [True, False],\n",
" 'reg__intercept_scaling': [0.5, 1.0],\n",
" 'reg__loss': [{'name': 'QR', 'qt': 0.5},\n",
" {'name': 'huber', 'tau': 1.0},\n",
" {'epsilon': 0.1, 'name': 'SVR'}],\n",
" 'reg__max_iter': [5000, 8000]},\n",
" scoring='r2', verbose=1)Pipeline(steps=[('scaler', StandardScaler()),\n",
" ('reg',\n",
" plq_Ridge_Regressor(C=10.0,\n",
" constraint=[{'name': 'nonnegative'}],\n",
" loss={'epsilon': 0.1, 'name': 'SVR'},\n",
" max_iter=8000))])StandardScaler()
plq_Ridge_Regressor(C=10.0, constraint=[{'name': 'nonnegative'}],\n",
" loss={'epsilon': 0.1, 'name': 'SVR'}, max_iter=8000)