Note

Click here to download the full example code or to run this example in your browser via Binder

Extending Auto-Sklearn with Preprocessor Component¶

The following example demonstrates how to create a wrapper around the linear discriminant analysis (LDA) algorithm from sklearn and use it as a preprocessor in auto-sklearn.

from typing import Optional
from pprint import pprint

from ConfigSpace.configuration_space import ConfigurationSpace
from ConfigSpace.hyperparameters import (
    UniformFloatHyperparameter,
    CategoricalHyperparameter,
)
from ConfigSpace.conditions import InCondition

import sklearn.metrics

from autosklearn.askl_typing import FEAT_TYPE_TYPE
import autosklearn.classification
import autosklearn.pipeline.components.feature_preprocessing
from autosklearn.pipeline.components.base import AutoSklearnPreprocessingAlgorithm
from autosklearn.pipeline.constants import DENSE, SIGNED_DATA, UNSIGNED_DATA
from autosklearn.util.common import check_none

from sklearn.datasets import load_breast_cancer
from sklearn.model_selection import train_test_split

Create LDA component for auto-sklearn¶

class LDA(AutoSklearnPreprocessingAlgorithm):
    def __init__(self, solver, tol, shrinkage=None, random_state=None):
        self.solver = solver
        self.shrinkage = shrinkage
        self.tol = tol
        self.random_state = random_state
        self.preprocessor = None

    def fit(self, X, y=None):
        if check_none(self.shrinkage):
            self.shrinkage = None
        else:
            self.shrinkage = float(self.shrinkage)
        self.tol = float(self.tol)

        import sklearn.discriminant_analysis

        self.preprocessor = sklearn.discriminant_analysis.LinearDiscriminantAnalysis(
            shrinkage=self.shrinkage,
            solver=self.solver,
            tol=self.tol,
        )
        self.preprocessor.fit(X, y)
        return self

    def transform(self, X):
        if self.preprocessor is None:
            raise NotImplementedError()
        return self.preprocessor.transform(X)

    @staticmethod
    def get_properties(dataset_properties=None):
        return {
            "shortname": "LDA",
            "name": "Linear Discriminant Analysis",
            "handles_regression": False,
            "handles_classification": True,
            "handles_multiclass": False,
            "handles_multilabel": False,
            "handles_multioutput": False,
            "is_deterministic": True,
            "input": (DENSE, UNSIGNED_DATA, SIGNED_DATA),
            "output": (DENSE, UNSIGNED_DATA, SIGNED_DATA),
        }

    @staticmethod
    def get_hyperparameter_search_space(
        feat_type: Optional[FEAT_TYPE_TYPE] = None, dataset_properties=None
    ):
        cs = ConfigurationSpace()
        solver = CategoricalHyperparameter(
            name="solver", choices=["svd", "lsqr", "eigen"], default_value="svd"
        )
        shrinkage = UniformFloatHyperparameter(
            name="shrinkage", lower=0.0, upper=1.0, default_value=0.5
        )
        tol = UniformFloatHyperparameter(
            name="tol", lower=0.0001, upper=1, default_value=0.0001
        )
        cs.add_hyperparameters([solver, shrinkage, tol])
        shrinkage_condition = InCondition(shrinkage, solver, ["lsqr", "eigen"])
        cs.add_condition(shrinkage_condition)
        return cs


# Add LDA component to auto-sklearn.
autosklearn.pipeline.components.feature_preprocessing.add_preprocessor(LDA)

Create dataset¶

X, y = load_breast_cancer(return_X_y=True)
X_train, X_test, y_train, y_test = train_test_split(X, y)

Configuration space¶

cs = LDA.get_hyperparameter_search_space()
print(cs)

Configuration space object:
  Hyperparameters:
    shrinkage, Type: UniformFloat, Range: [0.0, 1.0], Default: 0.5
    solver, Type: Categorical, Choices: {svd, lsqr, eigen}, Default: svd
    tol, Type: UniformFloat, Range: [0.0001, 1.0], Default: 0.0001
  Conditions:
    shrinkage | solver in {'lsqr', 'eigen'}

Fit the model using LDA as preprocessor¶

clf = autosklearn.classification.AutoSklearnClassifier(
    time_left_for_this_task=30,
    include={"feature_preprocessor": ["LDA"]},
    # Bellow two flags are provided to speed up calculations
    # Not recommended for a real implementation
    initial_configurations_via_metalearning=0,
    smac_scenario_args={"runcount_limit": 5},
)
clf.fit(X_train, y_train)

AutoSklearnClassifier(ensemble_class=<class 'autosklearn.ensembles.ensemble_selection.EnsembleSelection'>,
                      include={'feature_preprocessor': ['LDA']},
                      initial_configurations_via_metalearning=0,
                      per_run_time_limit=3,
                      smac_scenario_args={'runcount_limit': 5},
                      time_left_for_this_task=30)

Print prediction score and statistics¶

y_pred = clf.predict(X_test)
print("accuracy: ", sklearn.metrics.accuracy_score(y_pred, y_test))
pprint(clf.show_models(), indent=4)

accuracy:  0.9440559440559441
{   2: {   'balancing': Balancing(random_state=1),
           'classifier': <autosklearn.pipeline.components.classification.ClassifierChoice object at 0x7f05ce64b280>,
           'cost': 0.049645390070921946,
           'data_preprocessor': <autosklearn.pipeline.components.data_preprocessing.DataPreprocessorChoice object at 0x7f05d45bd250>,
           'ensemble_weight': 0.28,
           'feature_preprocessor': <autosklearn.pipeline.components.feature_preprocessing.FeaturePreprocessorChoice object at 0x7f05d237ec40>,
           'model_id': 2,
           'rank': 1,
           'sklearn_classifier': RandomForestClassifier(max_features=1, n_estimators=512, n_jobs=1,
                       random_state=1, warm_start=True)},
    4: {   'balancing': Balancing(random_state=1, strategy='weighting'),
           'classifier': <autosklearn.pipeline.components.classification.ClassifierChoice object at 0x7f05d3ff3fd0>,
           'cost': 0.049645390070921946,
           'data_preprocessor': <autosklearn.pipeline.components.data_preprocessing.DataPreprocessorChoice object at 0x7f05ccb9f4c0>,
           'ensemble_weight': 0.32,
           'feature_preprocessor': <autosklearn.pipeline.components.feature_preprocessing.FeaturePreprocessorChoice object at 0x7f05d3ff35b0>,
           'model_id': 4,
           'rank': 2,
           'sklearn_classifier': BernoulliNB(alpha=23.955206843654683, fit_prior=False)},
    5: {   'balancing': Balancing(random_state=1, strategy='weighting'),
           'classifier': <autosklearn.pipeline.components.classification.ClassifierChoice object at 0x7f05d1ad46d0>,
           'cost': 0.12056737588652477,
           'data_preprocessor': <autosklearn.pipeline.components.data_preprocessing.DataPreprocessorChoice object at 0x7f05cc27dc40>,
           'ensemble_weight': 0.4,
           'feature_preprocessor': <autosklearn.pipeline.components.feature_preprocessing.FeaturePreprocessorChoice object at 0x7f05d0fc0d60>,
           'model_id': 5,
           'rank': 3,
           'sklearn_classifier': AdaBoostClassifier(algorithm='SAMME',
                   base_estimator=DecisionTreeClassifier(max_depth=7),
                   learning_rate=1.371412136822841, n_estimators=135,
                   random_state=1)}}

Total running time of the script: ( 0 minutes 20.324 seconds)

Gallery generated by Sphinx-Gallery