Note

Click here to download the full example code or to run this example in your browser via Binder

Extending Auto-Sklearn with Data Preprocessor Component¶

The following example demonstrates how to turn off data preprocessing step in auto-skearn.

from typing import Optional
from pprint import pprint

import autosklearn.classification
import autosklearn.pipeline.components.data_preprocessing
import sklearn.metrics
from ConfigSpace.configuration_space import ConfigurationSpace

from autosklearn.askl_typing import FEAT_TYPE_TYPE
from autosklearn.pipeline.components.base import AutoSklearnPreprocessingAlgorithm
from autosklearn.pipeline.constants import SPARSE, DENSE, UNSIGNED_DATA, INPUT
from sklearn.datasets import load_breast_cancer
from sklearn.model_selection import train_test_split

Create NoPreprocessing component for auto-sklearn¶

class NoPreprocessing(AutoSklearnPreprocessingAlgorithm):
    def __init__(self, **kwargs):
        """This preprocessors does not change the data"""
        # Some internal checks makes sure parameters are set
        for key, val in kwargs.items():
            setattr(self, key, val)

    def fit(self, X, Y=None):
        return self

    def transform(self, X):
        return X

    @staticmethod
    def get_properties(dataset_properties=None):
        return {
            "shortname": "NoPreprocessing",
            "name": "NoPreprocessing",
            "handles_regression": True,
            "handles_classification": True,
            "handles_multiclass": True,
            "handles_multilabel": True,
            "handles_multioutput": True,
            "is_deterministic": True,
            "input": (SPARSE, DENSE, UNSIGNED_DATA),
            "output": (INPUT,),
        }

    @staticmethod
    def get_hyperparameter_search_space(
        feat_type: Optional[FEAT_TYPE_TYPE] = None, dataset_properties=None
    ):
        return ConfigurationSpace()  # Return an empty configuration as there is None


# Add NoPreprocessing component to auto-sklearn.
autosklearn.pipeline.components.data_preprocessing.add_preprocessor(NoPreprocessing)

Create dataset¶

X, y = load_breast_cancer(return_X_y=True)
X_train, X_test, y_train, y_test = train_test_split(X, y)

Fit the model without performing data preprocessing¶

clf = autosklearn.classification.AutoSklearnClassifier(
    time_left_for_this_task=120,
    include={"data_preprocessor": ["NoPreprocessing"]},
    # Bellow two flags are provided to speed up calculations
    # Not recommended for a real implementation
    initial_configurations_via_metalearning=0,
    smac_scenario_args={"runcount_limit": 5},
)
clf.fit(X_train, y_train)

# To check that models were found without issue when running examples
assert len(clf.get_models_with_weights()) > 0
print(clf.sprint_statistics())

auto-sklearn results:
  Dataset name: 0f572a45-38c5-11ed-8830-892d16569fbe
  Metric: accuracy
  Best validation score: 0.950355
  Number of target algorithm runs: 5
  Number of successful target algorithm runs: 5
  Number of crashed target algorithm runs: 0
  Number of target algorithms that exceeded the time limit: 0
  Number of target algorithms that exceeded the memory limit: 0

Print prediction score and statistics¶

y_pred = clf.predict(X_test)
print("accuracy: ", sklearn.metrics.accuracy_score(y_pred, y_test))
pprint(clf.show_models(), indent=4)

accuracy:  0.972027972027972
{   2: {   'balancing': Balancing(random_state=1),
           'classifier': <autosklearn.pipeline.components.classification.ClassifierChoice object at 0x7f05cb269400>,
           'cost': 0.049645390070921946,
           'data_preprocessor': <autosklearn.pipeline.components.data_preprocessing.DataPreprocessorChoice object at 0x7f05cb269220>,
           'ensemble_weight': 0.38,
           'feature_preprocessor': <autosklearn.pipeline.components.feature_preprocessing.FeaturePreprocessorChoice object at 0x7f05cb269130>,
           'model_id': 2,
           'rank': 1,
           'sklearn_classifier': RandomForestClassifier(max_features=5, n_estimators=512, n_jobs=1,
                       random_state=1, warm_start=True)},
    3: {   'balancing': Balancing(random_state=1),
           'classifier': <autosklearn.pipeline.components.classification.ClassifierChoice object at 0x7f05ce642040>,
           'cost': 0.11347517730496459,
           'data_preprocessor': <autosklearn.pipeline.components.data_preprocessing.DataPreprocessorChoice object at 0x7f05ce642400>,
           'ensemble_weight': 0.02,
           'feature_preprocessor': <autosklearn.pipeline.components.feature_preprocessing.FeaturePreprocessorChoice object at 0x7f05ce642370>,
           'model_id': 3,
           'rank': 2,
           'sklearn_classifier': AdaBoostClassifier(base_estimator=DecisionTreeClassifier(max_depth=9),
                   learning_rate=1.077525751047041, n_estimators=208,
                   random_state=1)},
    4: {   'balancing': Balancing(random_state=1, strategy='weighting'),
           'classifier': <autosklearn.pipeline.components.classification.ClassifierChoice object at 0x7f05d462ee20>,
           'cost': 0.375886524822695,
           'data_preprocessor': <autosklearn.pipeline.components.data_preprocessing.DataPreprocessorChoice object at 0x7f05d45afa30>,
           'ensemble_weight': 0.46,
           'feature_preprocessor': <autosklearn.pipeline.components.feature_preprocessing.FeaturePreprocessorChoice object at 0x7f05d462e9d0>,
           'model_id': 4,
           'rank': 3,
           'sklearn_classifier': PassiveAggressiveClassifier(C=7.328568999403367e-05, max_iter=16,
                            random_state=1, tol=0.02258105767523384,
                            warm_start=True)},
    5: {   'balancing': Balancing(random_state=1),
           'classifier': <autosklearn.pipeline.components.classification.ClassifierChoice object at 0x7f05d6854100>,
           'cost': 0.375886524822695,
           'data_preprocessor': <autosklearn.pipeline.components.data_preprocessing.DataPreprocessorChoice object at 0x7f05d462e670>,
           'ensemble_weight': 0.02,
           'feature_preprocessor': <autosklearn.pipeline.components.feature_preprocessing.FeaturePreprocessorChoice object at 0x7f05d462edc0>,
           'model_id': 5,
           'rank': 4,
           'sklearn_classifier': SVC(C=5.6217653025966605, cache_size=1806.2005208333333,
    coef0=-0.011192631375838147, degree=5, gamma=0.00016239358614064282,
    kernel='poly', max_iter=-1.0, random_state=1, tol=0.0001346237047084913)},
    6: {   'balancing': Balancing(random_state=1),
           'classifier': <autosklearn.pipeline.components.classification.ClassifierChoice object at 0x7f05ca916610>,
           'cost': 0.12765957446808507,
           'data_preprocessor': <autosklearn.pipeline.components.data_preprocessing.DataPreprocessorChoice object at 0x7f05d6854040>,
           'ensemble_weight': 0.12,
           'feature_preprocessor': <autosklearn.pipeline.components.feature_preprocessing.FeaturePreprocessorChoice object at 0x7f05d6854910>,
           'model_id': 6,
           'rank': 5,
           'sklearn_classifier': LinearDiscriminantAnalysis(shrinkage='auto', solver='lsqr',
                           tol=0.00010495766342828794)}}

Total running time of the script: ( 0 minutes 19.979 seconds)

Gallery generated by Sphinx-Gallery