Extending Auto-Sklearn with Data Preprocessor Component

The following example demonstrates how to turn off data preprocessing step in auto-skearn.

from typing import Optional
from pprint import pprint

import autosklearn.classification
import autosklearn.pipeline.components.data_preprocessing
import sklearn.metrics
from ConfigSpace.configuration_space import ConfigurationSpace

from autosklearn.askl_typing import FEAT_TYPE_TYPE
from autosklearn.pipeline.components.base import AutoSklearnPreprocessingAlgorithm
from autosklearn.pipeline.constants import SPARSE, DENSE, UNSIGNED_DATA, INPUT
from sklearn.datasets import load_breast_cancer
from sklearn.model_selection import train_test_split

Create NoPreprocessing component for auto-sklearn

class NoPreprocessing(AutoSklearnPreprocessingAlgorithm):
    def __init__(self, **kwargs):
        """This preprocessors does not change the data"""
        # Some internal checks makes sure parameters are set
        for key, val in kwargs.items():
            setattr(self, key, val)

    def fit(self, X, Y=None):
        return self

    def transform(self, X):
        return X

    @staticmethod
    def get_properties(dataset_properties=None):
        return {
            "shortname": "NoPreprocessing",
            "name": "NoPreprocessing",
            "handles_regression": True,
            "handles_classification": True,
            "handles_multiclass": True,
            "handles_multilabel": True,
            "handles_multioutput": True,
            "is_deterministic": True,
            "input": (SPARSE, DENSE, UNSIGNED_DATA),
            "output": (INPUT,),
        }

    @staticmethod
    def get_hyperparameter_search_space(
        feat_type: Optional[FEAT_TYPE_TYPE] = None, dataset_properties=None
    ):
        return ConfigurationSpace()  # Return an empty configuration as there is None


# Add NoPreprocessing component to auto-sklearn.
autosklearn.pipeline.components.data_preprocessing.add_preprocessor(NoPreprocessing)

Create dataset

X, y = load_breast_cancer(return_X_y=True)
X_train, X_test, y_train, y_test = train_test_split(X, y)

Fit the model without performing data preprocessing

clf = autosklearn.classification.AutoSklearnClassifier(
    time_left_for_this_task=120,
    include={"data_preprocessor": ["NoPreprocessing"]},
    # Bellow two flags are provided to speed up calculations
    # Not recommended for a real implementation
    initial_configurations_via_metalearning=0,
    smac_scenario_args={"runcount_limit": 5},
)
clf.fit(X_train, y_train)

# To check that models were found without issue when running examples
assert len(clf.get_models_with_weights()) > 0
print(clf.sprint_statistics())
Fitting to the training data:   0%|          | 0/120 [00:00<?, ?it/s, The total time budget for this task is 0:02:00]
Fitting to the training data:   1%|          | 1/120 [00:01<01:59,  1.00s/it, The total time budget for this task is 0:02:00]
Fitting to the training data:   2%|1         | 2/120 [00:02<01:58,  1.00s/it, The total time budget for this task is 0:02:00]
Fitting to the training data:   2%|2         | 3/120 [00:03<01:58,  1.01s/it, The total time budget for this task is 0:02:00]
Fitting to the training data:   3%|3         | 4/120 [00:04<01:57,  1.01s/it, The total time budget for this task is 0:02:00]
Fitting to the training data:   4%|4         | 5/120 [00:05<01:56,  1.01s/it, The total time budget for this task is 0:02:00]
Fitting to the training data:   5%|5         | 6/120 [00:06<01:55,  1.01s/it, The total time budget for this task is 0:02:00]
Fitting to the training data:   6%|5         | 7/120 [00:07<01:54,  1.01s/it, The total time budget for this task is 0:02:00]
Fitting to the training data:   7%|6         | 8/120 [00:08<01:53,  1.01s/it, The total time budget for this task is 0:02:00]
Fitting to the training data:   8%|7         | 9/120 [00:09<01:51,  1.01s/it, The total time budget for this task is 0:02:00]
Fitting to the training data:   8%|8         | 10/120 [00:10<01:50,  1.01s/it, The total time budget for this task is 0:02:00]
Fitting to the training data:   9%|9         | 11/120 [00:11<01:50,  1.01s/it, The total time budget for this task is 0:02:00]
Fitting to the training data:  10%|#         | 12/120 [00:12<01:48,  1.01s/it, The total time budget for this task is 0:02:00]
Fitting to the training data:  11%|#         | 13/120 [00:13<01:49,  1.03s/it, The total time budget for this task is 0:02:00]
Fitting to the training data:  12%|#1        | 14/120 [00:14<01:47,  1.02s/it, The total time budget for this task is 0:02:00]
Fitting to the training data: 100%|##########| 120/120 [00:14<00:00,  8.47it/s, The total time budget for this task is 0:02:00]
auto-sklearn results:
  Dataset name: 285e74f6-6bf8-11ed-87b6-77edb579fc6c
  Metric: accuracy
  Best validation score: 0.929078
  Number of target algorithm runs: 5
  Number of successful target algorithm runs: 4
  Number of crashed target algorithm runs: 1
  Number of target algorithms that exceeded the time limit: 0
  Number of target algorithms that exceeded the memory limit: 0