Source code for autosklearn.pipeline.components.feature_preprocessing

from typing import Optional, Type

import os
from collections import OrderedDict

from ConfigSpace.configuration_space import ConfigurationSpace
from ConfigSpace.hyperparameters import CategoricalHyperparameter

from autosklearn.askl_typing import FEAT_TYPE_TYPE

from ..base import (
    AutoSklearnChoice,
    AutoSklearnPreprocessingAlgorithm,
    ThirdPartyComponents,
    _addons,
    find_components,
)

classifier_directory = os.path.split(__file__)[0]
_preprocessors = find_components(
    __package__, classifier_directory, AutoSklearnPreprocessingAlgorithm
)
additional_components = ThirdPartyComponents(AutoSklearnPreprocessingAlgorithm)
_addons["feature_preprocessing"] = additional_components


[docs]def add_preprocessor(preprocessor: Type[AutoSklearnPreprocessingAlgorithm]) -> None: additional_components.add_component(preprocessor)
class FeaturePreprocessorChoice(AutoSklearnChoice): @classmethod def get_components(cls): components = OrderedDict() components.update(_preprocessors) components.update(additional_components.components) return components def get_available_components( self, dataset_properties=None, include=None, exclude=None ): if dataset_properties is None: dataset_properties = {} if include is not None and exclude is not None: raise ValueError( "The argument include and exclude cannot be used together." ) available_comp = self.get_components() if include is not None: for incl in include: if incl not in available_comp: raise ValueError( "Trying to include unknown component: " "%s" % incl ) # TODO check for task type classification and/or regression! components_dict = OrderedDict() for name in available_comp: if include is not None and name not in include: continue elif exclude is not None and name in exclude: continue entry = available_comp[name] # Exclude itself to avoid infinite loop if entry == FeaturePreprocessorChoice or hasattr(entry, "get_components"): continue target_type = dataset_properties["target_type"] if target_type == "classification": if entry.get_properties()["handles_classification"] is False: continue if ( dataset_properties.get("multiclass") is True and entry.get_properties()["handles_multiclass"] is False ): continue if ( dataset_properties.get("multilabel") is True and entry.get_properties()["handles_multilabel"] is False ): continue elif target_type == "regression": if entry.get_properties()["handles_regression"] is False: continue if ( dataset_properties.get("multioutput") is True and entry.get_properties()["handles_multioutput"] is False ): continue else: raise ValueError("Unknown target type %s" % target_type) components_dict[name] = entry return components_dict def get_hyperparameter_search_space( self, feat_type: Optional[FEAT_TYPE_TYPE] = None, dataset_properties=None, default=None, include=None, exclude=None, ): cs = ConfigurationSpace() if dataset_properties is None: dataset_properties = {} # Compile a list of legal preprocessors for this problem available_preprocessors = self.get_available_components( dataset_properties=dataset_properties, include=include, exclude=exclude ) if len(available_preprocessors) == 0: raise ValueError("No preprocessors found, please add NoPreprocessing") if default is None: defaults = ["no_preprocessing", "select_percentile", "pca", "truncatedSVD"] for default_ in defaults: if default_ in available_preprocessors: default = default_ break preprocessor = CategoricalHyperparameter( "__choice__", list(available_preprocessors.keys()), default_value=default ) cs.add_hyperparameter(preprocessor) for name in available_preprocessors: preprocessor_configuration_space = available_preprocessors[ name ].get_hyperparameter_search_space(dataset_properties=dataset_properties) parent_hyperparameter = {"parent": preprocessor, "value": name} cs.add_configuration_space( name, preprocessor_configuration_space, parent_hyperparameter=parent_hyperparameter, ) return cs def transform(self, X): return self.choice.transform(X)