Note
Click here to download the full example code or to run this example in your browser via Binder
Extending Auto-Sklearn with Preprocessor Component¶
The following example demonstrates how to create a wrapper around the linear discriminant analysis (LDA) algorithm from sklearn and use it as a preprocessor in auto-sklearn.
from typing import Optional
from pprint import pprint
from ConfigSpace.configuration_space import ConfigurationSpace
from ConfigSpace.hyperparameters import (
UniformFloatHyperparameter,
CategoricalHyperparameter,
)
from ConfigSpace.conditions import InCondition
import sklearn.metrics
from autosklearn.askl_typing import FEAT_TYPE_TYPE
import autosklearn.classification
import autosklearn.pipeline.components.feature_preprocessing
from autosklearn.pipeline.components.base import AutoSklearnPreprocessingAlgorithm
from autosklearn.pipeline.constants import DENSE, SIGNED_DATA, UNSIGNED_DATA
from autosklearn.util.common import check_none
from sklearn.datasets import load_breast_cancer
from sklearn.model_selection import train_test_split
Create LDA component for auto-sklearn¶
class LDA(AutoSklearnPreprocessingAlgorithm):
def __init__(self, solver, tol, shrinkage=None, random_state=None):
self.solver = solver
self.shrinkage = shrinkage
self.tol = tol
self.random_state = random_state
self.preprocessor = None
def fit(self, X, y=None):
if check_none(self.shrinkage):
self.shrinkage = None
else:
self.shrinkage = float(self.shrinkage)
self.tol = float(self.tol)
import sklearn.discriminant_analysis
self.preprocessor = sklearn.discriminant_analysis.LinearDiscriminantAnalysis(
shrinkage=self.shrinkage,
solver=self.solver,
tol=self.tol,
)
self.preprocessor.fit(X, y)
return self
def transform(self, X):
if self.preprocessor is None:
raise NotImplementedError()
return self.preprocessor.transform(X)
@staticmethod
def get_properties(dataset_properties=None):
return {
"shortname": "LDA",
"name": "Linear Discriminant Analysis",
"handles_regression": False,
"handles_classification": True,
"handles_multiclass": False,
"handles_multilabel": False,
"handles_multioutput": False,
"is_deterministic": True,
"input": (DENSE, UNSIGNED_DATA, SIGNED_DATA),
"output": (DENSE, UNSIGNED_DATA, SIGNED_DATA),
}
@staticmethod
def get_hyperparameter_search_space(
feat_type: Optional[FEAT_TYPE_TYPE] = None, dataset_properties=None
):
cs = ConfigurationSpace()
solver = CategoricalHyperparameter(
name="solver", choices=["svd", "lsqr", "eigen"], default_value="svd"
)
shrinkage = UniformFloatHyperparameter(
name="shrinkage", lower=0.0, upper=1.0, default_value=0.5
)
tol = UniformFloatHyperparameter(
name="tol", lower=0.0001, upper=1, default_value=0.0001
)
cs.add_hyperparameters([solver, shrinkage, tol])
shrinkage_condition = InCondition(shrinkage, solver, ["lsqr", "eigen"])
cs.add_condition(shrinkage_condition)
return cs
# Add LDA component to auto-sklearn.
autosklearn.pipeline.components.feature_preprocessing.add_preprocessor(LDA)
Create dataset¶
X, y = load_breast_cancer(return_X_y=True)
X_train, X_test, y_train, y_test = train_test_split(X, y)
Configuration space¶
cs = LDA.get_hyperparameter_search_space()
print(cs)
Configuration space object:
Hyperparameters:
shrinkage, Type: UniformFloat, Range: [0.0, 1.0], Default: 0.5
solver, Type: Categorical, Choices: {svd, lsqr, eigen}, Default: svd
tol, Type: UniformFloat, Range: [0.0001, 1.0], Default: 0.0001
Conditions:
shrinkage | solver in {'lsqr', 'eigen'}
Fit the model using LDA as preprocessor¶
clf = autosklearn.classification.AutoSklearnClassifier(
time_left_for_this_task=30,
include={"feature_preprocessor": ["LDA"]},
# Bellow two flags are provided to speed up calculations
# Not recommended for a real implementation
initial_configurations_via_metalearning=0,
smac_scenario_args={"runcount_limit": 5},
)
clf.fit(X_train, y_train)
Fitting to the training data: 0%| | 0/30 [00:00<?, ?it/s, The total time budget for this task is 0:00:30]
Fitting to the training data: 3%|3 | 1/30 [00:01<00:29, 1.00s/it, The total time budget for this task is 0:00:30]
Fitting to the training data: 7%|6 | 2/30 [00:02<00:28, 1.00s/it, The total time budget for this task is 0:00:30]
Fitting to the training data: 10%|# | 3/30 [00:03<00:27, 1.02s/it, The total time budget for this task is 0:00:30]
Fitting to the training data: 13%|#3 | 4/30 [00:04<00:26, 1.02s/it, The total time budget for this task is 0:00:30]
Fitting to the training data: 17%|#6 | 5/30 [00:05<00:25, 1.01s/it, The total time budget for this task is 0:00:30]
Fitting to the training data: 20%|## | 6/30 [00:06<00:24, 1.01s/it, The total time budget for this task is 0:00:30]
Fitting to the training data: 23%|##3 | 7/30 [00:07<00:23, 1.01s/it, The total time budget for this task is 0:00:30]
Fitting to the training data: 27%|##6 | 8/30 [00:08<00:22, 1.01s/it, The total time budget for this task is 0:00:30]
Fitting to the training data: 30%|### | 9/30 [00:09<00:21, 1.01s/it, The total time budget for this task is 0:00:30]
Fitting to the training data: 33%|###3 | 10/30 [00:10<00:20, 1.01s/it, The total time budget for this task is 0:00:30]
Fitting to the training data: 37%|###6 | 11/30 [00:11<00:19, 1.01s/it, The total time budget for this task is 0:00:30]
Fitting to the training data: 40%|#### | 12/30 [00:12<00:18, 1.01s/it, The total time budget for this task is 0:00:30]
Fitting to the training data: 43%|####3 | 13/30 [00:13<00:17, 1.05s/it, The total time budget for this task is 0:00:30]
Fitting to the training data: 47%|####6 | 14/30 [00:14<00:16, 1.03s/it, The total time budget for this task is 0:00:30]
Fitting to the training data: 100%|##########| 30/30 [00:14<00:00, 2.11it/s, The total time budget for this task is 0:00:30]
AutoSklearnClassifier(ensemble_class=<class 'autosklearn.ensembles.ensemble_selection.EnsembleSelection'>,
include={'feature_preprocessor': ['LDA']},
initial_configurations_via_metalearning=0,
per_run_time_limit=3,
smac_scenario_args={'runcount_limit': 5},
time_left_for_this_task=30)
Print prediction score and statistics¶
y_pred = clf.predict(X_test)
print("accuracy: ", sklearn.metrics.accuracy_score(y_pred, y_test))
pprint(clf.show_models(), indent=4)
accuracy: 0.9790209790209791
{ 2: { 'balancing': Balancing(random_state=1),
'classifier': <autosklearn.pipeline.components.classification.ClassifierChoice object at 0x7f2afd3e1400>,
'cost': 0.04255319148936165,
'data_preprocessor': <autosklearn.pipeline.components.data_preprocessing.DataPreprocessorChoice object at 0x7f2afd3e12b0>,
'ensemble_weight': 0.38,
'feature_preprocessor': <autosklearn.pipeline.components.feature_preprocessing.FeaturePreprocessorChoice object at 0x7f2afd3e1490>,
'model_id': 2,
'rank': 1,
'sklearn_classifier': RandomForestClassifier(max_features=1, n_estimators=512, n_jobs=1,
random_state=1, warm_start=True)},
4: { 'balancing': Balancing(random_state=1, strategy='weighting'),
'classifier': <autosklearn.pipeline.components.classification.ClassifierChoice object at 0x7f2af0710d30>,
'cost': 0.08510638297872342,
'data_preprocessor': <autosklearn.pipeline.components.data_preprocessing.DataPreprocessorChoice object at 0x7f2b10e20220>,
'ensemble_weight': 0.3,
'feature_preprocessor': <autosklearn.pipeline.components.feature_preprocessing.FeaturePreprocessorChoice object at 0x7f2af0710400>,
'model_id': 4,
'rank': 3,
'sklearn_classifier': BernoulliNB(alpha=23.955206843654683, fit_prior=False)},
5: { 'balancing': Balancing(random_state=1, strategy='weighting'),
'classifier': <autosklearn.pipeline.components.classification.ClassifierChoice object at 0x7f2af752cdc0>,
'cost': 0.07092198581560283,
'data_preprocessor': <autosklearn.pipeline.components.data_preprocessing.DataPreprocessorChoice object at 0x7f2aee5d9f40>,
'ensemble_weight': 0.32,
'feature_preprocessor': <autosklearn.pipeline.components.feature_preprocessing.FeaturePreprocessorChoice object at 0x7f2af752c850>,
'model_id': 5,
'rank': 2,
'sklearn_classifier': AdaBoostClassifier(algorithm='SAMME',
base_estimator=DecisionTreeClassifier(max_depth=7),
learning_rate=1.371412136822841, n_estimators=135,
random_state=1)}}
Total running time of the script: ( 0 minutes 23.434 seconds)