Note
Click here to download the full example code or to run this example in your browser via Binder
Extending Auto-Sklearn with Data Preprocessor Component¶
The following example demonstrates how to turn off data preprocessing step in auto-skearn.
from typing import Optional
from pprint import pprint
import autosklearn.classification
import autosklearn.pipeline.components.data_preprocessing
import sklearn.metrics
from ConfigSpace.configuration_space import ConfigurationSpace
from autosklearn.askl_typing import FEAT_TYPE_TYPE
from autosklearn.pipeline.components.base import AutoSklearnPreprocessingAlgorithm
from autosklearn.pipeline.constants import SPARSE, DENSE, UNSIGNED_DATA, INPUT
from sklearn.datasets import load_breast_cancer
from sklearn.model_selection import train_test_split
Create NoPreprocessing component for auto-sklearn¶
class NoPreprocessing(AutoSklearnPreprocessingAlgorithm):
def __init__(self, **kwargs):
"""This preprocessors does not change the data"""
# Some internal checks makes sure parameters are set
for key, val in kwargs.items():
setattr(self, key, val)
def fit(self, X, Y=None):
return self
def transform(self, X):
return X
@staticmethod
def get_properties(dataset_properties=None):
return {
"shortname": "NoPreprocessing",
"name": "NoPreprocessing",
"handles_regression": True,
"handles_classification": True,
"handles_multiclass": True,
"handles_multilabel": True,
"handles_multioutput": True,
"is_deterministic": True,
"input": (SPARSE, DENSE, UNSIGNED_DATA),
"output": (INPUT,),
}
@staticmethod
def get_hyperparameter_search_space(
feat_type: Optional[FEAT_TYPE_TYPE] = None, dataset_properties=None
):
return ConfigurationSpace() # Return an empty configuration as there is None
# Add NoPreprocessing component to auto-sklearn.
autosklearn.pipeline.components.data_preprocessing.add_preprocessor(NoPreprocessing)
Create dataset¶
X, y = load_breast_cancer(return_X_y=True)
X_train, X_test, y_train, y_test = train_test_split(X, y)
Fit the model without performing data preprocessing¶
clf = autosklearn.classification.AutoSklearnClassifier(
time_left_for_this_task=120,
include={"data_preprocessor": ["NoPreprocessing"]},
# Bellow two flags are provided to speed up calculations
# Not recommended for a real implementation
initial_configurations_via_metalearning=0,
smac_scenario_args={"runcount_limit": 5},
)
clf.fit(X_train, y_train)
# To check that models were found without issue when running examples
assert len(clf.get_models_with_weights()) > 0
print(clf.sprint_statistics())
Fitting to the training data: 0%| | 0/120 [00:00<?, ?it/s, The total time budget for this task is 0:02:00]
Fitting to the training data: 1%| | 1/120 [00:01<01:59, 1.00s/it, The total time budget for this task is 0:02:00]
Fitting to the training data: 2%|1 | 2/120 [00:02<01:58, 1.00s/it, The total time budget for this task is 0:02:00]
Fitting to the training data: 2%|2 | 3/120 [00:03<01:58, 1.01s/it, The total time budget for this task is 0:02:00]
Fitting to the training data: 3%|3 | 4/120 [00:04<01:57, 1.01s/it, The total time budget for this task is 0:02:00]
Fitting to the training data: 4%|4 | 5/120 [00:05<01:56, 1.01s/it, The total time budget for this task is 0:02:00]
Fitting to the training data: 5%|5 | 6/120 [00:06<01:55, 1.01s/it, The total time budget for this task is 0:02:00]
Fitting to the training data: 6%|5 | 7/120 [00:07<01:54, 1.01s/it, The total time budget for this task is 0:02:00]
Fitting to the training data: 7%|6 | 8/120 [00:08<01:53, 1.01s/it, The total time budget for this task is 0:02:00]
Fitting to the training data: 8%|7 | 9/120 [00:09<01:51, 1.01s/it, The total time budget for this task is 0:02:00]
Fitting to the training data: 8%|8 | 10/120 [00:10<01:50, 1.01s/it, The total time budget for this task is 0:02:00]
Fitting to the training data: 9%|9 | 11/120 [00:11<01:50, 1.01s/it, The total time budget for this task is 0:02:00]
Fitting to the training data: 10%|# | 12/120 [00:12<01:48, 1.01s/it, The total time budget for this task is 0:02:00]
Fitting to the training data: 11%|# | 13/120 [00:13<01:49, 1.03s/it, The total time budget for this task is 0:02:00]
Fitting to the training data: 12%|#1 | 14/120 [00:14<01:47, 1.02s/it, The total time budget for this task is 0:02:00]
Fitting to the training data: 100%|##########| 120/120 [00:14<00:00, 8.47it/s, The total time budget for this task is 0:02:00]
auto-sklearn results:
Dataset name: 285e74f6-6bf8-11ed-87b6-77edb579fc6c
Metric: accuracy
Best validation score: 0.929078
Number of target algorithm runs: 5
Number of successful target algorithm runs: 4
Number of crashed target algorithm runs: 1
Number of target algorithms that exceeded the time limit: 0
Number of target algorithms that exceeded the memory limit: 0
Print prediction score and statistics¶
y_pred = clf.predict(X_test)
print("accuracy: ", sklearn.metrics.accuracy_score(y_pred, y_test))
pprint(clf.show_models(), indent=4)
accuracy: 0.9370629370629371
{ 2: { 'balancing': Balancing(random_state=1),
'classifier': <autosklearn.pipeline.components.classification.ClassifierChoice object at 0x7f2af46fac70>,
'cost': 0.07092198581560283,
'data_preprocessor': <autosklearn.pipeline.components.data_preprocessing.DataPreprocessorChoice object at 0x7f2af46faf70>,
'ensemble_weight': 0.2,
'feature_preprocessor': <autosklearn.pipeline.components.feature_preprocessing.FeaturePreprocessorChoice object at 0x7f2af46fa700>,
'model_id': 2,
'rank': 1,
'sklearn_classifier': RandomForestClassifier(max_features=5, n_estimators=512, n_jobs=1,
random_state=1, warm_start=True)},
4: { 'balancing': Balancing(random_state=1, strategy='weighting'),
'classifier': <autosklearn.pipeline.components.classification.ClassifierChoice object at 0x7f2afd64a160>,
'cost': 0.375886524822695,
'data_preprocessor': <autosklearn.pipeline.components.data_preprocessing.DataPreprocessorChoice object at 0x7f2afd544f10>,
'ensemble_weight': 0.6,
'feature_preprocessor': <autosklearn.pipeline.components.feature_preprocessing.FeaturePreprocessorChoice object at 0x7f2afd45fc40>,
'model_id': 4,
'rank': 4,
'sklearn_classifier': PassiveAggressiveClassifier(C=7.328568999403367e-05, max_iter=16,
random_state=1, tol=0.02258105767523384,
warm_start=True)},
6: { 'balancing': Balancing(random_state=1),
'classifier': <autosklearn.pipeline.components.classification.ClassifierChoice object at 0x7f2af6f3f520>,
'cost': 0.099290780141844,
'data_preprocessor': <autosklearn.pipeline.components.data_preprocessing.DataPreprocessorChoice object at 0x7f2af76b4970>,
'ensemble_weight': 0.2,
'feature_preprocessor': <autosklearn.pipeline.components.feature_preprocessing.FeaturePreprocessorChoice object at 0x7f2af6f3f400>,
'model_id': 6,
'rank': 3,
'sklearn_classifier': LinearDiscriminantAnalysis(shrinkage='auto', solver='lsqr',
tol=0.00010495766342828794)}}
Total running time of the script: ( 0 minutes 19.473 seconds)