from __future__ import annotations
from typing import Any, Dict, List, Mapping, Optional, Union
import hashlib
import json
import os
import pathlib
import pickle
import dask.distributed
import numpy as np
import pandas as pd
import scipy.sparse
import sklearn
from ConfigSpace import Configuration
import autosklearn
import autosklearn.experimental.selector
from autosklearn.classification import AutoSklearnClassifier
from autosklearn.ensembles.abstract_ensemble import AbstractEnsemble
from autosklearn.ensembles.ensemble_selection import EnsembleSelection
from autosklearn.metrics import Scorer, accuracy, balanced_accuracy, log_loss, roc_auc
selector_metrics = (balanced_accuracy, roc_auc, log_loss)
class SmacObjectCallback:
def __init__(self, portfolio):
self.portfolio = portfolio
def __call__(
self,
scenario_dict,
seed,
ta,
ta_kwargs,
metalearning_configurations,
n_jobs,
dask_client,
multi_objective_algorithm,
multi_objective_kwargs,
):
from smac.facade.smac_ac_facade import SMAC4AC
from smac.intensification.simple_intensifier import SimpleIntensifier
from smac.runhistory.runhistory2epm import RunHistory2EPM4LogCost
from smac.scenario.scenario import Scenario
scenario = Scenario(scenario_dict)
initial_configurations = []
for member in self.portfolio.values():
try:
hp_names = scenario.cs.get_hyperparameter_names()
_member = {key: member[key] for key in member if key in hp_names}
initial_configurations.append(
Configuration(configuration_space=scenario.cs, values=_member)
)
except ValueError:
pass
rh2EPM = RunHistory2EPM4LogCost
return SMAC4AC(
scenario=scenario,
rng=seed,
runhistory2epm=rh2EPM,
tae_runner=ta,
tae_runner_kwargs=ta_kwargs,
initial_configurations=initial_configurations,
intensifier=SimpleIntensifier,
run_id=seed,
n_jobs=n_jobs,
dask_client=dask_client,
multi_objective_algorithm=multi_objective_algorithm,
multi_objective_kwargs=multi_objective_kwargs,
)
class SHObjectCallback:
def __init__(self, budget_type, eta, initial_budget, portfolio):
self.budget_type = budget_type
self.eta = eta
self.initial_budget = initial_budget
self.portfolio = portfolio
def __call__(
self,
scenario_dict,
seed,
ta,
ta_kwargs,
metalearning_configurations,
n_jobs,
dask_client,
multi_objective_algorithm,
multi_objective_kwargs,
):
from smac.facade.smac_ac_facade import SMAC4AC
from smac.intensification.successive_halving import SuccessiveHalving
from smac.runhistory.runhistory2epm import RunHistory2EPM4LogCost
from smac.scenario.scenario import Scenario
scenario = Scenario(scenario_dict)
initial_configurations = []
for member in self.portfolio.values():
try:
hp_names = scenario.cs.get_hyperparameter_names()
_member = {key: member[key] for key in member if key in hp_names}
initial_configurations.append(
Configuration(configuration_space=scenario.cs, values=_member)
)
except ValueError:
pass
rh2EPM = RunHistory2EPM4LogCost
ta_kwargs["budget_type"] = self.budget_type
smac4ac = SMAC4AC(
scenario=scenario,
rng=seed,
runhistory2epm=rh2EPM,
tae_runner=ta,
tae_runner_kwargs=ta_kwargs,
initial_configurations=initial_configurations,
run_id=seed,
intensifier=SuccessiveHalving,
intensifier_kwargs={
"initial_budget": self.initial_budget,
"max_budget": 100,
"eta": self.eta,
"min_chall": 1,
},
dask_client=dask_client,
n_jobs=n_jobs,
multi_objective_algorithm=multi_objective_algorithm,
multi_objective_kwargs=multi_objective_kwargs,
)
smac4ac.solver.epm_chooser.min_samples_model = int(
len(scenario.cs.get_hyperparameters()) / 2
)
return smac4ac
[docs]class AutoSklearn2Classifier(AutoSklearnClassifier):
def __init__(
self,
time_left_for_this_task: int = 3600,
per_run_time_limit=None,
ensemble_size: int | None = None,
ensemble_class: AbstractEnsemble | None = EnsembleSelection,
ensemble_kwargs: Dict[str, Any] | None = None,
ensemble_nbest: Union[float, int] = 50,
max_models_on_disc: int = 50,
seed: int = 1,
memory_limit: int = 3072,
tmp_folder: Optional[str] = None,
delete_tmp_folder_after_terminate: bool = True,
n_jobs: Optional[int] = None,
dask_client: Optional[dask.distributed.Client] = None,
disable_evaluator_output: bool = False,
smac_scenario_args: Optional[Dict[str, Any]] = None,
logging_config: Optional[Dict[str, Any]] = None,
metric: Optional[Scorer] = None,
scoring_functions: Optional[List[Scorer]] = None,
load_models: bool = True,
dataset_compression: Union[bool, Mapping[str, Any]] = True,
allow_string_features: bool = True,
disable_progress_bar: bool = False,
):
"""
Parameters
----------
time_left_for_this_task : int, optional (default=3600)
Time limit in seconds for the search of appropriate
models. By increasing this value, *auto-sklearn* has a higher
chance of finding better models.
per_run_time_limit : int, optional (default=1/10 of time_left_for_this_task)
Time limit for a single call to the machine learning model.
Model fitting will be terminated if the machine learning
algorithm runs over the time limit. Set this value high enough so
that typical machine learning algorithms can be fit on the
training data.
ensemble_size : int, optional
Number of models added to the ensemble built by *Ensemble
selection from libraries of models*. Models are drawn with
replacement. If set to ``0`` no ensemble is fit.
Deprecated - will be removed in Auto-sklearn 0.16. Please pass
this argument via ``ensemble_kwargs={"ensemble_size": int}``
if you want to change the ensemble size for ensemble selection.
ensemble_class : Type[AbstractEnsemble], optional (default=EnsembleSelection)
Class implementing the post-hoc ensemble algorithm. Set to
``None`` to disable ensemble building or use ``SingleBest``
to obtain only use the single best model instead of an
ensemble.
ensemble_kwargs : Dict, optional
Keyword arguments that are passed to the ensemble class upon
initialization.
max_models_on_disc: int, optional (default=50),
Defines the maximum number of models that are kept in the disc.
The additional number of models are permanently deleted. Due to the
nature of this variable, it sets the upper limit on how many models
can be used for an ensemble.
It must be an integer greater or equal than 1.
If set to None, all models are kept on the disc.
seed : int, optional (default=1)
Used to seed SMAC. Will determine the output file names.
memory_limit : int, optional (3072)
Memory limit in MB for the machine learning algorithm.
`auto-sklearn` will stop fitting the machine learning algorithm if
it tries to allocate more than ``memory_limit`` MB.
**Important notes:**
* If ``None`` is provided, no memory limit is set.
* In case of multi-processing, ``memory_limit`` will be *per job*, so the total usage is
``n_jobs x memory_limit``.
* The memory limit also applies to the ensemble creation process.
tmp_folder : string, optional (None)
folder to store configuration output and log files, if ``None``
automatically use ``/tmp/autosklearn_tmp_$pid_$random_number``
delete_tmp_folder_after_terminate: string, optional (True)
remove tmp_folder, when finished. If tmp_folder is None
tmp_dir will always be deleted
n_jobs : int, optional, experimental
The number of jobs to run in parallel for ``fit()``. ``-1`` means
using all processors.
**Important notes**:
* By default, Auto-sklearn uses one core.
* Ensemble building is not affected by ``n_jobs`` but can be controlled by the number
of models in the ensemble.
* ``predict()`` is not affected by ``n_jobs`` (in contrast to most scikit-learn models)
* If ``dask_client`` is ``None``, a new dask client is created.
dask_client : dask.distributed.Client, optional
User-created dask client, can be used to start a dask cluster and then
attach auto-sklearn to it.
disable_evaluator_output: bool or list, optional (False)
If True, disable model and prediction output. Cannot be used
together with ensemble building. ``predict()`` cannot be used when
setting this True. Can also be used as a list to pass more
fine-grained information on what to save. Allowed elements in the
list are:
* ``'y_optimization'`` : do not save the predictions for the
optimization/validation set, which would later on be used to build
an ensemble.
* ``model`` : do not save any model files
smac_scenario_args : dict, optional (None)
Additional arguments inserted into the scenario of SMAC. See the
`SMAC documentation <https://automl.github.io/SMAC3/main/api/smac.scenario.html#smac.scenario.Scenario>`_
for a list of available arguments.
logging_config : dict, optional (None)
dictionary object specifying the logger configuration. If None,
the default logging.yaml file is used, which can be found in
the directory ``util/logging.yaml`` relative to the installation.
metric : Scorer, optional (None)
An instance of :class:`autosklearn.metrics.Scorer` as created by
:meth:`autosklearn.metrics.make_scorer`. These are the `Built-in
Metrics`_.
If None is provided, a default metric is selected depending on the task.
scoring_functions : List[Scorer], optional (None)
List of scorers which will be calculated for each pipeline and results will be
available via ``cv_results``
load_models : bool, optional (True)
Whether to load the models after fitting Auto-sklearn.
disable_progress_bar: bool = False
Whether to disable the progress bar that is displayed in the console
while fitting to the training data.
Attributes
----------
cv_results_ : dict of numpy (masked) ndarrays
A dict with keys as column headers and values as columns, that can be
imported into a pandas ``DataFrame``.
Not all keys returned by scikit-learn are supported yet.
""" # noqa (links are too long)
self.required_training = False # Boolean to indicate if selectors were trained.
include_estimators = [
"extra_trees",
"passive_aggressive",
"random_forest",
"sgd",
"gradient_boosting",
"mlp",
]
include_preprocessors = ["no_preprocessing"]
include = {
"classifier": include_estimators,
"feature_preprocessor": include_preprocessors,
}
self.train_selectors(selected_metric=metric)
super().__init__(
time_left_for_this_task=time_left_for_this_task,
per_run_time_limit=per_run_time_limit,
initial_configurations_via_metalearning=0,
ensemble_size=ensemble_size,
ensemble_class=ensemble_class,
ensemble_kwargs=ensemble_kwargs,
ensemble_nbest=ensemble_nbest,
max_models_on_disc=max_models_on_disc,
seed=seed,
memory_limit=memory_limit,
include=include,
exclude=None,
resampling_strategy=None,
resampling_strategy_arguments=None,
tmp_folder=tmp_folder,
delete_tmp_folder_after_terminate=delete_tmp_folder_after_terminate,
n_jobs=n_jobs,
dask_client=dask_client,
disable_evaluator_output=disable_evaluator_output,
get_smac_object_callback=None,
smac_scenario_args=smac_scenario_args,
logging_config=logging_config,
metadata_directory=None,
metric=metric,
scoring_functions=scoring_functions,
load_models=load_models,
allow_string_features=allow_string_features,
disable_progress_bar=disable_progress_bar,
)
def train_selectors(self, selected_metric=None):
self.selector_metrics = (balanced_accuracy, roc_auc, log_loss)
self.selector_files = {}
self.this_directory = pathlib.Path(__file__).resolve().parent
if selected_metric is not None:
metric_list = [selected_metric]
else:
metric_list = self.selector_metrics
for metric in metric_list:
training_data_file = (
self.this_directory / metric.name / "askl2_training_data.json"
)
with open(training_data_file) as fh:
training_data = json.load(fh)
fh.seek(0)
m = hashlib.md5()
m.update(fh.read().encode("utf8"))
training_data_hash = m.hexdigest()[:10]
selector_filename = "askl2_selector_%s_%s_%s_%s.pkl" % (
autosklearn.__version__,
sklearn.__version__,
metric.name,
training_data_hash,
)
selector_directory = os.environ.get("XDG_CACHE_HOME")
if selector_directory is None:
selector_directory = pathlib.Path.home()
selector_directory = (
pathlib.Path(selector_directory).joinpath("auto-sklearn").expanduser()
)
self.selector_files[metric.name] = selector_directory / selector_filename
metafeatures = pd.DataFrame(training_data["metafeatures"])
self.strategies = training_data["strategies"]
y_values = pd.DataFrame(
training_data["y_values"],
columns=self.strategies,
index=metafeatures.index,
)
minima_for_methods = training_data["minima_for_methods"]
maxima_for_methods = training_data["maxima_for_methods"]
default_strategies = training_data["tie_break_order"]
if not self.selector_files[metric.name].exists():
self.required_training = True
selector = autosklearn.experimental.selector.OVORF(
configuration=training_data["configuration"],
random_state=np.random.RandomState(1),
n_estimators=500,
tie_break_order=default_strategies,
)
selector = autosklearn.experimental.selector.FallbackWrapper(
selector, default_strategies
)
selector.fit(
X=metafeatures,
y=y_values,
minima=minima_for_methods,
maxima=maxima_for_methods,
)
self.selector_files[metric.name].parent.mkdir(
exist_ok=True, parents=True
)
try:
with open(self.selector_files[metric.name], "wb") as fh:
pickle.dump(selector, fh)
except Exception as e:
print(
"AutoSklearn2Classifier needs to create a selector file under "
"the user's home directory or XDG_CACHE_HOME. Nevertheless "
"the path {} is not writable.".format(
self.selector_files[metric.name]
)
)
raise e
[docs] def fit(
self,
X,
y,
X_test=None,
y_test=None,
metric=None,
feat_type=None,
dataset_name=None,
):
# TODO
# regularly check https://github.com/scikit-learn/scikit-learn/issues/15336
# whether histogram gradient boosting in scikit-learn finally support
# sparse data
is_sparse = scipy.sparse.issparse(X)
if is_sparse:
include_estimators = [
"extra_trees",
"passive_aggressive",
"random_forest",
"sgd",
"mlp",
]
else:
include_estimators = [
"extra_trees",
"passive_aggressive",
"random_forest",
"sgd",
"gradient_boosting",
"mlp",
]
self.include["classifier"] = include_estimators
if self.metric is None:
if len(y.shape) == 1 or y.shape[1] == 1:
self.metric = accuracy
else:
self.metric = log_loss
if self.metric in self.selector_metrics:
metric_name = self.metric.name
selector_file = self.selector_files[metric_name]
else:
metric_name = "balanced_accuracy"
selector_file = self.selector_files[metric_name]
with open(selector_file, "rb") as fh:
selector = pickle.load(fh)
metafeatures = pd.DataFrame(
{dataset_name: [X.shape[1], X.shape[0]]}
).transpose()
selection = np.argmax(selector.predict(metafeatures))
automl_policy = self.strategies[selection]
setting = {
"RF_None_holdout_iterative_es_if": {
"resampling_strategy": "holdout-iterative-fit",
"fidelity": None,
},
"RF_None_3CV_iterative_es_if": {
"resampling_strategy": "cv-iterative-fit",
"folds": 3,
"fidelity": None,
},
"RF_None_5CV_iterative_es_if": {
"resampling_strategy": "cv-iterative-fit",
"folds": 5,
"fidelity": None,
},
"RF_None_10CV_iterative_es_if": {
"resampling_strategy": "cv-iterative-fit",
"folds": 10,
"fidelity": None,
},
"RF_SH-eta4-i_holdout_iterative_es_if": {
"resampling_strategy": "holdout-iterative-fit",
"fidelity": "SH",
},
"RF_SH-eta4-i_3CV_iterative_es_if": {
"resampling_strategy": "cv-iterative-fit",
"folds": 3,
"fidelity": "SH",
},
"RF_SH-eta4-i_5CV_iterative_es_if": {
"resampling_strategy": "cv-iterative-fit",
"folds": 5,
"fidelity": "SH",
},
"RF_SH-eta4-i_10CV_iterative_es_if": {
"resampling_strategy": "cv-iterative-fit",
"folds": 10,
"fidelity": "SH",
},
}[automl_policy]
resampling_strategy = setting["resampling_strategy"]
if resampling_strategy == "cv-iterative-fit":
resampling_strategy_kwargs = {"folds": setting["folds"]}
else:
resampling_strategy_kwargs = None
portfolio_file = (
self.this_directory
/ metric_name
/ "askl2_portfolios"
/ ("%s.json" % automl_policy)
)
with open(portfolio_file) as fh:
portfolio_json = json.load(fh)
portfolio = portfolio_json["portfolio"]
if setting["fidelity"] == "SH":
smac_callback = SHObjectCallback("iterations", 4, 5.0, portfolio)
else:
smac_callback = SmacObjectCallback(portfolio)
self.resampling_strategy = resampling_strategy
self.resampling_strategy_arguments = resampling_strategy_kwargs
self.get_smac_object_callback = smac_callback
return super().fit(
X=X,
y=y,
X_test=X_test,
y_test=y_test,
feat_type=feat_type,
dataset_name=dataset_name,
)