Source code for autosklearn.experimental.askl2

from __future__ import annotations

from typing import Any, Dict, List, Mapping, Optional, Union

import hashlib
import json
import os
import pathlib
import pickle

import dask.distributed
import numpy as np
import pandas as pd
import scipy.sparse
import sklearn
from ConfigSpace import Configuration

import autosklearn
import autosklearn.experimental.selector
from autosklearn.classification import AutoSklearnClassifier
from autosklearn.ensembles.abstract_ensemble import AbstractEnsemble
from autosklearn.ensembles.ensemble_selection import EnsembleSelection
from autosklearn.metrics import Scorer, accuracy, balanced_accuracy, log_loss, roc_auc

selector_metrics = (balanced_accuracy, roc_auc, log_loss)


class SmacObjectCallback:
    def __init__(self, portfolio):
        self.portfolio = portfolio

    def __call__(
        self,
        scenario_dict,
        seed,
        ta,
        ta_kwargs,
        metalearning_configurations,
        n_jobs,
        dask_client,
        multi_objective_algorithm,
        multi_objective_kwargs,
    ):
        from smac.facade.smac_ac_facade import SMAC4AC
        from smac.intensification.simple_intensifier import SimpleIntensifier
        from smac.runhistory.runhistory2epm import RunHistory2EPM4LogCost
        from smac.scenario.scenario import Scenario

        scenario = Scenario(scenario_dict)

        initial_configurations = []
        for member in self.portfolio.values():
            try:
                hp_names = scenario.cs.get_hyperparameter_names()
                _member = {key: member[key] for key in member if key in hp_names}
                initial_configurations.append(
                    Configuration(configuration_space=scenario.cs, values=_member)
                )
            except ValueError:
                pass

        rh2EPM = RunHistory2EPM4LogCost
        return SMAC4AC(
            scenario=scenario,
            rng=seed,
            runhistory2epm=rh2EPM,
            tae_runner=ta,
            tae_runner_kwargs=ta_kwargs,
            initial_configurations=initial_configurations,
            intensifier=SimpleIntensifier,
            run_id=seed,
            n_jobs=n_jobs,
            dask_client=dask_client,
            multi_objective_algorithm=multi_objective_algorithm,
            multi_objective_kwargs=multi_objective_kwargs,
        )


class SHObjectCallback:
    def __init__(self, budget_type, eta, initial_budget, portfolio):
        self.budget_type = budget_type
        self.eta = eta
        self.initial_budget = initial_budget
        self.portfolio = portfolio

    def __call__(
        self,
        scenario_dict,
        seed,
        ta,
        ta_kwargs,
        metalearning_configurations,
        n_jobs,
        dask_client,
        multi_objective_algorithm,
        multi_objective_kwargs,
    ):
        from smac.facade.smac_ac_facade import SMAC4AC
        from smac.intensification.successive_halving import SuccessiveHalving
        from smac.runhistory.runhistory2epm import RunHistory2EPM4LogCost
        from smac.scenario.scenario import Scenario

        scenario = Scenario(scenario_dict)

        initial_configurations = []
        for member in self.portfolio.values():
            try:
                hp_names = scenario.cs.get_hyperparameter_names()
                _member = {key: member[key] for key in member if key in hp_names}
                initial_configurations.append(
                    Configuration(configuration_space=scenario.cs, values=_member)
                )
            except ValueError:
                pass

        rh2EPM = RunHistory2EPM4LogCost
        ta_kwargs["budget_type"] = self.budget_type

        smac4ac = SMAC4AC(
            scenario=scenario,
            rng=seed,
            runhistory2epm=rh2EPM,
            tae_runner=ta,
            tae_runner_kwargs=ta_kwargs,
            initial_configurations=initial_configurations,
            run_id=seed,
            intensifier=SuccessiveHalving,
            intensifier_kwargs={
                "initial_budget": self.initial_budget,
                "max_budget": 100,
                "eta": self.eta,
                "min_chall": 1,
            },
            dask_client=dask_client,
            n_jobs=n_jobs,
            multi_objective_algorithm=multi_objective_algorithm,
            multi_objective_kwargs=multi_objective_kwargs,
        )
        smac4ac.solver.epm_chooser.min_samples_model = int(
            len(scenario.cs.get_hyperparameters()) / 2
        )
        return smac4ac


[docs]class AutoSklearn2Classifier(AutoSklearnClassifier):
    def __init__(
        self,
        time_left_for_this_task: int = 3600,
        per_run_time_limit=None,
        ensemble_size: int | None = None,
        ensemble_class: AbstractEnsemble | None = EnsembleSelection,
        ensemble_kwargs: Dict[str, Any] | None = None,
        ensemble_nbest: Union[float, int] = 50,
        max_models_on_disc: int = 50,
        seed: int = 1,
        memory_limit: int = 3072,
        tmp_folder: Optional[str] = None,
        delete_tmp_folder_after_terminate: bool = True,
        n_jobs: Optional[int] = None,
        dask_client: Optional[dask.distributed.Client] = None,
        disable_evaluator_output: bool = False,
        smac_scenario_args: Optional[Dict[str, Any]] = None,
        logging_config: Optional[Dict[str, Any]] = None,
        metric: Optional[Scorer] = None,
        scoring_functions: Optional[List[Scorer]] = None,
        load_models: bool = True,
        dataset_compression: Union[bool, Mapping[str, Any]] = True,
        allow_string_features: bool = True,
        disable_progress_bar: bool = False,
    ):

        """
        Parameters
        ----------
        time_left_for_this_task : int, optional (default=3600)
            Time limit in seconds for the search of appropriate
            models. By increasing this value, *auto-sklearn* has a higher
            chance of finding better models.

        per_run_time_limit : int, optional (default=1/10 of time_left_for_this_task)
            Time limit for a single call to the machine learning model.
            Model fitting will be terminated if the machine learning
            algorithm runs over the time limit. Set this value high enough so
            that typical machine learning algorithms can be fit on the
            training data.

        ensemble_size : int, optional
            Number of models added to the ensemble built by *Ensemble
            selection from libraries of models*. Models are drawn with
            replacement. If set to ``0`` no ensemble is fit.

            Deprecated - will be removed in Auto-sklearn 0.16. Please pass
            this argument via ``ensemble_kwargs={"ensemble_size": int}``
            if you want to change the ensemble size for ensemble selection.

        ensemble_class : Type[AbstractEnsemble], optional (default=EnsembleSelection)
            Class implementing the post-hoc ensemble algorithm. Set to
            ``None`` to disable ensemble building or use ``SingleBest``
            to obtain only use the single best model instead of an
            ensemble.

        ensemble_kwargs : Dict, optional
            Keyword arguments that are passed to the ensemble class upon
            initialization.

        max_models_on_disc: int, optional (default=50),
            Defines the maximum number of models that are kept in the disc.
            The additional number of models are permanently deleted. Due to the
            nature of this variable, it sets the upper limit on how many models
            can be used for an ensemble.
            It must be an integer greater or equal than 1.
            If set to None, all models are kept on the disc.

        seed : int, optional (default=1)
            Used to seed SMAC. Will determine the output file names.

        memory_limit : int, optional (3072)
            Memory limit in MB for the machine learning algorithm.
            `auto-sklearn` will stop fitting the machine learning algorithm if
            it tries to allocate more than ``memory_limit`` MB.

            **Important notes:**

            * If ``None`` is provided, no memory limit is set.
            * In case of multi-processing, ``memory_limit`` will be *per job*, so the total usage is
              ``n_jobs x memory_limit``.
            * The memory limit also applies to the ensemble creation process.

        tmp_folder : string, optional (None)
            folder to store configuration output and log files, if ``None``
            automatically use ``/tmp/autosklearn_tmp_$pid_$random_number``

        delete_tmp_folder_after_terminate: string, optional (True)
            remove tmp_folder, when finished. If tmp_folder is None
            tmp_dir will always be deleted

        n_jobs : int, optional, experimental
            The number of jobs to run in parallel for ``fit()``. ``-1`` means
            using all processors.

            **Important notes**:

            * By default, Auto-sklearn uses one core.
            * Ensemble building is not affected by ``n_jobs`` but can be controlled by the number
              of models in the ensemble.
            * ``predict()`` is not affected by ``n_jobs`` (in contrast to most scikit-learn models)
            * If ``dask_client`` is ``None``, a new dask client is created.

        dask_client : dask.distributed.Client, optional
            User-created dask client, can be used to start a dask cluster and then
            attach auto-sklearn to it.

        disable_evaluator_output: bool or list, optional (False)
            If True, disable model and prediction output. Cannot be used
            together with ensemble building. ``predict()`` cannot be used when
            setting this True. Can also be used as a list to pass more
            fine-grained information on what to save. Allowed elements in the
            list are:

            * ``'y_optimization'`` : do not save the predictions for the
              optimization/validation set, which would later on be used to build
              an ensemble.
            * ``model`` : do not save any model files

        smac_scenario_args : dict, optional (None)
            Additional arguments inserted into the scenario of SMAC. See the
            `SMAC documentation <https://automl.github.io/SMAC3/main/api/smac.scenario.html#smac.scenario.Scenario>`_
            for a list of available arguments.

        logging_config : dict, optional (None)
            dictionary object specifying the logger configuration. If None,
            the default logging.yaml file is used, which can be found in
            the directory ``util/logging.yaml`` relative to the installation.

        metric : Scorer, optional (None)
            An instance of :class:`autosklearn.metrics.Scorer` as created by
            :meth:`autosklearn.metrics.make_scorer`. These are the `Built-in
            Metrics`_.
            If None is provided, a default metric is selected depending on the task.

        scoring_functions : List[Scorer], optional (None)
            List of scorers which will be calculated for each pipeline and results will be
            available via ``cv_results``

        load_models : bool, optional (True)
            Whether to load the models after fitting Auto-sklearn.

        disable_progress_bar: bool = False
            Whether to disable the progress bar that is displayed in the console
            while fitting to the training data.

        Attributes
        ----------

        cv_results_ : dict of numpy (masked) ndarrays
            A dict with keys as column headers and values as columns, that can be
            imported into a pandas ``DataFrame``.

            Not all keys returned by scikit-learn are supported yet.

        """  # noqa (links are too long)
        self.required_training = False  # Boolean to indicate if selectors were trained.
        include_estimators = [
            "extra_trees",
            "passive_aggressive",
            "random_forest",
            "sgd",
            "gradient_boosting",
            "mlp",
        ]
        include_preprocessors = ["no_preprocessing"]
        include = {
            "classifier": include_estimators,
            "feature_preprocessor": include_preprocessors,
        }
        self.train_selectors(selected_metric=metric)
        super().__init__(
            time_left_for_this_task=time_left_for_this_task,
            per_run_time_limit=per_run_time_limit,
            initial_configurations_via_metalearning=0,
            ensemble_size=ensemble_size,
            ensemble_class=ensemble_class,
            ensemble_kwargs=ensemble_kwargs,
            ensemble_nbest=ensemble_nbest,
            max_models_on_disc=max_models_on_disc,
            seed=seed,
            memory_limit=memory_limit,
            include=include,
            exclude=None,
            resampling_strategy=None,
            resampling_strategy_arguments=None,
            tmp_folder=tmp_folder,
            delete_tmp_folder_after_terminate=delete_tmp_folder_after_terminate,
            n_jobs=n_jobs,
            dask_client=dask_client,
            disable_evaluator_output=disable_evaluator_output,
            get_smac_object_callback=None,
            smac_scenario_args=smac_scenario_args,
            logging_config=logging_config,
            metadata_directory=None,
            metric=metric,
            scoring_functions=scoring_functions,
            load_models=load_models,
            allow_string_features=allow_string_features,
            disable_progress_bar=disable_progress_bar,
        )

    def train_selectors(self, selected_metric=None):
        self.selector_metrics = (balanced_accuracy, roc_auc, log_loss)
        self.selector_files = {}
        self.this_directory = pathlib.Path(__file__).resolve().parent

        if selected_metric is not None:
            metric_list = [selected_metric]
        else:
            metric_list = self.selector_metrics

        for metric in metric_list:
            training_data_file = (
                self.this_directory / metric.name / "askl2_training_data.json"
            )
            with open(training_data_file) as fh:
                training_data = json.load(fh)
                fh.seek(0)
                m = hashlib.md5()
                m.update(fh.read().encode("utf8"))
            training_data_hash = m.hexdigest()[:10]
            selector_filename = "askl2_selector_%s_%s_%s_%s.pkl" % (
                autosklearn.__version__,
                sklearn.__version__,
                metric.name,
                training_data_hash,
            )
            selector_directory = os.environ.get("XDG_CACHE_HOME")
            if selector_directory is None:
                selector_directory = pathlib.Path.home()
            selector_directory = (
                pathlib.Path(selector_directory).joinpath("auto-sklearn").expanduser()
            )
            self.selector_files[metric.name] = selector_directory / selector_filename
            metafeatures = pd.DataFrame(training_data["metafeatures"])
            self.strategies = training_data["strategies"]
            y_values = pd.DataFrame(
                training_data["y_values"],
                columns=self.strategies,
                index=metafeatures.index,
            )
            minima_for_methods = training_data["minima_for_methods"]
            maxima_for_methods = training_data["maxima_for_methods"]
            default_strategies = training_data["tie_break_order"]
            if not self.selector_files[metric.name].exists():
                self.required_training = True
                selector = autosklearn.experimental.selector.OVORF(
                    configuration=training_data["configuration"],
                    random_state=np.random.RandomState(1),
                    n_estimators=500,
                    tie_break_order=default_strategies,
                )
                selector = autosklearn.experimental.selector.FallbackWrapper(
                    selector, default_strategies
                )
                selector.fit(
                    X=metafeatures,
                    y=y_values,
                    minima=minima_for_methods,
                    maxima=maxima_for_methods,
                )
                self.selector_files[metric.name].parent.mkdir(
                    exist_ok=True, parents=True
                )

                try:
                    with open(self.selector_files[metric.name], "wb") as fh:
                        pickle.dump(selector, fh)
                except Exception as e:
                    print(
                        "AutoSklearn2Classifier needs to create a selector file under "
                        "the user's home directory or XDG_CACHE_HOME. Nevertheless "
                        "the path {} is not writable.".format(
                            self.selector_files[metric.name]
                        )
                    )
                    raise e

[docs]    def fit(
        self,
        X,
        y,
        X_test=None,
        y_test=None,
        metric=None,
        feat_type=None,
        dataset_name=None,
    ):

        # TODO
        # regularly check https://github.com/scikit-learn/scikit-learn/issues/15336
        # whether histogram gradient boosting in scikit-learn finally support
        # sparse data
        is_sparse = scipy.sparse.issparse(X)
        if is_sparse:
            include_estimators = [
                "extra_trees",
                "passive_aggressive",
                "random_forest",
                "sgd",
                "mlp",
            ]
        else:
            include_estimators = [
                "extra_trees",
                "passive_aggressive",
                "random_forest",
                "sgd",
                "gradient_boosting",
                "mlp",
            ]
        self.include["classifier"] = include_estimators

        if self.metric is None:
            if len(y.shape) == 1 or y.shape[1] == 1:
                self.metric = accuracy
            else:
                self.metric = log_loss

        if self.metric in self.selector_metrics:
            metric_name = self.metric.name
            selector_file = self.selector_files[metric_name]
        else:
            metric_name = "balanced_accuracy"
            selector_file = self.selector_files[metric_name]
        with open(selector_file, "rb") as fh:
            selector = pickle.load(fh)

        metafeatures = pd.DataFrame(
            {dataset_name: [X.shape[1], X.shape[0]]}
        ).transpose()
        selection = np.argmax(selector.predict(metafeatures))
        automl_policy = self.strategies[selection]

        setting = {
            "RF_None_holdout_iterative_es_if": {
                "resampling_strategy": "holdout-iterative-fit",
                "fidelity": None,
            },
            "RF_None_3CV_iterative_es_if": {
                "resampling_strategy": "cv-iterative-fit",
                "folds": 3,
                "fidelity": None,
            },
            "RF_None_5CV_iterative_es_if": {
                "resampling_strategy": "cv-iterative-fit",
                "folds": 5,
                "fidelity": None,
            },
            "RF_None_10CV_iterative_es_if": {
                "resampling_strategy": "cv-iterative-fit",
                "folds": 10,
                "fidelity": None,
            },
            "RF_SH-eta4-i_holdout_iterative_es_if": {
                "resampling_strategy": "holdout-iterative-fit",
                "fidelity": "SH",
            },
            "RF_SH-eta4-i_3CV_iterative_es_if": {
                "resampling_strategy": "cv-iterative-fit",
                "folds": 3,
                "fidelity": "SH",
            },
            "RF_SH-eta4-i_5CV_iterative_es_if": {
                "resampling_strategy": "cv-iterative-fit",
                "folds": 5,
                "fidelity": "SH",
            },
            "RF_SH-eta4-i_10CV_iterative_es_if": {
                "resampling_strategy": "cv-iterative-fit",
                "folds": 10,
                "fidelity": "SH",
            },
        }[automl_policy]

        resampling_strategy = setting["resampling_strategy"]
        if resampling_strategy == "cv-iterative-fit":
            resampling_strategy_kwargs = {"folds": setting["folds"]}
        else:
            resampling_strategy_kwargs = None

        portfolio_file = (
            self.this_directory
            / metric_name
            / "askl2_portfolios"
            / ("%s.json" % automl_policy)
        )
        with open(portfolio_file) as fh:
            portfolio_json = json.load(fh)
        portfolio = portfolio_json["portfolio"]

        if setting["fidelity"] == "SH":
            smac_callback = SHObjectCallback("iterations", 4, 5.0, portfolio)
        else:
            smac_callback = SmacObjectCallback(portfolio)

        self.resampling_strategy = resampling_strategy
        self.resampling_strategy_arguments = resampling_strategy_kwargs
        self.get_smac_object_callback = smac_callback
        return super().fit(
            X=X,
            y=y,
            X_test=X_test,
            y_test=y_test,
            feat_type=feat_type,
            dataset_name=dataset_name,
        )