Source code for autosklearn.ensembles.singlebest_ensemble

from __future__ import annotations

from typing import Sequence

import os

import numpy as np
from smac.runhistory.runhistory import RunHistory

from autosklearn.automl_common.common.utils.backend import Backend
from autosklearn.data.validation import SUPPORTED_FEAT_TYPES
from autosklearn.ensemble_building.run import Run
from autosklearn.ensembles.abstract_ensemble import AbstractEnsemble
from autosklearn.metrics import Scorer, calculate_losses
from autosklearn.pipeline.base import BasePipeline


class AbstractSingleModelEnsemble(AbstractEnsemble):
    """Ensemble consisting of a single model.

    Parameters
    ----------
    task_type: int
        An identifier indicating which task is being performed.

    metrics: Sequence[Scorer] | Scorer
        The metrics used to evaluate the models.

    backend : Backend
        Gives access to the backend of Auto-sklearn. Not used.

    random_state: int | RandomState | None = None
        Not used.
    """

    def __init__(
        self,
        task_type: int,
        metrics: Sequence[Scorer] | Scorer,
        backend: Backend,
        random_state: int | np.random.RandomState | None = None,
    ):
        self.weights_ = [1.0]
        self.task_type = task_type
        if isinstance(metrics, Sequence):
            self.metrics = metrics
        elif isinstance(metrics, Scorer):
            self.metrics = [metrics]
        else:
            raise TypeError(type(metrics))
        self.random_state = random_state
        self.backend = backend

    def fit(
        self,
        base_models_predictions: np.ndarray | list[np.ndarray],
        true_targets: np.ndarray,
        model_identifiers: list[tuple[int, int, float]],
        runs: Sequence[Run],
        X_data: SUPPORTED_FEAT_TYPES | None = None,
    ) -> AbstractSingleModelEnsemble:
        """Fit the ensemble

        Parameters
        ----------
        base_models_predictions: np.ndarray
            shape = (n_base_models, n_data_points, n_targets)
            n_targets is the number of classes in case of classification,
            n_targets is 0 or 1 in case of regression

            Can be a list of 2d numpy arrays as well to prevent copying all
            predictions into a single, large numpy array.

        true_targets : array of shape [n_targets]

        model_identifiers : identifier for each base model.
            Can be used for practical text output of the ensemble.

        runs: Sequence[Run]
            Additional information for each run executed by SMAC that was
            considered by the ensemble builder.

        X_data : list-like | sparse matrix | None = None

        Returns
        -------
        self
        """
        return self

    def predict(self, predictions: np.ndarray | list[np.ndarray]) -> np.ndarray:
        """Select the predictions of the selected model.

        Parameters
        ----------
        base_models_predictions : np.ndarray
            shape = (n_base_models, n_data_points, n_targets)
            Same as in the fit method.

        Returns
        -------
        np.ndarray
        """
        return predictions[0]

    def __str__(self) -> str:
        return "%s:\n\tMembers: %s" "\n\tWeights: %s\n\tIdentifiers: [%s]" % (
            self.__class__.__name__,
            self.indices_,  # type: ignore [attr-defined]
            self.weights_,
            self.identifiers_[0],  # type: ignore [attr-defined]
        )

    def get_models_with_weights(
        self, models: dict[tuple[int, int, float], BasePipeline]
    ) -> list[tuple[float, BasePipeline]]:
        """List of (weight, model) pairs for the model selected by this ensemble.

        Parameters
        ----------
        models : dict {identifier : model object}
            The identifiers are the same as the one presented to the fit()
            method. Models can be used for nice printing.

        Returns
        -------
        list[tuple[float, BasePipeline]]
        """
        return [(self.weights_[0], models[self.identifiers_[0]])]  # type: ignore [attr-defined]  # noqa: E501

    def get_identifiers_with_weights(
        self,
    ) -> list[tuple[tuple[int, int, float], float]]:
        """Return a (identifier, weight)-pairs for the model selected by this ensemble.

        Parameters
        ----------
        models : dict {identifier : model object}
            The identifiers are the same as the one presented to the fit()
            method. Models can be used for nice printing.

        Returns
        -------
        list[tuple[tuple[int, int, float], float]
        """
        return list(zip(self.identifiers_, self.weights_))  # type: ignore [attr-defined]  # noqa: E501

    def get_selected_model_identifiers(self) -> list[tuple[int, int, float]]:
        """Return identifier of models in the ensemble.

        This includes models which have a weight of zero!

        Returns
        -------
        list
        """
        return self.identifiers_  # type: ignore [attr-defined]

    def get_validation_performance(self) -> float:
        """Return validation performance of ensemble.

        In case of multi-objective problem, only the first metric will be returned.

        Return
        ------
        float
        """
        return self.best_model_score_  # type: ignore [attr-defined]


[docs]class SingleModelEnsemble(AbstractSingleModelEnsemble): """Ensemble consisting of a single model. This class is used by the :class:`MultiObjectiveDummyEnsemble` to represent ensembles consisting of a single model, and this class should not be used on its own. Do not use by yourself! Parameters ---------- task_type: int An identifier indicating which task is being performed. metrics: Sequence[Scorer] | Scorer The metrics used to evaluate the models. backend : Backend Gives access to the backend of Auto-sklearn. Not used. model_index : int Index of the model that constitutes the ensemble. This index will be used to select the correct predictions that will be passed during ``fit`` and ``predict``. random_state: int | RandomState | None = None Not used. """ def __init__( self, task_type: int, metrics: Sequence[Scorer] | Scorer, backend: Backend, model_index: int, random_state: int | np.random.RandomState | None = None, ): super().__init__( task_type=task_type, metrics=metrics, random_state=random_state, backend=backend, ) self.indices_ = [model_index]
[docs] def fit( self, base_models_predictions: np.ndarray | list[np.ndarray], true_targets: np.ndarray, model_identifiers: list[tuple[int, int, float]], runs: Sequence[Run], X_data: SUPPORTED_FEAT_TYPES | None = None, ) -> SingleModelEnsemble: """Dummy implementation of the ``fit`` method. Actualy work of passing the model index is done in the constructor. This method only stores the identifier of the selected model and computes it's validation loss. Parameters ---------- base_models_predictions: np.ndarray shape = (n_base_models, n_data_points, n_targets) n_targets is the number of classes in case of classification, n_targets is 0 or 1 in case of regression Can be a list of 2d numpy arrays as well to prevent copying all predictions into a single, large numpy array. true_targets : array of shape [n_targets] model_identifiers : identifier for each base model. Can be used for practical text output of the ensemble. runs: Sequence[Run] Additional information for each run executed by SMAC that was considered by the ensemble builder. Not used. X_data : list-like | spmatrix | None = None X data to feed to a metric if it requires it Returns ------- self """ self.identifiers_ = [model_identifiers[self.indices_[0]]] loss = calculate_losses( solution=true_targets, prediction=base_models_predictions[self.indices_[0]], task_type=self.task_type, metrics=self.metrics, X_data=X_data, ) self.best_model_score_ = loss[self.metrics[0].name] return self
[docs]class SingleBest(AbstractSingleModelEnsemble): """Ensemble consisting of the single best model. Parameters ---------- task_type: int An identifier indicating which task is being performed. metrics: Sequence[Scorer] | Scorer The metrics used to evaluate the models. random_state: int | RandomState | None = None Not used. backend : Backend Gives access to the backend of Auto-sklearn. Not used. """ def __init__( self, task_type: int, metrics: Sequence[Scorer] | Scorer, backend: Backend, random_state: int | np.random.RandomState | None = None, ): super().__init__( task_type=task_type, metrics=metrics, random_state=random_state, backend=backend, )
[docs] def fit( self, base_models_predictions: np.ndarray | list[np.ndarray], true_targets: np.ndarray, model_identifiers: list[tuple[int, int, float]], runs: Sequence[Run], X_data: SUPPORTED_FEAT_TYPES | None = None, ) -> SingleBest: """Select the single best model. Parameters ---------- base_models_predictions: np.ndarray shape = (n_base_models, n_data_points, n_targets) n_targets is the number of classes in case of classification, n_targets is 0 or 1 in case of regression Can be a list of 2d numpy arrays as well to prevent copying all predictions into a single, large numpy array. true_targets : array of shape [n_targets] model_identifiers : identifier for each base model. Can be used for practical text output of the ensemble. runs: Sequence[Run] Additional information for each run executed by SMAC that was considered by the ensemble builder. Not used. X_data : array-like | sparse matrix | None = None Returns ------- self """ losses = [ calculate_losses( solution=true_targets, prediction=base_model_prediction, task_type=self.task_type, metrics=self.metrics, X_data=X_data, )[self.metrics[0].name] for base_model_prediction in base_models_predictions ] argmin = np.argmin(losses) self.indices_ = [argmin] self.identifiers_ = [model_identifiers[argmin]] self.best_model_score_ = losses[argmin] return self
[docs]class SingleBestFromRunhistory(AbstractSingleModelEnsemble): """ In the case of a crash, this class searches for the best individual model. Such model is returned as an ensemble of a single object, to comply with the expected interface of an AbstractEnsemble. Do not use by yourself! """ def __init__( self, task_type: int, metrics: Sequence[Scorer] | Scorer, backend: Backend, run_history: RunHistory, seed: int, random_state: int | np.random.RandomState | None = None, ): super().__init__( task_type=task_type, metrics=metrics, random_state=random_state, backend=backend, ) # The seed here is seperate from RandomState and is used to indiicate a # directory for the backend to search in self.seed = seed self.indices_ = [0] self.weights_ = [1.0] self.run_history = run_history self.identifiers_ = self.get_identifiers_from_run_history()
[docs] def get_identifiers_from_run_history(self) -> list[tuple[int, int, float]]: """Parses the run history, to identify the best performing model Populates the identifiers attribute, which is used by the backend to access the actual model. """ best_model_identifier = [] best_model_score = self.metrics[0]._worst_possible_result for run_key in self.run_history.data.keys(): run_value = self.run_history.data[run_key] print(run_key, run_value) if len(self.metrics) == 1: cost = run_value.cost else: cost = run_value.cost[0] score = self.metrics[0]._optimum - (self.metrics[0]._sign * cost) if (score > best_model_score and self.metrics[0]._sign > 0) or ( score < best_model_score and self.metrics[0]._sign < 0 ): # Make sure that the individual best model actually exists model_dir = self.backend.get_numrun_directory( self.seed, run_value.additional_info["num_run"], run_key.budget, ) model_file_name = self.backend.get_model_filename( self.seed, run_value.additional_info["num_run"], run_key.budget, ) file_path = os.path.join(model_dir, model_file_name) if not os.path.exists(file_path): continue best_model_identifier = [ ( self.seed, run_value.additional_info["num_run"], run_key.budget, ) ] best_model_score = score if not best_model_identifier: raise ValueError( "No valid model found in run history. This means smac was not able to" " fit a valid model. Please check the log file for errors." ) self.best_model_score_ = best_model_score return best_model_identifier