Source code for autosklearn.ensembles.singlebest_ensemble

from __future__ import annotations

from typing import Sequence

import os

import numpy as np
from smac.runhistory.runhistory import RunHistory

from autosklearn.automl_common.common.utils.backend import Backend
from autosklearn.data.validation import SUPPORTED_FEAT_TYPES
from autosklearn.ensemble_building.run import Run
from autosklearn.ensembles.abstract_ensemble import AbstractEnsemble
from autosklearn.metrics import Scorer, calculate_losses
from autosklearn.pipeline.base import BasePipeline


class AbstractSingleModelEnsemble(AbstractEnsemble):
    """Ensemble consisting of a single model.

    Parameters
    ----------
    task_type: int
        An identifier indicating which task is being performed.

    metrics: Sequence[Scorer] | Scorer
        The metrics used to evaluate the models.

    backend : Backend
        Gives access to the backend of Auto-sklearn. Not used.

    random_state: int | RandomState | None = None
        Not used.
    """

    def __init__(
        self,
        task_type: int,
        metrics: Sequence[Scorer] | Scorer,
        backend: Backend,
        random_state: int | np.random.RandomState | None = None,
    ):
        self.weights_ = [1.0]
        self.task_type = task_type
        if isinstance(metrics, Sequence):
            self.metrics = metrics
        elif isinstance(metrics, Scorer):
            self.metrics = [metrics]
        else:
            raise TypeError(type(metrics))
        self.random_state = random_state
        self.backend = backend

    def fit(
        self,
        base_models_predictions: np.ndarray | list[np.ndarray],
        true_targets: np.ndarray,
        model_identifiers: list[tuple[int, int, float]],
        runs: Sequence[Run],
        X_data: SUPPORTED_FEAT_TYPES | None = None,
    ) -> AbstractSingleModelEnsemble:
        """Fit the ensemble

        Parameters
        ----------
        base_models_predictions: np.ndarray
            shape = (n_base_models, n_data_points, n_targets)
            n_targets is the number of classes in case of classification,
            n_targets is 0 or 1 in case of regression

            Can be a list of 2d numpy arrays as well to prevent copying all
            predictions into a single, large numpy array.

        true_targets : array of shape [n_targets]

        model_identifiers : identifier for each base model.
            Can be used for practical text output of the ensemble.

        runs: Sequence[Run]
            Additional information for each run executed by SMAC that was
            considered by the ensemble builder.

        X_data : list-like | sparse matrix | None = None

        Returns
        -------
        self
        """
        return self

    def predict(self, predictions: np.ndarray | list[np.ndarray]) -> np.ndarray:
        """Select the predictions of the selected model.

        Parameters
        ----------
        base_models_predictions : np.ndarray
            shape = (n_base_models, n_data_points, n_targets)
            Same as in the fit method.

        Returns
        -------
        np.ndarray
        """
        return predictions[0]

    def __str__(self) -> str:
        return "%s:\n\tMembers: %s" "\n\tWeights: %s\n\tIdentifiers: [%s]" % (
            self.__class__.__name__,
            self.indices_,  # type: ignore [attr-defined]
            self.weights_,
            self.identifiers_[0],  # type: ignore [attr-defined]
        )

    def get_models_with_weights(
        self, models: dict[tuple[int, int, float], BasePipeline]
    ) -> list[tuple[float, BasePipeline]]:
        """List of (weight, model) pairs for the model selected by this ensemble.

        Parameters
        ----------
        models : dict {identifier : model object}
            The identifiers are the same as the one presented to the fit()
            method. Models can be used for nice printing.

        Returns
        -------
        list[tuple[float, BasePipeline]]
        """
        return [(self.weights_[0], models[self.identifiers_[0]])]  # type: ignore [attr-defined]  # noqa: E501

    def get_identifiers_with_weights(
        self,
    ) -> list[tuple[tuple[int, int, float], float]]:
        """Return a (identifier, weight)-pairs for the model selected by this ensemble.

        Parameters
        ----------
        models : dict {identifier : model object}
            The identifiers are the same as the one presented to the fit()
            method. Models can be used for nice printing.

        Returns
        -------
        list[tuple[tuple[int, int, float], float]
        """
        return list(zip(self.identifiers_, self.weights_))  # type: ignore [attr-defined]  # noqa: E501

    def get_selected_model_identifiers(self) -> list[tuple[int, int, float]]:
        """Return identifier of models in the ensemble.

        This includes models which have a weight of zero!

        Returns
        -------
        list
        """
        return self.identifiers_  # type: ignore [attr-defined]

    def get_validation_performance(self) -> float:
        """Return validation performance of ensemble.

        In case of multi-objective problem, only the first metric will be returned.

        Return
        ------
        float
        """
        return self.best_model_score_  # type: ignore [attr-defined]


[docs]class SingleModelEnsemble(AbstractSingleModelEnsemble):
    """Ensemble consisting of a single model.

    This class is used by the :class:`MultiObjectiveDummyEnsemble` to represent
    ensembles consisting of a single model, and this class should not be used
    on its own.

    Do not use by yourself!

    Parameters
    ----------
    task_type: int
        An identifier indicating which task is being performed.

    metrics: Sequence[Scorer] | Scorer
        The metrics used to evaluate the models.

    backend : Backend
        Gives access to the backend of Auto-sklearn. Not used.

    model_index : int
        Index of the model that constitutes the ensemble. This index will
        be used to select the correct predictions that will be passed during
        ``fit`` and ``predict``.

    random_state: int | RandomState | None = None
        Not used.
    """

    def __init__(
        self,
        task_type: int,
        metrics: Sequence[Scorer] | Scorer,
        backend: Backend,
        model_index: int,
        random_state: int | np.random.RandomState | None = None,
    ):
        super().__init__(
            task_type=task_type,
            metrics=metrics,
            random_state=random_state,
            backend=backend,
        )
        self.indices_ = [model_index]

[docs]    def fit(
        self,
        base_models_predictions: np.ndarray | list[np.ndarray],
        true_targets: np.ndarray,
        model_identifiers: list[tuple[int, int, float]],
        runs: Sequence[Run],
        X_data: SUPPORTED_FEAT_TYPES | None = None,
    ) -> SingleModelEnsemble:
        """Dummy implementation of the ``fit`` method.

        Actualy work of passing the model index is done in the constructor. This
        method only stores the identifier of the selected model and computes it's
        validation loss.

        Parameters
        ----------
        base_models_predictions: np.ndarray
            shape = (n_base_models, n_data_points, n_targets)
            n_targets is the number of classes in case of classification,
            n_targets is 0 or 1 in case of regression

            Can be a list of 2d numpy arrays as well to prevent copying all
            predictions into a single, large numpy array.

        true_targets : array of shape [n_targets]

        model_identifiers : identifier for each base model.
            Can be used for practical text output of the ensemble.

        runs: Sequence[Run]
            Additional information for each run executed by SMAC that was
            considered by the ensemble builder. Not used.

        X_data : list-like | spmatrix | None = None
           X data to feed to a metric if it requires it

        Returns
        -------
        self
        """
        self.identifiers_ = [model_identifiers[self.indices_[0]]]
        loss = calculate_losses(
            solution=true_targets,
            prediction=base_models_predictions[self.indices_[0]],
            task_type=self.task_type,
            metrics=self.metrics,
            X_data=X_data,
        )
        self.best_model_score_ = loss[self.metrics[0].name]
        return self


[docs]class SingleBest(AbstractSingleModelEnsemble):
    """Ensemble consisting of the single best model.

    Parameters
    ----------
    task_type: int
        An identifier indicating which task is being performed.

    metrics: Sequence[Scorer] | Scorer
        The metrics used to evaluate the models.

    random_state: int | RandomState | None = None
        Not used.

    backend : Backend
        Gives access to the backend of Auto-sklearn. Not used.
    """

    def __init__(
        self,
        task_type: int,
        metrics: Sequence[Scorer] | Scorer,
        backend: Backend,
        random_state: int | np.random.RandomState | None = None,
    ):
        super().__init__(
            task_type=task_type,
            metrics=metrics,
            random_state=random_state,
            backend=backend,
        )

[docs]    def fit(
        self,
        base_models_predictions: np.ndarray | list[np.ndarray],
        true_targets: np.ndarray,
        model_identifiers: list[tuple[int, int, float]],
        runs: Sequence[Run],
        X_data: SUPPORTED_FEAT_TYPES | None = None,
    ) -> SingleBest:
        """Select the single best model.

        Parameters
        ----------
        base_models_predictions: np.ndarray
            shape = (n_base_models, n_data_points, n_targets)
            n_targets is the number of classes in case of classification,
            n_targets is 0 or 1 in case of regression

            Can be a list of 2d numpy arrays as well to prevent copying all
            predictions into a single, large numpy array.

        true_targets : array of shape [n_targets]

        model_identifiers : identifier for each base model.
            Can be used for practical text output of the ensemble.

        runs: Sequence[Run]
            Additional information for each run executed by SMAC that was
            considered by the ensemble builder. Not used.

        X_data : array-like | sparse matrix | None = None

        Returns
        -------
         self
        """
        losses = [
            calculate_losses(
                solution=true_targets,
                prediction=base_model_prediction,
                task_type=self.task_type,
                metrics=self.metrics,
                X_data=X_data,
            )[self.metrics[0].name]
            for base_model_prediction in base_models_predictions
        ]
        argmin = np.argmin(losses)
        self.indices_ = [argmin]
        self.identifiers_ = [model_identifiers[argmin]]
        self.best_model_score_ = losses[argmin]
        return self


[docs]class SingleBestFromRunhistory(AbstractSingleModelEnsemble):
    """
    In the case of a crash, this class searches
    for the best individual model.

    Such model is returned as an ensemble of a single
    object, to comply with the expected interface of an
    AbstractEnsemble.

    Do not use by yourself!
    """

    def __init__(
        self,
        task_type: int,
        metrics: Sequence[Scorer] | Scorer,
        backend: Backend,
        run_history: RunHistory,
        seed: int,
        random_state: int | np.random.RandomState | None = None,
    ):
        super().__init__(
            task_type=task_type,
            metrics=metrics,
            random_state=random_state,
            backend=backend,
        )

        # The seed here is seperate from RandomState and is used to indiicate a
        # directory for the backend to search in
        self.seed = seed
        self.indices_ = [0]
        self.weights_ = [1.0]
        self.run_history = run_history
        self.identifiers_ = self.get_identifiers_from_run_history()

[docs]    def get_identifiers_from_run_history(self) -> list[tuple[int, int, float]]:
        """Parses the run history, to identify the best performing model

        Populates the identifiers attribute, which is used by the backend to access
        the actual model.
        """
        best_model_identifier = []
        best_model_score = self.metrics[0]._worst_possible_result

        for run_key in self.run_history.data.keys():
            run_value = self.run_history.data[run_key]
            print(run_key, run_value)
            if len(self.metrics) == 1:
                cost = run_value.cost
            else:
                cost = run_value.cost[0]
            score = self.metrics[0]._optimum - (self.metrics[0]._sign * cost)

            if (score > best_model_score and self.metrics[0]._sign > 0) or (
                score < best_model_score and self.metrics[0]._sign < 0
            ):

                # Make sure that the individual best model actually exists
                model_dir = self.backend.get_numrun_directory(
                    self.seed,
                    run_value.additional_info["num_run"],
                    run_key.budget,
                )
                model_file_name = self.backend.get_model_filename(
                    self.seed,
                    run_value.additional_info["num_run"],
                    run_key.budget,
                )
                file_path = os.path.join(model_dir, model_file_name)
                if not os.path.exists(file_path):
                    continue

                best_model_identifier = [
                    (
                        self.seed,
                        run_value.additional_info["num_run"],
                        run_key.budget,
                    )
                ]
                best_model_score = score

        if not best_model_identifier:
            raise ValueError(
                "No valid model found in run history. This means smac was not able to"
                " fit a valid model. Please check the log file for errors."
            )

        self.best_model_score_ = best_model_score

        return best_model_identifier