Source code for autosklearn.ensembles.multiobjective_dummy_ensemble

from __future__ import annotations

from typing import Sequence

import warnings

import numpy as np
from sklearn.exceptions import NotFittedError

from autosklearn.automl_common.common.utils.backend import Backend
from autosklearn.constants import TASK_TYPES
from autosklearn.data.validation import SUPPORTED_FEAT_TYPES
from autosklearn.ensemble_building.run import Run
from autosklearn.ensembles.abstract_ensemble import (
    AbstractEnsemble,
    AbstractMultiObjectiveEnsemble,
)
from autosklearn.ensembles.singlebest_ensemble import SingleModelEnsemble
from autosklearn.metrics import Scorer, calculate_losses
from autosklearn.pipeline.base import BasePipeline
from autosklearn.util.multiobjective import pareto_front


[docs]class MultiObjectiveDummyEnsemble(AbstractMultiObjectiveEnsemble):
    def __init__(
        self,
        task_type: int,
        metrics: Sequence[Scorer] | Scorer,
        backend: Backend,
        random_state: int | np.random.RandomState | None = None,
    ) -> None:
        """A dummy implementation of a multi-objective ensemble.

        Builds ensembles that are individual models on the Pareto front each.

        Parameters
        ----------
        task_type: int
            An identifier indicating which task is being performed.

        metrics: Sequence[Scorer] | Scorer
            The metrics used to evaluate the models.

        backend : Backend
            Gives access to the backend of Auto-sklearn. Not used.

        random_state: int | RandomState | None = None
            Not used.
        """
        self.task_type = task_type
        if isinstance(metrics, Sequence):
            if len(metrics) == 1:
                warnings.warn(
                    "Passed only a single metric to a multi-objective ensemble. "
                    "Please use a single-objective ensemble in such cases."
                )
            self.metrics = metrics
        else:
            self.metric = [metrics]
        self.random_state = random_state
        self.backend = backend

    @property
    def pareto_set(self) -> Sequence[AbstractEnsemble]:
        if not hasattr(self, "pareto_set_"):
            raise NotFittedError("`pareto_set` not created, please call `fit()` first")
        return self.pareto_set_

[docs]    def fit(
        self,
        base_models_predictions: list[np.ndarray],
        true_targets: np.ndarray,
        model_identifiers: list[tuple[int, int, float]],
        runs: Sequence[Run],
        X_data: SUPPORTED_FEAT_TYPES | None = None,
    ) -> MultiObjectiveDummyEnsemble:
        """Select dummy ensembles given predictions of base models and targets.

        Parameters
        ----------
        base_models_predictions: np.ndarray
            shape = (n_base_models, n_data_points, n_targets)
            n_targets is the number of classes in case of classification,
            n_targets is 0 or 1 in case of regression

            Can be a list of 2d numpy arrays as well to prevent copying all
            predictions into a single, large numpy array.

        true_targets : array of shape [n_targets]

        model_identifiers : identifier for each base model.
            Can be used for practical text output of the ensemble.

        runs: Sequence[Run]
            Additional information for each run executed by SMAC that was
            considered by the ensemble builder. Not used.

        X_data : list-like | sparse matrix | None = None
            X data to give to the metric if required

        Returns
        -------
        self
        """
        if self.task_type not in TASK_TYPES:
            raise ValueError("Unknown task type %s." % self.task_type)

        all_costs = np.empty((len(base_models_predictions), len(self.metrics)))
        for i, base_model_prediction in enumerate(base_models_predictions):
            losses = calculate_losses(
                solution=true_targets,
                prediction=base_model_prediction,
                task_type=self.task_type,
                metrics=self.metrics,
                X_data=X_data,
            )
            all_costs[i] = [losses[metric.name] for metric in self.metrics]

        all_costs = np.array(all_costs)
        sort_by_first_metric = np.argsort(all_costs[:, 0])
        efficient_points = pareto_front(all_costs, is_loss=True)
        pareto_set = []

        for argsort_idx in sort_by_first_metric:
            if not efficient_points[argsort_idx]:
                continue
            ensemble = SingleModelEnsemble(
                task_type=self.task_type,
                metrics=self.metrics,
                random_state=self.random_state,
                backend=self.backend,
                model_index=argsort_idx,
            )
            ensemble.fit(
                base_models_predictions=base_models_predictions,
                true_targets=true_targets,
                model_identifiers=model_identifiers,
                runs=runs,
                X_data=X_data,
            )
            pareto_set.append(ensemble)

        self.pareto_set_ = pareto_set
        return self

[docs]    def predict(
        self, base_models_predictions: np.ndarray | list[np.ndarray]
    ) -> np.ndarray:
        """Predict using the ensemble which is best for the 1st metric.

        Parameters
        ----------
        base_models_predictions : np.ndarray
            shape = (n_base_models, n_data_points, n_targets)
            Same as in the fit method.

        Returns
        -------
        np.ndarray
        """
        return self.pareto_set[0].predict(base_models_predictions)

    def __str__(self) -> str:
        return "MultiObjectiveDummyEnsemble: %d models" % len(self.pareto_set)

[docs]    def get_models_with_weights(
        self, models: dict[tuple[int, int, float], BasePipeline]
    ) -> list[tuple[float, BasePipeline]]:
        """Return a list of (weight, model) pairs for the ensemble that is
        best for the 1st metric.

        Parameters
        ----------
        models : dict {identifier : model object}
            The identifiers are the same as the one presented to the fit()
            method. Models can be used for nice printing.

        Returns
        -------
        list[tuple[float, BasePipeline]]
        """
        return self.pareto_set[0].get_models_with_weights(models)

[docs]    def get_identifiers_with_weights(
        self,
    ) -> list[tuple[tuple[int, int, float], float]]:
        """Return a (identifier, weight)-pairs for all models that were passed to the
        ensemble builder based on the ensemble that is best for the 1st metric.

        Parameters
        ----------
        models : dict {identifier : model object}
            The identifiers are the same as the one presented to the fit()
            method. Models can be used for nice printing.

        Returns
        -------
        list[tuple[tuple[int, int, float], float]
        """
        return self.pareto_set[0].get_identifiers_with_weights()

[docs]    def get_selected_model_identifiers(self) -> list[tuple[int, int, float]]:
        """Return identifiers of models in the ensemble that is best for the 1st metric.

        This includes models which have a weight of zero!

        Returns
        -------
        list
        """
        return self.pareto_set[0].get_selected_model_identifiers()

[docs]    def get_validation_performance(self) -> float:
        """Validation performance of the ensemble that is best for the 1st metric.

        Returns
        -------
        float
        """
        return self.pareto_set[0].get_validation_performance()