Source code for deepcave.evaluators.fanova

# Copyright 2021-2024 The DeepCAVE Authors
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#   http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

#  noqa: D400
"""
# fANOVA

This module provides a tool for assessing the importance of an algorithms Hyperparameters.

Utilities provide calculation of the data wrt the budget and train the forest on the encoded data.

## Classes
    - fANOVA: Calculate and provide midpoints and sizes.
"""

from typing import Any, Dict, List, Optional, Tuple, Union

import itertools as it

import numpy as np

from deepcave.constants import COMBINED_COST_NAME
from deepcave.evaluators.epm.fanova_forest import FanovaForest
from deepcave.runs import AbstractRun
from deepcave.runs.objective import Objective
from deepcave.utils.logs import get_logger



[docs]
class fANOVA:
    """
    Calculate and provide midpoints and sizes.

    They are generated from the forest's split values in order to get the marginals.

    Properties
    ----------
    run : AbstractRun
        The Abstract Run used for the calculation.
    cs : ConfigurationSpace
        The configuration space of the run.
    hps : List[Hyperparameters]
        The Hyperparameters of the configuration space.
    hp_names : List[str]
        The corresponding names of the Hyperparameters.
    n_trees : int
        The number of trees.
    """

    def __init__(self, run: AbstractRun):
        if run.configspace is None:
            raise RuntimeError("The run needs to be initialized.")

        self.run = run
        self.cs = run.configspace
        self.hps = list(self.cs.values())
        self.hp_names = list(self.cs.keys())
        self.logger = get_logger(self.__class__.__name__)


[docs]
    def calculate(
        self,
        objectives: Optional[Union[Objective, List[Objective]]] = None,
        budget: Optional[Union[int, float]] = None,
        n_trees: int = 16,
        seed: int = 0,
    ) -> None:
        """
        Get the data with respect to budget and train the forest on the encoded data.

        Note
        ----
        Right now, only `n_trees` is used. It can be further specified if needed.

        Parameters
        ----------
        objectives : Optional[Union[Objective, List[Objective]]], optional
            Considered objectives. By default None. If None, all objectives are considered.
        budget : Optional[Union[int, float]], optional
            Considered budget. By default None. If None, the highest budget is chosen.
        n_trees : int, optional
            How many trees should be used. By default 16.
        seed : int
            Random seed. By default 0.
        """
        if objectives is None:
            objectives = self.run.get_objectives()

        if budget is None:
            budget = self.run.get_highest_budget()

        self.n_trees = n_trees

        # Get data
        df = self.run.get_encoded_data(
            objectives, budget, specific=True, include_combined_cost=True
        )
        X = df[self.hp_names].to_numpy()
        # Combined cost name includes the cost of all selected objectives
        Y = df[COMBINED_COST_NAME].to_numpy()

        # Get model and train it
        self._model = FanovaForest(self.cs, n_trees=n_trees, seed=seed)
        self._model.train(X, Y)



[docs]
    def get_importances(
        self, hp_names: Optional[List[str]] = None, depth: int = 1, sort: bool = True
    ) -> Dict[Union[str, Tuple[str, ...]], Tuple[float, float, float, float]]:
        """
        Return the importance scores from the passed Hyperparameter names.

        Warning
        -------
        Using a depth higher than 1 might take much longer.

        Parameters
        ----------
        hp_names : Optional[List[str]]
            Selected Hyperparameter names to get the importance scores from. If None, all
            Hyperparameters of the configuration space are used.
        depth : int, optional
            How often dimensions should be combined. By default 1.
        sort : bool, optional
            Whether the Hyperparameters should be sorted by importance. By default True.

        Returns
        -------
        Dict[Union[str, Tuple[str, ...]], Tuple[float, float, float, float]]
            Dictionary with Hyperparameter names and the corresponding importance scores.
            The values are tuples of the form (mean individual, var individual, mean total,
            var total). Note that individual and total are the same if depth is 1.

        Raises
        ------
        RuntimeError
            If there is zero total variance in all trees.
        """
        if hp_names is None:
            hp_names = self.cs.get_hyperparameter_names()

        hp_ids = []
        for hp_name in hp_names:
            hp_ids.append(self.cs.index_of[hp_name])

        # Calculate the marginals
        vu_individual, vu_total = self._model.compute_marginals(hp_ids, depth)

        importances: Dict[Tuple[Any, ...], Tuple[float, float, float, float]] = {}
        for k in range(1, len(hp_ids) + 1):
            if k > depth:
                break

            for sub_hp_ids in it.combinations(hp_ids, k):
                sub_hp_ids = tuple(sub_hp_ids)

                # clean here to catch zero variance in a trees
                non_zero_idx = np.nonzero(
                    [self._model.trees_total_variance[t] for t in range(self.n_trees)]
                )

                if len(non_zero_idx[0]) == 0:
                    self.logger.warning("Encountered zero total variance in all trees.")
                    importances[sub_hp_ids] = (
                        np.nan,
                        np.nan,
                        np.nan,
                        np.nan,
                    )
                    continue

                fractions_total = np.array(
                    [
                        vu_total[sub_hp_ids][t] / self._model.trees_total_variance[t]
                        for t in non_zero_idx[0]
                    ]
                )
                fractions_individual = np.array(
                    [
                        vu_individual[sub_hp_ids][t] / self._model.trees_total_variance[t]
                        for t in non_zero_idx[0]
                    ]
                )

                importances[sub_hp_ids] = (
                    np.mean(fractions_individual),
                    np.var(fractions_individual),
                    np.mean(fractions_total),
                    np.var(fractions_total),
                )

        # Sort by total mean fraction
        if sort:
            importances = {
                k: v for k, v in sorted(importances.items(), key=lambda item: item[1][2])
            }

        # The ids get replaced with hyperparameter names again
        all_hp_names = list(self.cs.keys())
        importances_: Dict[Union[str, Tuple[str, ...]], Tuple[float, float, float, float]] = {}
        for hp_ids_importances, values in importances.items():
            hp_names = [all_hp_names[hp_id] for hp_id in hp_ids_importances]
            hp_names_key: Union[Tuple[str, ...], str]
            if len(hp_names) == 1:
                hp_names_key = hp_names[0]
            else:
                hp_names_key = tuple(hp_names)
            importances_[hp_names_key] = values

        return importances_


    '''
    def marginal_mean_variance_for_values(self, dimlist, values_to_predict):
        """
        Return the marginal of selected parameters for specific values

        Parameters
        ----------
        dimlist: list
                Contains the indices of ConfigSpace for the selected parameters
                (starts with 0)
        values_to_predict: list
                Contains the values to be predicted

        Returns
        -------
        tuple
            marginal mean prediction and corresponding variance estimate
        """
        sample = np.full(self.n_dims, np.nan, dtype=np.float)
        for i in range(len(dimlist)):
            sample[dimlist[i]] = values_to_predict[i]

        return self._model.forest.marginal_mean_variance_prediction(sample)

    def get_most_important_pairwise_marginals(self, params=None, n=10):
        """
        Return the n most important pairwise marginals from the whole ConfigSpace.

        Parameters
        ----------
        params: list of strings or ints
            If specified, limit analysis to those parameters. If ints, interpreting as indices from
            ConfigurationSpace
        n: int
             The number of most relevant pairwise marginals that will be returned

        Returns
        -------
        list:
             Contains the n most important pairwise marginals
        """
        self.tot_imp_dict = OrderedDict()
        pairwise_marginals = []
        if params is None:
            dimensions = range(self.n_dims)
        else:
            if type(params[0]) == str:
                idx = []
                for i, param in enumerate(params):
                    idx.append(self.cs.get_idx_by_hyperparameter_name(param))
                dimensions = idx

            else:
                dimensions = params
        # pairs = it.combinations(dimensions,2)
        pairs = [x for x in it.combinations(dimensions, 2)]
        if params:
            n = len(list(pairs))
        for combi in pairs:
            pairwise_marginal_performance = self.quantify_importance(combi)
            tot_imp = pairwise_marginal_performance[combi]["individual importance"]
            combi_names = [self.hps[combi[0]].name, self.hps[combi[1]].name]
            pairwise_marginals.append((tot_imp, combi_names[0], combi_names[1]))

        pairwise_marginal_performance = sorted(pairwise_marginals, reverse=True)

        for marginal, p1, p2 in pairwise_marginal_performance[:n]:
            self.tot_imp_dict[(p1, p2)] = marginal

        return self.tot_imp_dict

    def get_triple_marginals(self, params=None):
        """
        Return the n most important pairwise marginals from the whole ConfigSpace

        Parameters
        ----------
        params: list
             The parameters

        Returns
        -------
        list:
             Contains most important triple marginals
        """
        self.tot_imp_dict = OrderedDict()
        triple_marginals = []
        if len(params) < 3:
            raise RuntimeError(
                "Number of parameters have to be greater than %i. At least 3 parameters needed"
                % len(params)
            )
        if type(params[0]) == str:
            idx = []
            for i, param in enumerate(params):
                idx.append(self.cs.get_idx_by_hyperparameter_name(param))
            dimensions = idx

        else:
            dimensions = params

        triplets = [x for x in it.combinations(dimensions, 3)]
        for combi in triplets:
            triple_marginal_performance = self.quantify_importance(combi)
            tot_imp = triple_marginal_performance[combi]["individual importance"]
            combi_names = [
                self.hps[combi[0]].name,
                self.hps[combi[1]].name,
                self.hps[combi[2]].name,
            ]
            triple_marginals.append((tot_imp, combi_names[0], combi_names[1], combi_names[2]))

        triple_marginal_performance = sorted(triple_marginals, reverse=True)
        if params:
            triple_marginal_performance = triple_marginal_performance[: len(list(triplets))]

        for marginal, p1, p2, p3 in triple_marginal_performance:
            self.tot_imp_dict[(p1, p2, p3)] = marginal

        return self.tot_imp_dict
    '''
DeepCAVE Documentation

Source code for deepcave.evaluators.fanova