Source code for deepcave.evaluators.mo_fanova

# Copyright 2021-2024 The DeepCAVE Authors
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#   http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

#  noqa: D400
"""
# fANOVA

This module provides a tool for assessing the importance of an algorithms Hyperparameters.

Utilities provide calculation of the data wrt the budget and train the forest on the encoded data.

## Classes
    - fANOVA: Calculate and provide midpoints and sizes.
"""

from typing import List, Optional, Union

import pandas as pd

from deepcave.evaluators.epm.fanova_forest import FanovaForest
from deepcave.evaluators.fanova import fANOVA
from deepcave.runs import AbstractRun
from deepcave.runs.objective import Objective
from deepcave.utils.multi_objective_importance import get_weightings


[docs] class MOfANOVA(fANOVA): """ Multi-Objective fANOVA. Calculate and provide midpoints and sizes from the forest's split values in order to get the marginals. Override: to train the random forest with an arbitrary weighting of the objectives (multi-objective case). """ def __init__(self, run: AbstractRun): if run.configspace is None: raise RuntimeError("The run needs to be initialized.") super().__init__(run) self.importances_ = None
[docs] def calculate( self, objectives: Optional[Union[Objective, List[Objective]]] = None, budget: Optional[Union[int, float]] = None, n_trees: int = 100, seed: int = 0, ) -> None: """ Get the data with respect to budget and train the forest on the encoded data. Calculates weighted fanova for multiple objectives. Note ---- Right now, only `n_trees` is used. It can be further specified if needed. Parameters ---------- objectives : Optional[Union[Objective, List[Objective]]], optional Considered objectives. By default None. If None, all objectives are considered. budget : Optional[Union[int, float]], optional Considered budget. By default None. If None, the highest budget is chosen. n_trees : int, optional How many trees should be used. By default 100. seed : int Random seed. By default 0. """ if objectives is None: objectives = self.run.get_objectives() if budget is None: budget = self.run.get_highest_budget() self.n_trees = n_trees # Get data df = self.run.get_encoded_data( objectives, budget, specific=True, include_combined_cost=True ) # normalize objectives assert isinstance(objectives, list) objectives_normed = list() for obj in objectives: normed = obj.name + "_normed" df[normed] = (df[obj.name] - df[obj.name].min()) / ( df[obj.name].max() - df[obj.name].min() ) if obj.optimize == "upper": df[normed] = 1 - df[normed] objectives_normed.append(normed) df = df.dropna(subset=objectives_normed) X = df[self.hp_names].to_numpy() weightings = get_weightings(objectives_normed, df) df_all = pd.DataFrame([]) # calculate importance for each weighting generated from the pareto efficient points for w in weightings: Y = sum(df[obj] * weighting for obj, weighting in zip(objectives_normed, w)).to_numpy() self._model = FanovaForest(self.cs, n_trees=n_trees, seed=seed) self._model.train(X, Y) df_res = ( pd.DataFrame(super(MOfANOVA, self).get_importances(hp_names=None)) .loc[0:1] .T.reset_index() ) df_res["weight"] = w[0] df_all = pd.concat([df_all, df_res]) self.importances_ = df_all.rename( columns={0: "importance", 1: "variance", "index": "hp_name"} ).reset_index(drop=True)
[docs] def get_importances_(self, hp_names: Optional[List[str]] = None) -> str: """ Return the importance scores from the passed Hyperparameter names. Parameters ---------- hp_names : Optional[List[str]] Selected Hyperparameter names to get the importance scores from. If None, all Hyperparameters of the configuration space are used. sort : bool, optional Whether the Hyperparameters should be sorted by importance. By default True. Returns ------- Dict Dictionary with Hyperparameter names and the corresponding importance scores and variances. Raises ------ RuntimeError If the important scores are not calculated. """ if self.importances_ is None: raise RuntimeError("Importance scores must be calculated first.") if hp_names: return self.importances_.loc[self.importances_["hp_name"].isin(hp_names)].to_json() else: return self.importances_.to_json()