Source code for deepcave.evaluators.epm.fanova_forest

# Copyright 2021-2024 The DeepCAVE Authors
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#   http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

# noqa: D400
"""
# FanovaForest

The module provides utilities for creating a fANOVA forest.

It includes a FanovaForest wrapper for pyrfr.
fANOVA can be used for analyzing the importances of Hyperparameters.

## Classes
    - FanovaForest: A fANOVA forest wrapper for pyrfr.
"""

from typing import Any, Dict, List, Optional, Tuple, Union

import itertools as it

import numpy as np
import pyrfr
import pyrfr.regression as regression
import pyrfr.util
from ConfigSpace import ConfigurationSpace

from deepcave.evaluators.epm.random_forest import RandomForest


[docs] class FanovaForest(RandomForest): """ A fANOVA forest wrapper for pyrfr. Properties ---------- cutoffs : Tuple[float, float] The cutoffs of the model. percentiles : NDArray[floating] The percentiles of the data points Y. all_midpoints : List All midpoints tree wise for the whole forest. all_sizes : List All interval sizes tree wise for the whole forest. bounds : List[Tuple[float, float] Stores feature bounds. trees_total_variances : List The total variances of the trees. trees_total_variance : Any The total variance of a tree. trees_variance_fractions : Dict The variance fractions of the trees. V_U_total : Dict[Tuple[int, ...], List[Any]] Store variance-related information across all trees. V_U_individual : Dict[Tuple[int, ...], List[Any]] Store variance-related information for individual subsets. n_params : int The number of Hyperparameters to sample. """ def __init__( self, configspace: ConfigurationSpace, n_trees: int = 10, ratio_features: float = 1.0, min_samples_split: int = 0, min_samples_leaf: int = 0, max_depth: int = 64, max_nodes: int = 2**20, eps_purity: float = 1e-8, bootstrapping: bool = True, instance_features: Optional[np.ndarray] = None, pca_components: Optional[int] = 2, cutoffs: Tuple[float, float] = (-np.inf, np.inf), seed: int = 0, ): super().__init__( configspace=configspace, n_trees=n_trees, ratio_features=ratio_features, min_samples_split=min_samples_split, min_samples_leaf=min_samples_leaf, max_depth=max_depth, max_nodes=max_nodes, eps_purity=eps_purity, bootstrapping=bootstrapping, instance_features=instance_features, pca_components=pca_components, log_y=False, seed=seed, ) self.cutoffs = cutoffs def _get_model(self) -> regression.base_tree: """ Return the internal model. Returns ------- model : regression.base_tree Model which is used internally. """ return regression.fanova_forest() def _train(self, X: np.ndarray, Y: np.ndarray) -> None: """ Train the Random Forest on X and Y. Parameters ---------- X : np.ndarray Input data points. Y : np.ndarray Target values. """ super()._train(X, Y) self.percentiles = np.percentile(Y, range(0, 100)) # all midpoints and interval sizes tree wise for the whole forest self.all_midpoints = [] self.all_sizes = [] # getting split values forest_split_values = self._model.all_split_values() # compute midpoints and interval sizes for variables in each tree for tree_split_values in forest_split_values: sizes: List = [] midpoints: List = [] for i, split_vals in enumerate(tree_split_values): if np.isnan(self.bounds[i][1]): # categorical parameter # check if the tree actually splits on this parameter if len(split_vals) > 0: midpoints.append(split_vals) sizes.append(np.ones(len(split_vals))) # if not, simply append 0 as the value with the number of categories as the # size, that way this parameter will get 0 importance from this tree. else: midpoints.append((0,)) sizes.append((self.bounds[i][0],)) else: # add bounds to split values sv = np.array([self.bounds[i][0]] + list(split_vals) + [self.bounds[i][1]]) # compute midpoints and sizes midpoints.append((1 / 2) * (sv[1:] + sv[:-1])) sizes.append(sv[1:] - sv[:-1]) self.all_midpoints.append(midpoints) self.all_sizes.append(sizes) # capital V in the paper self.trees_total_variances: list = [] # dict of lists where the keys are tuples of the dimensions # and the value list contains \hat{f}_U for the individual trees # reset all the variance fractions computed self.trees_variance_fractions: dict = {} self.V_U_total: Dict[Tuple[int, ...], List[Any]] = {} self.V_U_individual: Dict[Tuple[int, ...], List[Any]] = {} # Set cut-off self._model.set_cutoffs(self.cutoffs[0], self.cutoffs[1]) # recompute the trees' total variance self.trees_total_variance = self._model.get_trees_total_variances()
[docs] def compute_marginals( self, hp_ids: Union[List[int], Tuple[int, ...]], depth: int = 1 ) -> Tuple[Dict[Tuple[int, ...], List[Any]], Dict[Tuple[int, ...], List[Any]],]: """ Return the marginal of selected Hyperparameters. Parameters ---------- hp_ids: Union[List[int], Tuple[int, ...]] Contains the indices of the configspace for the selected Hyperparameters (starts with 0). depth: int The depth of the marginalization. Default value is 1. Returns ------- Tuple[Dict[Tuple[int, ...], List[Any]], Dict[Tuple[int, ...], List[Any]], The marginal of selected Hyperparameters. """ if not isinstance(hp_ids, tuple): hp_ids = tuple(hp_ids) # check if values has been previously computed if hp_ids in self.V_U_individual: return self.V_U_individual, self.V_U_total # otherwise make sure all lower order marginals have been # computed, if not compute them for k in range(1, len(hp_ids)): if k > depth: break for sub_hp_ids in it.combinations(hp_ids, k): if sub_hp_ids not in self.V_U_total: self.compute_marginals(sub_hp_ids) # now all lower order terms have been computed self.V_U_individual[hp_ids] = [] self.V_U_total[hp_ids] = [] if len(hp_ids) > depth + 1: return self.V_U_individual, self.V_U_total for tree_idx in range(len(self.all_midpoints)): # collect all the midpoints and corresponding sizes for that tree midpoints = [self.all_midpoints[tree_idx][hp_id] for hp_id in hp_ids] sizes = [self.all_sizes[tree_idx][hp_id] for hp_id in hp_ids] stat = pyrfr.util.weighted_running_stats() prod_midpoints = it.product(*midpoints) prod_sizes = it.product(*sizes) sample: np.ndarray = np.full(self.n_params, np.nan, dtype=float) # make prediction for all midpoints and weigh them by the corresponding size for i, (m, s) in enumerate(zip(prod_midpoints, prod_sizes)): sample[list(hp_ids)] = list(m) ls = self._model.marginal_prediction_stat_of_tree(tree_idx, sample.tolist()) # self.logger.debug("%s, %s", (sample, ls.mean())) if not np.isnan(ls.mean()): stat.push(ls.mean(), np.prod(np.array(s)) * ls.sum_of_weights()) # line 10 in algorithm 2 # note that V_U^2 can be computed by var(\hat a)^2 - \sum_{subU} var(f_subU)^2 # which is why, \hat{f} is never computed in the code, but # appears in the pseudo code V_U_total = np.nan V_U_individual = np.nan if stat.sum_of_weights() > 0: V_U_total = stat.variance_population() V_U_individual = stat.variance_population() for k in range(1, len(hp_ids)): if k > depth: break for sub_dims in it.combinations(hp_ids, k): V_U_individual -= self.V_U_individual[sub_dims][tree_idx] V_U_individual = np.clip(V_U_individual, 0, np.inf) self.V_U_individual[hp_ids].append(V_U_individual) self.V_U_total[hp_ids].append(V_U_total) return self.V_U_individual, self.V_U_total