# Copyright 2021-2024 The DeepCAVE Authors
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# noqa: D400
"""
# FanovaForest
The module provides utilities for creating a fANOVA forest.
It includes a FanovaForest wrapper for pyrfr.
fANOVA can be used for analyzing the importances of Hyperparameters.
## Classes
- FanovaForest: A fANOVA forest wrapper for pyrfr.
"""
from typing import Any, Dict, List, Optional, Tuple, Union
import itertools as it
import numpy as np
import pyrfr
import pyrfr.regression as regression
import pyrfr.util
from ConfigSpace import ConfigurationSpace
from deepcave.evaluators.epm.random_forest import RandomForest
[docs]
class FanovaForest(RandomForest):
"""
A fANOVA forest wrapper for pyrfr.
Properties
----------
cutoffs : Tuple[float, float]
The cutoffs of the model.
percentiles : NDArray[floating]
The percentiles of the data points Y.
all_midpoints : List
All midpoints tree wise for the whole forest.
all_sizes : List
All interval sizes tree wise for the whole forest.
bounds : List[Tuple[float, float]
Stores feature bounds.
trees_total_variances : List
The total variances of the trees.
trees_total_variance : Any
The total variance of a tree.
trees_variance_fractions : Dict
The variance fractions of the trees.
V_U_total : Dict[Tuple[int, ...], List[Any]]
Store variance-related information across all trees.
V_U_individual : Dict[Tuple[int, ...], List[Any]]
Store variance-related information for individual subsets.
n_params : int
The number of Hyperparameters to sample.
"""
def __init__(
self,
configspace: ConfigurationSpace,
n_trees: int = 10,
ratio_features: float = 1.0,
min_samples_split: int = 0,
min_samples_leaf: int = 0,
max_depth: int = 64,
max_nodes: int = 2**20,
eps_purity: float = 1e-8,
bootstrapping: bool = True,
instance_features: Optional[np.ndarray] = None,
pca_components: Optional[int] = 2,
cutoffs: Tuple[float, float] = (-np.inf, np.inf),
seed: int = 0,
):
super().__init__(
configspace=configspace,
n_trees=n_trees,
ratio_features=ratio_features,
min_samples_split=min_samples_split,
min_samples_leaf=min_samples_leaf,
max_depth=max_depth,
max_nodes=max_nodes,
eps_purity=eps_purity,
bootstrapping=bootstrapping,
instance_features=instance_features,
pca_components=pca_components,
log_y=False,
seed=seed,
)
self.cutoffs = cutoffs
def _get_model(self) -> regression.base_tree:
"""
Return the internal model.
Returns
-------
model : regression.base_tree
Model which is used internally.
"""
return regression.fanova_forest()
def _train(self, X: np.ndarray, Y: np.ndarray) -> None:
"""
Train the Random Forest on X and Y.
Parameters
----------
X : np.ndarray
Input data points.
Y : np.ndarray
Target values.
"""
super()._train(X, Y)
self.percentiles = np.percentile(Y, range(0, 100))
# all midpoints and interval sizes tree wise for the whole forest
self.all_midpoints = []
self.all_sizes = []
# getting split values
forest_split_values = self._model.all_split_values()
# compute midpoints and interval sizes for variables in each tree
for tree_split_values in forest_split_values:
sizes: List = []
midpoints: List = []
for i, split_vals in enumerate(tree_split_values):
if np.isnan(self.bounds[i][1]): # categorical parameter
# check if the tree actually splits on this parameter
if len(split_vals) > 0:
midpoints.append(split_vals)
sizes.append(np.ones(len(split_vals)))
# if not, simply append 0 as the value with the number of categories as the
# size, that way this parameter will get 0 importance from this tree.
else:
midpoints.append((0,))
sizes.append((self.bounds[i][0],))
else:
# add bounds to split values
sv = np.array([self.bounds[i][0]] + list(split_vals) + [self.bounds[i][1]])
# compute midpoints and sizes
midpoints.append((1 / 2) * (sv[1:] + sv[:-1]))
sizes.append(sv[1:] - sv[:-1])
self.all_midpoints.append(midpoints)
self.all_sizes.append(sizes)
# capital V in the paper
self.trees_total_variances: list = []
# dict of lists where the keys are tuples of the dimensions
# and the value list contains \hat{f}_U for the individual trees
# reset all the variance fractions computed
self.trees_variance_fractions: dict = {}
self.V_U_total: Dict[Tuple[int, ...], List[Any]] = {}
self.V_U_individual: Dict[Tuple[int, ...], List[Any]] = {}
# Set cut-off
self._model.set_cutoffs(self.cutoffs[0], self.cutoffs[1])
# recompute the trees' total variance
self.trees_total_variance = self._model.get_trees_total_variances()
[docs]
def compute_marginals(
self, hp_ids: Union[List[int], Tuple[int, ...]], depth: int = 1
) -> Tuple[Dict[Tuple[int, ...], List[Any]], Dict[Tuple[int, ...], List[Any]],]:
"""
Return the marginal of selected Hyperparameters.
Parameters
----------
hp_ids: Union[List[int], Tuple[int, ...]]
Contains the indices of the configspace for the selected Hyperparameters
(starts with 0).
depth: int
The depth of the marginalization.
Default value is 1.
Returns
-------
Tuple[Dict[Tuple[int, ...], List[Any]],
Dict[Tuple[int, ...], List[Any]],
The marginal of selected Hyperparameters.
"""
if not isinstance(hp_ids, tuple):
hp_ids = tuple(hp_ids)
# check if values has been previously computed
if hp_ids in self.V_U_individual:
return self.V_U_individual, self.V_U_total
# otherwise make sure all lower order marginals have been
# computed, if not compute them
for k in range(1, len(hp_ids)):
if k > depth:
break
for sub_hp_ids in it.combinations(hp_ids, k):
if sub_hp_ids not in self.V_U_total:
self.compute_marginals(sub_hp_ids)
# now all lower order terms have been computed
self.V_U_individual[hp_ids] = []
self.V_U_total[hp_ids] = []
if len(hp_ids) > depth + 1:
return self.V_U_individual, self.V_U_total
for tree_idx in range(len(self.all_midpoints)):
# collect all the midpoints and corresponding sizes for that tree
midpoints = [self.all_midpoints[tree_idx][hp_id] for hp_id in hp_ids]
sizes = [self.all_sizes[tree_idx][hp_id] for hp_id in hp_ids]
stat = pyrfr.util.weighted_running_stats()
prod_midpoints = it.product(*midpoints)
prod_sizes = it.product(*sizes)
sample: np.ndarray = np.full(self.n_params, np.nan, dtype=float)
# make prediction for all midpoints and weigh them by the corresponding size
for i, (m, s) in enumerate(zip(prod_midpoints, prod_sizes)):
sample[list(hp_ids)] = list(m)
ls = self._model.marginal_prediction_stat_of_tree(tree_idx, sample.tolist())
# self.logger.debug("%s, %s", (sample, ls.mean()))
if not np.isnan(ls.mean()):
stat.push(ls.mean(), np.prod(np.array(s)) * ls.sum_of_weights())
# line 10 in algorithm 2
# note that V_U^2 can be computed by var(\hat a)^2 - \sum_{subU} var(f_subU)^2
# which is why, \hat{f} is never computed in the code, but
# appears in the pseudo code
V_U_total = np.nan
V_U_individual = np.nan
if stat.sum_of_weights() > 0:
V_U_total = stat.variance_population()
V_U_individual = stat.variance_population()
for k in range(1, len(hp_ids)):
if k > depth:
break
for sub_dims in it.combinations(hp_ids, k):
V_U_individual -= self.V_U_individual[sub_dims][tree_idx]
V_U_individual = np.clip(V_U_individual, 0, np.inf)
self.V_U_individual[hp_ids].append(V_U_individual)
self.V_U_total[hp_ids].append(V_U_total)
return self.V_U_individual, self.V_U_total