from __future__ import annotations
from typing import Any
import numpy as np
from ConfigSpace import ConfigurationSpace
from pyrfr import regression
from pyrfr.regression import binary_rss_forest as BinaryForest
from pyrfr.regression import default_data_container as DataContainer
from smac.constants import N_TREES, VERY_SMALL_NUMBER
from smac.model.random_forest import AbstractRandomForest
__copyright__ = "Copyright 2022, automl.org"
__license__ = "3-clause BSD"
[docs]class RandomForest(AbstractRandomForest):
"""Random forest that takes instance features into account.
Parameters
----------
n_trees : int, defaults to `N_TREES`
The number of trees in the random forest.
n_points_per_tree : int, defaults to -1
Number of points per tree. If the value is smaller than 0, the number of samples will be used.
ratio_features : float, defaults to 5.0 / 6.0
The ratio of features that are considered for splitting.
min_samples_split : int, defaults to 3
The minimum number of data points to perform a split.
min_samples_leaf : int, defaults to 3
The minimum number of data points in a leaf.
max_depth : int, defaults to 2**20
The maximum depth of a single tree.
eps_purity : float, defaults to 1e-8
The minimum difference between two target values to be considered.
max_nodes : int, defaults to 2**20
The maximum total number of nodes in a tree.
bootstrapping : bool, defaults to True
Enables bootstrapping.
log_y: bool, defaults to False
The y values (passed to this random forest) are expected to be log(y) transformed.
This will be considered during predicting.
instance_features : dict[str, list[int | float]] | None, defaults to None
Features (list of int or floats) of the instances (str). The features are incorporated into the X data,
on which the model is trained on.
pca_components : float, defaults to 7
Number of components to keep when using PCA to reduce dimensionality of instance features.
seed : int
"""
def __init__(
self,
configspace: ConfigurationSpace,
n_trees: int = N_TREES,
n_points_per_tree: int = -1,
ratio_features: float = 5.0 / 6.0,
min_samples_split: int = 3,
min_samples_leaf: int = 3,
max_depth: int = 2**20,
eps_purity: float = 1e-8,
max_nodes: int = 2**20,
bootstrapping: bool = True,
log_y: bool = False,
instance_features: dict[str, list[int | float]] | None = None,
pca_components: int | None = 7,
seed: int = 0,
) -> None:
super().__init__(
configspace=configspace,
instance_features=instance_features,
pca_components=pca_components,
seed=seed,
)
max_features = 0 if ratio_features > 1.0 else max(1, int(len(self._types) * ratio_features))
self._rf_opts = regression.forest_opts()
self._rf_opts.num_trees = n_trees
self._rf_opts.do_bootstrapping = bootstrapping
self._rf_opts.tree_opts.max_features = max_features
self._rf_opts.tree_opts.min_samples_to_split = min_samples_split
self._rf_opts.tree_opts.min_samples_in_leaf = min_samples_leaf
self._rf_opts.tree_opts.max_depth = max_depth
self._rf_opts.tree_opts.epsilon_purity = eps_purity
self._rf_opts.tree_opts.max_num_nodes = max_nodes
self._rf_opts.compute_law_of_total_variance = False
self._rf: BinaryForest | None = None
self._log_y = log_y
self._rng = regression.default_random_engine(seed)
self._n_trees = n_trees
self._n_points_per_tree = n_points_per_tree
self._ratio_features = ratio_features
self._min_samples_split = min_samples_split
self._min_samples_leaf = min_samples_leaf
self._max_depth = max_depth
self._eps_purity = eps_purity
self._max_nodes = max_nodes
self._bootstrapping = bootstrapping
# This list well be read out by save_iteration() in the solver
# self._hypers = [
# n_trees,
# max_nodes,
# bootstrapping,
# n_points_per_tree,
# ratio_features,
# min_samples_split,
# min_samples_leaf,
# max_depth,
# eps_purity,
# self._seed,
# ]
@property
def meta(self) -> dict[str, Any]: # noqa: D102
meta = super().meta
meta.update(
{
"n_trees": self._n_trees,
"n_points_per_tree": self._n_points_per_tree,
"ratio_features": self._ratio_features,
"min_samples_split": self._min_samples_split,
"min_samples_leaf": self._min_samples_leaf,
"max_depth": self._max_depth,
"eps_purity": self._eps_purity,
"max_nodes": self._max_nodes,
"bootstrapping": self._bootstrapping,
"pca_components": self._pca_components,
}
)
return meta
def _train(self, X: np.ndarray, y: np.ndarray) -> RandomForest:
X = self._impute_inactive(X)
y = y.flatten()
# self.X = X
# self.y = y.flatten()
if self._n_points_per_tree <= 0:
self._rf_opts.num_data_points_per_tree = X.shape[0]
else:
self._rf_opts.num_data_points_per_tree = self._n_points_per_tree
self._rf = regression.binary_rss_forest()
self._rf.options = self._rf_opts
data = self._init_data_container(X, y)
self._rf.fit(data, rng=self._rng)
return self
def _init_data_container(self, X: np.ndarray, y: np.ndarray) -> DataContainer:
"""Fills a pyrfr default data container s.t. the forest knows categoricals and bounds for continous data.
Parameters
----------
X : np.ndarray [#samples, #hyperparameter + #features]
Input data points.
Y : np.ndarray [#samples, #objectives]
The corresponding target values.
Returns
-------
data : DataContainer
The filled data container that pyrfr can interpret.
"""
# Retrieve the types and the bounds from the ConfigSpace
data = regression.default_data_container(X.shape[1])
for i, (mn, mx) in enumerate(self._bounds):
if np.isnan(mx):
data.set_type_of_feature(i, mn)
else:
data.set_bounds_of_feature(i, mn, mx)
for row_X, row_y in zip(X, y):
data.add_data_point(row_X, row_y)
return data
def _predict(
self,
X: np.ndarray,
covariance_type: str | None = "diagonal",
) -> tuple[np.ndarray, np.ndarray | None]:
if len(X.shape) != 2:
raise ValueError("Expected 2d array, got %dd array!" % len(X.shape))
if X.shape[1] != len(self._types):
raise ValueError("Rows in X should have %d entries but have %d!" % (len(self._types), X.shape[1]))
if covariance_type != "diagonal":
raise ValueError("`covariance_type` can only take `diagonal` for this model.")
assert self._rf is not None
X = self._impute_inactive(X)
if self._log_y:
all_preds = []
third_dimension = 0
# Gather data in a list of 2d arrays and get statistics about the required size of the 3d array
for row_X in X:
preds_per_tree = self._rf.all_leaf_values(row_X)
all_preds.append(preds_per_tree)
max_num_leaf_data = max(map(len, preds_per_tree))
third_dimension = max(max_num_leaf_data, third_dimension)
# Transform list of 2d arrays into a 3d array
preds_as_array = np.zeros((X.shape[0], self._rf_opts.num_trees, third_dimension)) * np.NaN
for i, preds_per_tree in enumerate(all_preds):
for j, pred in enumerate(preds_per_tree):
preds_as_array[i, j, : len(pred)] = pred
# Do all necessary computation with vectorized functions
preds_as_array = np.log(np.nanmean(np.exp(preds_as_array), axis=2) + VERY_SMALL_NUMBER)
# Compute the mean and the variance across the different trees
means = preds_as_array.mean(axis=1)
vars_ = preds_as_array.var(axis=1)
else:
means, vars_ = [], []
for row_X in X:
mean_, var = self._rf.predict_mean_var(row_X)
means.append(mean_)
vars_.append(var)
means = np.array(means)
vars_ = np.array(vars_)
return means.reshape((-1, 1)), vars_.reshape((-1, 1))
[docs] def predict_marginalized(self, X: np.ndarray) -> tuple[np.ndarray, np.ndarray]:
"""Predicts mean and variance marginalized over all instances.
Note
----
The method is random forest specific and follows the SMAC2 implementation. It requires
no distribution assumption to marginalize the uncertainty estimates.
Parameters
----------
X : np.ndarray [#samples, #hyperparameter + #features]
Input data points.
Returns
-------
means : np.ndarray [#samples, 1]
The predictive mean.
vars : np.ndarray [#samples, 1]
The predictive variance.
"""
if self._n_features == 0:
mean_, var = self.predict(X)
assert var is not None
var[var < self._var_threshold] = self._var_threshold
var[np.isnan(var)] = self._var_threshold
return mean_, var
assert self._instance_features is not None
if len(X.shape) != 2:
raise ValueError("Expected 2d array, got %dd array!" % len(X.shape))
if X.shape[1] != len(self._bounds):
raise ValueError("Rows in X should have %d entries but have %d!" % (len(self._bounds), X.shape[1]))
assert self._rf is not None
X = self._impute_inactive(X)
dat_ = np.zeros((X.shape[0], self._rf_opts.num_trees)) # Marginalized predictions for each tree
for i, x in enumerate(X):
# Marginalize over instances
# 1. get all leaf values for each tree
preds_trees: list[list[float]] = [[] for i in range(self._rf_opts.num_trees)]
for feat in self._instance_features.values():
x_ = np.concatenate([x, feat])
preds_per_tree = self._rf.all_leaf_values(x_)
for tree_id, preds in enumerate(preds_per_tree):
preds_trees[tree_id] += preds
# 2. average in each tree
if self._log_y:
for tree_id in range(self._rf_opts.num_trees):
dat_[i, tree_id] = np.log(np.exp(np.array(preds_trees[tree_id])).mean())
else:
for tree_id in range(self._rf_opts.num_trees):
dat_[i, tree_id] = np.array(preds_trees[tree_id]).mean()
# 3. compute statistics across trees
mean_ = dat_.mean(axis=1)
var = dat_.var(axis=1)
if var is None:
raise RuntimeError("The variance must not be none.")
var[var < self._var_threshold] = self._var_threshold
if len(mean_.shape) == 1:
mean_ = mean_.reshape((-1, 1))
if len(var.shape) == 1:
var = var.reshape((-1, 1))
return mean_, var