Skip to content

Random forest pyrfr

smac.model.random_forest.pyrfr.random_forest_pyrfr #

PyrfrRandomForest #

PyrfrRandomForest(
    configspace: ConfigurationSpace,
    n_trees: int = N_TREES,
    n_points_per_tree: int = -1,
    ratio_features: float = 5.0 / 6.0,
    min_samples_split: int = 3,
    min_samples_leaf: int = 3,
    max_depth: int = 2**20,
    eps_purity: float = 1e-08,
    max_nodes: int = 2**20,
    bootstrapping: bool = True,
    log_y: bool = False,
    instance_features: (
        dict[str, list[int | float]] | None
    ) = None,
    pca_components: int | None = 7,
    seed: int = 0,
)

Bases: AbstractRandomForest

Random forest that takes instance features into account.

PARAMETER DESCRIPTION
n_trees

The number of trees in the random forest.

TYPE: int, defaults to `N_TREES` DEFAULT: N_TREES

n_points_per_tree

Number of points per tree. If the value is smaller than 0, the number of samples will be used.

TYPE: int, defaults to -1 DEFAULT: -1

ratio_features

The ratio of features that are considered for splitting.

TYPE: float, defaults to 5.0 / 6.0 DEFAULT: 5.0 / 6.0

min_samples_split

The minimum number of data points to perform a split.

TYPE: int, defaults to 3 DEFAULT: 3

min_samples_leaf

The minimum number of data points in a leaf.

TYPE: int, defaults to 3 DEFAULT: 3

max_depth

The maximum depth of a single tree.

TYPE: int, defaults to 2**20 DEFAULT: 2 ** 20

eps_purity

The minimum difference between two target values to be considered.

TYPE: float, defaults to 1e-8 DEFAULT: 1e-08

max_nodes

The maximum total number of nodes in a tree.

TYPE: int, defaults to 2**20 DEFAULT: 2 ** 20

bootstrapping

Enables bootstrapping.

TYPE: bool, defaults to True DEFAULT: True

log_y

The y values (passed to this random forest) are expected to be log(y) transformed. This will be considered during predicting.

TYPE: bool DEFAULT: False

instance_features

Features (list of int or floats) of the instances (str). The features are incorporated into the X data, on which the model is trained on.

TYPE: dict[str, list[int | float]] | None, defaults to None DEFAULT: None

pca_components

Number of components to keep when using PCA to reduce dimensionality of instance features.

TYPE: float, defaults to 7 DEFAULT: 7

seed

TYPE: int DEFAULT: 0

Source code in smac/model/random_forest/pyrfr/random_forest_pyrfr.py
def __init__(
    self,
    configspace: ConfigurationSpace,
    n_trees: int = N_TREES,
    n_points_per_tree: int = -1,
    ratio_features: float = 5.0 / 6.0,
    min_samples_split: int = 3,
    min_samples_leaf: int = 3,
    max_depth: int = 2**20,
    eps_purity: float = 1e-8,
    max_nodes: int = 2**20,
    bootstrapping: bool = True,
    log_y: bool = False,
    instance_features: dict[str, list[int | float]] | None = None,
    pca_components: int | None = 7,
    seed: int = 0,
) -> None:
    super().__init__(
        configspace=configspace,
        instance_features=instance_features,
        pca_components=pca_components,
        seed=seed,
    )

    max_features = 0 if ratio_features > 1.0 else max(1, int(len(self._types) * ratio_features))

    self._rf_opts = regression.forest_opts()
    self._rf_opts.num_trees = n_trees
    self._rf_opts.do_bootstrapping = bootstrapping
    self._rf_opts.tree_opts.max_features = max_features
    self._rf_opts.tree_opts.min_samples_to_split = min_samples_split
    self._rf_opts.tree_opts.min_samples_in_leaf = min_samples_leaf
    self._rf_opts.tree_opts.max_depth = max_depth
    self._rf_opts.tree_opts.epsilon_purity = eps_purity
    self._rf_opts.tree_opts.max_num_nodes = max_nodes
    self._rf_opts.compute_law_of_total_variance = False
    self._rf: BinaryForest | None = None
    self._log_y = log_y

    # Case to `int` incase we get an `np.integer` type
    self._rng = regression.default_random_engine(int(seed))

    self._n_trees = n_trees
    self._n_points_per_tree = n_points_per_tree
    self._ratio_features = ratio_features
    self._min_samples_split = min_samples_split
    self._min_samples_leaf = min_samples_leaf
    self._max_depth = max_depth
    self._eps_purity = eps_purity
    self._max_nodes = max_nodes
    self._bootstrapping = bootstrapping

predict #

predict(
    X: ndarray, covariance_type: str | None = "diagonal"
) -> tuple[ndarray, ndarray | None]

Predicts mean and variance for a given X. Internally, calls the method _predict.

PARAMETER DESCRIPTION
X

Input data points.

TYPE: np.ndarray [#samples, #hyperparameters + #features]

covariance_type

Specifies what to return along with the mean. Applied only to Gaussian Processes. Takes four valid inputs: * None: Only the mean is returned. * "std": Standard deviation at test points is returned. * "diagonal": Diagonal of the covariance matrix is returned. * "full": Whole covariance matrix between the test points is returned.

TYPE: str | None DEFAULT: 'diagonal'

RETURNS DESCRIPTION
means

The predictive mean.

TYPE: np.ndarray [#samples, #objectives]

vars

Predictive variance or standard deviation.

TYPE: np.ndarray [#samples, #objectives] or [#samples, #samples] | None

Source code in smac/model/abstract_model.py
def predict(
    self,
    X: np.ndarray,
    covariance_type: str | None = "diagonal",
) -> tuple[np.ndarray, np.ndarray | None]:
    """Predicts mean and variance for a given X. Internally, calls the method `_predict`.

    Parameters
    ----------
    X : np.ndarray [#samples, #hyperparameters + #features]
        Input data points.
    covariance_type: str | None, defaults to "diagonal"
        Specifies what to return along with the mean. Applied only to Gaussian Processes.
        Takes four valid inputs:
        * None: Only the mean is returned.
        * "std": Standard deviation at test points is returned.
        * "diagonal": Diagonal of the covariance matrix is returned.
        * "full": Whole covariance matrix between the test points is returned.

    Returns
    -------
    means : np.ndarray [#samples, #objectives]
        The predictive mean.
    vars : np.ndarray [#samples, #objectives] or [#samples, #samples] | None
        Predictive variance or standard deviation.
    """
    if len(X.shape) != 2:
        raise ValueError("Expected 2d array, got %dd array!" % len(X.shape))

    if X.shape[1] != self._n_hps + self._n_features:
        raise ValueError(
            f"Feature mismatch: X should have {self._n_hps} hyperparameters + {self._n_features} features, "
            f"but has {X.shape[1]} in total."
        )

    if self._apply_pca:
        try:
            X_feats = X[:, -self._n_features :]
            X_feats = self._scaler.transform(X_feats)
            X_feats = self._pca.transform(X_feats)
            X = np.hstack((X[:, : self._n_hps], X_feats))
        except NotFittedError:
            # PCA not fitted if only one training sample
            pass

    if X.shape[1] != len(self._types):
        raise ValueError("Rows in X should have %d entries but have %d!" % (len(self._types), X.shape[1]))

    with warnings.catch_warnings():
        warnings.filterwarnings("ignore", "Predicted variances smaller than 0. Setting those variances to 0.")
        mean, var = self._predict(X, covariance_type)

    if len(mean.shape) == 1:
        mean = mean.reshape((-1, 1))

    if var is not None and len(var.shape) == 1:
        var = var.reshape((-1, 1))

    return mean, var

predict_marginalized #

predict_marginalized(X: ndarray) -> tuple[ndarray, ndarray]

Predicts mean and variance marginalized over all instances.

Note

The method is random forest specific and follows the SMAC2 implementation. It requires no distribution assumption to marginalize the uncertainty estimates.

PARAMETER DESCRIPTION
X

Input data points.

TYPE: np.ndarray [#samples, #hyperparameter + #features]

RETURNS DESCRIPTION
means

The predictive mean.

TYPE: np.ndarray [#samples, 1]

vars

The predictive variance.

TYPE: np.ndarray [#samples, 1]

Source code in smac/model/random_forest/pyrfr/random_forest_pyrfr.py
def predict_marginalized(self, X: np.ndarray) -> tuple[np.ndarray, np.ndarray]:
    """Predicts mean and variance marginalized over all instances.

    Note
    ----
    The method is random forest specific and follows the SMAC2 implementation. It requires
    no distribution assumption to marginalize the uncertainty estimates.

    Parameters
    ----------
    X : np.ndarray [#samples, #hyperparameter + #features]
        Input data points.

    Returns
    -------
    means : np.ndarray [#samples, 1]
        The predictive mean.
    vars : np.ndarray [#samples, 1]
        The predictive variance.
    """
    if self._n_features == 0:
        mean_, var = self.predict(X)
        assert var is not None

        var[var < self._var_threshold] = self._var_threshold
        var[np.isnan(var)] = self._var_threshold

        return mean_, var

    assert self._instance_features is not None

    if len(X.shape) != 2:
        raise ValueError("Expected 2d array, got %dd array!" % len(X.shape))

    if X.shape[1] != len(self._bounds):
        raise ValueError("Rows in X should have %d entries but have %d!" % (len(self._bounds), X.shape[1]))

    assert self._rf is not None
    X = self._impute_inactive(X)

    X_feat = list(self._instance_features.values())
    dat_ = self._rf.predict_marginalized_over_instances_batch(X, X_feat, self._log_y)
    dat_ = np.array(dat_)

    # 3. compute statistics across trees
    mean_ = dat_.mean(axis=1)
    var = dat_.var(axis=1)

    if var is None:
        raise RuntimeError("The variance must not be none.")

    var[var < self._var_threshold] = self._var_threshold

    if len(mean_.shape) == 1:
        mean_ = mean_.reshape((-1, 1))
    if len(var.shape) == 1:
        var = var.reshape((-1, 1))

    return mean_, var

train #

train(X: ndarray, Y: ndarray) -> Self

Trains the random forest on X and Y. Internally, calls the method _train.

PARAMETER DESCRIPTION
X

Input data points.

TYPE: np.ndarray [#samples, #hyperparameters + #features]

Y

The corresponding target values.

TYPE: np.ndarray [#samples, #objectives]

RETURNS DESCRIPTION
self

TYPE: AbstractModel

Source code in smac/model/abstract_model.py
def train(self: Self, X: np.ndarray, Y: np.ndarray) -> Self:
    """Trains the random forest on X and Y. Internally, calls the method `_train`.

    Parameters
    ----------
    X : np.ndarray [#samples, #hyperparameters + #features]
        Input data points.
    Y : np.ndarray [#samples, #objectives]
        The corresponding target values.

    Returns
    -------
    self : AbstractModel
    """
    if len(X.shape) != 2:
        raise ValueError("Expected 2d array, got %dd array!" % len(X.shape))

    if X.shape[1] != self._n_hps + self._n_features:
        raise ValueError(
            f"Feature mismatch: X should have {self._n_hps} hyperparameters + {self._n_features} features, "
            f"but has {X.shape[1]} in total."
        )

    if X.shape[0] != Y.shape[0]:
        raise ValueError("X.shape[0] ({}) != y.shape[0] ({})".format(X.shape[0], Y.shape[0]))

    # Reduce dimensionality of features if larger than PCA_DIM
    if (
        self._pca_components is not None
        and X.shape[0] > self._pca.n_components
        and self._n_features >= self._pca_components
    ):
        X_feats = X[:, -self._n_features :]

        # Scale features
        X_feats = self._scaler.fit_transform(X_feats)
        X_feats = np.nan_to_num(X_feats)  # if features with max == min

        # PCA
        X_feats = self._pca.fit_transform(X_feats)
        X = np.hstack((X[:, : self._n_hps], X_feats))

        if hasattr(self, "_types"):
            # For RF, adapt types list
            # if X_feats.shape[0] < self._pca, X_feats.shape[1] == X_feats.shape[0]
            self._types = np.array(
                np.hstack((self._types[: self._n_hps], np.zeros(X_feats.shape[1]))),
                dtype=np.uint,
            )  # type: ignore

        self._apply_pca = True
    else:
        self._apply_pca = False

        if hasattr(self, "_types"):
            self._types = copy.deepcopy(self._initial_types)

    return self._train(X, Y)