Random forest pyrfr

smac.model.random_forest.pyrfr.random_forest_pyrfr #

PyrfrRandomForest #

PyrfrRandomForest(
    configspace: ConfigurationSpace,
    n_trees: int = N_TREES,
    n_points_per_tree: int = -1,
    ratio_features: float = 5.0 / 6.0,
    min_samples_split: int = 3,
    min_samples_leaf: int = 3,
    max_depth: int = 2**20,
    eps_purity: float = 1e-08,
    max_nodes: int = 2**20,
    bootstrapping: bool = True,
    log_y: bool = False,
    instance_features: (
        dict[str, list[int | float]] | None
    ) = None,
    pca_components: int | None = 7,
    seed: int = 0,
)

Bases: AbstractRandomForest

Random forest that takes instance features into account.

PARAMETER	DESCRIPTION
`n_trees`	The number of trees in the random forest. TYPE: int, defaults to `N_TREES` DEFAULT: `N_TREES`
`n_points_per_tree`	Number of points per tree. If the value is smaller than 0, the number of samples will be used. TYPE: `int, defaults to -1` DEFAULT: `-1`
`ratio_features`	The ratio of features that are considered for splitting. TYPE: `float, defaults to 5.0 / 6.0` DEFAULT: `5.0 / 6.0`
`min_samples_split`	The minimum number of data points to perform a split. TYPE: `int, defaults to 3` DEFAULT: `3`
`min_samples_leaf`	The minimum number of data points in a leaf. TYPE: `int, defaults to 3` DEFAULT: `3`
`max_depth`	The maximum depth of a single tree. TYPE: `int, defaults to 220` DEFAULT: `2 20`
`eps_purity`	The minimum difference between two target values to be considered. TYPE: `float, defaults to 1e-8` DEFAULT: `1e-08`
`max_nodes`	The maximum total number of nodes in a tree. TYPE: `int, defaults to 220` DEFAULT: `2 20`
`bootstrapping`	Enables bootstrapping. TYPE: `bool, defaults to True` DEFAULT: `True`
`log_y`	The y values (passed to this random forest) are expected to be log(y) transformed. This will be considered during predicting. TYPE: `bool` DEFAULT: `False`
`instance_features`	Features (list of int or floats) of the instances (str). The features are incorporated into the X data, on which the model is trained on. TYPE: `dict[str, list[int \| float]] \| None, defaults to None` DEFAULT: `None`
`pca_components`	Number of components to keep when using PCA to reduce dimensionality of instance features. TYPE: `float, defaults to 7` DEFAULT: `7`
`seed`	TYPE: `int` DEFAULT: `0`

Source code in smac/model/random_forest/pyrfr/random_forest_pyrfr.py

def __init__(
    self,
    configspace: ConfigurationSpace,
    n_trees: int = N_TREES,
    n_points_per_tree: int = -1,
    ratio_features: float = 5.0 / 6.0,
    min_samples_split: int = 3,
    min_samples_leaf: int = 3,
    max_depth: int = 2**20,
    eps_purity: float = 1e-8,
    max_nodes: int = 2**20,
    bootstrapping: bool = True,
    log_y: bool = False,
    instance_features: dict[str, list[int | float]] | None = None,
    pca_components: int | None = 7,
    seed: int = 0,
) -> None:
    super().__init__(
        configspace=configspace,
        instance_features=instance_features,
        pca_components=pca_components,
        seed=seed,
    )

    max_features = 0 if ratio_features > 1.0 else max(1, int(len(self._types) * ratio_features))

    self._rf_opts = regression.forest_opts()
    self._rf_opts.num_trees = n_trees
    self._rf_opts.do_bootstrapping = bootstrapping
    self._rf_opts.tree_opts.max_features = max_features
    self._rf_opts.tree_opts.min_samples_to_split = min_samples_split
    self._rf_opts.tree_opts.min_samples_in_leaf = min_samples_leaf
    self._rf_opts.tree_opts.max_depth = max_depth
    self._rf_opts.tree_opts.epsilon_purity = eps_purity
    self._rf_opts.tree_opts.max_num_nodes = max_nodes
    self._rf_opts.compute_law_of_total_variance = False
    self._rf: BinaryForest | None = None
    self._log_y = log_y

    # Case to `int` incase we get an `np.integer` type
    self._rng = regression.default_random_engine(int(seed))

    self._n_trees = n_trees
    self._n_points_per_tree = n_points_per_tree
    self._ratio_features = ratio_features
    self._min_samples_split = min_samples_split
    self._min_samples_leaf = min_samples_leaf
    self._max_depth = max_depth
    self._eps_purity = eps_purity
    self._max_nodes = max_nodes
    self._bootstrapping = bootstrapping

predict #

predict(
    X: ndarray, covariance_type: str | None = "diagonal"
) -> tuple[ndarray, ndarray | None]

Predicts mean and variance for a given X. Internally, calls the method _predict.

PARAMETER	DESCRIPTION
`X`	Input data points. TYPE: `np.ndarray [#samples, #hyperparameters + #features]`
`covariance_type`	Specifies what to return along with the mean. Applied only to Gaussian Processes. Takes four valid inputs: * None: Only the mean is returned. * "std": Standard deviation at test points is returned. * "diagonal": Diagonal of the covariance matrix is returned. * "full": Whole covariance matrix between the test points is returned. TYPE: `str \| None` DEFAULT: `'diagonal'`

RETURNS	DESCRIPTION
`means`	The predictive mean. TYPE: `np.ndarray [#samples, #objectives]`
`vars`	Predictive variance or standard deviation. TYPE: `np.ndarray [#samples, #objectives] or [#samples, #samples] \| None`

Source code in smac/model/abstract_model.py

def predict(
    self,
    X: np.ndarray,
    covariance_type: str | None = "diagonal",
) -> tuple[np.ndarray, np.ndarray | None]:
    """Predicts mean and variance for a given X. Internally, calls the method `_predict`.

    Parameters
    ----------
    X : np.ndarray [#samples, #hyperparameters + #features]
        Input data points.
    covariance_type: str | None, defaults to "diagonal"
        Specifies what to return along with the mean. Applied only to Gaussian Processes.
        Takes four valid inputs:
        * None: Only the mean is returned.
        * "std": Standard deviation at test points is returned.
        * "diagonal": Diagonal of the covariance matrix is returned.
        * "full": Whole covariance matrix between the test points is returned.

    Returns
    -------
    means : np.ndarray [#samples, #objectives]
        The predictive mean.
    vars : np.ndarray [#samples, #objectives] or [#samples, #samples] | None
        Predictive variance or standard deviation.
    """
    if len(X.shape) != 2:
        raise ValueError("Expected 2d array, got %dd array!" % len(X.shape))

    if X.shape[1] != self._n_hps + self._n_features:
        raise ValueError(
            f"Feature mismatch: X should have {self._n_hps} hyperparameters + {self._n_features} features, "
            f"but has {X.shape[1]} in total."
        )

    if self._apply_pca:
        try:
            X_feats = X[:, -self._n_features :]
            X_feats = self._scaler.transform(X_feats)
            X_feats = self._pca.transform(X_feats)
            X = np.hstack((X[:, : self._n_hps], X_feats))
        except NotFittedError:
            # PCA not fitted if only one training sample
            pass

    if X.shape[1] != len(self._types):
        raise ValueError("Rows in X should have %d entries but have %d!" % (len(self._types), X.shape[1]))

    with warnings.catch_warnings():
        warnings.filterwarnings("ignore", "Predicted variances smaller than 0. Setting those variances to 0.")
        mean, var = self._predict(X, covariance_type)

    if len(mean.shape) == 1:
        mean = mean.reshape((-1, 1))

    if var is not None and len(var.shape) == 1:
        var = var.reshape((-1, 1))

    return mean, var

predict_marginalized #

predict_marginalized(X: ndarray) -> tuple[ndarray, ndarray]

Predicts mean and variance marginalized over all instances.

Note

The method is random forest specific and follows the SMAC2 implementation. It requires no distribution assumption to marginalize the uncertainty estimates.

PARAMETER	DESCRIPTION
`X`	Input data points. TYPE: `np.ndarray [#samples, #hyperparameter + #features]`

RETURNS	DESCRIPTION
`means`	The predictive mean. TYPE: `np.ndarray [#samples, 1]`
`vars`	The predictive variance. TYPE: `np.ndarray [#samples, 1]`

Source code in smac/model/random_forest/pyrfr/random_forest_pyrfr.py

def predict_marginalized(self, X: np.ndarray) -> tuple[np.ndarray, np.ndarray]:
    """Predicts mean and variance marginalized over all instances.

    Note
    ----
    The method is random forest specific and follows the SMAC2 implementation. It requires
    no distribution assumption to marginalize the uncertainty estimates.

    Parameters
    ----------
    X : np.ndarray [#samples, #hyperparameter + #features]
        Input data points.

    Returns
    -------
    means : np.ndarray [#samples, 1]
        The predictive mean.
    vars : np.ndarray [#samples, 1]
        The predictive variance.
    """
    if self._n_features == 0:
        mean_, var = self.predict(X)
        assert var is not None

        var[var < self._var_threshold] = self._var_threshold
        var[np.isnan(var)] = self._var_threshold

        return mean_, var

    assert self._instance_features is not None

    if len(X.shape) != 2:
        raise ValueError("Expected 2d array, got %dd array!" % len(X.shape))

    if X.shape[1] != len(self._bounds):
        raise ValueError("Rows in X should have %d entries but have %d!" % (len(self._bounds), X.shape[1]))

    assert self._rf is not None
    X = self._impute_inactive(X)

    X_feat = list(self._instance_features.values())
    dat_ = self._rf.predict_marginalized_over_instances_batch(X, X_feat, self._log_y)
    dat_ = np.array(dat_)

    # 3. compute statistics across trees
    mean_ = dat_.mean(axis=1)
    var = dat_.var(axis=1)

    if var is None:
        raise RuntimeError("The variance must not be none.")

    var[var < self._var_threshold] = self._var_threshold

    if len(mean_.shape) == 1:
        mean_ = mean_.reshape((-1, 1))
    if len(var.shape) == 1:
        var = var.reshape((-1, 1))

    return mean_, var

train #

train(X: ndarray, Y: ndarray) -> Self

Trains the random forest on X and Y. Internally, calls the method _train.

PARAMETER	DESCRIPTION
`X`	Input data points. TYPE: `np.ndarray [#samples, #hyperparameters + #features]`
`Y`	The corresponding target values. TYPE: `np.ndarray [#samples, #objectives]`

RETURNS	DESCRIPTION
`self`	TYPE: `AbstractModel`

Source code in smac/model/abstract_model.py

def train(self: Self, X: np.ndarray, Y: np.ndarray) -> Self:
    """Trains the random forest on X and Y. Internally, calls the method `_train`.

    Parameters
    ----------
    X : np.ndarray [#samples, #hyperparameters + #features]
        Input data points.
    Y : np.ndarray [#samples, #objectives]
        The corresponding target values.

    Returns
    -------
    self : AbstractModel
    """
    if len(X.shape) != 2:
        raise ValueError("Expected 2d array, got %dd array!" % len(X.shape))

    if X.shape[1] != self._n_hps + self._n_features:
        raise ValueError(
            f"Feature mismatch: X should have {self._n_hps} hyperparameters + {self._n_features} features, "
            f"but has {X.shape[1]} in total."
        )

    if X.shape[0] != Y.shape[0]:
        raise ValueError("X.shape[0] ({}) != y.shape[0] ({})".format(X.shape[0], Y.shape[0]))

    # Reduce dimensionality of features if larger than PCA_DIM
    if (
        self._pca_components is not None
        and X.shape[0] > self._pca.n_components
        and self._n_features >= self._pca_components
    ):
        X_feats = X[:, -self._n_features :]

        # Scale features
        X_feats = self._scaler.fit_transform(X_feats)
        X_feats = np.nan_to_num(X_feats)  # if features with max == min

        # PCA
        X_feats = self._pca.fit_transform(X_feats)
        X = np.hstack((X[:, : self._n_hps], X_feats))

        if hasattr(self, "_types"):
            # For RF, adapt types list
            # if X_feats.shape[0] < self._pca, X_feats.shape[1] == X_feats.shape[0]
            self._types = np.array(
                np.hstack((self._types[: self._n_hps], np.zeros(X_feats.shape[1]))),
                dtype=np.uint,
            )  # type: ignore

        self._apply_pca = True
    else:
        self._apply_pca = False

        if hasattr(self, "_types"):
            self._types = copy.deepcopy(self._initial_types)

    return self._train(X, Y)