Gp

neps.optimizers.bayesian_optimization.models.gp #

ComprehensiveGP #

ComprehensiveGP(
    graph_kernels: Iterable,
    hp_kernels: Iterable,
    likelihood: float = 0.001,
    weights=None,
    vectorial_features: list = None,
    combined_kernel: str = "sum",
    logger=None,
    surrogate_model_fit_args: dict = None,
)

Source code in neps/optimizers/bayesian_optimization/models/gp.py

def __init__(
    self,
    graph_kernels: Iterable,
    hp_kernels: Iterable,
    likelihood: float = 1e-3,
    weights=None,
    vectorial_features: list = None,
    combined_kernel: str = "sum",
    logger=None,
    surrogate_model_fit_args: dict = None,
):
    self.likelihood = likelihood
    self.surrogate_model_fit_args = surrogate_model_fit_args or {}

    self.domain_kernels: list = []
    if bool(graph_kernels):
        self.domain_kernels += list(graph_kernels)
    if bool(hp_kernels):
        self.domain_kernels += list(hp_kernels)

    self.n_kernels: int = len(self.domain_kernels)
    self.n_graph_kernels: int = len(
        [i for i in self.domain_kernels if isinstance(i, GraphKernels)]
    )
    self.n_vector_kernels: int = self.n_kernels - self.n_graph_kernels

    self.vectorial_features = vectorial_features

    if weights is not None:
        self.fixed_weights = True
        if weights is not None:
            assert len(weights) == len(self.n_kernels), (
                "the weights vector, if supplied, needs to have the same length as "
                "the number of kernel_operators!"
            )
        self.init_weights = (
            weights
            if isinstance(weights, torch.Tensor)
            else torch.tensor(weights).flatten()
        )
    else:
        self.fixed_weights = False
        # Initialise the domain kernel weights to uniform
        self.init_weights = torch.tensor(
            [1.0 / self.n_kernels] * self.n_kernels,
        )
    self.weights = self.init_weights.clone()

    if combined_kernel == "product":
        self.combined_kernel = ProductKernel(
            *self.domain_kernels, weights=self.weights
        )
    elif combined_kernel == "sum":
        self.combined_kernel = SumKernel(*self.domain_kernels, weights=self.weights)
    else:
        raise NotImplementedError(
            f'Combining kernel {combined_kernel} is not yet implemented! Only "sum" '
            f'or "product" are currently supported. '
        )

    self.logger = logger or logging.getLogger("neps")
    # Cache the Gram matrix inverse and its log-determinant
    self.K, self.K_i, self.logDetK = [None] * 3
    self.theta_vector = None
    self.layer_weights = None
    self.nlml = None

    self.x_configs: list = None
    self.y: torch.Tensor = None
    self.y_: torch.Tensor = None
    self.y_mean: torch.Tensor = None
    self.y_std: torch.Tensor = None
    self.n: int = None

dmu_dphi #

dmu_dphi(
    X_s=None,
    average_across_features=True,
    average_across_occurrences=False,
)

Compute the derivative of the GP posterior mean at the specified input location with respect to the vector embedding of the graph (e.g., if using WL-subtree, this function computes the gradient wrt each subtree pattern)

The derivative is given by $ \frac{\partial \mu^}{\partial \phi ^} = \frac{\partial K(\phi, \phi^)}{\partial \phi ^ }K(\phi, \phi)^{-1} \mathbf{y} $

which derives directly from the GP posterior mean formula, and since the term $K(\phi, \phi)^{-1} and \mathbf{y} are both independent of the testing points (X_s, or \phi^*}, the posterior gradient is simply the matrix produce of the kernel gradient with the inverse Gram and the training label vector.

Parameters#

X_s: The locations on which the GP posterior mean derivatives should be evaluated. If left blank, the derivatives will be evaluated at the training points.

compute_grad_var: bool. If true, also compute the gradient variance.

The derivative of GP is also a GP, and thus the predictive distribution of the posterior gradient is Gaussian. The posterior mean is given above, and the posterior variance is: $ \mathbb{V}[\frac{\partial f^}{\partial \phi^}]= \frac{\partial^2k(\phi^, \phi^)}{\partial \phi^^2} - \frac{\partial k(\phi^, \Phi)}{\partial \phi^}K(X, X)^{-1}\frac{\partial k{(\Phi, \phi^)}}{\partial \phi^*} $

Returns#

list of K torch.Tensor of the shape N x2 D, where N is the length of the X_s list (each element of which is a networkx graph), K is the number of kernel_operators in the combined kernel and D is the dimensionality of the feature vector (this is determined by the specific graph kernel.

OR

list of K torch.Tensor of shape D, if averaged_over_samples flag is enabled.

Source code in neps/optimizers/bayesian_optimization/models/gp.py

def dmu_dphi(
    self,
    X_s=None,
    # compute_grad_var=False,
    average_across_features=True,
    average_across_occurrences=False,
):
    r"""
    Compute the derivative of the GP posterior mean at the specified input location with respect to the
    *vector embedding* of the graph (e.g., if using WL-subtree, this function computes the gradient wrt
    each subtree pattern)

    The derivative is given by
    $
    \frac{\partial \mu^*}{\partial \phi ^*} = \frac{\partial K(\phi, \phi^*)}{\partial \phi ^ *}K(\phi, \phi)^{-1}
    \mathbf{y}
    $

    which derives directly from the GP posterior mean formula, and since the term $K(\phi, \phi)^{-1} and \mathbf{y}
    are both independent of the testing points (X_s, or \phi^*}, the posterior gradient is simply the matrix
    produce of the kernel gradient with the inverse Gram and the training label vector.

    Parameters
    ----------
    X_s: The locations on which the GP posterior mean derivatives should be evaluated. If left blank, the
    derivatives will be evaluated at the training points.

    compute_grad_var: bool. If true, also compute the gradient variance.

    The derivative of GP is also a GP, and thus the predictive distribution of the posterior gradient is Gaussian.
    The posterior mean is given above, and the posterior variance is:
    $
    \mathbb{V}[\frac{\partial f^*}{\partial \phi^*}]= \frac{\partial^2k(\phi^*, \phi^*)}{\partial \phi^*^2} -
    \frac{\partial k(\phi^*, \Phi)}{\partial \phi^*}K(X, X)^{-1}\frac{\partial k{(\Phi, \phi^*)}}{\partial \phi^*}
    $

    Returns
    -------
    list of K torch.Tensor of the shape N x2 D, where N is the length of the X_s list (each element of which is a
    networkx graph), K is the number of kernel_operators in the combined kernel and D is the dimensionality of the
    feature vector (this is determined by the specific graph kernel.

    OR

    list of K torch.Tensor of shape D, if averaged_over_samples flag is enabled.
    """
    if self.K_i is None or self.logDetK is None:
        raise ValueError(
            "Inverse of Gram matrix is not instantiated. Please call the optimize "
            "function to fit on the training data first!"
        )
    if self.n_vector_kernels:
        if X_s is not None:
            V_s = self._get_vectorial_features(X_s, self.vectorial_feactures)
            V_s, _, _ = standardize_x(V_s, self.x_features_min, self.x_features_max)
        else:
            V_s = self.x_features
            X_s = self.x[:]
    else:
        V_s = None
        X_s = X_s if X_s is not None else self.x[:]

    alpha = (self.K_i @ self.y).double().reshape(1, -1)
    dmu_dphi = []
    # dmu_dphi_var = [] if compute_grad_var else None

    Ks_handles = []
    feature_matrix = []
    for j, x_s in enumerate(X_s):
        jacob_vecs = []
        if V_s is None:
            handles = self.combined_kernel.forward_t(
                self.weights,
                [x_s],
            )
        else:
            handles = self.combined_kernel.forward_t(self.weights, [x_s], V_s[j])
        Ks_handles.append(handles)
        # Each handle is a 2-tuple. first element is the Gram matrix, second element is the leaf variable
        feature_vectors = []
        for handle in handles:
            k_s, y, _ = handle
            # k_s is output, leaf is input, alpha is the K_i @ y term which is constant.
            # When compute_grad_var is not required, computational graphs do not need to be saved.
            jacob_vecs.append(
                torch.autograd.grad(
                    outputs=k_s, inputs=y, grad_outputs=alpha, retain_graph=False
                )[0]
            )
            feature_vectors.append(y)
        feature_matrix.append(feature_vectors)
        jacob_vecs = torch.cat(jacob_vecs)
        dmu_dphi.append(jacob_vecs)

    feature_matrix = torch.cat([f[0] for f in feature_matrix])
    if average_across_features:
        dmu_dphi = torch.cat(dmu_dphi)
        # compute the weighted average of the gradient across N_t.
        # feature matrix is of shape N_t x K x D
        avg_mu, avg_var, incidences = get_grad(
            dmu_dphi, feature_matrix, average_across_occurrences
        )
        return avg_mu, avg_var, incidences
    return (
        dmu_dphi,
        None,
        feature_matrix.sum(dim=0) if average_across_occurrences else feature_matrix,
    )

predict #

predict(x_configs, preserve_comp_graph: bool = False)

Kriging predictions

Source code in neps/optimizers/bayesian_optimization/models/gp.py

def predict(self, x_configs, preserve_comp_graph: bool = False):
    """Kriging predictions"""

    if not isinstance(x_configs, list):
        # Convert a single input X_s to a singleton list
        x_configs = [x_configs]

    if self.K_i is None or self.logDetK is None:
        raise ValueError(
            "Inverse of Gram matrix is not instantiated. Please call the optimize "
            "function to fit on the training data first!"
        )

    # Concatenate the full list
    X_configs_all = self.x_configs + x_configs

    # Make a copy of the sum_kernels for this step, to avoid breaking the autodiff
    # if grad guided mutation is used
    if preserve_comp_graph:
        combined_kernel_copy = deepcopy(self.combined_kernel)
    else:
        combined_kernel_copy = self.combined_kernel

    K_full = combined_kernel_copy.fit_transform(
        self.weights,
        X_configs_all,
        layer_weights=self.layer_weights,
        feature_lengthscale=self.theta_vector,
        rebuild_model=True,
        save_gram_matrix=False,
        gp_fit=False,
    )

    K_s = K_full[: self.n :, self.n :]

    K_ss = K_full[self.n :, self.n :] + self.likelihood * torch.eye(
        len(x_configs),
    )

    mu_s = K_s.t() @ self.K_i @ self.y
    cov_s = K_ss - K_s.t() @ self.K_i @ K_s
    cov_s = torch.clamp(cov_s, self.likelihood, np.inf)
    mu_s = unnormalize_y(mu_s, self.y_mean, self.y_std)
    std_s = torch.sqrt(cov_s)
    std_s = unnormalize_y(std_s, None, self.y_std, True)
    cov_s = std_s**2
    if preserve_comp_graph:
        del combined_kernel_copy
    return mu_s, cov_s

compute_log_marginal_likelihood #

compute_log_marginal_likelihood(
    K_i: Tensor,
    logDetK: Tensor,
    y: Tensor,
    normalize: bool = True,
    log_prior_dist=None,
)

Compute the zero mean Gaussian process log marginal likelihood given the inverse of Gram matrix K(x2,x2), its log determinant, and the training label vector y. Option:

normalize: normalize the log marginal likelihood by the length of the label vector, as per the gpytorch routine.

prior: A pytorch distribution object. If specified, the hyperparameter prior will be taken into consideration and we use Type-II MAP instead of Type-II MLE (compute log_posterior instead of log_evidence)

Source code in neps/optimizers/bayesian_optimization/models/gp.py

def compute_log_marginal_likelihood(
    K_i: torch.Tensor,
    logDetK: torch.Tensor,
    y: torch.Tensor,
    normalize: bool = True,
    log_prior_dist=None,
):
    """Compute the zero mean Gaussian process log marginal likelihood given the inverse of Gram matrix K(x2,x2), its
    log determinant, and the training label vector y.
    Option:

    normalize: normalize the log marginal likelihood by the length of the label vector, as per the gpytorch
    routine.

    prior: A pytorch distribution object. If specified, the hyperparameter prior will be taken into consideration and
    we use Type-II MAP instead of Type-II MLE (compute log_posterior instead of log_evidence)
    """
    lml = (
        -0.5 * y.t() @ K_i @ y
        + 0.5 * logDetK
        - y.shape[0]
        / 2.0
        * torch.log(
            2
            * torch.tensor(
                np.pi,
            )
        )
    )
    if log_prior_dist is not None:
        lml -= log_prior_dist
    return lml / y.shape[0] if normalize else lml

compute_pd_inverse #

compute_pd_inverse(K: tensor, jitter: float = 1e-06)

Compute the inverse of a postive-(semi)definite matrix K using Cholesky inversion.

Source code in neps/optimizers/bayesian_optimization/models/gp.py

def compute_pd_inverse(K: torch.tensor, jitter: float = 1e-6):
    """Compute the inverse of a postive-(semi)definite matrix K using Cholesky inversion."""
    n = K.shape[0]
    assert (
        isinstance(jitter, float) or jitter.ndim == 0
    ), "only homoscedastic noise variance is allowed here!"
    is_successful = False
    fail_count = 0
    max_fail = 3
    while fail_count < max_fail and not is_successful:
        try:
            jitter_diag = jitter * torch.eye(n, device=K.device) * 10**fail_count
            K_ = K + jitter_diag
            try:
                Kc = torch.linalg.cholesky(K_)
            except AttributeError:  # For torch < 1.8.0
                Kc = torch.cholesky(K_)
            is_successful = True
        except RuntimeError:
            fail_count += 1
    if not is_successful:
        raise RuntimeError(f"Gram matrix not positive definite despite of jitter:\n{K}")
    logDetK = -2 * torch.sum(torch.log(torch.diag(Kc)))
    K_i = torch.cholesky_inverse(Kc)
    return K_i.to(torch.get_default_dtype()), logDetK.to(torch.get_default_dtype())

get_grad #

get_grad(
    grad_matrix, feature_matrix, average_occurrences=False
)

Average across the samples via a Monte Carlo sampling scheme. Also estimates the empirical variance. :param average_occurrences: if True, do a weighted summation based on the frequency distribution of the occurrence to compute a gradient per each feature. Otherwise, each different occurrence (\phi_i = k) will get a different gradient estimate.

Source code in neps/optimizers/bayesian_optimization/models/gp.py

def get_grad(grad_matrix, feature_matrix, average_occurrences=False):
    r"""
    Average across the samples via a Monte Carlo sampling scheme. Also estimates the
    empirical variance. :param average_occurrences: if True, do a weighted summation
    based on the frequency distribution of the occurrence to compute a gradient *per
    each feature*. Otherwise, each different occurrence (\phi_i = k) will get a
    different gradient estimate.
    """
    assert grad_matrix.shape == feature_matrix.shape
    # Prune out the all-zero columns that pop up sometimes
    valid_cols = []
    for col_idx in range(feature_matrix.size(1)):
        if not torch.all(feature_matrix[:, col_idx] == 0):
            valid_cols.append(col_idx)
    feature_matrix = feature_matrix[:, valid_cols]
    grad_matrix = grad_matrix[:, valid_cols]

    _, D = feature_matrix.shape
    if average_occurrences:
        avg_grad = torch.zeros(D)
        avg_grad_var = torch.zeros(D)
        for d in range(D):
            current_feature = feature_matrix[:, d].clone().detach()
            instances, indices, counts = torch.unique(
                current_feature, return_inverse=True, return_counts=True
            )
            weight_vector = torch.tensor([counts[i] for i in indices]).type(torch.float)
            weight_vector /= weight_vector.sum()
            mean = torch.sum(weight_vector * grad_matrix[:, d])
            # Compute the empirical variance of gradients
            variance = torch.sum(weight_vector * grad_matrix[:, d] ** 2) - mean**2
            avg_grad[d] = mean
            avg_grad_var[d] = variance
        return avg_grad, avg_grad_var, feature_matrix.sum(dim=0)
    else:
        # The maximum number possible occurrences -- 7 is an example, if problem occurs, maybe we can increase this
        # number. But for now, for both NAS-Bench datasets, this should be more than enough!
        max_occur = 7
        avg_grad = torch.zeros(D, max_occur)
        avg_grad_var = torch.zeros(D, max_occur)
        incidences = torch.zeros(D, max_occur)
        for d in range(D):
            current_feature = feature_matrix[:, d].clone().detach()
            instances, indices, counts = torch.unique(
                current_feature, return_inverse=True, return_counts=True
            )
            for i, val in enumerate(instances):
                # Find index of all feature counts that are equal to the current val
                feature_at_val = grad_matrix[current_feature == val]
                avg_grad[d, int(val)] = torch.mean(feature_at_val)
                avg_grad_var[d, int(val)] = torch.var(feature_at_val)
                incidences[d, int(val)] = counts[i]
        return avg_grad, avg_grad_var, incidences

standardize_x #

standardize_x(
    x: Tensor, x_min: Tensor = None, x_max: Tensor = None
)

Standardize the vectorial input into a d-dimensional hypercube [0, 1]^d, where d is the number of features. if x_min ond x_max are supplied, x2 will be standardised using these instead. This is used when standardising the validation/test inputs.

Source code in neps/optimizers/bayesian_optimization/models/gp.py

def standardize_x(
    x: torch.Tensor, x_min: torch.Tensor = None, x_max: torch.Tensor = None
):
    """Standardize the vectorial input into a d-dimensional hypercube [0, 1]^d, where d is the number of features.
    if x_min ond x_max are supplied, x2 will be standardised using these instead. This is used when standardising the
    validation/test inputs.
    """
    if (x_min is not None and x_max is None) or (x_min is None and x_max is not None):
        raise ValueError(
            "Either *both* or *neither* of x_min, x_max need to be supplied!"
        )
    if x_min is None:
        x_min = torch.min(x, 0)[0]
        x_max = torch.max(x, 0)[0]
    x = (x - x_min) / (x_max - x_min)
    return x, x_min, x_max

unnormalize_y #

unnormalize_y(y, y_mean, y_std, scale_std=False)

Similar to the undoing of the pre-processing step above, but on the output predictions

Source code in neps/optimizers/bayesian_optimization/models/gp.py

def unnormalize_y(y, y_mean, y_std, scale_std=False):
    """Similar to the undoing of the pre-processing step above, but on the output predictions"""
    if not scale_std:
        return y * y_std + y_mean
    else:
        return y * y_std