Utils

neps.optimizers.multi_fidelity.utils #

MFObservedData #

MFObservedData(
    columns: list[str] | None = None,
    index_names: list[str] | None = None,
)

(Under development)

This module is used to unify the data access across different Multi-Fidelity optimizers. It stores column names and index names. Possible optimizations and extensions of the observed data should be handled by this class.

So far this is just a draft class containing the DataFrame and some properties.

Source code in neps/optimizers/multi_fidelity/utils.py

def __init__(
    self,
    columns: list[str] | None = None,
    index_names: list[str] | None = None,
):
    if columns is None:
        columns = [self.default_config_col, self.default_perf_col]
    if index_names is None:
        index_names = [self.default_config_idx, self.default_budget_idx]

    self.config_col = columns[0]
    self.perf_col = columns[1]

    if len(columns) > 2:
        self.lc_col_name = columns[2]
    else:
        self.lc_col_name = self.default_lc_col

    if len(index_names) == 1:
        index_names += ["budget_id"]

    self.config_idx = index_names[0]
    self.budget_idx = index_names[1]

    index = pd.MultiIndex.from_tuples([], names=index_names)

    self.df = pd.DataFrame([], columns=columns, index=index)

add_data #

add_data(
    data: list[Any] | list[list[Any]],
    index: (
        tuple[int, ...]
        | Sequence[tuple[int, ...]]
        | Sequence[int]
        | int
    ),
    error: bool = False,
)

Add data only if none of the indices are already existing in the DataFrame

Source code in neps/optimizers/multi_fidelity/utils.py

def add_data(
    self,
    data: list[Any] | list[list[Any]],
    index: tuple[int, ...] | Sequence[tuple[int, ...]] | Sequence[int] | int,
    error: bool = False,
):
    """
    Add data only if none of the indices are already existing in the DataFrame
    """
    # TODO: If index is only config_id extend it
    if not isinstance(index, list):
        index_list = [index]
        data_list = [data]
    else:
        index_list = index
        data_list = data

    if not self.df.index.isin(index_list).any():
        _df = pd.DataFrame(data_list, columns=self.df.columns, index=index_list)
        self.df = pd.concat((self.df, _df))
    elif error:
        raise ValueError(
            f"Data with at least one of the given indices already "
            f"exists: {self.df[self.df.index.isin(index_list)]}\n"
            f"Given indices: {index_list}"
        )

get_best_learning_curve_id #

get_best_learning_curve_id(maximize: bool = False)

Returns a single configuration id of the best observed performance

this will always return the single best lowest ID

if two configurations has the same performance

Source code in neps/optimizers/multi_fidelity/utils.py

def get_best_learning_curve_id(self, maximize: bool = False):
    """
    Returns a single configuration id of the best observed performance

    Note: this will always return the single best lowest ID
          if two configurations has the same performance
    """
    learning_curves = self.get_learning_curves()
    if maximize:
        return learning_curves.max(axis=1).idxmax()
    else:
        return learning_curves.min(axis=1).idxmin()

get_best_performance_for_each_budget #

get_best_performance_for_each_budget(
    maximize: bool = False,
)

Returns a series object with the best partial configuration for each budget id

this will always map the best lowest ID if two configurations

has the same performance at the same fidelity

Source code in neps/optimizers/multi_fidelity/utils.py

def get_best_performance_for_each_budget(self, maximize: bool = False):
    """
    Returns a series object with the best partial configuration for each budget id

    Note: this will always map the best lowest ID if two configurations
          has the same performance at the same fidelity
    """
    learning_curves = self.get_learning_curves()
    if maximize:
        performance = learning_curves.max(axis=0)
    else:
        performance = learning_curves.min(axis=0)

    return performance

get_incumbents_for_budgets #

get_incumbents_for_budgets(maximize: bool = False)

Returns a series object with the best partial configuration for each budget id

this will always map the best lowest ID if two configurations

has the same performance at the same fidelity

Source code in neps/optimizers/multi_fidelity/utils.py

def get_incumbents_for_budgets(self, maximize: bool = False):
    """
    Returns a series object with the best partial configuration for each budget id

    Note: this will always map the best lowest ID if two configurations
          has the same performance at the same fidelity
    """
    learning_curves = self.get_learning_curves()
    if maximize:
        config_ids = learning_curves.idxmax(axis=0)
    else:
        config_ids = learning_curves.idxmin(axis=0)

    indices = list(zip(config_ids.values.tolist(), config_ids.index.to_list()))
    partial_configs = self.df.loc[indices, self.config_col].to_list()
    return pd.Series(partial_configs, index=config_ids.index, name=self.config_col)

tokenize #

tokenize(df: DataFrame, as_tensor: bool = False)

Function to format data for PFN.

Source code in neps/optimizers/multi_fidelity/utils.py

def tokenize(self, df: pd.DataFrame, as_tensor: bool = False):
    """Function to format data for PFN."""
    configs = np.array([normalize_vectorize_config(c) for c in df])
    fidelity = np.array([c.fidelity.value for c in df]).reshape(-1, 1)
    idx = df.index.values.reshape(-1, 1)

    data = np.hstack([idx, fidelity, configs])

    if as_tensor:
        device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
        data = torch.Tensor(data).to(device)
    return data

update_data #

update_data(
    data_dict: dict[str, list[Any]],
    index: (
        tuple[int, ...]
        | Sequence[tuple[int, ...]]
        | Sequence[int]
        | int
    ),
    error: bool = False,
)

Update data if all the indices already exist in the DataFrame

Source code in neps/optimizers/multi_fidelity/utils.py

def update_data(
    self,
    data_dict: dict[str, list[Any]],
    index: tuple[int, ...] | Sequence[tuple[int, ...]] | Sequence[int] | int,
    error: bool = False,
):
    """
    Update data if all the indices already exist in the DataFrame
    """
    if not isinstance(index, list):
        index_list = [index]
    else:
        index_list = index
    if self.df.index.isin(index_list).sum() == len(index_list):
        column_names, data = zip(*data_dict.items())
        data = list(zip(*data))
        self.df.loc[index_list, list(column_names)] = data

    elif error:
        raise ValueError(
            f"Data with at least one of the given indices doesn't "
            f"exist.\n Existing indices: {self.df.index}\n"
            f"Given indices: {index_list}"
        )

continuous_to_tabular #

continuous_to_tabular(
    config: SearchSpace, categorical_space: SearchSpace
) -> SearchSpace

Convert the continuous parameters in the config into categorical ones based on the categorical_space provided

Source code in neps/optimizers/multi_fidelity/utils.py

def continuous_to_tabular(
    config: SearchSpace, categorical_space: SearchSpace
) -> SearchSpace:
    """
    Convert the continuous parameters in the config into categorical ones based on
    the categorical_space provided
    """
    result = config.copy()
    for hp_name, _ in config.items():
        if hp_name in categorical_space.keys():
            choices = np.array(categorical_space[hp_name].choices)
            diffs = choices - config[hp_name].value
            # NOTE: in case of a tie the first value in the choices array will be returned
            closest = choices[np.abs(diffs).argmin()]
            result[hp_name].value = closest

    return result