Skip to content

tabular

class TabularBenchmark(name, table, *, config_name, fidelity_name, result_keys, config_keys, remove_constants=False, space=None, seed=None, prior=None, perturb_prior=None) #

Bases: Benchmark[CTabular, R, F]

PARAMETER DESCRIPTION
name

The name of this benchmark.

TYPE: str

table

The table to use for the benchmark.

TYPE: DataFrame

config_name

The column in the table that contains the config id

TYPE: str

fidelity_name

The column in the table that contains the fidelity

TYPE: str

result_keys

The columns in the table that contain the results

TYPE: Sequence[str]

config_keys

The columns in the table that contain the config values

TYPE: Sequence[str]

remove_constants

Remove constant config columns from the data or not.

TYPE: bool DEFAULT: False

space

The configuration space to use for the benchmark. If None, will just be an empty space.

TYPE: ConfigurationSpace | None DEFAULT: None

prior

The prior to use for the benchmark. If None, no prior is used. If a string, will be treated as a prior specific for this benchmark if it can be found, otherwise assumes it to be a Path. If a Path, will load the prior from the path. If a dict or Configuration, will be used directly.

TYPE: str | Path | CTabular | Mapping[str, Any] | None DEFAULT: None

perturb_prior

If not None, will perturb the prior by this amount. For numericals, while for categoricals, this is interpreted as the probability of swapping the value for a random one.

TYPE: float | None DEFAULT: None

seed

The seed to use for the benchmark.

TYPE: int | None DEFAULT: None

Source code in src/mfpbench/tabular.py
def __init__(  # noqa: PLR0913, C901
    self,
    name: str,
    table: pd.DataFrame,
    *,
    config_name: str,
    fidelity_name: str,
    result_keys: Sequence[str],
    config_keys: Sequence[str],
    remove_constants: bool = False,
    space: ConfigurationSpace | None = None,
    seed: int | None = None,
    prior: str | Path | CTabular | Mapping[str, Any] | None = None,
    perturb_prior: float | None = None,
):
    """Initialize the benchmark.

    Args:
        name: The name of this benchmark.
        table: The table to use for the benchmark.
        config_name: The column in the table that contains the config id
        fidelity_name: The column in the table that contains the fidelity
        result_keys: The columns in the table that contain the results
        config_keys: The columns in the table that contain the config values
        remove_constants: Remove constant config columns from the data or not.
        space: The configuration space to use for the benchmark. If None, will
            just be an empty space.
        prior: The prior to use for the benchmark. If None, no prior is used.
            If a string, will be treated as a prior specific for this benchmark
            if it can be found, otherwise assumes it to be a Path.
            If a Path, will load the prior from the path.
            If a dict or Configuration, will be used directly.
        perturb_prior: If not None, will perturb the prior by this amount.
            For numericals, while for categoricals, this is interpreted as the
            probability of swapping the value for a random one.
        seed: The seed to use for the benchmark.
    """
    cls = self.__class__
    if remove_constants:

        def is_constant(_s: pd.Series) -> bool:
            _arr = _s.to_numpy()
            return bool((_arr == _arr[0]).all())

        constant_cols = [
            col for col in table.columns if is_constant(table[col])  # type: ignore
        ]
        table = table.drop(columns=constant_cols)  # type: ignore
        config_keys = [k for k in config_keys if k not in constant_cols]

    # If the table isn't indexed, index it
    index_cols = [config_name, fidelity_name]
    if table.index.names != index_cols:
        # Only drop the index if it's not relevant.
        relevant_cols: list[str] = [  # type: ignore
            *list(index_cols),  # type: ignore
            *list(result_keys),
            *list(config_keys),
        ]
        relevant = any(name in relevant_cols for name in table.index.names)
        table = table.reset_index(drop=not relevant)

        if config_name not in table.columns:
            raise ValueError(f"{config_name=} not in columns {table.columns}")
        if fidelity_name not in table.columns:
            raise ValueError(f"{fidelity_name=} not in columns {table.columns}")

        table = table.set_index(index_cols)
        table = table.sort_index()

    # Make sure all keys are in the table
    for key in chain(result_keys, config_keys):
        if key not in table.columns:
            raise ValueError(f"{key=} not in columns {table.columns}")

    # Make sure the keyword "id" is not in the columns as we use it to
    # identify configs
    if "id" in table.columns:
        raise ValueError(f"{table.columns=} contains 'id'. Please rename it")

    # Make sure we have equidistance fidelities for all configs
    fidelity_values = table.index.get_level_values(fidelity_name)
    fidelity_counts = fidelity_values.value_counts()
    if not (fidelity_counts == fidelity_counts.iloc[0]).all():
        raise ValueError(f"{fidelity_name=} not  uniform. \n{fidelity_counts}")

    # We now have the following table
    #
    # config_id fidelity | **metric, **config_values
    #     0         0    |
    #               1    |
    #               2    |
    #     1         0    |
    #               1    |
    #               2    |
    #   ...

    # Here we get all the unique configs
    # config_id fidelity | **metric, **config_values
    #     0         0    |
    #     1         0    |
    #   ...
    config_id_table = table.groupby(level=config_name).agg("first")
    configs = {
        str(config_id): cls.Config.from_dict(
            {
                **row[config_keys].to_dict(),  # type: ignore
                "id": str(config_id),
            },
        )
        for config_id, row in config_id_table.iterrows()
    }

    fidelity_values = table.index.get_level_values(fidelity_name).unique()

    # We just assume equidistant fidelities
    sorted_fids = sorted(fidelity_values)
    start = sorted_fids[0]
    end = sorted_fids[-1]
    step = sorted_fids[1] - sorted_fids[0]

    # Create the configuration space
    if space is None:
        space = ConfigurationSpace(name, seed=seed)

    self.table = table
    self.configs = configs
    self.fidelity_name = fidelity_name
    self.config_name = config_name
    self.config_keys = sorted(config_keys)
    self.result_keys = sorted(result_keys)
    self.fidelity_range = (start, end, step)  # type: ignore

    super().__init__(
        name=name,
        seed=seed,
        space=space,
        prior=prior,
        perturb_prior=perturb_prior,
    )

table: pd.DataFrame
attr
#

The table of results used for this benchmark

fidelity_name: str
attr
#

The name of the fidelity used in this benchmark

config_name: str
attr
#

The column in the table that contains the config id. Will be set to the index

config_keys: Sequence[str]
attr
#

The keys in the table that contain the config

result_keys: Sequence[str]
attr
#

The keys in the table that contain the results

def query(config, at=None, *, argmax=None, argmin=None) #

Submit a query and get a result.

Passing a raw config

If a mapping is passed (and not a Config object), we will attempt to look for id in the mapping, to know which config to lookup.

If this fails, we will try match the config to one of the configs in the benchmark.

Prefer to pass the Config object directly if possible.

Override

This function overrides the default query() to allow for this config matching

PARAMETER DESCRIPTION
config

The query to use

TYPE: CTabular | Mapping[str, Any] | str

at

The fidelity at which to query, defaults to None which means maximum

TYPE: F | None DEFAULT: None

argmax

Whether to return the argmax up to the point at. Will be slower as it has to get the entire trajectory. Uses the key from the Results.

TYPE: str | None DEFAULT: None

argmin

Whether to return the argmin up to the point at. Will be slower as it has to get the entire trajectory. Uses the key from the Results.

TYPE: str | None DEFAULT: None

RETURNS DESCRIPTION
R

The result of the query

Source code in src/mfpbench/tabular.py
def query(
    self,
    config: CTabular | Mapping[str, Any] | str,
    at: F | None = None,
    *,
    argmax: str | None = None,
    argmin: str | None = None,
) -> R:
    """Submit a query and get a result.

    !!! warning "Passing a raw config"

        If a mapping is passed (and **not** a [`Config`][mfpbench.Config] object),
        we will attempt to look for `id` in the mapping, to know which config to
        lookup.

        If this fails, we will try match the config to one of the configs in
        the benchmark.

        Prefer to pass the [`Config`][mfpbench.Config] object directly if possible.

    ??? note "Override"

        This function overrides the default
        [`query()`][mfpbench.Benchmark.query] to allow for this
        config matching

    Args:
        config: The query to use
        at: The fidelity at which to query, defaults to None which means *maximum*
        argmax: Whether to return the argmax up to the point `at`. Will be slower as
            it has to get the entire trajectory. Uses the key from the Results.
        argmin: Whether to return the argmin up to the point `at`. Will be slower as
            it has to get the entire trajectory. Uses the key from the Results.

    Returns:
        The result of the query
    """
    _config = self._find_config(config)
    return super().query(
        _config,
        at=at,  # type: ignore
        argmax=argmax,
        argmin=argmin,
    )

def trajectory(config, *, frm=None, to=None, step=None) #

Submit a query and get a result.

Passing a raw config

If a mapping is passed (and not a Config object), we will attempt to look for id in the mapping, to know which config to lookup.

If this fails, we will try match the config to one of the configs in the benchmark.

Prefer to pass the Config object directly if possible.

Override

This function overrides the default trajectory() to allow for this config matching

PARAMETER DESCRIPTION
config

The query to use

TYPE: CTabular | Mapping[str, Any] | str

frm

Start of the curve, should default to the start

TYPE: F | None DEFAULT: None

to

End of the curve, should default to the total

TYPE: F | None DEFAULT: None

step

Step size, defaults to cls.default_step

TYPE: F | None DEFAULT: None

RETURNS DESCRIPTION
list[R]

The result of the query

Source code in src/mfpbench/tabular.py
@override
def trajectory(
    self,
    config: CTabular | Mapping[str, Any] | str,
    *,
    frm: F | None = None,
    to: F | None = None,
    step: F | None = None,
) -> list[R]:
    """Submit a query and get a result.

    !!! warning "Passing a raw config"

        If a mapping is passed (and **not** a [`Config`][mfpbench.Config] object),
        we will attempt to look for `id` in the mapping, to know which config to
        lookup.

        If this fails, we will try match the config to one of the configs in
        the benchmark.

        Prefer to pass the [`Config`][mfpbench.Config] object directly if possible.

    ??? note "Override"

        This function overrides the default
        [`trajectory()`][mfpbench.Benchmark.trajectory] to allow for this
        config matching

    Args:
        config: The query to use
        frm: Start of the curve, should default to the start
        to: End of the curve, should default to the total
        step: Step size, defaults to ``cls.default_step``

    Returns:
        The result of the query
    """
    _config = self._find_config(config)
    return super().trajectory(_config, frm=frm, to=to, step=step)  # type: ignore

def sample(n=None, *, seed=None) #

Sample a random possible config.

PARAMETER DESCRIPTION
n

How many samples to take, None means jsut a single one, not in a list

TYPE: int | None DEFAULT: None

seed

The seed to use for the sampling.

Seeding

This is different than any seed passed to the construction of the benchmark.

TYPE: int | RandomState | None DEFAULT: None

RETURNS DESCRIPTION
CTabular | list[CTabular]

Get back a possible Config to use

Source code in src/mfpbench/tabular.py
@override
def sample(
    self,
    n: int | None = None,
    *,
    seed: int | np.random.RandomState | None = None,
) -> CTabular | list[CTabular]:
    """Sample a random possible config.

    Args:
        n: How many samples to take, None means jsut a single one, not in a list
        seed: The seed to use for the sampling.

            !!! note "Seeding"

                This is different than any seed passed to the construction
                of the benchmark.

    Returns:
        Get back a possible Config to use
    """
    _seed: int | None
    if isinstance(seed, np.random.RandomState):
        _seed = seed.random_integers(0, 2**32 - 1)
    else:
        _seed = seed

    rng = np.random.default_rng(seed=_seed)

    config_items: list[CTabular] = list(self.configs.values())
    n_configs = len(config_items)
    sample_amount = n if n is not None else 1

    if sample_amount > n_configs:
        raise ValueError(
            f"Can't sample {sample_amount} configs from {n_configs} configs",
        )

    indices = rng.choice(n_configs, size=sample_amount, replace=False)
    if n is None:
        first_index: int = indices[0]
        return config_items[first_index]

    return [config_items[i] for i in indices]

class GenericTabularBenchmark(table, *, name=None, fidelity_name, config_name, result_keys, config_keys, result_mapping=None, remove_constants=False, space=None, seed=None, prior=None, perturb_prior=None) #

Bases: TabularBenchmark[GenericTabularConfig, GenericTabularResult[GenericTabularConfig, F], F]

PARAMETER DESCRIPTION
table

The table to use for the benchmark

TYPE: DataFrame

name

The name of the benchmark. If None, will be set to unknown-{datetime.now().isoformat()}

TYPE: str | None DEFAULT: None

fidelity_name

The column in the table that contains the fidelity

TYPE: str

config_name

The column in the table that contains the config id

TYPE: str

result_keys

The columns in the table that contain the results

TYPE: Sequence[str]

config_keys

The columns in the table that contain the config values

TYPE: Sequence[str]

result_mapping

A mapping from the result keys to the table keys. If a string, will be used as the key in the table. If a callable, will be called with the table and the result will be used as the value.

TYPE: dict[str, str | Callable[[DataFrame], Any]] | None DEFAULT: None

remove_constants

Remove constant config columns from the data or not.

TYPE: bool DEFAULT: False

space

The configuration space to use for the benchmark. If None, will just be an empty space.

TYPE: ConfigurationSpace | None DEFAULT: None

seed

The seed to use.

TYPE: int | None DEFAULT: None

prior

The prior to use for the benchmark. If None, no prior is used. If a str, will check the local location first for a prior specific for this benchmark, otherwise assumes it to be a Path. If a Path, will load the prior from the path. If a Mapping, will be used directly.

TYPE: str | Path | GenericTabularConfig | Mapping[str, Any] | None DEFAULT: None

perturb_prior

If not None, will perturb the prior by this amount. For numericals, this is interpreted as the standard deviation of a normal distribution while for categoricals, this is interpreted as the probability of swapping the value for a random one.

TYPE: float | None DEFAULT: None

Source code in src/mfpbench/tabular.py
def __init__(  # noqa: PLR0913
    self,
    table: pd.DataFrame,
    *,
    name: str | None = None,
    fidelity_name: str,
    config_name: str,
    result_keys: Sequence[str],
    config_keys: Sequence[str],
    result_mapping: (dict[str, str | Callable[[pd.DataFrame], Any]] | None) = None,
    remove_constants: bool = False,
    space: ConfigurationSpace | None = None,
    seed: int | None = None,
    prior: str | Path | GenericTabularConfig | Mapping[str, Any] | None = None,
    perturb_prior: float | None = None,
):
    """Initialize the benchmark.

    Args:
        table: The table to use for the benchmark
        name: The name of the benchmark. If None, will be set to
            `unknown-{datetime.now().isoformat()}`

        fidelity_name: The column in the table that contains the fidelity
        config_name: The column in the table that contains the config id
        result_keys: The columns in the table that contain the results
        config_keys: The columns in the table that contain the config values
        result_mapping: A mapping from the result keys to the table keys.
            If a string, will be used as the key in the table. If a callable,
            will be called with the table and the result will be used as the value.
        remove_constants: Remove constant config columns from the data or not.
        space: The configuration space to use for the benchmark. If None, will
            just be an empty space.
        seed: The seed to use.
        prior: The prior to use for the benchmark. If None, no prior is used.
            If a str, will check the local location first for a prior
            specific for this benchmark, otherwise assumes it to be a Path.
            If a Path, will load the prior from the path.
            If a Mapping, will be used directly.
        perturb_prior: If not None, will perturb the prior by this amount.
            For numericals, this is interpreted as the standard deviation of a
            normal distribution while for categoricals, this is interpreted
            as the probability of swapping the value for a random one.
    """
    if name is None:
        name = f"unknown-{datetime.now().isoformat()}"

    _result_mapping: dict = result_mapping if result_mapping is not None else {}

    # Remap the result keys so it works with the generic result types
    if _result_mapping is not None:
        for k, v in _result_mapping.items():
            if isinstance(v, str):
                if v not in table.columns:
                    raise ValueError(f"{v} not in columns\n{table.columns}")

                table[k] = table[v]
            elif callable(v):
                table[k] = v(table)
            else:
                raise ValueError(f"Unknown result mapping {v} for {k}")

    super().__init__(
        name=name,
        table=table,
        config_name=config_name,
        fidelity_name=fidelity_name,
        result_keys=[*result_keys, *_result_mapping.keys()],
        config_keys=config_keys,
        remove_constants=remove_constants,
        space=space,
        seed=seed,
        prior=prior,
        perturb_prior=perturb_prior,
    )