Source code for cave.utils.helpers

import configparser
import inspect
import logging
import os
import typing

import numpy as np
from ConfigSpace.configuration_space import Configuration
from smac.runhistory.runhistory import RunHistory, RunKey

from cave.utils.exceptions import NotApplicable


[docs]def get_timeout(rh, conf, cutoff):
    """Check for timeouts. If multiple runs for an inst/config-pair are
    available, using the median (not the mean: no fractional timeouts)

    Parameters
    ----------
    rh: RunHistory
        runhistory to take runs from
    conf: Configuration
        config to use
    cutoff: int
        to determine timeouts

    Returns
    -------
    timeouts: Dict(str: bool)
        mapping instances to [True, False], where True indicates a timeout
    """
    # TODO Possibly inconsistent: median over timeouts is timeout, but mean over
    # costs is not. Possible?
    if not cutoff:
        return {}
    # Check if config is in runhistory
    conf_id = rh.config_ids[conf]

    timeouts = {}
    runs = rh.get_runs_for_config(conf, only_max_observed_budget=True)
    for run in runs:
        # Averaging over seeds, run = (inst, seed)
        inst, seed, _git  = run
        status = rh.data[RunKey(conf_id, inst, seed)].time < cutoff
        if inst in timeouts:
            timeouts[inst].append(status)
        else:
            timeouts[inst] = [status]
    # Use median
    timeouts = {i: np.floor(np.median(timeouts[i])) for i in timeouts.keys()}
    return timeouts


[docs]def get_cost_dict_for_config(rh: RunHistory,
                             conf: Configuration,
                             par: int=1,
                             cutoff: typing.Union[float, None]=None):
    """
    Aggregates loss for configuration on evaluated instances over seeds.

    Parameters
    ----------
    rh: RunHistory
        runhistory with data
    conf: Configuration
        configuration to evaluate
    par: int
        par-factor with which to multiply timeouts
    cutoff: float
        cutoff of scenario - used to penalize costs if par != 1

    Returns
    -------
    cost: dict(instance->cost)
        cost per instance (aggregated or as list per seed)
    """
    instance_costs = rh.get_instance_costs_for_config(conf)

    if par != 1:
        if cutoff:
            instance_costs = {k: v if v < cutoff else v * par for k, v in instance_costs.items()}
        else:
            raise ValueError("To apply penalization of costs, a cutoff needs to be provided.")

    return instance_costs


[docs]def escape_parameter_name(p):
    """Necessary because:
        1. parameters called 'size' or 'origin' might exist in cs
        2. '-' not allowed in bokeh's CDS"""
    return 'p_' + p.replace('-', '_')


[docs]def scenario_sanity_check(s, logger):
    """Check scenario for number of train- and test-instances, (duplicate) features and inconsistencies.
    Logs information and raises ValueError if train-features available, but test-features not."""
    train, test, feat = [t for t in s.train_insts if t], [t for t in s.test_insts if t], list(s.feature_dict.keys())
    train_feat, test_feat = [t for t in feat if t in train], [t for t in feat if t in test]
    logger.debug("Instances: train=%d, test=%d, train-features=%d, test-features=%d",
                 len([t for t in train if t]), len([t for t in test if t]), len(train_feat), len(test_feat))
    if (train and train_feat) and (test and not test_feat):
        raise ValueError("Detected train- and test-instances, but only train-features. Either\n  (a) remove train-"
                         "features\n  (b) add test-features or\n  (c) remove test-instances.")

[docs]def combine_runhistories(rhs, logger=None):
    """Combine list of given runhistories. interleaving to best approximate execution order"""
    combi_rh = RunHistory()
    rh_to_runs = {rh : list(rh.data.items()) for rh in rhs}
    if logger:
        logger.debug("number of elements: " + str({k : len(v) for k, v in rh_to_runs}))
    idx = 0
    while len(rh_to_runs) > 0:
        for rh in list(rh_to_runs.keys()):
            try:
                k, v = rh_to_runs[rh][idx]
                combi_rh.add(config=rh.ids_config[k.config_id],
                             cost=v.cost,
                             time=v.time,
                             status=v.status,
                             instance_id=k.instance_id,
                             #TODO budget option
                             seed=k.seed,
                             additional_info=v.additional_info)
            except IndexError:
                rh_to_runs.pop(rh)
        idx += 1
    if logger:
        logger.debug("number of elements in individual rhs: " + str({k : len(v) for k, v in rh_to_runs}))
        logger.debug("number of elements in combined rh: " + str(len(combi_rh.data)))
    return combi_rh

[docs]def combine_trajectories(trajs, logger=None):
    """Combine trajectories. Trajectories are expected as an iterable of sorted lists, which are increasing in time.
    A trajectory entry is expected as:
    TrajEntry = collections.namedtuple(
                  'TrajEntry', ['train_perf', 'incumbent_id', 'incumbent',
                                'ta_runs', 'ta_time_used', 'wallclock_time'])

    Parameters
    ----------
    trajs: List[List[TrajEntry]]
        trajectories to be combined

    Returns
    -------
    combined_traj: List[TrajEntry]
        combined trajectory
    """
    # flatten list
    flattened_list = [a for b in trajs for a in b]
    # Sort by wallclock-time
    flattened_list.sort(key=lambda traj_entry: traj_entry['wallclock_time'])
    if logger:
        logger.debug("{} trajectories combined to one with {} elements".format(len(trajs), len(flattened_list)))
        #logger.debug(flattened_list)
    # Now add one by one in order of time if better performance than before
    combined_traj = [flattened_list[0]]
    for entry in flattened_list:
        if entry['cost'] < combined_traj[-1]['cost']:
            combined_traj.append(entry)
    return combined_traj

[docs]class MissingInstancesError(Exception):
    """Exception indicating that instances are missing."""
    pass

[docs]def get_config_origin(c):
    """Return appropriate configuration origin

    Parameters
    ----------
    c: Configuration
        configuration to be examined

    Returns
    -------
    origin: str
        origin of configuration (e.g. "Local", "Random", etc.)
    """
    if not c.origin:
        origin = "Unknown"
    elif c.origin.startswith("Local") or c.origin == 'Model based pick' or "sorted" in c.origin:
        origin = "Acquisition Function"
    elif c.origin.startswith("Random"):
        origin = "Random"
    else:
        logging.getLogger("cave.utils.helpers").debug("Cannot interpret origin: %s", c.origin)
        origin = "Unknown"
    return origin

[docs]def check_for_features(scenario):
    features = scenario.feature_dict
    # filter instance features
    train = scenario.train_insts
    test = scenario.test_insts
    train_feats = {k: v for k, v in features.items() if k in train}
    test_feats = {k: v for k, v in features.items() if k in test}
    if not (train_feats or test_feats):
        raise NotApplicable("Could not detect any instances.")

[docs]def load_default_options(options=None, file_format=None):
    # Load the configuration file
    own_folder = os.path.realpath(os.path.abspath(os.path.split(inspect.getfile(inspect.currentframe()))[0]))
    default_options = configparser.ConfigParser()
    default_options.read(os.path.join(own_folder, 'options/default_analysis_options.ini'))

    if options is not None:
        if isinstance(options, str):
            default_options.read_file(options)
        else:
            default_options.read_dict(options)

    return default_options

[docs]def detect_fileformat(folders):
    from cave.reader.conversion.csv2smac import CSV2SMAC
    from cave.reader.smac2_reader import SMAC2Reader
    from cave.reader.smac3_reader import SMAC3Reader

    # First check if it's APT, else BOHB
    bohb_files = ["configs.json", "results.json", "configspace.json"]
    apt_files = ["autonet_config.json", "results_fit.json"]
    if all([all([os.path.isfile(os.path.join(f, sub)) for sub in bohb_files]) for f in folders]):
        if all([all([os.path.isfile(os.path.join(f, sub)) for sub in apt_files]) for f in folders]):
            return "APT"
        else:
            return "BOHB"

    # Check if it's SMAC
    if all([SMAC3Reader.check_for_files(f) for f in folders]):
        return "SMAC3"
    if all([SMAC2Reader.check_for_files(f) for f in folders]):
        return "SMAC2"
    # Check if it's CSV
    if all([CSV2SMAC.check_for_files(f) for f in folders]):
        return "CSV"

    raise RuntimeError("Autodetection of file-format failed. Please try to specify (using --file_format on cmd-line)")

[docs]def get_folder_basenames(folders):
    """Shorten folder-strings as much as possible (always keeping the basename).
    ["foo/bar/run_1", "foo/bar/run_2/"] will be ["run_1", "run_2"]
    ["foo/run_1/bar/", "foo/run_2/bar"] will be ["run_1/bar", "run_2/bar"]
    """
    throw, keep = folders[:], ['' for _ in range(len(set(folders)))]
    max_parts = max([len(f.split('/')) for f in folders])
    for _ in range(max_parts):
        for idx in range(len(folders)):
            throw[idx], new = os.path.split(throw[idx].rstrip('/'))
            keep[idx] = os.path.join(new, keep[idx]).rstrip('/')
        if len(set(keep)) == len(set(folders)):
            break

    return keep