Source code for cave.reader.conversion.hpbandster2smac

import itertools
import os
import tempfile
from collections import OrderedDict

import numpy as np
from ConfigSpace.configuration_space import Configuration, ConfigurationSpace
from ConfigSpace.hyperparameters import CategoricalHyperparameter
from ConfigSpace.read_and_write import json as pcs_json
from ConfigSpace.read_and_write import pcs_new
from smac.runhistory.runhistory import RunHistory
from smac.scenario.scenario import Scenario
from smac.stats.stats import Stats
from smac.tae import StatusType
from smac.utils.io.traj_logging import TrajLogger

from cave.reader.conversion.base_converter import BaseConverter
from cave.utils.helpers import get_folder_basenames


[docs]class HpBandSter2SMAC(BaseConverter):
    """
    Converts data from `hpbandster <https://github.com/automl/HpBandSter>`_ at least up to the 1.0 release into SMAC3 data.
    """
[docs]    def convert(self, folders, ta_exec_dirs=None, output_dir=None, converted_dest='converted_input_data'):
        try:
            from hpbandster.core.result import Result as HPBResult
            from hpbandster.core.result import logged_results_to_HBS_result
        except ImportError as e:
            raise ImportError("To analyze BOHB-data, please install hpbandster (e.g. `pip install hpbandster`)")

        self.logger.debug("Converting BOHB-data to SMAC3-data. Called with: folders=%s, ta_exec_dirs=%s, output_dir=%s,"
                          " converted_dest=%s", str(folders), str(ta_exec_dirs), str(output_dir), str(converted_dest))

        # Using temporary files for the intermediate smac-result-like format if no output_dir specified
        if not output_dir:
            output_dir = tempfile.mkdtemp()
            self.logger.debug("Temporary directory for intermediate SMAC3-results: %s", output_dir)
        if ta_exec_dirs is None or len(ta_exec_dirs) == 0:
            ta_exec_dirs = ['.']
        if len(ta_exec_dirs) != len(folders):
            ta_exec_dirs = [ta_exec_dirs[0] for _ in folders]

        # Get a list with alternative interpretations of the configspace-file
        # (if it's a .pcs-file, for .json-files it's a length-one-list)
        cs_interpretations = self.load_configspace(folders[0])
        self.logger.debug("Loading with %d configspace alternative options...", len(cs_interpretations))
        self.logger.info("Assuming BOHB treats target algorithms as deterministic (and does not re-evaluate)")

        #####################
        # Actual conversion #
        #####################
        folder_basenames = get_folder_basenames(folders)
        result = OrderedDict()
        for f, f_base, ta_exec_dir in zip(folders, folder_basenames, ta_exec_dirs):  # Those are the parallel runs
            converted_folder_path = os.path.join(output_dir, converted_dest, f_base)
            self.logger.debug("Processing folder=%s, f_base=%s, ta_exec_dir=%s. Saving to %s.",
                              f, f_base, ta_exec_dir, converted_folder_path)
            if not os.path.exists(converted_folder_path):
                self.logger.debug("%s doesn't exist. Creating...", converted_folder_path)
                os.makedirs(converted_folder_path)

            # Original hpbandster-formatted result-object
            hp_result = logged_results_to_HBS_result(f)
            result[f] = self.hpbandster2smac(f, hp_result, cs_interpretations, converted_folder_path)

        return result

[docs]    def load_configspace(self, folder):
        """
        Will try to load the configspace. cs_options will be a list containing all possible
        combinations of interpretation for Categoricals. If this issue will be fixed, we can drop this procedure.

        Parameters
        ----------
        folder: str
            path to folder in which to look for configspace

        Returns
        -------
        cs_options: list[ConfigurationSpace]
            list with possible interpretations for config-space-file. Only contains multiple items if file-format is pcs.
        """
        cs_options = []
        cs_fn_json = os.path.join(folder, 'configspace.json')
        cs_fn_pcs = os.path.join(folder, 'configspace.pcs')

        if os.path.exists(cs_fn_json):
            with open(cs_fn_json, 'r') as fh:
                cs_options = [pcs_json.read(fh.read())]
            self.logger.debug("Detected and loaded \"%s\". No alternative interpretations necessary", cs_fn_json)
        elif os.path.exists(cs_fn_pcs):
            with open(cs_fn_pcs, 'r') as fh:
                cs = pcs_new.read(fh.readlines())
            # Create alternative interpretations
            categoricals = [hp for hp in cs.get_hyperparameters() if isinstance(hp, CategoricalHyperparameter)]
            non_categoricals = [hp for hp in cs.get_hyperparameters() if not isinstance(hp, CategoricalHyperparameter)]

            def _get_interpretations(choices):
                """ Generate different interpretations for critical categorical hyperparameters that are not seamlessly
                supported by pcs-format."""
                result = []
                if set(choices) == {"True", "False"}:
                    result.append([True, False])
                if all([c.isdigit() for c in choices]):
                    result.append([int(c) for c in choices])
                result.append(choices)
                return result

            choices_per_cat = [_get_interpretations(hp.choices) for hp in categoricals]
            combinations = itertools.product(*choices_per_cat)
            self.logger.debug(combinations)
            for combi in combinations:
                bcs = ConfigurationSpace()
                for hp in non_categoricals:
                    bcs.add_hyperparameter(hp)
                for name, choices in zip([hp.name for hp in categoricals], combi):
                    bcs.add_hyperparameter(CategoricalHyperparameter(name, choices))
                bcs.add_conditions(cs.get_conditions())
                cs_options.append(bcs)

            self.logger.debug("Sampled %d interpretations of \"%s\"", len(cs_options), cs_fn_pcs)
        else:
            raise ValueError("Missing pcs-file at '%s.[pcs|json]'!" % os.path.join(folder, 'configspace'))
        return cs_options

[docs]    def _get_config(self, config_id, id2config, cs):
        config = Configuration(cs, id2config[config_id]['config'])
        try:
            model_based_pick = id2config[config_id]['config_info']['model_based_pick']
            config.origin = 'Model based pick' if model_based_pick else 'Random'
        except KeyError:
            self.logger.debug("No origin for config (id %s)!" % str(config_id), exc_info=True)
        return config

[docs]    def hpbandster2smac(self, folder, result, cs_options, output_dir: str):
        """Reading hpbandster-result-object and creating RunHistory and trajectory...

        Parameters
        ----------
        folder: str (path)
            original folder
        result: hpbandster.core.result.Result
            bohb's result-object
        cs_options: list[ConfigurationSpace]
            the configuration spaces. in the best case it's a single element, but for pcs-format we need to guess
            through a list of possible configspaces
        output_dir_base: str
            the output-dir to save the smac-runs to
        
        Returns
        -------
        converted: dict{
                'new_path' : path_to_converted_input,
                'hp_bandster_result' : result_in_hpbandster_format,
                'config_space' : config_space,
                'runhistory' : runhistory,
                'validated_runhistory' : validated_runhistory,
                'scenario' : scenario,
                'trajectory' : trajectory,
                }

        """
        self.logger.debug("Budgets for '%s': %s" % (folder, str(result.HB_config['budgets'])))
        ##########################
        # 1. Create runhistory   #
        ##########################
        id2config_mapping = result.get_id2config_mapping()
        skipped = {'None' : 0, 'NaN' : 0}
        rh = RunHistory()
        for run in result.get_all_runs():
            # Load config...
            config = None
            while config is None:
                if len(cs_options) == 0:
                    self.logger.debug("None of the alternatives worked...")
                    raise ValueError("Your configspace seems to be corrupt. If you use floats (or mix up ints, bools "
                                     "and strings) as categoricals, please consider using the .json-format, as the "
                                     ".pcs-format cannot recover the type of categoricals. Otherwise please report "
                                     "this to https://github.com/automl/CAVE/issues (and attach the debug.log)")
                try:
                    config = self._get_config(run.config_id, id2config_mapping, cs_options[0])
                except ValueError as err:
                    self.logger.debug("Loading config failed. Trying %d alternatives" % len(cs_options) - 1, exc_info=1)
                    cs_options = cs_options[1:]  # remove the failing cs-version

            # Filter corrupted loss-values (ignore them)
            if run.loss is None:
                skipped['None'] += 1
                continue
            if np.isnan(run.loss):
                skipped['NaN'] += 1
                continue

            rh.add(config=config,
                   cost=run.loss,
                   time=run.time_stamps['finished'] - run.time_stamps['started'],
                   status=StatusType.SUCCESS,
                   budget=run.budget,
                   seed=0,
                   additional_info={'info' : run.info, 'timestamps': run.time_stamps})

        self.logger.debug("Skipped %d None- and %d NaN-loss-values in BOHB-result", skipped['None'], skipped['NaN'])

        ##########################
        # 2. Create all else     #
        ##########################
        scenario = Scenario({'run_obj': 'quality',
                             'cs': cs_options[0],
                             'output_dir': output_dir,
                             'deterministic': True,  # At the time of writing, BOHB is always treating ta's as deterministic
                            })
        scenario.output_dir_for_this_run = output_dir
        scenario.write()

        with open(os.path.join(output_dir, 'configspace.json'), 'w') as fh:
            fh.write(pcs_json.write(cs_options[0]))

        rh.save_json(fn=os.path.join(output_dir, 'runhistory.json'))

        trajectory = self.get_trajectory(result, output_dir, scenario, rh)

        return {'new_path': output_dir,
                'hpbandster_result': result,
                'config_space': cs_options[0],
                'runhistory': rh,
                'validated_runhistory': None,
                'scenario': scenario,
                'trajectory': trajectory,
                }

[docs]    def get_trajectory(self, result, output_path, scenario, rh):
        """
        Use hpbandster's averaging.
        """
        cs = scenario.cs

        if not output_path:
            output_path = tempfile.mkdtemp()

        traj_logger = TrajLogger(output_path, Stats(scenario))
        total_traj_dict = []
        traj_dict = result.get_incumbent_trajectory()

        id2config_mapping = result.get_id2config_mapping()

        failed_entries = []
        for config_id, time, budget, loss in zip(traj_dict['config_ids'],
                                                 traj_dict['times_finished'],
                                                 traj_dict['budgets'],
                                                 traj_dict['losses']):
            incumbent = self._get_config(config_id, id2config_mapping, cs)
            try:
                incumbent_id = rh.config_ids[incumbent]
            except KeyError as err:
                self.logger.debug(err)
                failed_entries.append((config_id, incumbent))
                self.logger.debug("Could not load configuration id %d (%s)", config_id, str(incumbent))
                continue
            total_traj_dict.append({'config_id' : incumbent_id,
                                    'time_finished' : time,
                                    'budget' : budget,
                                    'loss' : loss})
        if len(failed_entries) > 0:
            self.logger.warning("Failed to load %d (of %d total) entries from trajectory",
                                len(failed_entries), len(failed_entries) + len(total_traj_dict))

        last_loss = np.inf
        for element in sorted(total_traj_dict, key=lambda x: x['time_finished']):
            incumbent_id = element["config_id"]
            incumbent = rh.ids_config[incumbent_id]
            time = element["time_finished"]
            loss = element["loss"]
            budget = element["budget"]

            if loss > last_loss:
                continue

            ta_runs = -1
            ta_time_used = -1
            wallclock_time = time
            train_perf = loss
            # add to trajectory, imitate `add_entry` method of SMAC's traj_logger
            traj_logger.trajectory.append({
                'cpu_time': ta_time_used,
                'total_cpu_time': None,
                "wallclock_time": wallclock_time,
                "evaluations": ta_runs,
                "cost": train_perf,
                "incumbent": incumbent,
                "budget": budget
            })
            traj_logger._add_in_alljson_format(train_perf,
                                               incumbent_id,
                                               incumbent,
                                               budget,
                                               ta_time_used,
                                               wallclock_time,
                                               )
        return traj_logger.trajectory