Source code for cave.analyzer.performance.overview_table

import os
from collections import OrderedDict

import numpy as np
from ConfigSpace.hyperparameters import NumericalHyperparameter, CategoricalHyperparameter, OrdinalHyperparameter, \
    Constant
from pandas import DataFrame

from cave.analyzer.base_analyzer import BaseAnalyzer
from cave.utils.helpers import get_config_origin


[docs]class OverviewTable(BaseAnalyzer):
    """
    Meta data, i.e. number of instances and parameters as well as configuration budget. Statistics apply to the
    best run, if multiple configurator runs are compared.
    """
    def __init__(self, runscontainer):
        super().__init__(runscontainer)
        self.output_dir = runscontainer.output_dir

        self.run()

[docs]    def get_name(self):
        return "Meta Data"

[docs]    def run(self):
        """ Generate tables. """
        scenario = self.runscontainer.scenario

        # General infos
        general_dict = self._general_dict(scenario)
        html_table_general = DataFrame(data=OrderedDict([('General', general_dict)]))
        html_table_general = html_table_general.reindex(list(general_dict.keys()))
        html_table_general = html_table_general.to_html(escape=False, header=False, justify='left')
        self.result["General"] = {"table": html_table_general,
                                  "tooltip": "General information about the optimization scenario."}

        # Run-specific / budget specific infos
        for mode in ['parallel', 'budget']:
            runspec_dict = self._runspec_dict(identify=mode)
            if not runspec_dict:
                continue
            order_spec = list(list(runspec_dict.values())[0].keys())  # Get keys of any sub-dict for order
            html_table_specific = DataFrame(runspec_dict)
            html_table_specific = html_table_specific.reindex(order_spec)
            html_table_specific = html_table_specific.to_html(escape=False, justify='left')
            if mode == 'parallel':
                self.result["Parallel Runs"] = {"table": html_table_specific,
                                                "tooltip": "Information to individual parallel runs."}
            if mode == 'budget':
                self.result["Budgets"] = {"table": html_table_specific,
                                          "tooltip": "Statistics related to the budgets used in this optimization."}

        # ConfigSpace in tabular form
        cs_dict = self._configspace(scenario.cs)
        cs_table = DataFrame(data=cs_dict)
        html_table_cs = cs_table.to_html(escape=False, justify='left', index=False)
        self.result["Configuration Space"] = {"table": html_table_cs,
                                              "tooltip": "The parameter configuration space. "
                                                         "(See github.com/automl/ConfigSpace)"}

        return self.result

[docs]    def _general_dict(self, scenario):
        """ Generate the meta-information that holds for all runs (scenario info etc)

        Parameters
        ----------
        scenario: smac.Scenario
            scenario file to get information from
        """
        # general stores information that holds for all runs, runspec holds information on a run-basis
        general = OrderedDict()

        if len(self.runscontainer.get_budgets()) > 1:
            general['# budgets'] = len(self.runscontainer.get_budgets())
        if len(self.runscontainer.get_folders()) > 1:
            general['# parallel runs'] = len(self.runscontainer.get_folders())

        # Scenario related
        general['# parameters'] = len(scenario.cs.get_hyperparameters())
        general['Deterministic target algorithm'] = scenario.deterministic
        general['Optimized run objective'] = scenario.run_obj
        if scenario.cutoff or scenario.run_obj == 'runtime':
            general['Cutoff'] = scenario.cutoff
        if any([str(lim)!='inf' for lim in [scenario.wallclock_limit, scenario.ta_run_limit, scenario.algo_runs_timelimit]]):
            general['Walltime budget'] = scenario.wallclock_limit
            general['Runcount budget'] = scenario.ta_run_limit
            general['CPU budget'] = scenario.algo_runs_timelimit
        # Instances
        num_train, num_test = [len([i for i in insts if i]) for insts in [scenario.train_insts, scenario.test_insts]]
        if num_train > 0 or num_test > 0:
            general['# instances (train/test)'] = "{} / {}".format(num_train, num_test)
        # Features
        num_feats = scenario.n_features if scenario.feature_dict else 0
        num_dup_feats = 0
        if scenario.feature_dict:
            dup_feats = DataFrame(scenario.feature_array)
            num_dup_feats = len(dup_feats[dup_feats.duplicated()])  # only contains train instances
        if num_feats > 0:
            general['# features (duplicates)'] = "{} ({})".format(num_feats, num_dup_feats)

        general['----------'] = '----------'

        combined_run = self.runscontainer.get_aggregated(False, False)[0]
        combined_stats = self._stats_for_run(combined_run.original_runhistory,
                                             combined_run.scenario,
                                             combined_run.incumbent)
        for k, v in combined_stats.items():
            general[k] = v

        return general

[docs]    def _runspec_dict(self, identify='parallel'):
        """
        identify-keyword specifies whether to use path or budget for name
        """
        if identify not in ['parallel', 'budget']:
            raise ValueError("illegal use of _runspec_dict")
        if (identify == 'budget' and len(self.runscontainer.get_budgets()) <= 1 and
            (self.runscontainer.get_budgets() is None or self.runscontainer.get_budgets()[0] == 0.0)):
            return False
        if (identify == 'parallel' and len(self.runscontainer.get_folders()) <= 1):
            return False

        runspec = OrderedDict()
        runs = self.runscontainer.get_aggregated(keep_folders=identify=='parallel',
                                                 keep_budgets=identify=='budget')

        for idx, run in enumerate(runs):
            if identify == 'budget' and len(set(run.reduced_to_budgets)) != 1:
                raise ValueError("Runs processed here should only have a single budget specified (%s)." %
                                 run.reduced_to_budgets)
            self.logger.debug("Path to folder for run no. {}: {}".format(idx, str(run.path_to_folder)))
            name = os.path.basename(run.path_to_folder) if identify == 'parallel' else str(run.reduced_to_budgets[0])
            runspec[name] = self._stats_for_run(run.original_runhistory,
                                                run.scenario,
                                                run.incumbent)
        return runspec

[docs]    def _stats_for_run(self, rh, scenario, incumbent):
        result = OrderedDict()

        all_configs = rh.get_all_configs()
        default = scenario.cs.get_default_configuration()

        # Runtime statistics
        all_ta_runtimes = [run_value.time for run_value in rh.data.values()]
        result['Total time spent evaluating configurations'] = "{:.2f} sec".format(np.sum(all_ta_runtimes))
        result['Average time per configuration (mean / std)'] = '{:5.2f} sec (± {:5.2f})'.format(np.mean(all_ta_runtimes),
                                                                                                 np.std(all_ta_runtimes))

        # Number of evaluations
        ta_evals = [len(rh.get_runs_for_config(c, only_max_observed_budget=True)) for c in all_configs]
        result['# evaluated configurations'] = len(all_configs)
        if not scenario.deterministic:
            result['# evaluations in total'] = np.sum(ta_evals)
            result['# evaluations for default/incumbent'] = "{}/{}".format(len(rh.get_runs_for_config(default, only_max_observed_budget=True)),
                                                                           len(rh.get_runs_for_config(incumbent, only_max_observed_budget=True)))
            result['# runs per configuration (min, mean and max)'] = "{}/{:.2f}/{}".format(
                            np.min(ta_evals), np.mean(ta_evals), np.max(ta_evals))
        # Info about configurations
        num_changed_params = len([p for p in scenario.cs.get_hyperparameter_names() if default[p] != incumbent[p]])
        result['# changed parameters (default to incumbent)'] = num_changed_params
        # Origins
        origins = [get_config_origin(c) for c in all_configs]
        origins = {o : origins.count(o) for o in set(origins)}
        if not (list(origins.keys()) == ["Unknown"]):
            result['Configuration origins'] = ", ".join(['{} : {}'.format(o, n) for o, n in origins.items()])

        return result

[docs]    def _configspace(self, cs):
        """ Return configspace in table-format """
        d = OrderedDict([("Parameter", []),
                         ("Type", []),
                         ("Range/Choices", []),
                         ("Default", [])]
                        )
        for hp in cs.get_hyperparameters():
            d["Parameter"].append(hp.name)
            d["Type"].append(type(hp).__name__)
            if isinstance(hp, NumericalHyperparameter):
                d["Range/Choices"].append("[{}, {}]{}".format(hp.lower, hp.upper, ' (log)' if hp.log else ''))
            elif isinstance(hp, CategoricalHyperparameter):
                d["Range/Choices"].append("{}".format(hp.choices))
            elif isinstance(hp, OrdinalHyperparameter):
                d["Range/Choices"].append("{}".format(hp.sequence))
            elif isinstance(hp, Constant):
                d["Range/Choices"].append("{}".format(hp.default_value))
            else:
                d["Range/Choices"].append("?")
            d["Default"].append(hp.default_value)
        return d