Source code for cave.analyzer.configurator.parallel_coordinates

from collections import OrderedDict
from typing import Union, Dict, List

import numpy as np
import pandas as pd
from ConfigSpace.configuration_space import ConfigurationSpace, Configuration
from ConfigSpace.hyperparameters import NumericalHyperparameter, CategoricalHyperparameter
from bokeh.embed import components
from bokeh.layouts import column
from bokeh.models import Div
from bokeh.palettes import Viridis256
from smac.runhistory.runhistory import RunHistory
from smac.scenario.scenario import Scenario
from smac.utils.validate import Validator

from cave.analyzer.base_analyzer import BaseAnalyzer
from cave.plot.parallel_plot.parallel_plot import parallel_plot
from cave.utils.hpbandster_helpers import format_budgets
from cave.utils.timing import timing

__author__ = "Joshua Marben"
__copyright__ = "Copyright 2017, ML4AAD"
__license__ = "3-clause BSD"
__maintainer__ = "Joshua Marben"
__email__ = "joshua.marben@neptun.uni-freiburg.de"


[docs]class ParallelCoordinates(BaseAnalyzer):
    """
    Previously used by Golovin et al.  to study the frequency of chosen parameter settings in
    black-box-optimization.  Each line corresponds to one configuration in the runhistory and shows the parameter
    settings and the corresponding (estimated) average cost. To handle large configuration spaces with hundreds of
    parameters, the (at most) 10 most important parameters based on a fANOVA parameter importance analysis are
    plotted.  To emphasize better configurations, the performance is encoded in the color of each line, ranging from
    blue to red. These plots provide insights into whether the configurator focused on specific parameter values and
    how these correlate to their costs.

    NOTE: the given RunHistory should contain only optimization and no
    validation to analyze the explored parameter-space.
    """

    def __init__(self,
                 runscontainer,
                 pc_sort_by: str=None,
                 params: Union[int, List[str]]=None,
                 n_configs: int=None,
                 max_runs_epm: int=None,
                 ):
        """This function prepares the data from a SMAC-related format (using runhistories and parameters) to a more
        general format (using a dataframe). The resulting dataframe is passed to the parallel_coordinates-routine

        Parameters
        ----------
        params: Union[int, List[str]]
            either directly the parameters to displayed or the number of parameters (will try to define the most
            important ones
        n_configs: int
            number of configs to be plotted
        pc_sort_by: str
            defines the pimp-method by which to choose the plotted parameters
        max_runs_epm: int
            maximum number of runs to train the epm with. this should prevent MemoryErrors
        """
        super().__init__(runscontainer,
                         pc_sort_by=pc_sort_by,
                         params=params,
                         n_configs=n_configs,
                         max_runs_epm=max_runs_epm)

        self.params = self.options.getint('params')
        self.n_configs = self.options.getint('n_configs')
        self.max_runs_epm = self.options.getint('max_runs_epm')
        self.pc_sort_by = self.options['pc_sort_by']

        self.data = None  # save data here so bokeh-plots can be recreated fast.

[docs]    def get_name(self):
        return "Parallel Coordinates"

[docs]    def plot_bokeh(self, return_components=False):
        """ If components is specified, will return script, div-tuple """
        if not self.data:
            self._preprocess()
        result = self.result if return_components else {}
        for budget, dataframe in self.data.items():
            plot = self._plot_budget(dataframe)
            if return_components:
                result[budget] = {'bokeh': components(plot)}
            else:
                result[budget] = plot

        # If only one budget, we don't need an extra tab...
        if len(result) == 1:
            result = list(result.values())[0]

        return result

[docs]    def _preprocess(self):
        if self.data:
            raise ValueError("Data seems to be already initialized, undefined behaviour.")
        else:
            self.data = OrderedDict()

        formatted_budgets = format_budgets(self.runscontainer.get_budgets())
        for budget, run in zip(self.runscontainer.get_budgets(),
                               self.runscontainer.get_aggregated(keep_budgets=True, keep_folders=False)):
            self.data[formatted_budgets[budget]] = self._preprocess_budget(
                    original_rh=run.original_runhistory,
                    validated_rh=run.validated_runhistory,
                    validator=run.validator,
                    scenario=run.scenario,
                    default=run.default, incumbent=run.incumbent,
                    param_imp=run.share_information["parameter_importance"],
                    output_dir=run.output_dir,
                    cs=run.scenario.cs,
                    runtime=(run.scenario.run_obj == 'runtime'))

[docs]    def _preprocess_budget(self,
                           original_rh: RunHistory,
                           validated_rh: RunHistory,
                           validator: Validator,
                           scenario: Scenario,
                           default: Configuration,
                           incumbent: Configuration,
                           param_imp: Union[None, Dict[str, float]],
                           output_dir: str,
                           cs: ConfigurationSpace,
                           runtime: bool = False,
                           ):
        """
        Preprocess data and save in self.data to enable fast replots

        Parameters:
        -----------
        original_rh: RunHistory
            runhistory that should contain only runs that were executed during search
        validated_rh: RunHistory
            runhistory that may contain as many runs as possible, also external runs.
            this runhistory will be used to build the EPM
        validator: Validator
            validator to be used to estimate costs for configurations
        scenario: Scenario
            scenario object to take instances from
        default, incumbent: Configuration
            default and incumbent, they will surely be displayed
        param_imp: Union[None, Dict[str->float]
            if given, maps parameter-names to importance
        output_dir: str
            output directory for plots
        cs: ConfigurationSpace
            parameter configuration space to be visualized
        runtime: boolean
            runtime will be on logscale
        """
        # Sorting parameters by importance, if possible (choose first executed parameter-importance)
        method, importance = "", {}
        if self.pc_sort_by == 'all':
            self.logger.debug("Sorting by average importance")
            method = 'average'
            for m, i in param_imp.items():
                if i:
                    for p, imp in i.items():
                        if p in importance:
                            importance[p].append(imp)
                        else:
                            importance[p] = [imp]
            importance = {k: sum(v) / len(v) for k, v in importance.items()}
        elif self.pc_sort_by in param_imp:
            method, importance = self.pc_sort_by, param_imp[self.pc_sort_by]
        else:
            self.logger.debug("%s not evaluated.. choosing at random from: %s", self.pc_sort_by,
                              str(list(param_imp.keys())))
            for m, i in param_imp.items():
                if i:
                    method, importance = m, i
                    self.logger.debug("Chose %s", method)
                    break

        hp_names = sorted([p for p in cs.get_hyperparameter_names()], key=lambda x: importance.get(x, 0), reverse=True)
        self.logger.debug("Sorted hyperparameters by method \'%s\': %s", method, str(hp_names))

        # Define set of configurations (limiting to max and choosing most interesting ones)
        all_configs = original_rh.get_all_configs()
        # max_runs_epm is the maximum total number of runs considered for epm to limit maximum possible number configs
        max_configs = int(self.max_runs_epm / (len(scenario.train_insts) + len(scenario.test_insts)))
        if len(all_configs) > max_configs:
            self.logger.debug("Limiting number of configs to train epm from %d to %d (based on max runs %d) and "
                              "choosing the ones with the most runs (for parallel coordinates)",
                              len(all_configs), max_configs, self.max_runs_epm)
            all_configs = sorted(all_configs,
                                 key=lambda c: len(original_rh.get_runs_for_config(c, only_max_observed_budget=False)))
            all_configs = all_configs[:max_configs]
            if default not in all_configs:
                all_configs = [default] + all_configs
            if incumbent not in all_configs:
                all_configs.append(incumbent)

        # Get costs for those configurations
        epm_rh = RunHistory()
        epm_rh.update(validated_rh)
        if scenario.feature_dict:  # if instances are available
            epm_rh.update(timing(validator.validate_epm)(all_configs, 'train+test', 1, runhistory=validated_rh))
        config_to_cost = OrderedDict({c: epm_rh.get_cost(c) for c in all_configs})

        data = OrderedDict()
        data['cost'] = list(config_to_cost.values())
        for hp in self.runscontainer.scenario.cs.get_hyperparameter_names():
            data[hp] = np.array([c[hp]  # if hp in c.get_dictionary() and not isinstance(c[hp], str) else np.nan
                                 for c in config_to_cost.keys()])
        df = pd.DataFrame(data=data)
        return df

[docs]    def _plot_budget(self, df):
        limits = OrderedDict([('cost', {'lower': df['cost'].min(),
                                        'upper': df['cost'].max()})])
        for hp in self.runscontainer.scenario.cs.get_hyperparameters():
            if isinstance(hp, NumericalHyperparameter):
                limits[hp.name] = {'lower': hp.lower, 'upper': hp.upper}
                if hp.log:
                    limits[hp.name]['log'] = True
            elif isinstance(hp, CategoricalHyperparameter):
                # We pass strings as numbers and overwrite the labels
                df[hp.name].replace({v: i for i, v in enumerate(hp.choices)}, inplace=True)
                limits[hp.name] = {'lower': 0, 'upper': len(hp.choices) - 1, 'choices': hp.choices}
            else:
                raise ValueError("Hyperparameter %s of type %s causes undefined behaviour." % (hp.name, type(hp)))
        p = parallel_plot(df=df, axes=limits, color=df[df.columns[0]], palette=Viridis256)
        div = Div(text="Select up and down column grid lines to define filters. Double click a filter to reset it.")
        plot = column(div, p)
        return plot

[docs]    def get_html(self, d=None, tooltip=None):
        result = self.plot_bokeh(return_components=True)
        if d is not None:
            result["tooltip"] = self.__doc__
            d["Parallel Coordinates"] = result
        return result