Source code for cave.analyzer.configurator.parallel_coordinates

from collections import OrderedDict
from typing import Union, Dict, List

import numpy as np
import pandas as pd
from ConfigSpace.configuration_space import ConfigurationSpace, Configuration
from ConfigSpace.hyperparameters import NumericalHyperparameter, CategoricalHyperparameter
from bokeh.embed import components
from bokeh.layouts import column
from bokeh.models import Div
from bokeh.palettes import Viridis256
from smac.runhistory.runhistory import RunHistory
from smac.scenario.scenario import Scenario
from smac.utils.validate import Validator

from cave.analyzer.base_analyzer import BaseAnalyzer
from cave.plot.parallel_plot.parallel_plot import parallel_plot
from cave.utils.hpbandster_helpers import format_budgets
from cave.utils.timing import timing

__author__ = "Joshua Marben"
__copyright__ = "Copyright 2017, ML4AAD"
__license__ = "3-clause BSD"
__maintainer__ = "Joshua Marben"
__email__ = "joshua.marben@neptun.uni-freiburg.de"


[docs]class ParallelCoordinates(BaseAnalyzer): """ Previously used by Golovin et al. to study the frequency of chosen parameter settings in black-box-optimization. Each line corresponds to one configuration in the runhistory and shows the parameter settings and the corresponding (estimated) average cost. To handle large configuration spaces with hundreds of parameters, the (at most) 10 most important parameters based on a fANOVA parameter importance analysis are plotted. To emphasize better configurations, the performance is encoded in the color of each line, ranging from blue to red. These plots provide insights into whether the configurator focused on specific parameter values and how these correlate to their costs. NOTE: the given RunHistory should contain only optimization and no validation to analyze the explored parameter-space. """ def __init__(self, runscontainer, pc_sort_by: str=None, params: Union[int, List[str]]=None, n_configs: int=None, max_runs_epm: int=None, ): """This function prepares the data from a SMAC-related format (using runhistories and parameters) to a more general format (using a dataframe). The resulting dataframe is passed to the parallel_coordinates-routine Parameters ---------- params: Union[int, List[str]] either directly the parameters to displayed or the number of parameters (will try to define the most important ones n_configs: int number of configs to be plotted pc_sort_by: str defines the pimp-method by which to choose the plotted parameters max_runs_epm: int maximum number of runs to train the epm with. this should prevent MemoryErrors """ super().__init__(runscontainer, pc_sort_by=pc_sort_by, params=params, n_configs=n_configs, max_runs_epm=max_runs_epm) self.params = self.options.getint('params') self.n_configs = self.options.getint('n_configs') self.max_runs_epm = self.options.getint('max_runs_epm') self.pc_sort_by = self.options['pc_sort_by'] self.data = None # save data here so bokeh-plots can be recreated fast.
[docs] def get_name(self): return "Parallel Coordinates"
[docs] def plot_bokeh(self, return_components=False): """ If components is specified, will return script, div-tuple """ if not self.data: self._preprocess() result = self.result if return_components else {} for budget, dataframe in self.data.items(): plot = self._plot_budget(dataframe) if return_components: result[budget] = {'bokeh': components(plot)} else: result[budget] = plot # If only one budget, we don't need an extra tab... if len(result) == 1: result = list(result.values())[0] return result
[docs] def _preprocess(self): if self.data: raise ValueError("Data seems to be already initialized, undefined behaviour.") else: self.data = OrderedDict() formatted_budgets = format_budgets(self.runscontainer.get_budgets()) for budget, run in zip(self.runscontainer.get_budgets(), self.runscontainer.get_aggregated(keep_budgets=True, keep_folders=False)): self.data[formatted_budgets[budget]] = self._preprocess_budget( original_rh=run.original_runhistory, validated_rh=run.validated_runhistory, validator=run.validator, scenario=run.scenario, default=run.default, incumbent=run.incumbent, param_imp=run.share_information["parameter_importance"], output_dir=run.output_dir, cs=run.scenario.cs, runtime=(run.scenario.run_obj == 'runtime'))
[docs] def _preprocess_budget(self, original_rh: RunHistory, validated_rh: RunHistory, validator: Validator, scenario: Scenario, default: Configuration, incumbent: Configuration, param_imp: Union[None, Dict[str, float]], output_dir: str, cs: ConfigurationSpace, runtime: bool = False, ): """ Preprocess data and save in self.data to enable fast replots Parameters: ----------- original_rh: RunHistory runhistory that should contain only runs that were executed during search validated_rh: RunHistory runhistory that may contain as many runs as possible, also external runs. this runhistory will be used to build the EPM validator: Validator validator to be used to estimate costs for configurations scenario: Scenario scenario object to take instances from default, incumbent: Configuration default and incumbent, they will surely be displayed param_imp: Union[None, Dict[str->float] if given, maps parameter-names to importance output_dir: str output directory for plots cs: ConfigurationSpace parameter configuration space to be visualized runtime: boolean runtime will be on logscale """ # Sorting parameters by importance, if possible (choose first executed parameter-importance) method, importance = "", {} if self.pc_sort_by == 'all': self.logger.debug("Sorting by average importance") method = 'average' for m, i in param_imp.items(): if i: for p, imp in i.items(): if p in importance: importance[p].append(imp) else: importance[p] = [imp] importance = {k: sum(v) / len(v) for k, v in importance.items()} elif self.pc_sort_by in param_imp: method, importance = self.pc_sort_by, param_imp[self.pc_sort_by] else: self.logger.debug("%s not evaluated.. choosing at random from: %s", self.pc_sort_by, str(list(param_imp.keys()))) for m, i in param_imp.items(): if i: method, importance = m, i self.logger.debug("Chose %s", method) break hp_names = sorted([p for p in cs.get_hyperparameter_names()], key=lambda x: importance.get(x, 0), reverse=True) self.logger.debug("Sorted hyperparameters by method \'%s\': %s", method, str(hp_names)) # Define set of configurations (limiting to max and choosing most interesting ones) all_configs = original_rh.get_all_configs() # max_runs_epm is the maximum total number of runs considered for epm to limit maximum possible number configs max_configs = int(self.max_runs_epm / (len(scenario.train_insts) + len(scenario.test_insts))) if len(all_configs) > max_configs: self.logger.debug("Limiting number of configs to train epm from %d to %d (based on max runs %d) and " "choosing the ones with the most runs (for parallel coordinates)", len(all_configs), max_configs, self.max_runs_epm) all_configs = sorted(all_configs, key=lambda c: len(original_rh.get_runs_for_config(c, only_max_observed_budget=False))) all_configs = all_configs[:max_configs] if default not in all_configs: all_configs = [default] + all_configs if incumbent not in all_configs: all_configs.append(incumbent) # Get costs for those configurations epm_rh = RunHistory() epm_rh.update(validated_rh) if scenario.feature_dict: # if instances are available epm_rh.update(timing(validator.validate_epm)(all_configs, 'train+test', 1, runhistory=validated_rh)) config_to_cost = OrderedDict({c: epm_rh.get_cost(c) for c in all_configs}) data = OrderedDict() data['cost'] = list(config_to_cost.values()) for hp in self.runscontainer.scenario.cs.get_hyperparameter_names(): data[hp] = np.array([c[hp] # if hp in c.get_dictionary() and not isinstance(c[hp], str) else np.nan for c in config_to_cost.keys()]) df = pd.DataFrame(data=data) return df
[docs] def _plot_budget(self, df): limits = OrderedDict([('cost', {'lower': df['cost'].min(), 'upper': df['cost'].max()})]) for hp in self.runscontainer.scenario.cs.get_hyperparameters(): if isinstance(hp, NumericalHyperparameter): limits[hp.name] = {'lower': hp.lower, 'upper': hp.upper} if hp.log: limits[hp.name]['log'] = True elif isinstance(hp, CategoricalHyperparameter): # We pass strings as numbers and overwrite the labels df[hp.name].replace({v: i for i, v in enumerate(hp.choices)}, inplace=True) limits[hp.name] = {'lower': 0, 'upper': len(hp.choices) - 1, 'choices': hp.choices} else: raise ValueError("Hyperparameter %s of type %s causes undefined behaviour." % (hp.name, type(hp))) p = parallel_plot(df=df, axes=limits, color=df[df.columns[0]], palette=Viridis256) div = Div(text="Select up and down column grid lines to define filters. Double click a filter to reset it.") plot = column(div, p) return plot
[docs] def get_html(self, d=None, tooltip=None): result = self.plot_bokeh(return_components=True) if d is not None: result["tooltip"] = self.__doc__ d["Parallel Coordinates"] = result return result