Source code for cave.analyzer.budgets.bohb_learning_curves

from collections import OrderedDict

import numpy as np
from bokeh.embed import components
from bokeh.io import output_notebook
from bokeh.layouts import column, row, widgetbox
from bokeh.models import HoverTool, Range1d, LinearColorMapper, CustomJS
from bokeh.models.filters import GroupFilter
from bokeh.models.sources import CDSView
from bokeh.models.widgets import Select
from bokeh.palettes import Spectral11
from bokeh.plotting import show, figure, ColumnDataSource

from cave.analyzer.base_analyzer import BaseAnalyzer
from cave.utils.bokeh_routines import get_checkbox


[docs]class BohbLearningCurves(BaseAnalyzer):
    """
    Visualizing the learning curves of all individual HyperBand-iterations. Model-based picks are marked with a
    cross. The config-id tuple denotes (iteration, stage, id_within_stage), where the iteration is the hyperband
    iteration and the stage is the index of the budget in which the configuration was first sampled (should be 0).
    The third index is just a sequential enumeration. This id can be interpreted as a nested index-identifier.
    """

    def __init__(self,
                 runscontainer,
                 ):
        super().__init__(runscontainer)
        try:
            from hpbandster.core.result import logged_results_to_HBS_result
            from hpbandster.core.result import extract_HBS_learning_curves
        except ImportError as err:
            self.logger.exception(err)
            raise ImportError("You need to install hpbandster (e.g. 'pip install hpbandster') to analyze bohb-results.")

        self.hp_names = runscontainer.scenario.cs.get_hyperparameter_names()
        self.result_objects = [cr.share_information['hpbandster_result'] for cr in self.runscontainer.get_all_runs()]
        self.result_object = self.result_objects[0]
        # TODO extend to support parallel runs (?)
        self.lcs = self.result_object.get_learning_curves(lc_extractor=extract_HBS_learning_curves)

[docs]    def get_name(self):
        return "BOHB Learning Curves"

[docs]    def plot(self, reset_times=False):
        return self._plot(self.result_object, self.lcs, self.hp_names, reset_times=reset_times)

[docs]    def _plot(self, result_object, learning_curves, hyperparameter_names, reset_times=False):
        # Extract information from learning-curve-dict
        times, losses, config_ids, = [], [], []
        for conf_id, learning_curves in learning_curves.items():
            # self.logger.debug("Config ID: %s, learning_curves: %s", str(conf_id), str(learning_curves))
            for lc in learning_curves:
                if len(lc) == 0:
                    continue
                tmp = list(zip(*[(time, loss) for time, loss in lc if np.isfinite(loss) and loss is not None]))
                if len(tmp) == 0:
                    self.logger.debug("Probably filtered NaNs or None's.., skipping %s, data %s", str(conf_id), str(lc))
                    continue
                times.append(tmp[0])
                losses.append(tmp[1])
                config_ids.append(conf_id)

        if reset_times:
            times = [np.array(ts) - ts[0] for ts in times]

        # Prepare ColumnDataSource
        data = OrderedDict([
                            ('config_id', []),
                            ('config_info', []),
                            ('times', []),
                            ('losses', []),
                            ('duration', []),
                            ('HB_iteration', []),
                            ('colors', []),
                            ('colors_performance', []),
                            ('colors_iteration', []),
                           ])
        for hp in hyperparameter_names:
            data[hp] = []

        # Populate
        id2conf = result_object.get_id2config_mapping()
        for counter, c_id in enumerate(config_ids):
            if not (len(times[counter]) == len(losses[counter])):
                raise ValueError()
            longest_run = self.get_longest_run(c_id, result_object)
            if not longest_run:
                continue
            data['config_id'].append(str(c_id))
            try:
                conf_info = '\n'.join([str(k) + "=" + str(v) for k, v in sorted(id2conf[c_id]['config_info'].items())])
            except KeyError:
                conf_info = 'Not Available'
            data['config_info'].append(conf_info)
            data['times'].append(times[counter])
            data['losses'].append(losses[counter])
            if longest_run:
                data['duration'].append(longest_run['time_stamps']['finished'] - longest_run['time_stamps']['started'])
            else:
                data['duration'].append('N/A')
            data['HB_iteration'].append(str(c_id[0]))
            for hp in hyperparameter_names:
                try:
                    data[hp].append(id2conf[c_id]['config'][hp])
                except KeyError:
                    data[hp].append("None")
            data['colors'].append(losses[counter][-1])
            data['colors_performance'].append(losses[counter][-1])
            data['colors_iteration'].append(c_id[0])

        # Tooltips
        tooltips = [(key, '@' + key) for key in data.keys() if key not in ['times', 'duration', 'colors',
                                                                           'colors_performance', 'colors_iteration']]
        tooltips.insert(4, ('duration (sec)', '@duration'))
        tooltips.insert(5, ('Configuration', ' '))
        hover = HoverTool(tooltips=tooltips)

        # Create sources
        source_multiline = ColumnDataSource(data=data)
        # Special source for scattering points, since times and losses for multi_line are nested lists
        scatter_data = {key: [] for key in data.keys()}
        for idx, c_id in enumerate(data['config_id']):
            for t, l in zip(data['times'][idx], data['losses'][idx]):
                scatter_data['times'].append(t)
                scatter_data['losses'].append(l)
                for key in list(data.keys()):
                    if key in ['times', 'losses']:
                        continue
                    scatter_data[key].append(data[key][idx])
        source_scatter = ColumnDataSource(data=scatter_data)

        # Color
        min_perf, max_perf = min([loss[-1] for loss in data['losses']]), max([loss[-1] for loss in data['losses']])
        min_iter, max_iter = min([int(i) for i in data['HB_iteration']]), max([int(i) for i in data['HB_iteration']])
        color_mapper = LinearColorMapper(palette=Spectral11, low=min_perf, high=max_perf)

        # Create plot
        y_axis_type = "log" if len([a for a in scatter_data['losses'] if a <= 0]) == 0 else 'linear'

        x_min, x_max = min(scatter_data['times']), max(scatter_data['times'])
        x_pad = (x_max - x_min) / 10
        x_min -= x_pad
        x_max += x_pad
        y_min, y_max = min(scatter_data['losses']), max(scatter_data['losses'])
        y_pad = (y_max - y_min) / 10
        y_min -= (y_min / 10) if y_axis_type == 'log' else y_pad  # because this must not be below 0 if it's a logscale
        y_max += y_pad * 10 if y_axis_type == 'log' else y_pad
        p = figure(plot_height=500, plot_width=600,
                   y_axis_type=y_axis_type,
                   tools=[hover, 'save', 'pan', 'wheel_zoom', 'box_zoom', 'reset'],
                   x_axis_label='Time', y_axis_label='Cost',
                   x_range=Range1d(x_min, x_max, bounds='auto'),
                   y_range=Range1d(y_min, y_max, bounds='auto'),
                   )

        # Plot per HB_iteration, each config individually
        HB_iterations = sorted(set(data['HB_iteration']))
        max_label_len = max([len(iteration) for iteration in HB_iterations])
        HB_handles, HB_labels = [], []
        self.logger.debug("Assuming config_info to be either \"model_based_pick=True\" or \"model_based_pick=False\"")
        for it in HB_iterations:
            line_handles = []
            view = CDSView(source=source_multiline, filters=[GroupFilter(column_name='HB_iteration', group=str(it))])
            line_handles.append(p.multi_line(xs='times', ys='losses',
                                             source=source_multiline,
                                             view=view,
                                             color={'field': 'colors', 'transform': color_mapper},
                                             alpha=0.5,
                                             line_width=5,
                                             ))
            # Separate modelbased and rand   om
            view = CDSView(source=source_scatter, filters=[GroupFilter(column_name='HB_iteration', group=str(it)),
                                                           GroupFilter(column_name='config_info',
                                                           group="model_based_pick=True")])
            line_handles.append(p.circle_x(x='times', y='losses',
                                           source=source_scatter,
                                           view=view,
                                           fill_color={'field': 'colors', 'transform': color_mapper},
                                           fill_alpha=0.5,
                                           line_color='colors',
                                           size=20,
                                           ))
            view = CDSView(source=source_scatter, filters=[GroupFilter(column_name='HB_iteration', group=str(it)),
                                                           GroupFilter(column_name='config_info',
                                                           group="model_based_pick=False")])
            line_handles.append(p.circle(x='times', y='losses',
                                         source=source_scatter,
                                         view=view,
                                         fill_color={'field': 'colors', 'transform': color_mapper},
                                         fill_alpha=0.5,
                                         line_color='colors',
                                         size=20,
                                         ))
            HB_handles.append(line_handles)
            HB_labels.append('warmstart data' if it in [-1, '-1'] else '{number:0{width}d}'.format(width=max_label_len,
                                                                                                   number=int(it)))

        # Sort all lists according to label
        HB_iterations, HB_handles, HB_labels = zip(*sorted(zip(HB_iterations, HB_handles, HB_labels),
                                                           key=lambda tup: tup[2]))
        HB_iterations, HB_handles, HB_labels = list(HB_iterations), list(HB_handles), list(HB_labels)
        self.logger.debug("HB_iterations to labels: %s", str(list(zip(HB_iterations, HB_labels))))

        chckboxes, select_all, select_none = get_checkbox(HB_handles, HB_labels, max_checkbox_length=10)

        callback_color = CustomJS(args=dict(source_multiline=source_multiline,
                                            source_scatter=source_scatter,
                                            cm=color_mapper), code="""
            var data_multiline = source_multiline.data;
            var data_scatter = source_scatter.data;
            var min_perf = {0};
            var max_perf = {1};
            var min_iter = {2};
            var max_iter = {3};
            if (cb_obj.value == 'performance') {{
                data_multiline['colors'] = data_multiline['colors_performance'];
                data_scatter['colors'] = data_scatter['colors_performance'];
                cm.low = min_perf;
                cm.high = max_perf;
            }} else {{
                data_multiline['colors'] = data_multiline['colors_iteration'];
                data_scatter['colors'] = data_scatter['colors_iteration'];
                cm.low = min_iter;
                cm.high = max_iter;
            }}
            source.change.emit();
            """.format(min_perf, max_perf, min_iter, max_iter))

        select_color = Select(title="Select colors",
                              value="performance",
                              options=["performance", "iteration"],
                              callback=callback_color)

        # Put it all together in a layout (width of checkbox-field sizes with number of elements
        chkbox_width = 650 if len(HB_labels) > 100 else 500 if len(HB_labels) > 70 else 400
        layout = row(p, column(*[widgetbox(chkbox, max_width=chkbox_width, width_policy="min") for chkbox in chckboxes],
                               row(widgetbox(select_all, width=50),
                                   widgetbox(select_none, width=50)),
                               widgetbox(select_color, width=200)))
        return layout

[docs]    @staticmethod
    def get_longest_run(c_id, result_object):
        all_runs =  result_object.get_runs_by_id(c_id)
        longest_run = all_runs[-1]
        while longest_run.loss is None:
            all_runs.pop()
            if len(all_runs) == 0:
                return False
            longest_run = all_runs[-1]
        return longest_run

[docs]    def get_jupyter(self):
        output_notebook()
        show(self.plot())

[docs]    def get_html(self, d=None, tooltip=None):
        script, div = components(self.plot())
        if d is not None:
            d["BOHB Learning Curves"] = {"bokeh": (script, div), "tooltip": self.__doc__}
        return script, div