from collections import OrderedDict
import numpy as np
from bokeh.embed import components
from bokeh.io import output_notebook
from bokeh.layouts import column, row, widgetbox
from bokeh.models import HoverTool, Range1d, LinearColorMapper, CustomJS
from bokeh.models.filters import GroupFilter
from bokeh.models.sources import CDSView
from bokeh.models.widgets import Select
from bokeh.palettes import Spectral11
from bokeh.plotting import show, figure, ColumnDataSource
from cave.analyzer.base_analyzer import BaseAnalyzer
from cave.utils.bokeh_routines import get_checkbox
[docs]class BohbLearningCurves(BaseAnalyzer):
"""
Visualizing the learning curves of all individual HyperBand-iterations. Model-based picks are marked with a
cross. The config-id tuple denotes (iteration, stage, id_within_stage), where the iteration is the hyperband
iteration and the stage is the index of the budget in which the configuration was first sampled (should be 0).
The third index is just a sequential enumeration. This id can be interpreted as a nested index-identifier.
"""
def __init__(self,
runscontainer,
):
super().__init__(runscontainer)
try:
from hpbandster.core.result import logged_results_to_HBS_result
from hpbandster.core.result import extract_HBS_learning_curves
except ImportError as err:
self.logger.exception(err)
raise ImportError("You need to install hpbandster (e.g. 'pip install hpbandster') to analyze bohb-results.")
self.hp_names = runscontainer.scenario.cs.get_hyperparameter_names()
self.result_objects = [cr.share_information['hpbandster_result'] for cr in self.runscontainer.get_all_runs()]
self.result_object = self.result_objects[0]
# TODO extend to support parallel runs (?)
self.lcs = self.result_object.get_learning_curves(lc_extractor=extract_HBS_learning_curves)
[docs] def get_name(self):
return "BOHB Learning Curves"
[docs] def plot(self, reset_times=False):
return self._plot(self.result_object, self.lcs, self.hp_names, reset_times=reset_times)
[docs] def _plot(self, result_object, learning_curves, hyperparameter_names, reset_times=False):
# Extract information from learning-curve-dict
times, losses, config_ids, = [], [], []
for conf_id, learning_curves in learning_curves.items():
# self.logger.debug("Config ID: %s, learning_curves: %s", str(conf_id), str(learning_curves))
for lc in learning_curves:
if len(lc) == 0:
continue
tmp = list(zip(*[(time, loss) for time, loss in lc if np.isfinite(loss) and loss is not None]))
if len(tmp) == 0:
self.logger.debug("Probably filtered NaNs or None's.., skipping %s, data %s", str(conf_id), str(lc))
continue
times.append(tmp[0])
losses.append(tmp[1])
config_ids.append(conf_id)
if reset_times:
times = [np.array(ts) - ts[0] for ts in times]
# Prepare ColumnDataSource
data = OrderedDict([
('config_id', []),
('config_info', []),
('times', []),
('losses', []),
('duration', []),
('HB_iteration', []),
('colors', []),
('colors_performance', []),
('colors_iteration', []),
])
for hp in hyperparameter_names:
data[hp] = []
# Populate
id2conf = result_object.get_id2config_mapping()
for counter, c_id in enumerate(config_ids):
if not (len(times[counter]) == len(losses[counter])):
raise ValueError()
longest_run = self.get_longest_run(c_id, result_object)
if not longest_run:
continue
data['config_id'].append(str(c_id))
try:
conf_info = '\n'.join([str(k) + "=" + str(v) for k, v in sorted(id2conf[c_id]['config_info'].items())])
except KeyError:
conf_info = 'Not Available'
data['config_info'].append(conf_info)
data['times'].append(times[counter])
data['losses'].append(losses[counter])
if longest_run:
data['duration'].append(longest_run['time_stamps']['finished'] - longest_run['time_stamps']['started'])
else:
data['duration'].append('N/A')
data['HB_iteration'].append(str(c_id[0]))
for hp in hyperparameter_names:
try:
data[hp].append(id2conf[c_id]['config'][hp])
except KeyError:
data[hp].append("None")
data['colors'].append(losses[counter][-1])
data['colors_performance'].append(losses[counter][-1])
data['colors_iteration'].append(c_id[0])
# Tooltips
tooltips = [(key, '@' + key) for key in data.keys() if key not in ['times', 'duration', 'colors',
'colors_performance', 'colors_iteration']]
tooltips.insert(4, ('duration (sec)', '@duration'))
tooltips.insert(5, ('Configuration', ' '))
hover = HoverTool(tooltips=tooltips)
# Create sources
source_multiline = ColumnDataSource(data=data)
# Special source for scattering points, since times and losses for multi_line are nested lists
scatter_data = {key: [] for key in data.keys()}
for idx, c_id in enumerate(data['config_id']):
for t, l in zip(data['times'][idx], data['losses'][idx]):
scatter_data['times'].append(t)
scatter_data['losses'].append(l)
for key in list(data.keys()):
if key in ['times', 'losses']:
continue
scatter_data[key].append(data[key][idx])
source_scatter = ColumnDataSource(data=scatter_data)
# Color
min_perf, max_perf = min([loss[-1] for loss in data['losses']]), max([loss[-1] for loss in data['losses']])
min_iter, max_iter = min([int(i) for i in data['HB_iteration']]), max([int(i) for i in data['HB_iteration']])
color_mapper = LinearColorMapper(palette=Spectral11, low=min_perf, high=max_perf)
# Create plot
y_axis_type = "log" if len([a for a in scatter_data['losses'] if a <= 0]) == 0 else 'linear'
x_min, x_max = min(scatter_data['times']), max(scatter_data['times'])
x_pad = (x_max - x_min) / 10
x_min -= x_pad
x_max += x_pad
y_min, y_max = min(scatter_data['losses']), max(scatter_data['losses'])
y_pad = (y_max - y_min) / 10
y_min -= (y_min / 10) if y_axis_type == 'log' else y_pad # because this must not be below 0 if it's a logscale
y_max += y_pad * 10 if y_axis_type == 'log' else y_pad
p = figure(plot_height=500, plot_width=600,
y_axis_type=y_axis_type,
tools=[hover, 'save', 'pan', 'wheel_zoom', 'box_zoom', 'reset'],
x_axis_label='Time', y_axis_label='Cost',
x_range=Range1d(x_min, x_max, bounds='auto'),
y_range=Range1d(y_min, y_max, bounds='auto'),
)
# Plot per HB_iteration, each config individually
HB_iterations = sorted(set(data['HB_iteration']))
max_label_len = max([len(iteration) for iteration in HB_iterations])
HB_handles, HB_labels = [], []
self.logger.debug("Assuming config_info to be either \"model_based_pick=True\" or \"model_based_pick=False\"")
for it in HB_iterations:
line_handles = []
view = CDSView(source=source_multiline, filters=[GroupFilter(column_name='HB_iteration', group=str(it))])
line_handles.append(p.multi_line(xs='times', ys='losses',
source=source_multiline,
view=view,
color={'field': 'colors', 'transform': color_mapper},
alpha=0.5,
line_width=5,
))
# Separate modelbased and rand om
view = CDSView(source=source_scatter, filters=[GroupFilter(column_name='HB_iteration', group=str(it)),
GroupFilter(column_name='config_info',
group="model_based_pick=True")])
line_handles.append(p.circle_x(x='times', y='losses',
source=source_scatter,
view=view,
fill_color={'field': 'colors', 'transform': color_mapper},
fill_alpha=0.5,
line_color='colors',
size=20,
))
view = CDSView(source=source_scatter, filters=[GroupFilter(column_name='HB_iteration', group=str(it)),
GroupFilter(column_name='config_info',
group="model_based_pick=False")])
line_handles.append(p.circle(x='times', y='losses',
source=source_scatter,
view=view,
fill_color={'field': 'colors', 'transform': color_mapper},
fill_alpha=0.5,
line_color='colors',
size=20,
))
HB_handles.append(line_handles)
HB_labels.append('warmstart data' if it in [-1, '-1'] else '{number:0{width}d}'.format(width=max_label_len,
number=int(it)))
# Sort all lists according to label
HB_iterations, HB_handles, HB_labels = zip(*sorted(zip(HB_iterations, HB_handles, HB_labels),
key=lambda tup: tup[2]))
HB_iterations, HB_handles, HB_labels = list(HB_iterations), list(HB_handles), list(HB_labels)
self.logger.debug("HB_iterations to labels: %s", str(list(zip(HB_iterations, HB_labels))))
chckboxes, select_all, select_none = get_checkbox(HB_handles, HB_labels, max_checkbox_length=10)
callback_color = CustomJS(args=dict(source_multiline=source_multiline,
source_scatter=source_scatter,
cm=color_mapper), code="""
var data_multiline = source_multiline.data;
var data_scatter = source_scatter.data;
var min_perf = {0};
var max_perf = {1};
var min_iter = {2};
var max_iter = {3};
if (cb_obj.value == 'performance') {{
data_multiline['colors'] = data_multiline['colors_performance'];
data_scatter['colors'] = data_scatter['colors_performance'];
cm.low = min_perf;
cm.high = max_perf;
}} else {{
data_multiline['colors'] = data_multiline['colors_iteration'];
data_scatter['colors'] = data_scatter['colors_iteration'];
cm.low = min_iter;
cm.high = max_iter;
}}
source.change.emit();
""".format(min_perf, max_perf, min_iter, max_iter))
select_color = Select(title="Select colors",
value="performance",
options=["performance", "iteration"],
callback=callback_color)
# Put it all together in a layout (width of checkbox-field sizes with number of elements
chkbox_width = 650 if len(HB_labels) > 100 else 500 if len(HB_labels) > 70 else 400
layout = row(p, column(*[widgetbox(chkbox, max_width=chkbox_width, width_policy="min") for chkbox in chckboxes],
row(widgetbox(select_all, width=50),
widgetbox(select_none, width=50)),
widgetbox(select_color, width=200)))
return layout
[docs] @staticmethod
def get_longest_run(c_id, result_object):
all_runs = result_object.get_runs_by_id(c_id)
longest_run = all_runs[-1]
while longest_run.loss is None:
all_runs.pop()
if len(all_runs) == 0:
return False
longest_run = all_runs[-1]
return longest_run
[docs] def get_jupyter(self):
output_notebook()
show(self.plot())
[docs] def get_html(self, d=None, tooltip=None):
script, div = components(self.plot())
if d is not None:
d["BOHB Learning Curves"] = {"bokeh": (script, div), "tooltip": self.__doc__}
return script, div