import logging
import os
import shutil
import tempfile
import typing
from collections import OrderedDict
from functools import wraps
from importlib import reload
from typing import Union, List
import numpy as np
from cave.__version__ import __version__ as cave_version
from cave.analyzer.apt.apt_overview import APTOverview
from cave.analyzer.apt.apt_tensorboard import APTTensorboard
from cave.analyzer.budgets.bohb_learning_curves import BohbLearningCurves
from cave.analyzer.budgets.budget_correlation import BudgetCorrelation
from cave.analyzer.budgets.incumbents_over_budgets import IncumbentsOverBudgets
from cave.analyzer.configurator.configurator_footprint import ConfiguratorFootprint
from cave.analyzer.configurator.parallel_coordinates import ParallelCoordinates
from cave.analyzer.feature_analysis.box_violin import BoxViolin
from cave.analyzer.feature_analysis.feature_clustering import FeatureClustering
from cave.analyzer.feature_analysis.feature_correlation import FeatureCorrelation
from cave.analyzer.feature_analysis.feature_importance import FeatureImportance
from cave.analyzer.parameter_importance.ablation import Ablation
from cave.analyzer.parameter_importance.fanova import Fanova
from cave.analyzer.parameter_importance.forward_selection import ForwardSelection
from cave.analyzer.parameter_importance.local_parameter_importance import LocalParameterImportance
from cave.analyzer.parameter_importance.pimp_comparison_table import PimpComparisonTable
from cave.analyzer.performance.algorithm_footprint import AlgorithmFootprint
from cave.analyzer.performance.compare_default_incumbent import CompareDefaultIncumbent
from cave.analyzer.performance.cost_over_time import CostOverTime
from cave.analyzer.performance.overview_table import OverviewTable
from cave.analyzer.performance.performance_table import PerformanceTable
from cave.analyzer.performance.plot_ecdf import PlotECDF
from cave.analyzer.performance.plot_scatter import PlotScatter
from cave.html.html_builder import HTMLBuilder
from cave.reader.runs_container import RunsContainer
from cave.utils.exceptions import Deactivated, NotApplicable
from cave.utils.helpers import load_default_options
from cave.utils.timing import timing
__author__ = "Joshua Marben"
__copyright__ = "Copyright 2017, ML4AAD"
__license__ = "3-clause BSD"
__maintainer__ = "Joshua Marben"
__email__ = "joshua.marben@neptun.uni-freiburg.de"
[docs]def _analyzer_type(f):
@wraps(f)
def wrap(self, *args, d=None, **kw):
self.logger.debug("Args: %s, Kwargs: %s", str(args), str(kw))
try:
analyzer = f(self, *args, **kw)
except Deactivated as err:
self.logger.info(err)
return
except NotApplicable as err:
self.logger.debug("Skipping analyzer ({})".format(err))
return
except Exception as err:
self.logger.exception(err)
raise
else:
# execute hooks, if possible and/or desired
if self.show_jupyter:
try:
analyzer.get_jupyter()
except ImportError as err:
self.logger.debug(err)
self.logger.info("Assuming that jupyter is not installed. Disable for rest of report.")
self.show_jupyter = False
if isinstance(d, dict):
analyzer.get_html(d)
self._build_website()
return analyzer
return wrap
[docs]class CAVE(object):
def __init__(self,
folders: typing.List[str],
output_dir: str,
ta_exec_dir: typing.List[str],
file_format: str='auto',
validation_format='NONE',
validation_method: str='epm',
seed: int=42,
show_jupyter: bool=True,
verbose_level: str='OFF',
analyzing_options=None,
**kwargs
):
"""
Initialize CAVE facade to handle analyzing, plotting and building the report-page easily.
During initialization, the analysis-infrastructure is built and the data is validated, the overall best
incumbent is found and default+incumbent are evaluated for all instances for all runs, by default using an EPM.
Arguments
---------
folders: list<strings>
paths to relevant SMAC runs
output_dir: string
output for cave to write results (figures + report)
ta_exec_dir: string
execution directory for target algorithm (to find instance.txt specified in scenario, ..)
file_format: string
what format the rundata is in, options are [SMAC3, SMAC2, BOHB and CSV]
validation_format: string
what format the validation rundata is in, options are [SMAC3, SMAC2, CSV and None]
validation_method: string
from [validation, epm], how to estimate missing runs
seed: int
random seed for analysis (e.g. the random forests)
show_jupyter: bool
default True, tries to output plots and tables to jupyter-frontend, if available
verbose_level: string
from [OFF, INFO, DEBUG, DEV_DEBUG and WARNING]
analyzing_options: string or dict
options-dictionary following CAVE's options-syntax
"""
self.show_jupyter = show_jupyter
if self.show_jupyter:
# Reset logging module (needs to happen before logger initialization)
logging.shutdown()
reload(logging)
self.logger = logging.getLogger(self.__module__ + '.' + self.__class__.__name__)
self.output_dir = output_dir
self.output_dir_created = False # this flag avoids multiple output-dir creations
# Create output_dir and set verbosity
self.set_verbosity(verbose_level.upper())
self._create_outputdir(self.output_dir)
self.logger.debug("Running CAVE version %s", cave_version)
self.verbose_level = verbose_level
self.rng = np.random.RandomState(seed)
self.folders = folders
self.ta_exec_dir = ta_exec_dir
self.file_format = file_format
self.validation_format = validation_format
self.validation_method = validation_method
# Configuration of analyzers (works as a default for report generation)
analyzing_options = load_default_options(analyzing_options, file_format)
self.runscontainer = RunsContainer(folders=self.folders,
ta_exec_dirs=self.ta_exec_dir,
output_dir=self.output_dir,
file_format=self.file_format, # TODO remove?
validation_format=self.validation_format, # TODO remove?
analyzing_options=analyzing_options,
)
# create builder for html-website, decide for suitable logo
custom_logo = './custom_logo.png'
if self.runscontainer.file_format == 'BOHB':
logo_fn = 'BOHB_logo.png'
elif self.runscontainer.file_format.startswith('SMAC'):
logo_fn = 'SMAC_logo.png'
elif os.path.exists(custom_logo):
logo_fn = custom_logo
else:
logo_fn = 'automl-logo.png'
self.logger.info("No suitable logo found. You can use a custom logo simply by having a file called '%s' "
"in the directory from which you run CAVE.", custom_logo)
use_custom_logo = custom_logo == logo_fn
self.builder = HTMLBuilder(self.output_dir, "CAVE", logo_fn=logo_fn, logo_custom=use_custom_logo)
self.website = OrderedDict([])
[docs] @timing
def analyze(self,
options=None):
"""
Analyze the available data and build HTML-webpage as dict.
Save webpage in 'self.output_dir/CAVE/report.html'.
Parameters
----------
options: Dict or str
either a dictionary or a path to an ini-file.
"""
# Save jupyter-flag (needs to be False while analyzing) and reset it later.
flag_show_jupyter = self.show_jupyter
self.show_jupyter = False
# Process analyzing-options
if isinstance(options, str):
self.logger.debug("Loading \"{}\".".format(options))
self.runscontainer.analyzing_options.read(options)
elif isinstance(options, dict):
for k, v in options.items():
occurences = sum([1 for x in self.runscontainer.analyzing_options.sections() if k in x])
if occurences > 1:
self.logger.warning("{} is an arbitrary option - to avoid collisions, consider a different name.")
if occurences == 0 and k not in self.runscontainer.analyzing_options.sections():
self.logger.warning("{} is not found in default options. Are you sure you know what you are doing?")
for s in self.runscontainer.analyzing_options.sections():
if s == k:
self.runscontainer.analyzing_options[k]['run'] = str(v)
elif k in self.runscontainer.analyzing_options[s]:
self.runscontainer.analyzing_options[s][k] = str(v)
# Invoke the analyzers one by one
self.overview_table(d=self.website)
self.compare_default_incumbent(d=self._get_dict(self.website, "Meta Data"))
###################################################
# Performance Analysis # Performance Analysis #
###################################################
title = "Performance Analysis"
self.performance_table(d=self._get_dict(self.website, title))
self.plot_ecdf(d=self._get_dict(self.website, title))
self.plot_scatter(d=self._get_dict(self.website, title))
self.algorithm_footprints(d=self._get_dict(self.website, title))
###################################################
# Budget Analysis # Budget Analysis #
###################################################
title = "Budget Analysis"
if self.runscontainer.get_budgets() is not None:
self.bohb_incumbents_per_budget(d=self._get_dict(self.website, title))
self.budget_correlation(d=self._get_dict(self.website, title))
if self.runscontainer.file_format == "BOHB":
self.bohb_learning_curves(d=self._get_dict(self.website, title))
###################################################
# Parameter- and Feature-Analysis #
###################################################
self.parameter_importance(self._get_dict(self.website, "Parameter Importance"))
self.feature_analysis(self._get_dict(self.website, "Feature Analysis"))
###################################################
# Configurator's Behaviour #
###################################################
title = "Configurators Behavior"
self.configurator_footprint(d=self._get_dict(self.website, title))
self.cost_over_time(d=self._get_dict(self.website, title))
# Parallel Coordinates should be after parameter importance, if performed.
self.parallel_coordinates(d=self._get_dict(self.website, title))
###################################################
# Auto-PyTorch specific #
###################################################
title = "Auto-PyTorch specific"
self.apt_overview(d=self._get_dict(self.website, title))
self.apt_tensorboard(d=self._get_dict(self.website, title))
self._build_website()
self.logger.info("CAVE finished. Report is located in %s", os.path.join(self.output_dir, 'report.html'))
# Set jupyter-flag as it was before.
self.show_jupyter = flag_show_jupyter
[docs] def _get_dict(self, d, layername):
""" Get the appropriate sub-dict for this layer (or layer-run combination) and create it if necessary """
if not isinstance(d, dict):
raise ValueError("Pass a valid dict to _get_dict!")
if layername not in d:
d[layername] = OrderedDict()
return d[layername]
[docs] @_analyzer_type
def overview_table(self):
return OverviewTable(self.runscontainer)
[docs] @_analyzer_type
def compare_default_incumbent(self):
return CompareDefaultIncumbent(self.runscontainer)
[docs] @_analyzer_type
def plot_scatter(self):
return PlotScatter(self.runscontainer)
[docs] @_analyzer_type
def plot_ecdf(self):
return PlotECDF(self.runscontainer)
[docs] @_analyzer_type
def cost_over_time(self,
incumbent_trajectory=None):
return CostOverTime(self.runscontainer,
incumbent_trajectory=incumbent_trajectory)
[docs] @_analyzer_type
def parallel_coordinates(self,
pc_sort_by: str=None,
params: Union[int, List[str]]=None,
n_configs: int=None,
max_runs_epm: int=None,
):
return ParallelCoordinates(self.runscontainer,
pc_sort_by=pc_sort_by,
params=params,
n_configs=n_configs,
max_runs_epm=max_runs_epm,
)
[docs] @_analyzer_type
def cave_fanova(self):
try:
fanova = Fanova(self.runscontainer)
except IndexError as err:
self.logger.debug("Error in fANOVA (%s)", err, exc_info=1)
raise IndexError("Error in fANOVA - please run with --pimp_no_fanova_pairs (this might be due to a known "
"issue with ints and bools in categorical hyperparameters, see issue #192).")
return fanova
[docs] @_analyzer_type
def cave_ablation(self):
return Ablation(self.runscontainer)
[docs] @_analyzer_type
def pimp_forward_selection(self):
return ForwardSelection(self.runscontainer)
[docs] @_analyzer_type
def local_parameter_importance(self):
return LocalParameterImportance(self.runscontainer)
[docs] @_analyzer_type
def pimp_comparison_table(self,
pimp_sort_table_by="average"):
return PimpComparisonTable(self.runscontainer,
sort_table_by=pimp_sort_table_by,
)
[docs] def parameter_importance(self,
d):
"""Perform the specified parameter importance procedures. """
self.cave_fanova(d=d)
self.cave_ablation(d=d)
self.pimp_forward_selection(d=d)
self.local_parameter_importance(d=d)
if sum([1 for x in ['fANOVA', 'Ablation', 'Forward Selection', 'Local Parameter Importance (LPI)']
if self.runscontainer.analyzing_options[x]['run'] != 'False']) >= 2:
pct = self.pimp_comparison_table(d=d)
d.move_to_end(pct.name, last=False)
[docs] @_analyzer_type
def feature_importance(self):
res = FeatureImportance(self.runscontainer)
return res
[docs] @_analyzer_type
def box_violin(self):
return BoxViolin(self.runscontainer)
[docs] @_analyzer_type
def feature_correlation(self):
return FeatureCorrelation(self.runscontainer)
[docs] @_analyzer_type
def feature_clustering(self):
return FeatureClustering(self.runscontainer)
[docs] def feature_analysis(self, d):
# feature importance using forward selection
self.feature_importance(d=d)
self.box_violin(d=d)
self.feature_correlation(d=d)
self.feature_clustering(d=d)
[docs] @_analyzer_type
def bohb_learning_curves(self):
return BohbLearningCurves(self.runscontainer)
[docs] @_analyzer_type
def bohb_incumbents_per_budget(self):
return IncumbentsOverBudgets(self.runscontainer)
[docs] @_analyzer_type
def budget_correlation(self):
return BudgetCorrelation(self.runscontainer)
[docs] @_analyzer_type
def apt_overview(self):
return APTOverview(self.runscontainer)
[docs] @_analyzer_type
def apt_tensorboard(self):
return APTTensorboard(self.runscontainer)
###########################################################################
# HELPERS HELPERS HELPERS HELPERS HELPERS HELPERS HELPERS HELPERS HELPERS #
###########################################################################
[docs] def print_budgets(self):
"""If the analyzed configurator uses budgets, print a list of available budgets."""
print(self.runscontainer.get_budgets())
[docs] def _build_website(self):
self.builder.generate_webpage(self.website)
[docs] def set_verbosity(self, level):
"""
Set verbosity level and configure loggers (to stdout and to file), create output-dir if necessary.
Parameters
----------
level: str
From [INFO, WARNING, OFF, DEBUG, DEV_DEBUG]
"""
logging.getLogger().setLevel(logging.DEBUG)
# Log to stream (console)
formatter = logging.Formatter('%(levelname)s:%(name)s:%(message)s')
stdout_handler = logging.StreamHandler()
stdout_handler.setFormatter(formatter)
if level == "INFO":
stdout_handler.setLevel(logging.INFO)
elif level == "WARNING":
stdout_handler.setLevel(logging.WARNING)
elif level == "OFF":
stdout_handler.setLevel(logging.ERROR)
elif level in ["DEBUG", "DEV_DEBUG"]:
stdout_handler.setLevel(logging.DEBUG)
if level == "DEV_DEBUG":
# Disable annoying boilerplate-debug-logs from foreign modules
disable_loggers = ["smac.scenario",
# pimp logging
"pimp.epm.unlogged_epar_x_rfwi.UnloggedEPARXrfi",
"Forward-Selection",
"LPI",
# Other (mostly bokeh)
"PIL.PngImagePlugin",
"matplotlib",
"urllib3.connectionpool",
"selenium.webdriver.remote.remote_connection"]
for logger in disable_loggers:
logging.getLogger('cave.settings').debug("Setting logger \'%s\' on level INFO", logger)
logging.getLogger(logger).setLevel(logging.INFO)
else:
raise ValueError("{} not recognized as a verbosity level."
"Choose from DEBUG, DEV_DEBUG. INFO, WARNING, OFF.".format(level))
logging.getLogger().addHandler(stdout_handler)
# Log to file is always debug
debug_path = os.path.join(self.output_dir, "debug", "debug.log")
logging.getLogger('cave.settings').debug("Output-file for debug-log: '%s'", debug_path)
self._create_outputdir(self.output_dir)
os.makedirs(os.path.split(debug_path)[0])
fh = logging.FileHandler(debug_path, "a")
fh.setLevel(logging.DEBUG)
fh.setFormatter(formatter)
logging.getLogger().addHandler(fh)
self.logger.debug("Existing loggers: %s", str(logging.root.manager.loggerDict))
[docs] def _create_outputdir(self, output_dir):
""" Creates output-dir, if necessary. Also sets the 'self.output_dir_created'-flag, so this only happens once.
If there is a directory already, zip this into an archive in the output_dir called '.OLD.zip'. """
if self.output_dir_created:
if not os.path.exists(output_dir):
raise RuntimeError("'%s' should exist, but doesn't. Any raceconditions? "
"Please report to github.com/automl/CAVE/issues with debug/debug.log")
self.logger.debug("Output-dir '%s' was already created, call ignored", output_dir)
return
self.logger.info("Saving results to '%s'", output_dir)
if not os.path.exists(output_dir):
self.logger.debug("Output-dir '%s' does not exist, creating", output_dir)
os.makedirs(output_dir)
else:
archive_path = shutil.make_archive(os.path.join(tempfile.mkdtemp(), '.OLD'), 'zip', output_dir)
shutil.rmtree(output_dir)
os.makedirs(output_dir)
shutil.move(archive_path, output_dir)
self.logger.debug("Output-dir '%s' exists, moving old content to '%s'", self.output_dir,
os.path.join(self.output_dir, '.OLD.zip'))
self.output_dir_created = True