Source code for cave.cavefacade

import logging
import os
import shutil
import tempfile
import typing
from collections import OrderedDict
from functools import wraps
from importlib import reload
from typing import Union, List

import numpy as np

from cave.__version__ import __version__ as cave_version
from cave.analyzer.apt.apt_overview import APTOverview
from cave.analyzer.apt.apt_tensorboard import APTTensorboard
from cave.analyzer.budgets.bohb_learning_curves import BohbLearningCurves
from cave.analyzer.budgets.budget_correlation import BudgetCorrelation
from cave.analyzer.budgets.incumbents_over_budgets import IncumbentsOverBudgets
from cave.analyzer.configurator.configurator_footprint import ConfiguratorFootprint
from cave.analyzer.configurator.parallel_coordinates import ParallelCoordinates
from cave.analyzer.feature_analysis.box_violin import BoxViolin
from cave.analyzer.feature_analysis.feature_clustering import FeatureClustering
from cave.analyzer.feature_analysis.feature_correlation import FeatureCorrelation
from cave.analyzer.feature_analysis.feature_importance import FeatureImportance
from cave.analyzer.parameter_importance.ablation import Ablation
from cave.analyzer.parameter_importance.fanova import Fanova
from cave.analyzer.parameter_importance.forward_selection import ForwardSelection
from cave.analyzer.parameter_importance.local_parameter_importance import LocalParameterImportance
from cave.analyzer.parameter_importance.pimp_comparison_table import PimpComparisonTable
from cave.analyzer.performance.algorithm_footprint import AlgorithmFootprint
from cave.analyzer.performance.compare_default_incumbent import CompareDefaultIncumbent
from cave.analyzer.performance.cost_over_time import CostOverTime
from cave.analyzer.performance.overview_table import OverviewTable
from cave.analyzer.performance.performance_table import PerformanceTable
from cave.analyzer.performance.plot_ecdf import PlotECDF
from cave.analyzer.performance.plot_scatter import PlotScatter
from cave.html.html_builder import HTMLBuilder
from cave.reader.runs_container import RunsContainer
from cave.utils.exceptions import Deactivated, NotApplicable
from cave.utils.helpers import load_default_options
from cave.utils.timing import timing

__author__ = "Joshua Marben"
__copyright__ = "Copyright 2017, ML4AAD"
__license__ = "3-clause BSD"
__maintainer__ = "Joshua Marben"
__email__ = "joshua.marben@neptun.uni-freiburg.de"


[docs]def _analyzer_type(f): @wraps(f) def wrap(self, *args, d=None, **kw): self.logger.debug("Args: %s, Kwargs: %s", str(args), str(kw)) try: analyzer = f(self, *args, **kw) except Deactivated as err: self.logger.info(err) return except NotApplicable as err: self.logger.debug("Skipping analyzer ({})".format(err)) return except Exception as err: self.logger.exception(err) raise else: # execute hooks, if possible and/or desired if self.show_jupyter: try: analyzer.get_jupyter() except ImportError as err: self.logger.debug(err) self.logger.info("Assuming that jupyter is not installed. Disable for rest of report.") self.show_jupyter = False if isinstance(d, dict): analyzer.get_html(d) self._build_website() return analyzer return wrap
[docs]class CAVE(object): def __init__(self, folders: typing.List[str], output_dir: str, ta_exec_dir: typing.List[str], file_format: str='auto', validation_format='NONE', validation_method: str='epm', seed: int=42, show_jupyter: bool=True, verbose_level: str='OFF', analyzing_options=None, **kwargs ): """ Initialize CAVE facade to handle analyzing, plotting and building the report-page easily. During initialization, the analysis-infrastructure is built and the data is validated, the overall best incumbent is found and default+incumbent are evaluated for all instances for all runs, by default using an EPM. Arguments --------- folders: list<strings> paths to relevant SMAC runs output_dir: string output for cave to write results (figures + report) ta_exec_dir: string execution directory for target algorithm (to find instance.txt specified in scenario, ..) file_format: string what format the rundata is in, options are [SMAC3, SMAC2, BOHB and CSV] validation_format: string what format the validation rundata is in, options are [SMAC3, SMAC2, CSV and None] validation_method: string from [validation, epm], how to estimate missing runs seed: int random seed for analysis (e.g. the random forests) show_jupyter: bool default True, tries to output plots and tables to jupyter-frontend, if available verbose_level: string from [OFF, INFO, DEBUG, DEV_DEBUG and WARNING] analyzing_options: string or dict options-dictionary following CAVE's options-syntax """ self.show_jupyter = show_jupyter if self.show_jupyter: # Reset logging module (needs to happen before logger initialization) logging.shutdown() reload(logging) self.logger = logging.getLogger(self.__module__ + '.' + self.__class__.__name__) self.output_dir = output_dir self.output_dir_created = False # this flag avoids multiple output-dir creations # Create output_dir and set verbosity self.set_verbosity(verbose_level.upper()) self._create_outputdir(self.output_dir) self.logger.debug("Running CAVE version %s", cave_version) self.verbose_level = verbose_level self.rng = np.random.RandomState(seed) self.folders = folders self.ta_exec_dir = ta_exec_dir self.file_format = file_format self.validation_format = validation_format self.validation_method = validation_method # Configuration of analyzers (works as a default for report generation) analyzing_options = load_default_options(analyzing_options, file_format) self.runscontainer = RunsContainer(folders=self.folders, ta_exec_dirs=self.ta_exec_dir, output_dir=self.output_dir, file_format=self.file_format, # TODO remove? validation_format=self.validation_format, # TODO remove? analyzing_options=analyzing_options, ) # create builder for html-website, decide for suitable logo custom_logo = './custom_logo.png' if self.runscontainer.file_format == 'BOHB': logo_fn = 'BOHB_logo.png' elif self.runscontainer.file_format.startswith('SMAC'): logo_fn = 'SMAC_logo.png' elif os.path.exists(custom_logo): logo_fn = custom_logo else: logo_fn = 'automl-logo.png' self.logger.info("No suitable logo found. You can use a custom logo simply by having a file called '%s' " "in the directory from which you run CAVE.", custom_logo) use_custom_logo = custom_logo == logo_fn self.builder = HTMLBuilder(self.output_dir, "CAVE", logo_fn=logo_fn, logo_custom=use_custom_logo) self.website = OrderedDict([])
[docs] @timing def analyze(self, options=None): """ Analyze the available data and build HTML-webpage as dict. Save webpage in 'self.output_dir/CAVE/report.html'. Parameters ---------- options: Dict or str either a dictionary or a path to an ini-file. """ # Save jupyter-flag (needs to be False while analyzing) and reset it later. flag_show_jupyter = self.show_jupyter self.show_jupyter = False # Process analyzing-options if isinstance(options, str): self.logger.debug("Loading \"{}\".".format(options)) self.runscontainer.analyzing_options.read(options) elif isinstance(options, dict): for k, v in options.items(): occurences = sum([1 for x in self.runscontainer.analyzing_options.sections() if k in x]) if occurences > 1: self.logger.warning("{} is an arbitrary option - to avoid collisions, consider a different name.") if occurences == 0 and k not in self.runscontainer.analyzing_options.sections(): self.logger.warning("{} is not found in default options. Are you sure you know what you are doing?") for s in self.runscontainer.analyzing_options.sections(): if s == k: self.runscontainer.analyzing_options[k]['run'] = str(v) elif k in self.runscontainer.analyzing_options[s]: self.runscontainer.analyzing_options[s][k] = str(v) # Invoke the analyzers one by one self.overview_table(d=self.website) self.compare_default_incumbent(d=self._get_dict(self.website, "Meta Data")) ################################################### # Performance Analysis # Performance Analysis # ################################################### title = "Performance Analysis" self.performance_table(d=self._get_dict(self.website, title)) self.plot_ecdf(d=self._get_dict(self.website, title)) self.plot_scatter(d=self._get_dict(self.website, title)) self.algorithm_footprints(d=self._get_dict(self.website, title)) ################################################### # Budget Analysis # Budget Analysis # ################################################### title = "Budget Analysis" if self.runscontainer.get_budgets() is not None: self.bohb_incumbents_per_budget(d=self._get_dict(self.website, title)) self.budget_correlation(d=self._get_dict(self.website, title)) if self.runscontainer.file_format == "BOHB": self.bohb_learning_curves(d=self._get_dict(self.website, title)) ################################################### # Parameter- and Feature-Analysis # ################################################### self.parameter_importance(self._get_dict(self.website, "Parameter Importance")) self.feature_analysis(self._get_dict(self.website, "Feature Analysis")) ################################################### # Configurator's Behaviour # ################################################### title = "Configurators Behavior" self.configurator_footprint(d=self._get_dict(self.website, title)) self.cost_over_time(d=self._get_dict(self.website, title)) # Parallel Coordinates should be after parameter importance, if performed. self.parallel_coordinates(d=self._get_dict(self.website, title)) ################################################### # Auto-PyTorch specific # ################################################### title = "Auto-PyTorch specific" self.apt_overview(d=self._get_dict(self.website, title)) self.apt_tensorboard(d=self._get_dict(self.website, title)) self._build_website() self.logger.info("CAVE finished. Report is located in %s", os.path.join(self.output_dir, 'report.html')) # Set jupyter-flag as it was before. self.show_jupyter = flag_show_jupyter
[docs] def _get_dict(self, d, layername): """ Get the appropriate sub-dict for this layer (or layer-run combination) and create it if necessary """ if not isinstance(d, dict): raise ValueError("Pass a valid dict to _get_dict!") if layername not in d: d[layername] = OrderedDict() return d[layername]
[docs] @_analyzer_type def overview_table(self): return OverviewTable(self.runscontainer)
[docs] @_analyzer_type def compare_default_incumbent(self): return CompareDefaultIncumbent(self.runscontainer)
[docs] @_analyzer_type def performance_table(self): return PerformanceTable(self.runscontainer)
[docs] @_analyzer_type def plot_scatter(self): return PlotScatter(self.runscontainer)
[docs] @_analyzer_type def plot_ecdf(self): return PlotECDF(self.runscontainer)
[docs] @_analyzer_type def algorithm_footprints(self): return AlgorithmFootprint(self.runscontainer)
[docs] @_analyzer_type def cost_over_time(self, incumbent_trajectory=None): return CostOverTime(self.runscontainer, incumbent_trajectory=incumbent_trajectory)
[docs] @_analyzer_type def parallel_coordinates(self, pc_sort_by: str=None, params: Union[int, List[str]]=None, n_configs: int=None, max_runs_epm: int=None, ): return ParallelCoordinates(self.runscontainer, pc_sort_by=pc_sort_by, params=params, n_configs=n_configs, max_runs_epm=max_runs_epm, )
[docs] @_analyzer_type def configurator_footprint(self, time_slider=None, max_configurations_to_plot=None, number_quantiles=None): return ConfiguratorFootprint(self.runscontainer, time_slider=time_slider, max_configurations_to_plot=max_configurations_to_plot, number_quantiles=number_quantiles)
[docs] @_analyzer_type def cave_fanova(self): try: fanova = Fanova(self.runscontainer) except IndexError as err: self.logger.debug("Error in fANOVA (%s)", err, exc_info=1) raise IndexError("Error in fANOVA - please run with --pimp_no_fanova_pairs (this might be due to a known " "issue with ints and bools in categorical hyperparameters, see issue #192).") return fanova
[docs] @_analyzer_type def cave_ablation(self): return Ablation(self.runscontainer)
[docs] @_analyzer_type def pimp_forward_selection(self): return ForwardSelection(self.runscontainer)
[docs] @_analyzer_type def local_parameter_importance(self): return LocalParameterImportance(self.runscontainer)
[docs] @_analyzer_type def pimp_comparison_table(self, pimp_sort_table_by="average"): return PimpComparisonTable(self.runscontainer, sort_table_by=pimp_sort_table_by, )
[docs] def parameter_importance(self, d): """Perform the specified parameter importance procedures. """ self.cave_fanova(d=d) self.cave_ablation(d=d) self.pimp_forward_selection(d=d) self.local_parameter_importance(d=d) if sum([1 for x in ['fANOVA', 'Ablation', 'Forward Selection', 'Local Parameter Importance (LPI)'] if self.runscontainer.analyzing_options[x]['run'] != 'False']) >= 2: pct = self.pimp_comparison_table(d=d) d.move_to_end(pct.name, last=False)
[docs] @_analyzer_type def feature_importance(self): res = FeatureImportance(self.runscontainer) return res
[docs] @_analyzer_type def box_violin(self): return BoxViolin(self.runscontainer)
[docs] @_analyzer_type def feature_correlation(self): return FeatureCorrelation(self.runscontainer)
[docs] @_analyzer_type def feature_clustering(self): return FeatureClustering(self.runscontainer)
[docs] def feature_analysis(self, d): # feature importance using forward selection self.feature_importance(d=d) self.box_violin(d=d) self.feature_correlation(d=d) self.feature_clustering(d=d)
[docs] @_analyzer_type def bohb_learning_curves(self): return BohbLearningCurves(self.runscontainer)
[docs] @_analyzer_type def bohb_incumbents_per_budget(self): return IncumbentsOverBudgets(self.runscontainer)
[docs] @_analyzer_type def budget_correlation(self): return BudgetCorrelation(self.runscontainer)
[docs] @_analyzer_type def apt_overview(self): return APTOverview(self.runscontainer)
[docs] @_analyzer_type def apt_tensorboard(self): return APTTensorboard(self.runscontainer)
########################################################################### # HELPERS HELPERS HELPERS HELPERS HELPERS HELPERS HELPERS HELPERS HELPERS # ###########################################################################
[docs] def print_budgets(self): """If the analyzed configurator uses budgets, print a list of available budgets.""" print(self.runscontainer.get_budgets())
[docs] def _build_website(self): self.builder.generate_webpage(self.website)
[docs] def set_verbosity(self, level): """ Set verbosity level and configure loggers (to stdout and to file), create output-dir if necessary. Parameters ---------- level: str From [INFO, WARNING, OFF, DEBUG, DEV_DEBUG] """ logging.getLogger().setLevel(logging.DEBUG) # Log to stream (console) formatter = logging.Formatter('%(levelname)s:%(name)s:%(message)s') stdout_handler = logging.StreamHandler() stdout_handler.setFormatter(formatter) if level == "INFO": stdout_handler.setLevel(logging.INFO) elif level == "WARNING": stdout_handler.setLevel(logging.WARNING) elif level == "OFF": stdout_handler.setLevel(logging.ERROR) elif level in ["DEBUG", "DEV_DEBUG"]: stdout_handler.setLevel(logging.DEBUG) if level == "DEV_DEBUG": # Disable annoying boilerplate-debug-logs from foreign modules disable_loggers = ["smac.scenario", # pimp logging "pimp.epm.unlogged_epar_x_rfwi.UnloggedEPARXrfi", "Forward-Selection", "LPI", # Other (mostly bokeh) "PIL.PngImagePlugin", "matplotlib", "urllib3.connectionpool", "selenium.webdriver.remote.remote_connection"] for logger in disable_loggers: logging.getLogger('cave.settings').debug("Setting logger \'%s\' on level INFO", logger) logging.getLogger(logger).setLevel(logging.INFO) else: raise ValueError("{} not recognized as a verbosity level." "Choose from DEBUG, DEV_DEBUG. INFO, WARNING, OFF.".format(level)) logging.getLogger().addHandler(stdout_handler) # Log to file is always debug debug_path = os.path.join(self.output_dir, "debug", "debug.log") logging.getLogger('cave.settings').debug("Output-file for debug-log: '%s'", debug_path) self._create_outputdir(self.output_dir) os.makedirs(os.path.split(debug_path)[0]) fh = logging.FileHandler(debug_path, "a") fh.setLevel(logging.DEBUG) fh.setFormatter(formatter) logging.getLogger().addHandler(fh) self.logger.debug("Existing loggers: %s", str(logging.root.manager.loggerDict))
[docs] def _create_outputdir(self, output_dir): """ Creates output-dir, if necessary. Also sets the 'self.output_dir_created'-flag, so this only happens once. If there is a directory already, zip this into an archive in the output_dir called '.OLD.zip'. """ if self.output_dir_created: if not os.path.exists(output_dir): raise RuntimeError("'%s' should exist, but doesn't. Any raceconditions? " "Please report to github.com/automl/CAVE/issues with debug/debug.log") self.logger.debug("Output-dir '%s' was already created, call ignored", output_dir) return self.logger.info("Saving results to '%s'", output_dir) if not os.path.exists(output_dir): self.logger.debug("Output-dir '%s' does not exist, creating", output_dir) os.makedirs(output_dir) else: archive_path = shutil.make_archive(os.path.join(tempfile.mkdtemp(), '.OLD'), 'zip', output_dir) shutil.rmtree(output_dir) os.makedirs(output_dir) shutil.move(archive_path, output_dir) self.logger.debug("Output-dir '%s' exists, moving old content to '%s'", self.output_dir, os.path.join(self.output_dir, '.OLD.zip')) self.output_dir_created = True