Source code for cave.reader.configurator_run

import copy
import logging
import os
import tempfile
from collections import OrderedDict
from contextlib import contextmanager

import numpy as np
from pimp.importance.importance import Importance
from smac.runhistory.runhistory import RunHistory, DataOrigin
from smac.utils.io.input_reader import InputReader
from smac.utils.validate import Validator
from smac import __version__ as smac_version

from cave.reader.smac2_reader import SMAC2Reader
from cave.reader.smac3_reader import SMAC3Reader
from cave.utils.helpers import scenario_sanity_check
from cave.utils.timing import timing


[docs]class ConfiguratorRun(object): """ ConfiguratorRuns load and maintain information about individual configurator runs. There are different supported formats, like: BOHB, SMAC3, SMAC2 and CSV This class is responsible for providing a scenario, a runhistory and a trajectory and handling original/validated data appropriately. To create a ConfiguratorRun from a folder, use Configurator.from_folder() """ def __init__(self, scenario, original_runhistory, validated_runhistory, trajectory, options, path_to_folder=None, ta_exec_dir=None, file_format=None, validation_format=None, reduced_to_budgets=None, output_dir=None, ): """ Parameters ---------- scenario: Scenario scenario original_runhistory, validated_runhistory: RunHistory runhistores containing only the original evaluated data (during optimization process) or the validated data where points of interest are reevaluated after the optimization process trajectory: List[dict] a trajectory of the best performing configurations at each point in time options: dict options can define a number of custom settings path_to_folder: str path to the physical folder containing the data ta_exec_dir: str path to the target-algorithm-execution-directory. This is only important for SMAC-optimized data file_format, validation_format: str will be autodetected some point soon, until then, specify the file-format (SMAC2, SMAC3, BOHB, etc...) reduced_to_budgets: List str int or float budgets, with which this cr is associated output_dir: str where to save analysis-data for this cr """ self.logger = logging.getLogger("cave.ConfiguratorRun.{}".format(path_to_folder)) self.rng = np.random.RandomState(42) self.options = options self.path_to_folder = path_to_folder self.reduced_to_budgets = [None] if reduced_to_budgets is None else reduced_to_budgets self.scenario = scenario self.original_runhistory = original_runhistory self.validated_runhistory = validated_runhistory self.trajectory = trajectory self.ta_exec_dir = ta_exec_dir self.file_format = file_format self.validation_format = validation_format if not output_dir: self.logger.debug("New outputdir") output_dir = tempfile.mkdtemp() self.output_dir = os.path.join(output_dir, 'analysis_data', self.get_identifier()) os.makedirs(self.output_dir, exist_ok=True) self.default = self.scenario.cs.get_default_configuration() self.incumbent = self.trajectory[-1]['incumbent'] if self.trajectory else None self.feature_names = self._get_feature_names() # Create combined runhistory to collect all "real" runs self.combined_runhistory = RunHistory() self.combined_runhistory.update(self.original_runhistory, origin=DataOrigin.INTERNAL) if self.validated_runhistory is not None: self.combined_runhistory.update(self.validated_runhistory, origin=DataOrigin.EXTERNAL_SAME_INSTANCES) # Create runhistory with estimated runs (create Importance-object of pimp and use epm-model for validation) self.epm_runhistory = RunHistory() self.epm_runhistory.update(self.combined_runhistory) # Initialize importance and validator self._init_pimp_and_validator() try: self._validate_default_and_incumbents("epm", self.ta_exec_dir) except KeyError as err: self.logger.debug(err, exc_info=True) msg = "Validation of default and incumbent failed. SMAC (v: {}) does not support validation of budgets+ins"\ "tances yet, if you use budgets but no instances ignore this warning.".format(str(smac_version)) if self.feature_names: self.logger.warning(msg) else: self.logger.debug(msg) # Set during execution, to share information between Analyzers self.share_information = {'parameter_importance': OrderedDict(), 'feature_importance': OrderedDict(), 'evaluators': OrderedDict(), 'validator': None, 'hpbandster_result': None, # Only for file-format BOHB }
[docs] def get_identifier(self): return self.identify(self.path_to_folder, self.reduced_to_budgets)
[docs] @classmethod def identify(cls, path, budget): path = path if path is not None else "all_folders" budget = str(budget) if budget is not None else "all_budgets" res = "_".join([path, budget]).replace('/', '_') if len(res) > len(str(hash(res))): res = str(hash(res)) return res
[docs] def get_budgets(self): return set([k.budget for k in self.original_runhistory.data.keys()])
[docs] @classmethod def from_folder(cls, folder: str, ta_exec_dir: str, options, file_format: str='SMAC3', validation_format: str='NONE', output_dir=None, ): """Initialize scenario, runhistory and incumbent from folder Parameters ---------- folder: string output-dir of this configurator-run -> this is also the 'id' for a single run in parallel optimization ta_exec_dir: string if the execution directory for the SMAC-run differs from the cwd, there might be problems loading instance-, feature- or PCS-files in the scenario-object. since instance- and PCS-files are necessary, specify the path to the execution-dir of SMAC here file_format: string from [SMAC2, SMAC3, BOHB, APT, CSV] validation_format: string from [SMAC2, SMAC3, APT, CSV, NONE], in which format to look for validated data """ logger = logging.getLogger("cave.ConfiguratorRun.{}".format(folder)) logger.debug("Loading from \'%s\' with ta_exec_dir \'%s\' with file-format '%s' and validation-format %s. ", folder, ta_exec_dir, file_format, validation_format) if file_format == 'BOHB' or file_format == "APT": logger.debug("File format is BOHB or APT, assmuming data was converted to SMAC3-format using " "HpBandSter2SMAC from cave.reader.converter.hpbandster2smac.") validation_format = validation_format if validation_format != 'NONE' else None # Read in data (scenario, runhistory & trajectory) reader = cls.get_reader(file_format, folder, ta_exec_dir) scenario = reader.get_scenario() scenario_sanity_check(scenario, logger) original_runhistory = reader.get_runhistory(scenario.cs) validated_runhistory = None if validation_format == "NONE" or validation_format is None: validation_format = None else: logger.debug('Using format %s for validation', validation_format) vali_reader = cls.get_reader(validation_format, folder, ta_exec_dir) vali_reader.scen = scenario validated_runhistory = vali_reader.get_validated_runhistory(scenario.cs) #self._check_rh_for_inc_and_def(self.validated_runhistory, 'validated runhistory') logger.info("Found validated runhistory for \"%s\" and using " "it for evaluation. #configs in validated rh: %d", folder, len(validated_runhistory.config_ids)) trajectory = reader.get_trajectory(scenario.cs) return cls(scenario, original_runhistory, validated_runhistory, trajectory, options, path_to_folder=folder, ta_exec_dir=ta_exec_dir, file_format=file_format, validation_format=validation_format, output_dir=output_dir, )
[docs] def get_incumbent(self): return self.incumbent
[docs] def _init_pimp_and_validator(self, alternative_output_dir=None, ): """ Create ParameterImportance-object and use it's trained model for validation and further predictions. We pass a combined (original + validated) runhistory, so that the returned model will be based on as much information as possible Parameters ---------- alternative_output_dir: str e.g. for budgets we want pimp to use an alternative output-dir (subfolders per budget) """ self.logger.debug("Using '%s' as output for pimp", alternative_output_dir if alternative_output_dir else self.output_dir) self.pimp = Importance(scenario=copy.deepcopy(self.scenario), runhistory=self.combined_runhistory, incumbent=self.incumbent if self.incumbent else self.default, save_folder=alternative_output_dir if alternative_output_dir is not None else self.output_dir, seed=self.rng.randint(1, 100000), max_sample_size=self.options['fANOVA'].getint("pimp_max_samples"), fANOVA_pairwise=self.options['fANOVA'].getboolean("fanova_pairwise"), preprocess=False, verbose=False, # disable progressbars in pimp... ) # Validator (initialize without trajectory) self.validator = Validator(self.scenario, None, None) self.validator.epm = self.pimp.model
[docs] @timing def _validate_default_and_incumbents(self, method, ta_exec_dir, ): """Validate default and incumbent configurations on all instances possible. Either use validation (physically execute the target algorithm) or EPM-estimate and update according runhistory (validation -> self.global_validated_rh; epm -> self.global_epm_rh). Parameters ---------- method: str epm or validation ta_exec_dir: str path from where the target algorithm can be executed as found in scenario (only used for actual validation) """ # TODO maybe just validate whole trajectory? self.logger.debug("Validating %s using %s!", self.get_identifier(), method) self.validator.traj = self.trajectory if method == "validation": with _changedir(ta_exec_dir): # TODO determine # repetitions new_rh = self.validator.validate('def+inc', 'train+test', 1, -1, runhistory=self.combined_runhistory) self.validated_runhistory.update(new_rh) self.combined_runhistory_rh.update(new_rh) elif method == "epm": # Only do test-instances if features for test-instances are available instance_mode = 'train+test' if (any([i not in self.scenario.feature_dict for i in self.scenario.test_insts]) and any([i in self.scenario.feature_dict for i in self.scenario.train_insts])): # noqa self.logger.debug("No features provided for test-instances (but for train!). Cannot validate on \"epm\".") self.logger.warning("Features detected for train-instances, but not for test-instances. This is " "unintended usage and may lead to errors for some analysis-methods.") instance_mode = 'train' new_rh = self.validator.validate_epm('def+inc', instance_mode, 1, runhistory=self.combined_runhistory) self.epm_runhistory.update(new_rh) else: raise ValueError("Missing data method illegal (%s)", method) self.validator.traj = None # Avoid usage-mistakes
[docs] def _get_feature_names(self): if not self.scenario.feature_dict: self.logger.info("No features available. Skipping feature analysis.") return feat_fn = self.scenario.feature_fn if not self.scenario.feature_names: self.logger.debug("`scenario.feature_names` is not set. Loading from '%s'", feat_fn) with _changedir(self.ta_exec_dir if self.ta_exec_dir else '.'): if not feat_fn or not os.path.exists(feat_fn): self.logger.warning("Feature names are missing. Either provide valid feature_file in scenario " "(currently %s) or set `scenario.feature_names` manually." % feat_fn) self.logger.error("Skipping Feature Analysis.") return else: # Feature names are contained in feature-file and retrieved feat_names = InputReader().read_instance_features_file(feat_fn)[0] else: feat_names = copy.deepcopy(self.scenario.feature_names) return feat_names
[docs] def _check_rh_for_inc_and_def(self, rh, name=''): """ Check if default and incumbent are evaluated on all instances in this rh Parameters ---------- rh: RunHistory runhistory to be checked name: str name for logging-purposes Returns ------- return_value: bool False if either inc or def was not evaluated on all train/test-instances """ return_value = True for c_name, c in [("default", self.default), ("inc", self.incumbent)]: runs = rh.get_runs_for_config(c, only_max_observed_budget=False) evaluated = set([inst for inst, seed in runs]) for i_name, i in [("train", self.train_inst), ("test", self.test_inst)]: not_evaluated = set(i) - evaluated if len(not_evaluated) > 0: self.logger.debug("RunHistory %s only evaluated on %d/%d %s-insts for %s in folder %s", name, len(i) - len(not_evaluated), len(i), i_name, c_name, self.folder) return_value = False return return_value
[docs] @classmethod def get_reader(cls, name, folder, ta_exec_dir): """ Returns an appropriate reader for the specified format. """ # TODO make autodetect format (here? where?) if name == 'SMAC3': return SMAC3Reader(folder, ta_exec_dir) elif name == 'BOHB': return SMAC3Reader(folder, ta_exec_dir) elif name == 'APT': return SMAC3Reader(folder, ta_exec_dir) elif name == 'SMAC2': return SMAC2Reader(folder, ta_exec_dir) elif name == 'CSV': return SMAC3Reader(folder, ta_exec_dir) else: raise ValueError("%s not supported as file-format" % name)
[docs]@contextmanager def _changedir(newdir): """ Helper function to change directory, for example to create a scenario from file, where paths to the instance- and feature-files are relative to the original SMAC-execution-directory. Same with target algorithms that need be executed for validation. """ olddir = os.getcwd() os.chdir(os.path.expanduser(newdir)) try: yield finally: os.chdir(olddir)