Source code for cave.reader.configurator_run

import copy
import logging
import os
import tempfile
from collections import OrderedDict
from contextlib import contextmanager

import numpy as np
from pimp.importance.importance import Importance
from smac.runhistory.runhistory import RunHistory, DataOrigin
from smac.utils.io.input_reader import InputReader
from smac.utils.validate import Validator
from smac import __version__ as smac_version

from cave.reader.smac2_reader import SMAC2Reader
from cave.reader.smac3_reader import SMAC3Reader
from cave.utils.helpers import scenario_sanity_check
from cave.utils.timing import timing


[docs]class ConfiguratorRun(object):
    """
    ConfiguratorRuns load and maintain information about individual configurator
    runs. There are different supported formats, like: BOHB, SMAC3, SMAC2 and CSV
    This class is responsible for providing a scenario, a runhistory and a
    trajectory and handling original/validated data appropriately.
    To create a ConfiguratorRun from a folder, use Configurator.from_folder()
    """
    def __init__(self,
                 scenario,
                 original_runhistory,
                 validated_runhistory,
                 trajectory,
                 options,
                 path_to_folder=None,
                 ta_exec_dir=None,
                 file_format=None,
                 validation_format=None,
                 reduced_to_budgets=None,
                 output_dir=None,
                 ):
        """
        Parameters
        ----------
        scenario: Scenario
            scenario
        original_runhistory, validated_runhistory: RunHistory
            runhistores containing only the original evaluated data (during optimization process) or the validated data
            where points of interest are reevaluated after the optimization process
        trajectory: List[dict]
            a trajectory of the best performing configurations at each point in time
        options: dict
            options can define a number of custom settings
        path_to_folder: str
            path to the physical folder containing the data
        ta_exec_dir: str
            path to the target-algorithm-execution-directory. This is only important for SMAC-optimized data
        file_format, validation_format: str
            will be autodetected some point soon, until then, specify the file-format (SMAC2, SMAC3, BOHB, etc...)
        reduced_to_budgets: List str int or float
            budgets, with which this cr is associated
        output_dir: str
            where to save analysis-data for this cr
        """
        self.logger = logging.getLogger("cave.ConfiguratorRun.{}".format(path_to_folder))
        self.rng = np.random.RandomState(42)
        self.options = options

        self.path_to_folder = path_to_folder
        self.reduced_to_budgets = [None] if reduced_to_budgets is None else reduced_to_budgets

        self.scenario = scenario
        self.original_runhistory = original_runhistory
        self.validated_runhistory = validated_runhistory
        self.trajectory = trajectory
        self.ta_exec_dir = ta_exec_dir
        self.file_format = file_format
        self.validation_format = validation_format
        if not output_dir:
            self.logger.debug("New outputdir")
            output_dir = tempfile.mkdtemp()
        self.output_dir = os.path.join(output_dir, 'analysis_data', self.get_identifier())
        os.makedirs(self.output_dir, exist_ok=True)

        self.default = self.scenario.cs.get_default_configuration()
        self.incumbent = self.trajectory[-1]['incumbent'] if self.trajectory else None
        self.feature_names = self._get_feature_names()

        # Create combined runhistory to collect all "real" runs
        self.combined_runhistory = RunHistory()
        self.combined_runhistory.update(self.original_runhistory, origin=DataOrigin.INTERNAL)
        if self.validated_runhistory is not None:
            self.combined_runhistory.update(self.validated_runhistory, origin=DataOrigin.EXTERNAL_SAME_INSTANCES)

        # Create runhistory with estimated runs (create Importance-object of pimp and use epm-model for validation)
        self.epm_runhistory = RunHistory()
        self.epm_runhistory.update(self.combined_runhistory)

        # Initialize importance and validator
        self._init_pimp_and_validator()
        try:
            self._validate_default_and_incumbents("epm", self.ta_exec_dir)
        except KeyError as err:
            self.logger.debug(err, exc_info=True)
            msg = "Validation of default and incumbent failed. SMAC (v: {}) does not support validation of budgets+ins"\
                  "tances yet, if you use budgets but no instances ignore this warning.".format(str(smac_version))
            if self.feature_names:
                self.logger.warning(msg)
            else:
                self.logger.debug(msg)

        # Set during execution, to share information between Analyzers
        self.share_information = {'parameter_importance': OrderedDict(),
                                  'feature_importance': OrderedDict(),
                                  'evaluators': OrderedDict(),
                                  'validator': None,
                                  'hpbandster_result': None,  # Only for file-format BOHB
                                  }

[docs]    def get_identifier(self):
        return self.identify(self.path_to_folder, self.reduced_to_budgets)

[docs]    @classmethod
    def identify(cls, path, budget):
        path = path if path is not None else "all_folders"
        budget = str(budget) if budget is not None else "all_budgets"
        res = "_".join([path, budget]).replace('/', '_')
        if len(res) > len(str(hash(res))):
            res = str(hash(res))
        return res

[docs]    def get_budgets(self):
        return set([k.budget for k in self.original_runhistory.data.keys()])

[docs]    @classmethod
    def from_folder(cls,
                    folder: str,
                    ta_exec_dir: str,
                    options,
                    file_format: str='SMAC3',
                    validation_format: str='NONE',
                    output_dir=None,
                    ):
        """Initialize scenario, runhistory and incumbent from folder

        Parameters
        ----------
        folder: string
            output-dir of this configurator-run -> this is also the 'id' for a single run in parallel optimization
        ta_exec_dir: string
            if the execution directory for the SMAC-run differs from the cwd,
            there might be problems loading instance-, feature- or PCS-files
            in the scenario-object. since instance- and PCS-files are necessary,
            specify the path to the execution-dir of SMAC here
        file_format: string
            from [SMAC2, SMAC3, BOHB, APT, CSV]
        validation_format: string
            from [SMAC2, SMAC3, APT, CSV, NONE], in which format to look for validated data
        """
        logger = logging.getLogger("cave.ConfiguratorRun.{}".format(folder))
        logger.debug("Loading from \'%s\' with ta_exec_dir \'%s\' with file-format '%s' and validation-format %s. ",
                     folder, ta_exec_dir, file_format, validation_format)

        if file_format == 'BOHB' or file_format == "APT":
            logger.debug("File format is BOHB or APT, assmuming data was converted to SMAC3-format using "
                         "HpBandSter2SMAC from cave.reader.converter.hpbandster2smac.")
        validation_format = validation_format if validation_format != 'NONE' else None

        # Read in data (scenario, runhistory & trajectory)
        reader = cls.get_reader(file_format, folder, ta_exec_dir)

        scenario = reader.get_scenario()
        scenario_sanity_check(scenario, logger)
        original_runhistory = reader.get_runhistory(scenario.cs)
        validated_runhistory = None

        if validation_format == "NONE" or validation_format is None:
            validation_format = None
        else:
            logger.debug('Using format %s for validation', validation_format)
            vali_reader = cls.get_reader(validation_format, folder, ta_exec_dir)
            vali_reader.scen = scenario
            validated_runhistory = vali_reader.get_validated_runhistory(scenario.cs)
            #self._check_rh_for_inc_and_def(self.validated_runhistory, 'validated runhistory')
            logger.info("Found validated runhistory for \"%s\" and using "
                        "it for evaluation. #configs in validated rh: %d",
                        folder, len(validated_runhistory.config_ids))

        trajectory = reader.get_trajectory(scenario.cs)

        return cls(scenario,
                   original_runhistory,
                   validated_runhistory,
                   trajectory,
                   options,
                   path_to_folder=folder,
                   ta_exec_dir=ta_exec_dir,
                   file_format=file_format,
                   validation_format=validation_format,
                   output_dir=output_dir,
                   )

[docs]    def get_incumbent(self):
        return self.incumbent

[docs]    def _init_pimp_and_validator(self,
                                 alternative_output_dir=None,
                                 ):
        """
        Create ParameterImportance-object and use it's trained model for validation and further predictions.
        We pass a combined (original + validated) runhistory, so that the returned model will be based on as much
        information as possible

        Parameters
        ----------
        alternative_output_dir: str
            e.g. for budgets we want pimp to use an alternative output-dir (subfolders per budget)
        """
        self.logger.debug("Using '%s' as output for pimp", alternative_output_dir if alternative_output_dir else
                          self.output_dir)
        self.pimp = Importance(scenario=copy.deepcopy(self.scenario),
                               runhistory=self.combined_runhistory,
                               incumbent=self.incumbent if self.incumbent else self.default,
                               save_folder=alternative_output_dir if alternative_output_dir is not None else self.output_dir,
                               seed=self.rng.randint(1, 100000),
                               max_sample_size=self.options['fANOVA'].getint("pimp_max_samples"),
                               fANOVA_pairwise=self.options['fANOVA'].getboolean("fanova_pairwise"),
                               preprocess=False,
                               verbose=False,  # disable progressbars in pimp...
                               )
        # Validator (initialize without trajectory)
        self.validator = Validator(self.scenario, None, None)
        self.validator.epm = self.pimp.model

[docs]    @timing
    def _validate_default_and_incumbents(self,
                                         method,
                                         ta_exec_dir,
                                         ):
        """Validate default and incumbent configurations on all instances possible.
        Either use validation (physically execute the target algorithm) or EPM-estimate and update according runhistory
        (validation -> self.global_validated_rh; epm -> self.global_epm_rh).

        Parameters
        ----------
        method: str
            epm or validation
        ta_exec_dir: str
            path from where the target algorithm can be executed as found in scenario (only used for actual validation)
        """
        # TODO maybe just validate whole trajectory?
        self.logger.debug("Validating %s using %s!", self.get_identifier(), method)
        self.validator.traj = self.trajectory
        if method == "validation":
            with _changedir(ta_exec_dir):
                # TODO determine # repetitions
                new_rh = self.validator.validate('def+inc', 'train+test', 1, -1, runhistory=self.combined_runhistory)
            self.validated_runhistory.update(new_rh)
            self.combined_runhistory_rh.update(new_rh)
        elif method == "epm":
            # Only do test-instances if features for test-instances are available
            instance_mode = 'train+test'
            if (any([i not in self.scenario.feature_dict for i in self.scenario.test_insts]) and
                any([i in self.scenario.feature_dict for i in self.scenario.train_insts])):  # noqa
                self.logger.debug("No features provided for test-instances (but for train!). Cannot validate on \"epm\".")
                self.logger.warning("Features detected for train-instances, but not for test-instances. This is "
                                    "unintended usage and may lead to errors for some analysis-methods.")
                instance_mode = 'train'

            new_rh = self.validator.validate_epm('def+inc', instance_mode, 1, runhistory=self.combined_runhistory)
            self.epm_runhistory.update(new_rh)
        else:
            raise ValueError("Missing data method illegal (%s)", method)
        self.validator.traj = None  # Avoid usage-mistakes

[docs]    def _get_feature_names(self):
        if not self.scenario.feature_dict:
            self.logger.info("No features available. Skipping feature analysis.")
            return
        feat_fn = self.scenario.feature_fn
        if not self.scenario.feature_names:
            self.logger.debug("`scenario.feature_names` is not set. Loading from '%s'", feat_fn)
            with _changedir(self.ta_exec_dir if self.ta_exec_dir else '.'):
                if not feat_fn or not os.path.exists(feat_fn):
                    self.logger.warning("Feature names are missing. Either provide valid feature_file in scenario "
                                        "(currently %s) or set `scenario.feature_names` manually." % feat_fn)
                    self.logger.error("Skipping Feature Analysis.")
                    return
                else:
                    # Feature names are contained in feature-file and retrieved
                    feat_names = InputReader().read_instance_features_file(feat_fn)[0]
        else:
            feat_names = copy.deepcopy(self.scenario.feature_names)
        return feat_names

[docs]    def _check_rh_for_inc_and_def(self, rh, name=''):
        """
        Check if default and incumbent are evaluated on all instances in this rh

        Parameters
        ----------
        rh: RunHistory
            runhistory to be checked
        name: str
            name for logging-purposes

        Returns
        -------
        return_value: bool
            False if either inc or def was not evaluated on all
            train/test-instances
        """
        return_value = True
        for c_name, c in [("default", self.default), ("inc", self.incumbent)]:
            runs = rh.get_runs_for_config(c, only_max_observed_budget=False)
            evaluated = set([inst for inst, seed in runs])
            for i_name, i in [("train", self.train_inst),
                              ("test", self.test_inst)]:
                not_evaluated = set(i) - evaluated
                if len(not_evaluated) > 0:
                    self.logger.debug("RunHistory %s only evaluated on %d/%d %s-insts for %s in folder %s",
                                      name, len(i) - len(not_evaluated), len(i), i_name, c_name, self.folder)
                    return_value = False
        return return_value

[docs]    @classmethod
    def get_reader(cls, name, folder, ta_exec_dir):
        """ Returns an appropriate reader for the specified format. """
        # TODO make autodetect format (here? where?)
        if name == 'SMAC3':
            return SMAC3Reader(folder, ta_exec_dir)
        elif name == 'BOHB':
            return SMAC3Reader(folder, ta_exec_dir)
        elif name == 'APT':
            return SMAC3Reader(folder, ta_exec_dir)
        elif name == 'SMAC2':
            return SMAC2Reader(folder, ta_exec_dir)
        elif name == 'CSV':
            return SMAC3Reader(folder, ta_exec_dir)
        else:
            raise ValueError("%s not supported as file-format" % name)

[docs]@contextmanager
def _changedir(newdir):
    """ Helper function to change directory, for example to create a scenario from file, where paths to the instance-
    and feature-files are relative to the original SMAC-execution-directory. Same with target algorithms that need
    be executed for validation. """
    olddir = os.getcwd()
    os.chdir(os.path.expanduser(newdir))
    try:
        yield
    finally:
        os.chdir(olddir)