Source code for cave.reader.conversion.csv2rh

import logging
import warnings
from typing import Union

import numpy as np
import pandas as pd
from ConfigSpace import ConfigurationSpace
from ConfigSpace.hyperparameters import UniformFloatHyperparameter
from ConfigSpace.read_and_write import pcs
from ConfigSpace.util import deactivate_inactive_hyperparameters, fix_types
from smac.runhistory.runhistory import RunHistory, DataOrigin
from smac.tae import StatusType
from smac.utils.io.input_reader import InputReader

from cave.utils.io import load_csv_to_pandaframe


[docs]class CSV2RH(object):
[docs] def read_csv_to_rh(self, data, cs: Union[None, str, ConfigurationSpace] = None, id_to_config: Union[None, dict] = None, train_inst: Union[None, str, list] = None, test_inst: Union[None, str, list] = None, instance_features: Union[None, str, dict] = None, ): """ Interpreting a .csv-file as runhistory. Valid values for the header of the csv-file/DataFrame are: ['seed', 'cost', 'time', 'status', 'budget', 'config_id', 'instance_id'] or any parameter- or instance-feature-names. Parameters ---------- data: str or pd.DataFrame either string to csv-formatted runhistory-file or DataFrame containing the same information cs: str or ConfigurationSpace config-space to use for this runhistory id_to_config: dict mapping ids to Configuration-objects train_inst: str or list[str] train instances or path to file test_inst: str or list[str] test instances or path to file instance_features: str or dict instance features as dict mapping instance-ids to feature-array or file to appropriately formatted instance-feature-file Returns: -------- rh: RunHistory runhistory with all the runs from the csv-file """ self.logger = logging.getLogger(self.__module__ + '.' + self.__class__.__name__) self.input_reader = InputReader() self.train_inst = self.input_reader.read_instance_file(train_inst) if type(train_inst) == str else train_inst self.test_inst = self.input_reader.read_instance_file(test_inst) if type(test_inst) == str else test_inst feature_names = [] # names of instance-features if type(instance_features) == str: feature_names, instance_features = self.input_reader.read_instance_features_file(instance_features) # Read in data if isinstance(data, str): self.logger.debug("Detected path for csv-file (\'%s\')", data) data = load_csv_to_pandaframe(data, self.logger, apply_numeric=False) # Expecting header as described in docstring self.valid_values = ['seed', 'cost', 'time', 'status', 'budget', 'config_id', 'instance_id'] if isinstance(cs, str): self.logger.debug("Reading PCS from %s", cs) with open(cs, 'r') as fh: cs = pcs.read(fh) elif not cs: self.logger.debug("No config-space provided, create from columns") if id_to_config: cs = np.random.choice(list(id_to_config.values())).configuration_space else: parameters = set(data.columns) parameters -= set(self.valid_values) parameters -= set(feature_names) parameters = list(parameters) cs = self.create_cs_from_pandaframe(data[parameters]) parameters = cs.get_hyperparameter_names() if not feature_names and not 'instance_id' in data.columns: feature_names = [c for c in data.columns if not c.lower() in self.valid_values and not c in parameters] for c in set(self.valid_values).intersection(set(data.columns)): # Cast to numeric data[c] = data[c].apply(pd.to_numeric, errors='ignore') data, id_to_config = self.extract_configs(data, cs, id_to_config) data, id_to_inst_feats = self.extract_instances(data, feature_names, instance_features) self.logger.debug("Found: seed=%s, cost=%s, time=%s, status=%s, budget=%s", 'seed' in data.columns, 'cost' in data.columns, 'time' in data.columns, 'status' in data.columns, 'budget' in data.columns) # Create RunHistory rh = RunHistory() def add_to_rh(row): new_status = self._interpret_status(row['status']) if 'status' in row else StatusType.SUCCESS rh.add(config=id_to_config[row['config_id']], cost=row['cost'], time=row['time'] if 'time' in row else -1, status=new_status, instance_id=row['instance_id'] if 'instance_id' in row else None, seed=row['seed'] if 'seed' in row else None, budget=row['budget'] if 'budget' in row else 0, additional_info=None, origin=DataOrigin.INTERNAL) data.apply(add_to_rh, axis=1) return rh
[docs] def create_cs_from_pandaframe(self, data): # TODO use from pyimp after https://github.com/automl/ParameterImportance/issues/72 is implemented warnings.warn("No parameter configuration space (pcs) provided! " "Interpreting all parameters as floats. This might lead " "to suboptimal analysis.", RuntimeWarning) self.logger.debug("Interpreting as parameters: %s", data.columns) minima = data.min() # to define ranges of hyperparameter maxima = data.max() cs = ConfigurationSpace(seed=42) for p in data.columns: cs.add_hyperparameter(UniformFloatHyperparameter(p, lower=minima[p] - 1, upper=maxima[p] + 1)) return cs
[docs] def _interpret_status(self, status, types=None): """ Parameters ---------- status: str status-string types: dict[str:StatusType] optional, mapping to use Returns ------- status: StatusType interpreted status-type """ if not types: types = {"SAT": StatusType.SUCCESS, "UNSAT": StatusType.SUCCESS, "SUCCESS": StatusType.SUCCESS, "STATUSTYPE.SUCCESS": StatusType.SUCCESS, "TIMEOUT": StatusType.TIMEOUT, "STATUSTYPE.TIMEOUT": StatusType.TIMEOUT, "CRASHED": StatusType.CRASHED, "STATUSTYPE.CRASHED": StatusType.CRASHED, "MEMOUT": StatusType.MEMOUT, "STATUSTYPE.MEMOUT": StatusType.MEMOUT, "ABORT": StatusType.ABORT, "STATUSTYPE.ABORT": StatusType.ABORT, } status = status.strip().upper() if status in types: status = types[status] else: self.logger.warning("Could not parse %s as a status. Valid values " "are: %s. Treating as CRASHED run.", status, types.keys()) status = StatusType.CRASHED return status
[docs] def extract_configs(self, data, cs: ConfigurationSpace, id_to_config=None): """ After completion, every unique configuration in the data will have a corresponding id in the data-frame. The data-frame is expected to either contain a column for config-id OR columns for each individual hyperparameter. Parameter-names will be used from the provided configspace. If a mapping of ids to configurations already exists, it will be used. Parameters ---------- data: pd.DataFrame pandas dataframe containing either a column called `config_id` or a column for every individual parameter cs: ConfigurationSpace optional, if provided the `parameters`-argument will be ignored id_to_config: dict[int:Configuration] optional, mapping ids to Configurations (necessary when using `config_id`-column) Returns ------- data: pd.DataFrame if no config-id-columns was there before, there is one now. id_to_config: dict mapping every id to a configuration """ if id_to_config: config_to_id = {conf: name for name, conf in id_to_config.items()} else: id_to_config = {} config_to_id = {} parameters = cs.get_hyperparameter_names() if 'config_id' in data.columns and not id_to_config: raise ValueError("When defining configs with \"config_id\" " "in header, you need to provide the argument " "\"configurations\" to the CSV2RH-object - " "either as a dict, mapping the id's to " "Configurations or as a path to a csv-file " "containing the necessary information.") if 'config_id' not in data.columns: # Map to configurations ids_in_order = [] data['config_id'] = -1 def add_config(row): values = {name: row[name] for name in parameters if row[name] != ''} config = deactivate_inactive_hyperparameters(fix_types(values, cs), cs) if config not in config_to_id: config_to_id[config] = len(config_to_id) row['config_id'] = config_to_id[config] return row data = data.apply(add_config, axis=1) id_to_config = {conf: name for name, conf in config_to_id.items()} data["config_id"] = pd.to_numeric(data["config_id"]) # Check whether all config-ids are present if len(set(data['config_id']) - set(id_to_config.keys())) > 0: raise ValueError("config id {} cannot be identified (is your configurations.csv complete? Or maybe " "this is a type-issue...".format(set(data['config_id']) - set(id_to_config.keys()))) return data, id_to_config
[docs] def extract_instances(self, data, feature_names, features): """ After completion, every unique instance in the data will have a corresponding id in the data-frame. The data-frame is expected to either contain a column for instance-id OR columns for each individual instance-feature. Parameter-names will be used from the provided configspace. If a mapping of ids to configurations already exists, it will be used. Parameters ---------- data: pd.DataFrame pandas dataframe containing either a column called `instance_id` or a column for every individual instance-features feature_names: list[str] optional, list of feature-names features: dict[int:np.array] optional, mapping ids to instance-feature vectors (necessary when using `instance_id`-column) Returns ------- data: pd.DataFrame if no instance_id-columns was there before, there is one now. id_to_inst_feats: dict mapping every id to instance-features """ id_to_inst_feats = {} inst_feats_to_id = {} if features: id_to_inst_feats = {i: tuple([str(f) for f in feat]) for i, feat in features.items()} inst_feats_to_id = {feat: i for i, feat in id_to_inst_feats.items()} if 'instance_id' in data.columns and not features: raise ValueError("Instances defined via \'instance_id\'-column, but no instance features available.") elif 'instance_id' not in data.columns and feature_names: # Add new column for instance-ids data['instance_id'] = -1 self.old = None def add_instance(row): row_features = tuple([str(row[idx]) for idx in feature_names]) if row_features not in inst_feats_to_id: new_id = len(inst_feats_to_id) inst_feats_to_id[row_features] = new_id id_to_inst_feats[new_id] = features row['instance_id'] = inst_feats_to_id[row_features] self.old = row_features return row data = data.apply(add_instance, axis=1) else: self.logger.info("No instances detected.") id_to_inst_feats = {i: np.array(f).astype('float64') for i, f in id_to_inst_feats.items()} return data, id_to_inst_feats