Source code for cave.reader.smac2_reader

import csv
import os
import re
import shutil

import numpy as np
import pandas as pd
from ConfigSpace import Configuration
from ConfigSpace.util import deactivate_inactive_hyperparameters, fix_types
from smac.runhistory.runhistory import RunHistory
from smac.scenario.scenario import Scenario
from smac.utils.io.input_reader import InputReader

from cave.reader.base_reader import BaseReader, changedir
from cave.reader.conversion.csv2rh import CSV2RH
from cave.utils.io import load_csv_to_pandaframe


[docs]class SMAC2Reader(BaseReader): """Reader for SMAC2-output. The expected output-structure in the specified folder is: self.folder/ smac-output/ aclib/ state-run1/ - scenario.txt # scenario - runs_and_results(...).csv # runhistory - paramstrings(...).csv # runhistory - traj-run-(...) # trajectory validate-time-train/ # optional, if validation has been beformed - validationCallStrings(...).csv - validationRunResultLineMatrix(...).csv validate-time-test/ # optional, if validation has been beformed - validationCallStrings(...).csv - validationRunResultLineMatrix(...).csv """
[docs] def get_scenario(self): run_1_existed = os.path.exists('run_1') in_reader = InputReader() # Create Scenario (disable output_dir to avoid cluttering) scen_fn = os.path.join(self.folder, 'smac-output/aclib/state-run1/scenario.txt') if not scen_fn: scen_fn = self.get_glob_file(self.folder, 'scenario.txt') scen_dict = in_reader.read_scenario_file(scen_fn) scen_dict['output_dir'] = "" with changedir(self.ta_exec_dir): self.logger.debug("Creating scenario from \"%s\"", self.ta_exec_dir) scen = Scenario(scen_dict) if (not run_1_existed) and os.path.exists('run_1'): shutil.rmtree('run_1') self.scen = scen return scen
[docs] def get_runhistory(self, cs): """ Expects the following files: - `self.folder/smac-output/aclib/state-run1/runs_and_results(...).csv` - `self.folder/smac-output/aclib/state-run1/paramstrings(...).csv` Returns ------- rh: RunHistory runhistory """ rh_fn = self.get_glob_file(self.folder, 'runs_and_results*.csv') self.logger.debug("Runhistory loaded as csv from %s", rh_fn) configs_fn = self.get_glob_file(self.folder, 'paramstrings*.txt') self.logger.debug("Configurations loaded from %s", configs_fn) # Translate smac2 to csv csv_data = load_csv_to_pandaframe(rh_fn, self.logger) data = pd.DataFrame() data["config_id"] = csv_data["Run History Configuration ID"] data["instance_id"] = csv_data["Instance ID"].apply(lambda x: self.scen.train_insts[x-1]) data["seed"] = csv_data["Seed"] data["time"] = csv_data["Runtime"] if self.scen.run_obj == 'runtime': data["cost"] = csv_data["Runtime"] else: data["cost"] = csv_data["Run Quality"] data["status"] = csv_data["Run Result"] # Load configurations with open(configs_fn, 'r') as csv_file: csv_data = list(csv.reader(csv_file, delimiter=',', skipinitialspace=True)) id_to_config = {} for row in csv_data: config_id = int(re.match(r'^(\d*):', row[0]).group(1)) params = [re.match(r'^\d*: (.*)', row[0]).group(1)] params.extend(row[1:]) #self.logger.debug(params) matches = [re.match(r'(.*)=\'(.*)\'', p) for p in params] values = {m.group(1) : m.group(2) for m in matches} values = deactivate_inactive_hyperparameters(fix_types(values, cs), cs).get_dictionary() id_to_config[config_id] = Configuration(cs, values=values) self.id_to_config = id_to_config names, feats = self.scen.feature_names, self.scen.feature_dict rh = CSV2RH().read_csv_to_rh(data, cs=cs, id_to_config=id_to_config, train_inst=self.scen.train_insts, test_inst=self.scen.test_insts, instance_features=feats) return rh
[docs] def get_validated_runhistory(self, cs): """ Expects the following files: - `self.folder/validate-time-train/validationCallStrings(...).csv` - `self.folder/validate-time-train/validationRunResultLineMatrix(...).csv` - `self.folder/validate-time-test/validationCallStrings(...).csv` - `self.folder/validate-time-test/validationRunResultLineMatrix(...).csv` Returns ------- validated_rh: RunHistory validated runhistory """ self.logger.debug("Loading validation-data") folder = os.path.join(self.folder, 'validate-time-train') configs_fn = re.search(r'validationCallStrings.*?\.csv', str(os.listdir(folder))) if not configs_fn: self.logger.warning("Specified validation_format is \'SMAC2\', but no " "\'validationCallStrings(...).csv\'-file could be found " "in %s" % folder) return configs_fn = os.path.join(folder, configs_fn.group()) results_fn = re.search(r'validationRunResultLineMatrix.*?\.csv', str(os.listdir(folder))) if not results_fn: self.logger.warning("Specified validation_format is \'SMAC2\', but no " "\'validationRunResultLineMatrix(...).csv\'-file could be found " "in %s" % folder) return results_fn = os.path.join(folder, results_fn.group()) self.logger.debug("Configurations loaded from %s", configs_fn) self.logger.debug("Runhistory loaded as csv from %s", results_fn) # Load configurations csv_data = load_csv_to_pandaframe(configs_fn, self.logger, False) id_to_config = {} for idx, row in csv_data.iterrows(): config_id = int(row[0]) configuration = row[1].split() params = [p.lstrip('-') for p in configuration[::2]] # all odds values = [v.strip('\'') for v in configuration[1::2]] # all evens param_values = dict(zip(params, values)) param_values = deactivate_inactive_hyperparameters(fix_types(param_values, cs), cs).get_dictionary() id_to_config[config_id] = Configuration(cs, values=param_values) names, feats = self.scen.feature_names, self.scen.feature_dict # Translate smac2-validation (RunResultString-matrix) to csv csv_data = load_csv_to_pandaframe(results_fn, self.logger, delimiter='\",\"') data = pd.DataFrame() for idx, row in csv_data.iterrows(): instance, seed = row[0], row[1] for column in csv_data.columns[2:]: config_id = int(re.match(r'^Run result line of validation config #(\d*)$', column).group(1)) result = [e.strip() for e in row[column].split(',')] data = data.append({"config_id" : config_id, "instance_id" : instance, "seed" : seed, "time" : result[1], "cost" : result[1] if self.scen.run_obj == 'runtime' else result[3], "status" : result[0]}, ignore_index=True) rh = CSV2RH().read_csv_to_rh(data, cs=cs, id_to_config=id_to_config, train_inst=self.scen.train_insts, test_inst=self.scen.test_insts, instance_features=feats) self.logger.debug("%d datapoints for %d configurations found in validated rh.", len(rh.data), len(rh.get_all_configs())) return rh
[docs] def get_trajectory(self, cs): """Expects the following files: - `self.folder/smac-output/aclib/state-run1/traj-run-(...).csv` """ traj_fn = self.get_glob_file(self.folder, 'traj-run-*.txt') with open(traj_fn, 'r') as csv_file: csv_data = list(csv.reader(csv_file, delimiter=',', skipinitialspace=True)) header, csv_data = csv_data[0][:-1], np.array([csv_data[1:]])[0] csv_data = pd.DataFrame(np.delete(csv_data, np.s_[5:], axis=1), columns=header) csv_data = csv_data.apply(pd.to_numeric, errors='ignore') traj = [] def add_to_traj(row): new_entry = {} new_entry['cpu_time'] = row['CPU Time Used'] new_entry['total_cpu_time'] = None new_entry["wallclock_time"] = row['Wallclock Time'] new_entry["evaluations"] = -1 new_entry["cost"] = row["Estimated Training Performance"] new_entry["incumbent"] = self.id_to_config[row["Incumbent ID"]] new_entry["budget"] = 0 # No budget-support for SMAC2! traj.append(new_entry) csv_data.apply(add_to_traj, axis=1) return traj
[docs] @classmethod def check_for_files(cls, path): if ((os.path.isfile(os.path.join(path, 'smac-output/aclib/state-run1/scenario.txt')) or cls.get_glob_file(path, 'scenario.txt')) and cls.get_glob_file(path, 'runs_and_results*.csv', 0) and cls.get_glob_file(path, 'paramstrings*.txt', 0) and cls.get_glob_file(path, 'traj-run-*.txt', 0) ): return True return False