Source code for cave.reader.conversion.csv2smac

import os
import shutil
import tempfile
from collections import OrderedDict

from ConfigSpace.read_and_write import json as pcs_json
from smac.runhistory.runhistory import RunHistory
from smac.scenario.scenario import Scenario
from smac.stats.stats import Stats
from smac.utils.io.input_reader import InputReader
from smac.utils.io.traj_logging import TrajLogger

from cave.reader.base_reader import changedir
from cave.reader.conversion.base_converter import BaseConverter
from cave.reader.conversion.csv2rh import CSV2RH
from cave.utils.helpers import get_folder_basenames
from cave.utils.io import load_config_csv, load_csv_to_pandaframe


[docs]class CSV2SMAC(BaseConverter): """ Converting `CSV-data <manualdoc/fileformats.html>`_ to SMAC3-format. """
[docs] def convert(self, folders, ta_exec_dirs=None, output_dir=None, converted_dest='converted_input_data'): self.logger.debug("Converting CSV-data to SMAC3-data. Called with: folders=%s, ta_exec_dirs=%s, output_dir=%s, " "converted_dest=%s", str(folders), str(ta_exec_dirs), str(output_dir), str(converted_dest)) # Using temporary files for the intermediate smac-result-like format if no output_dir specified if not output_dir: output_dir = tempfile.mkdtemp() self.logger.debug("Temporary directory for intermediate SMAC3-results: %s", output_dir) if ta_exec_dirs is None or len(ta_exec_dirs) == 0: ta_exec_dirs = ['.'] if len(ta_exec_dirs) != len(folders): ta_exec_dirs = [ta_exec_dirs[0] for _ in folders] ##################### # Actual conversion # ##################### folder_basenames = get_folder_basenames(folders) result = OrderedDict() for f, f_base, ta_exec_dir in zip(folders, folder_basenames, ta_exec_dirs): # Those are the parallel runs converted_folder_path = os.path.join(output_dir, converted_dest, f_base) self.logger.debug("Processing folder=%s, f_base=%s, ta_exec_dir=%s. Saving to %s.", f, f_base, ta_exec_dir, converted_folder_path) if not os.path.exists(converted_folder_path): self.logger.debug("%s doesn't exist. Creating...", converted_folder_path) os.makedirs(converted_folder_path) # Get and write scenario # (todo: enhancement: make scenario-file optional (build from scratch)) scenario_file_path = os.path.join(converted_folder_path, 'scenario.txt') scenario = self.get_scenario(f, ta_exec_dir=ta_exec_dir, out_path=scenario_file_path) # Read Configuration Space config_space = scenario.cs #config_space = self.load_configspace(f) scenario.paramfile = os.path.join(converted_folder_path, 'configspace.json') with open(scenario.paramfile, 'w') as new_file: new_file.write(pcs_json.write(config_space)) # Read runhistory.csv and write runhistory.json(s) runhistory = self.get_runhistory(f, scenario, 'runhistory.csv') runhistory.save_json(os.path.join(converted_folder_path, 'runhistory.json')) try: validated_runhistory = self.get_runhistory(f, scenario, 'validated_runhistory.csv') validated_runhistory.save_json(os.path.join(converted_folder_path, 'validated_runhistory.json')) except FileNotFoundError: validated_runhistory = None self.logger.debug("No file detected at \"%s\"", os.path.join(f, 'validated_runhistory.csv')) # Read trajectory. # (todo: enhancement: make trajectory-file (read it from runhistory?)) trajectory = self.get_trajectory(f, config_space, scenario, converted_folder_path) # After (possibly) changing paths and options (or creating the object), (over)write to new location scenario.output_dir_for_this_run = converted_folder_path scenario.write() result[f] = { 'new_path': converted_folder_path, 'config_space': config_space, 'runhistory': runhistory, 'validated_runhistory': validated_runhistory, 'scenario': scenario, 'trajectory': trajectory, } return result
[docs] def get_runhistory(self, folder, scenario, filename='runhistory.csv'): """Reads runhistory in csv-format: +--------------------+--------------------+------+------+------+--------+----------+ | config_id | instance_id | cost | time | seed | status | (budget) | +====================+====================+======+======+======+========+==========+ | name of config 1 | name of instance 1 | ... | ... | ... | ... | ... | +--------------------+--------------------+------+------+------+--------+----------+ | ... | ... | ... | ... | ... | ... | ... | +--------------------+--------------------+------+------+------+--------+----------+ where config_id and instance_id can also be replaced by columns for the individual parameters/instance features Sideeffect ---------- Sets self.id_to_config (dict) Returns ------- rh: RunHistory runhistory """ cs = scenario.cs rh_fn = os.path.join(folder, filename) if not os.path.exists(rh_fn): raise FileNotFoundError("Specified format is \'CSV\', but no \'%s\'-file could be found " "in \'%s\'" % (filename, folder)) self.logger.debug("Load Runhistory as csv from %s", rh_fn) configs_fn = os.path.join(folder, 'configurations.csv') if os.path.exists(configs_fn): self.logger.debug("Found \'configurations.csv\' in %s." % folder) self.id_to_config = load_config_csv(configs_fn, cs, self.logger)[1] else: self.logger.debug("No \'configurations.csv\' in %s." % folder) self.id_to_config = {} rh = CSV2RH().read_csv_to_rh(rh_fn, cs=cs, id_to_config=self.id_to_config, train_inst=scenario.train_insts, test_inst=scenario.test_insts, instance_features=scenario.feature_dict, ) if not self.id_to_config: self.id_to_config = rh.ids_config return rh
[docs] def get_trajectory(self, folder, cs, scenario, output_path): """Reads `folder/trajectory.csv`, expected format: +----------+------+----------------+-------------+-----------+ | cpu_time | cost | wallclock_time | evaluations | config_id | +==========+======+================+=============+===========+ | ... | ... | ... | ... | ... | +----------+------+----------------+-------------+-----------+ or +----------+------+----------------+-------------+------------+------------+-----+ | cpu_time | cost | wallclock_time | evaluations | parameter1 | parameter2 | ... | +==========+======+================+=============+============+============+=====+ | ... | ... | ... | ... | ... | ... | ... | +----------+------+----------------+-------------+------------+------------+-----+ Sideeffect ---------- Writes trajectory to trajectory-file in output-dir Returns ------- traj: List[TrajEntry] Returns trajectory as list """ traj_fn = os.path.join(folder, 'trajectory.csv') if not os.path.exists(traj_fn): raise FileNotFoundError("Specified format is \'CSV\', but no \'../trajectory\'" "-file at \'{}\'.".format(traj_fn)) csv_data = load_csv_to_pandaframe(traj_fn, self.logger, apply_numeric=False) traj_logger = TrajLogger(output_path, Stats(scenario)) csv_data, configs = CSV2RH().extract_configs(csv_data, cs, self.id_to_config) def add_to_traj(row): """ Adds a new entry to the trajectory (and logs it to the trajectory file)""" new_entry = { 'cpu_time' : float(row['cpu_time']), 'total_cpu_time' : None, "wallclock_time" : float(row['wallclock_time']), "evaluations" : int(row['evaluations']), "cost" : float(row["cost"]), "incumbent" : self.id_to_config[row["config_id"]], "budget": float(row["budget"]) if "budget" in row else 0, } traj_logger.trajectory.append(new_entry) traj_logger._add_in_alljson_format(train_perf=new_entry['cost'], incumbent_id=row['config_id'], incumbent=new_entry['incumbent'], budget=new_entry['budget'], ta_time_used=new_entry['cpu_time'], wallclock_time=new_entry['wallclock_time'], ) csv_data.apply(add_to_traj, axis=1) return traj_logger.trajectory
[docs] def get_scenario(self, path, ta_exec_dir=None, out_path=None): run_1_existed = os.path.exists('run_1') if ta_exec_dir is None: ta_exec_dir = '.' in_reader = InputReader() # Create Scenario scen_fn = os.path.join(path, 'scenario.txt') scen_dict = in_reader.read_scenario_file(scen_fn) scen_dict['output_dir'] = out_path with changedir(ta_exec_dir): self.logger.debug("Creating scenario from \"%s\"", ta_exec_dir) scen = Scenario(scen_dict) if (not run_1_existed) and os.path.exists('run_1'): shutil.rmtree('run_1') return scen
[docs] @classmethod def check_for_files(cls, path): """ Returns True if all files needed for CSV formatted results are detected in target folder """ if (os.path.isfile(os.path.join(path, 'scenario.txt')) and os.path.isfile(os.path.join(path, 'runhistory.csv')) and os.path.isfile(os.path.join(path, 'trajectory.csv')) ): return True return False