Source code for deepcave.runs.run

# Copyright 2021-2024 The DeepCAVE Authors
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#   http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

#  noqa: D400
"""
# Run

This module provides utilities to create a new run and get its attributes.

## Classes
    - Run: Create a new run.
"""

from abc import ABC, abstractmethod
from typing import Any, Dict, List, Optional, Union

import json
from pathlib import Path

import ConfigSpace
import jsonlines
import numpy as np
from ConfigSpace.configuration_space import Configuration, ConfigurationSpace
from ConfigSpace.hyperparameters.hp_components import ROUND_PLACES

from deepcave.runs import AbstractRun, Status, Trial
from deepcave.runs.objective import Objective
from deepcave.utils.compression import Encoder
from deepcave.utils.files import make_dirs
from deepcave.utils.hash import string_to_hash
from deepcave.utils.util import config_to_tuple



[docs]
class Run(AbstractRun, ABC):
    """
    Create a new run and get its attributes.

    If path is given, runs are loaded from the path.

    Properties
    ----------
    configspace : ConfigurationSpace
        The configuration space of the run.
    path : Optional[Union[str, Path]]
        The path of a run to be loaded. If path is not None, the given parameters are used.
        If path is None, the run is created from the given parameters.
    meta : Dict[str, Any]
        Contains serialized objectives and budgets.
    prefix : str
        The prefix for the id.
    meta_fn : Path
        The path to the meta data.
    configspace_fn : Path
        The path to the configuration space file.
    configs_fn : Path
        The path to the configurations file.
    origins_fn : Path
        The path to the origins file.
    history_fn : Path
        The path to the history file.
    models_dir : Path
        The path to the models directory.
    configs : Dict[int, Configuration]
        Containing the configurations.
    models : Dict[int, Optional[Union[str, "torch.nn.Module"]]]
        Contains the models.
    """

    prefix = "run"
    _initial_order: int

    def __init__(
        self,
        name: str,
        configspace: Optional[ConfigSpace.ConfigurationSpace] = None,
        objectives: Optional[Union[Objective, List[Objective]]] = None,
        meta: Optional[Dict[str, Any]] = None,
        path: Optional[Path] = None,
    ) -> None:
        super(Run, self).__init__(name)
        if objectives is None:
            objectives = []
        if meta is None:
            meta = {}

        # Reset and load configspace/path
        self.reset()
        if configspace is not None:
            self.configspace = configspace
        self.path = path
        if self.path is not None:
            self.load()
            return

        if configspace is None and path is None:
            raise RuntimeError(
                "Please provide a configspace or specify a path to load existing trials."
            )

        # Objectives
        if not isinstance(objectives, List):
            objectives = [objectives]

        serialized_objectives = []
        for objective in objectives:
            assert isinstance(objective, Objective)
            serialized_objectives += [objective.to_json()]

        # Meta
        self.meta = {"objectives": serialized_objectives, "budgets": [], "seeds": []}
        self.meta.update(meta)


[docs]
    @classmethod
    @abstractmethod
    def from_path(cls, path: Path) -> "Run":
        """
        Based on a path, return a new Run object.

        Parameters
        ----------
        path : Path
            The path to get the run from.

        Returns
        -------
        "Run"
            The run loaded from the path.
        """
        pass



[docs]
    @classmethod
    @abstractmethod
    def is_valid_run(cls, path_name: str) -> bool:
        """
        Check whether the path name belongs to a valid run.

        Parameters
        ----------
        path_name: str
            The path to check.

        Returns
        -------
        bool
            True if path is valid run.
            False otherwise.
        """
        pass


    @property
    def id(self) -> str:
        """
        Get a hash as id.

        Returns
        -------
        str
            The hashed id.
        """
        return string_to_hash(f"{self.prefix}:{self.path}")

    @property
    def path(self) -> Optional[Path]:
        """
        Return the path of the run if it exists.

        Returns
        -------
        Optional[Path]
            The path of the run.
        """
        return self._path

    @path.setter
    def path(self, value: Optional[Union[str, Path]]) -> None:
        """
        Set the paths of the run and the JSON files.

        Parameters
        ----------
        value : Optional[Union[str, Path]]
            The path for the directory.
        """
        if value is None:
            self._path = None
            return

        self._path = Path(value)
        make_dirs(self._path)

        self.meta_fn = self._path / "meta.json"
        self.configspace_fn = self._path / "configspace.json"
        self.configs_fn = self._path / "configs.json"
        self.origins_fn = self._path / "origins.json"
        self.history_fn = self._path / "history.jsonl"
        self.models_dir = self._path / "models"


[docs]
    def exists(self) -> bool:
        """
        Check if the run exists based on the internal path.

        Returns
        -------
        bool
            If run exists.
        """
        if self._path is None:
            return False

        return all(
            f.is_file()
            for f in (
                self.meta_fn,
                self.configspace_fn,
                self.configs_fn,
                self.origins_fn,
                self.history_fn,
            )
        )



[docs]
    def add(
        self,
        costs: Union[List[float], float],
        config: Union[Dict, Configuration],
        seed: int,
        budget: float = np.inf,
        start_time: float = 0.0,
        end_time: float = 0.0,
        status: Status = Status.SUCCESS,
        origin: Optional[str] = None,
        model: Union[str, "torch.nn.Module"] = None,  # type: ignore # noqa: F821
        additional: Optional[Dict] = None,
    ) -> None:
        """
        Add a trial to the run.

        If combination of config, seed, and budget already exists, it will be overwritten.
        Not successful runs are added with `None` costs.

        Parameters
        ----------
        costs : Union[List[float], float]
            Costs of the run. In case of multi-objective, a list of costs is expected.
        config : Union[Dict, Configuration]
            The corresponding configuration.
        seed : int
            Seed of the run.
        budget : float, optional
            Budget of the run. By default np.inf
        start_time : float, optional
            Start time. By default, 0.0
        end_time : float, optional
            End time. By default, 0.0
        status : Status, optional
            Status of the trial. By default, Status.SUCCESS
        origin : str, optional
            Origin of the trial. By default, None
        model : Union[str, "torch.nn.Module"], optional
            Model of the trial. By default, None
        additional : Optional[Dict], optional
            Additional information of the trial. By default, None.
            Following information is used by DeepCAVE:
            * traceback

        Raises
        ------
        RuntimeError
            If number of costs does not match number of objectives.
        ValueError
            If config id is None.
        """
        if additional is None:
            additional = {}

        if not isinstance(costs, list):
            costs = [costs]

        if len(costs) != len(self.get_objectives()):
            raise RuntimeError("Number of costs does not match number of objectives.")

        updated_objectives = []
        for i in range(len(costs)):
            cost = costs[i]
            objective = self.get_objectives()[i]

            # Update time objective here
            if objective.name == "time" and cost is None:
                costs[i] = end_time - start_time
                cost = costs[i]

            # If cost is none, replace it later with the highest cost
            if cost is not None:
                # Update bounds here
                if not objective.lock_lower and objective.lower is not None:
                    if cost < objective.lower:
                        objective.lower = cost

                if not objective.lock_upper and objective.upper is not None:
                    if cost > objective.upper:
                        objective.upper = cost

            updated_objectives += [objective.to_json()]

        self.meta["objectives"] = updated_objectives

        if isinstance(config, Configuration):
            config = dict(config)

        if config not in self.configs.values():
            config_id_len = len(self.configs)
            self.configs[config_id_len] = config
            self.origins[config_id_len] = origin
            # Use same rounding as ConfigSpace does
            self.config_id_mapping[config_to_tuple(config, ROUND_PLACES)] = config_id_len

        config_id = self.get_config_id(config)
        if config_id is None:
            raise ValueError("Config id is None.")

        trial = Trial(
            config_id=config_id,
            budget=budget,
            seed=seed,
            costs=costs,
            start_time=np.round(start_time, 2),
            end_time=np.round(end_time, 2),
            status=status,
            additional=additional,
        )

        trial_key = trial.get_key()
        if trial_key not in self.trial_keys:
            self.trial_keys[trial_key] = len(self.history)
            self.history += [trial]
        else:
            # Overwrite
            self.history[self.trial_keys[trial_key]] = trial

        # Update budgets
        if budget not in self.meta["budgets"]:
            self.meta["budgets"].append(budget)
            self.meta["budgets"].sort()

        self._update_highest_budget(config_id, budget, status)

        # Update seeds
        if seed not in self.meta["seeds"]:
            self.meta["seeds"].append(seed)
            self.meta["seeds"].sort()

        # Update models
        # Problem: The model should not be in the cache.
        # Therefore, first the model is kept as it is,
        # but remove it from the dict and save it to the disk later on.
        if model is not None:
            self.models[config_id] = model



[docs]
    def save(self, path: Union[str, Path]) -> None:
        """
        Save the run and its information.

        Parameters
        ----------
        path : Optional[Union[str, Path]]
            The path in which to save the trials.

        Raises
        ------
        RuntimeError
            If the path is not specified.
        """
        if path is None:
            raise RuntimeError("Please specify a path to save the trials.")

        self.path = Path(path)

        # Save configspace
        self.configspace.to_json(self.configspace_fn)

        # Save meta data (could be changed)
        self.meta_fn.write_text(json.dumps(self.meta, cls=Encoder, indent=4))
        self.configs_fn.write_text(json.dumps(self.configs, cls=Encoder, indent=4))
        self.origins_fn.write_text(json.dumps(self.origins, cls=Encoder, indent=4))

        # Save history
        with jsonlines.open(self.history_fn, mode="w") as f:
            for trial in self.history:
                f.write(trial.to_json())

        # TODO: Update general cache file and tell him that self.path was used
        # to save the run.
        # Then, DeepCAVE can show direct suggestions in the select path dialog.

        # Models
        if len(self.models) > 0:
            # torch is imported here, because it is not wanted as requirement.
            import torch

            # Iterate over models and save them if they are a module.
            for config_id in list(self.models.keys()):
                filename = self.models_dir / f"{str(config_id)}.pth"
                if not filename.exists():
                    make_dirs(filename)

                    model = self.models[config_id]
                    if isinstance(model, torch.nn.Module):
                        torch.save(model, filename)
                    else:
                        raise RuntimeError("Unknown model type.")

                # Remove from dict
                del self.models[config_id]



[docs]
    def load(self, path: Optional[Union[str, Path]] = None) -> None:
        """
        Load the run.

        Parameters
        ----------
        path : Optional[Union[str, Path]], optional
            The path where to load the run from.
            Default is None.

        Raises
        ------
        RuntimeError
            If the path is None.
            If the trials were not found.
        """
        self.reset()

        if path is None and self.path is None:
            raise RuntimeError("Could not load trials because path is None.")

        if path is not None:
            self.path = Path(path)

        if not self.exists():
            raise RuntimeError("Could not load trials because trials were not found.")

        # Load meta data
        self.meta = json.loads(self.meta_fn.read_text())

        # Load configspace
        self.configspace = ConfigurationSpace.from_json(self.configspace_fn)

        # Load configs
        configs = json.loads(self.configs_fn.read_text())
        # Make sure all keys are integers
        self.configs = {int(k): v for k, v in configs.items()}

        # Load origins
        origins = json.loads(self.origins_fn.read_text())
        self.origins = {int(k): v for k, v in origins.items()}

        # Make sure there is a config_id_mapping
        if not self.config_id_mapping:
            for config_id, config in self.configs.items():
                if isinstance(config, Configuration):
                    config = dict(config)
                self.config_id_mapping[config_to_tuple(config, ROUND_PLACES)] = config_id

        # Load history
        with jsonlines.open(self.history_fn) as f:
            self.history = []
            for obj in f:
                # Create trial object here
                trial = Trial(*obj)
                self.history.append(trial)

                # Also create trial_keys
                self.trial_keys[trial.get_key()] = len(self.history) - 1

                # Update highest budget
                self._update_highest_budget(trial.config_id, trial.budget, trial.status)
DeepCAVE Documentation

Source code for deepcave.runs.run