Source code for deepcave.runs.converters.amltk

# Copyright 2021-2024 The DeepCAVE Authors
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#   http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

#  noqa: D400
"""
# AMLTKRun

This module provides utilities to create an AMLTK (AutoML Toolkit) run.

## Classes
    - AMLTKRun: Define an AMLTK run object.
"""

from typing import Optional, Sequence, Union

import os
import re
from pathlib import Path

import numpy as np
import pandas as pd
from ConfigSpace.configuration_space import ConfigurationSpace

from deepcave.runs import Status
from deepcave.runs.objective import Objective
from deepcave.runs.run import Run
from deepcave.utils.converters import extract_config, extract_costs, extract_value
from deepcave.utils.hash import file_to_hash



[docs]
class AMLTKRun(Run):
    """
    Define an AMLTK (AutoML Toolkit) run object.

    Properties
    ----------
    path : Path
        The path to the run.
    """

    prefix = "AMLTK"
    _initial_order = 2

    @property
    def hash(self) -> str:
        """
        Hash of the current run.

        If the hash changes, the cache has to be cleared.
        This ensures that the cache always holds the latest results of the run.

        Returns
        -------
        str
            The hash of the run.
        """
        if self.path is None:
            return ""

        # Use hash of history.parquet as id
        return file_to_hash(self.path / "history.parquet")

    @property
    def latest_change(self) -> Union[float, int]:
        """
        Get the timestamp of the latest change.

        Returns
        -------
        Union[float, int]
            The latest change.
        """
        if self.path is None:
            return 0

        return Path(self.path / "history.parquet").stat().st_mtime


[docs]
    @classmethod
    def from_path(cls, path: Union[Path, str]) -> "AMLTKRun":
        """
        Based on working_dir/run_name/*, return a new trials object.

        Parameters
        ----------
        path : Union[Path, str]
            The path to base the trial object on.

        Returns
        -------
        The AMLTK run.

        Raises
        ------
        RuntimeError
            Instances are not supported.
        """
        path = Path(path)

        # Read configspace
        configspace = ConfigurationSpace.from_json(path / "configspace.json")

        history = pd.read_parquet(path / "history.parquet")

        history["budget"] = history.index.map(
            lambda x: float(value) if (value := extract_value(x, "budget")) is not None else None
        )

        # Extract the objectives from the dataframe
        obj_list = list()
        for metric_string in history.columns:
            if metric_string.startswith("metric:"):
                match = re.match(
                    r"metric:(\w+) \[(-?\d+\.?\d*|[-+]inf), (-?\d+\.?\d*|[-+]inf)\] \((\w+)\)",
                    metric_string,
                )
                assert match is not None
                metric_name = match.group(1)
                lower = float(match.group(2))
                upper = float(match.group(3))
                maximize = match.group(4) == "maximize"

                obj_list.append(
                    Objective(
                        name=metric_name,
                        lower=lower,
                        upper=upper,
                        optimize="upper" if maximize else "lower",
                    )
                )

        obj_list.append(Objective("Time"))

        # Let's create a new run object
        run = AMLTKRun(name=path.stem, configspace=configspace, objectives=obj_list, meta=None)

        # The path has to be set manually
        run._path = path

        first_starttime = None
        seeds = []
        for _, trial in history.iterrows():
            config = extract_config(trial, configspace)

            if trial["trial_seed"] not in seeds:
                seeds.append(trial["trial_seed"])

            # Start and end time of the trial need to be given via a deepcave:time:start and
            # deepcave:time:end column
            starttime_col = "deepcave:time:start"
            endtime_col = "deepcave:time:end"
            if starttime_col not in history.columns:
                raise ValueError(
                    f"Missing DeepCAVE start time column '{starttime_col}' in history.csv."
                )
            if endtime_col not in history.columns:
                raise ValueError(
                    f"Missing DeepCAVE end time column '{endtime_col}' in history.csv."
                )

            if first_starttime is None:
                first_starttime = trial[starttime_col]

            starttime = trial[starttime_col] - first_starttime
            endtime = trial[endtime_col] - first_starttime

            if trial["status"] == "success":
                status = Status.SUCCESS
            elif trial["status"] == "fail":
                status = Status.FAILED
            elif trial["status"] == "crashed":
                status = Status.CRASHED
            else:
                status = Status.UNKNOWN

            cost: Sequence[Optional[float]] = extract_costs(trial)

            if status != Status.SUCCESS:
                # Costs which failed, should not be included
                cost = [None] * len(cost)
                time = None
            else:
                time = float(endtime - starttime)

            # Round budget
            if trial["budget"] is not None:
                budget = np.round(trial["budget"], 2)
            else:
                budget = 0.0

            if trial["traceback"] is not None:
                additional_info = {"traceback": trial["traceback"]}
            else:
                additional_info = None

            run.add(
                costs=cost + [time],  # type: ignore
                config=config,
                budget=budget,
                seed=trial["trial_seed"],
                start_time=starttime,
                end_time=endtime,
                status=status,
                origin=None,
                additional=additional_info,
            )

        return run



[docs]
    @classmethod
    def is_valid_run(cls, path_name: str) -> bool:
        """
        Check whether the path name belongs to a valid amltk run.

        Parameters
        ----------
        path_name: str
            The path to check.

        Returns
        -------
        bool
            True if path is valid run.
            False otherwise.
        """
        if os.path.isfile(path_name + "/history.parquet") and os.path.isfile(
            path_name + "/configspace.json"
        ):
            return True
        return False
DeepCAVE Documentation

Source code for deepcave.runs.converters.amltk