Skip to content

mfpbench

training

automl/mf-prior-bench

training

`def dehb_target_function(config, budget, X, y, seed=None, default_budget=MAX_ESTIMATORS, cv=5, scoring=('r2'))` #

Target function to run while training an xgboost model.

PARAMETER	DESCRIPTION
`config`	The configuration to use for the XGBoost model TYPE: `Configuration`
`budget`	The number of estimators to use for the XGBoost model TYPE: `int \| float \| None`
`X`	The data to train on TYPE: `DataFrame`
`y`	The target to train on TYPE: `Series`
`seed`	The seed to use for the XGBoost model TYPE: `int \| None` DEFAULT: `None`
`default_budget`	The default budget to use if budget is None TYPE: `int` DEFAULT: `MAX_ESTIMATORS`
`cv`	The number of folds to use for cross validation TYPE: `int` DEFAULT: `5`
`scoring`	The scoring metrics to use for cross validation TYPE: `tuple[str]` DEFAULT: `('r2')`

RETURNS	DESCRIPTION
`dict[str, Any]`	The result of the target function

Source code in src/mfpbench/pd1/surrogate/training.py

def dehb_target_function(
    config: Configuration,
    budget: int | float | None,
    X: pd.DataFrame,
    y: pd.Series,
    seed: int | None = None,
    default_budget: int = MAX_ESTIMATORS,
    cv: int = 5,
    scoring: tuple[str] = ("r2",),
) -> dict[str, Any]:
    """Target function to run while training an xgboost model.

    Args:
        config: The configuration to use for the XGBoost model
        budget: The number of estimators to use for the XGBoost model
        X: The data to train on
        y: The target to train on
        seed: The seed to use for the XGBoost model
        default_budget: The default budget to use if budget is None
        cv: The number of folds to use for cross validation
        scoring: The scoring metrics to use for cross validation

    Returns:
        The result of the target function
    """
    start = time.time()

    # Not sure if this is really needed but it's in example code for dehb
    budget = default_budget if budget is None else int(budget)

    if y.name == "train_cost":
        model = XGBRegressor(
            **config,
            seed=seed,
            n_estimators=budget,
            monotone_constraints={"epoch": 1},
            n_jobs=1,
        )
    else:
        model = XGBRegressor(**config, seed=seed, n_estimators=budget, n_jobs=1)

    scores = cross_validate(
        estimator=model,
        X=X,
        y=y,
        cv=KFold(shuffle=True, random_state=seed, n_splits=cv),
        scoring=scoring,
        return_train_score=True,
    )

    primary_eval_metric = scoring[0]
    primary = np.mean(scores[f"test_{primary_eval_metric}"])

    cost = time.time() - start
    for k, v in scores.items():
        scores[k] = list(v) if isinstance(v, np.ndarray) else v

    return {
        "fitness": -primary,  # DEHB minimized
        "cost": cost,
        "info": {
            "score": primary,
            "cv_scores": scores,
            "budget": budget,
            "config": dict(config),
        },
    }

`def find_xgboost_surrogate(X, y, *, cv=5, seed=None, opt_time=30.0, output_path=None, n_workers=1)` #

Find the best XGBoost surrogate for the given data.

PARAMETER	DESCRIPTION
`X`	The data to train on TYPE: `DataFrame`
`y`	The target to train on TYPE: `Series`
`cv`	The number of folds to use for cross validation TYPE: `int` DEFAULT: `5`
`seed`	The seed to use for the XGBoost model TYPE: `int \| None` DEFAULT: `None`
`opt_time`	The time to spend optimizing the surrogate TYPE: `float` DEFAULT: `30.0`
`output_path`	The path to save the results to TYPE: `Path \| None` DEFAULT: `None`
`n_workers`	The number of workers to use for DEHB TYPE: `int` DEFAULT: `1`

RETURNS	DESCRIPTION
`XGBRegressor`	The trained XGBoost model

Source code in src/mfpbench/pd1/surrogate/training.py

def find_xgboost_surrogate(
    X: pd.DataFrame,
    y: pd.Series,
    *,
    cv: int = 5,
    seed: int | None = None,
    opt_time: float = 30.0,
    output_path: Path | None = None,
    n_workers: int = 1,
) -> XGBRegressor:
    """Find the best XGBoost surrogate for the given data.

    Args:
        X: The data to train on
        y: The target to train on
        cv: The number of folds to use for cross validation
        seed: The seed to use for the XGBoost model
        opt_time: The time to spend optimizing the surrogate
        output_path: The path to save the results to
        n_workers: The number of workers to use for DEHB

    Returns:
        The trained XGBoost model
    """
    cs = space(seed=seed)
    if output_path is None:
        timestamp = datetime.isoformat(datetime.now())
        output_path = Path(f"surrogate-opt-{y.name}-{timestamp}")

    if not output_path.exists():
        output_path.mkdir(exist_ok=True)

    dehb_path = output_path / "dehb"
    if not dehb_path.exists():
        dehb_path.mkdir(exist_ok=True)

    dehb = DEHB(
        f=dehb_target_function,
        cs=cs,
        dimensions=len(cs.get_hyperparameters()),
        min_budget=MIN_ESTIMATORS,
        max_budget=MAX_ESTIMATORS,
        n_workers=n_workers,
        output_path=str(dehb_path),
    )

    _, _, hist = dehb.run(
        total_cost=opt_time,
        verbose=True,
        save_intermediate=False,
        # kwargs
        X=X,
        y=y,
        cv=cv,
    )

    # Now we find the one with the highest test_score and use that for
    # training our final model
    infos = [info for *_, info in hist]
    best = max(infos, key=lambda i: i["score"])

    # Write out the info
    info_path = output_path / "info.json"
    best_path = output_path / "best.json"
    with info_path.open("w") as f:
        json.dump(infos, f)

    with best_path.open("w") as f:
        json.dump(best, f)

    best_config = best["config"]
    best_budget = best["budget"]

    # Train
    model = XGBRegressor(**best_config, seed=seed, n_estimators=best_budget)
    model.fit(X, y)
    return model  # type: ignore