Dqn

mighty.mighty_agents.dqn #

DQN agent.

MightyDQNAgent #

MightyDQNAgent(
    output_dir: str,
    env: MIGHTYENV,
    seed: int | None = None,
    eval_env: MIGHTYENV = None,
    learning_rate: float = 0.01,
    gamma: float = 0.9,
    epsilon: float = 0.1,
    batch_size: int = 64,
    learning_starts: int = 1,
    render_progress: bool = True,
    log_wandb: bool = False,
    wandb_kwargs: dict | None = None,
    replay_buffer_class: str
    | DictConfig
    | type[MightyReplay]
    | None = None,
    replay_buffer_kwargs: TypeKwargs | None = None,
    meta_methods: list[str | type] | None = None,
    meta_kwargs: list[TypeKwargs] | None = None,
    use_target: bool = True,
    n_units: int = 8,
    soft_update_weight: float = 0.01,
    target_update_freq: int | None = None,
    policy_class: str
    | DictConfig
    | type[MightyExplorationPolicy]
    | None = None,
    policy_kwargs: TypeKwargs | None = None,
    q_class: str | DictConfig | type[DQN] | None = None,
    q_kwargs: TypeKwargs | None = None,
    td_update_class: type[QLearning] = QLearning,
    td_update_kwargs: TypeKwargs | None = None,
    save_replay: bool = False,
    n_gradient_steps: int = 1,
    normalize_obs: bool = False,
    normalize_reward: bool = False,
    rescale_action: bool = False,
)

Bases: MightyAgent

Mighty DQN agent.

This agent implements the DQN algorithm and extension as first proposed in "Playing Atari with Deep Reinforcement Learning" by Mnih et al. in 2013. DDQN was proposed by van Hasselt et al. in 2016's "Deep Reinforcement Learning with Double Q-learning". Like all Mighty agents, it's supposed to be called via the train method. By default, this agent uses an epsilon-greedy policy.

Creates all relevant class variables and calls agent-specific init function

:param env: Train environment :param eval_env: Evaluation environment :param learning_rate: Learning rate for training :param epsilon: Exploration factor for training :param batch_size: Batch size for training :param render_progress: Render progress :param log_tensorboard: Log to tensorboard as well as to file :param replay_buffer_class: Replay buffer class from coax replay buffers :param replay_buffer_kwargs: Arguments for the replay buffer :param tracer_class: Reward tracing class from coax tracers :param tracer_kwargs: Arguments for the reward tracer :param n_units: Number of units for Q network :param soft_update_weight: Size of soft updates for target network :param policy_class: Policy class from coax value-based policies :param policy_kwargs: Arguments for the policy :param td_update_class: Kind of TD update used from coax TD updates :param td_update_kwargs: Arguments for the TD update :return:

Source code in mighty/mighty_agents/dqn.py

def __init__(
    self,
    output_dir: str,
    # MightyAgent Args
    env: MIGHTYENV,  # type: ignore
    seed: int | None = None,
    eval_env: MIGHTYENV = None,  # type: ignore
    learning_rate: float = 0.01,
    gamma: float = 0.9,
    epsilon: float = 0.1,
    batch_size: int = 64,
    learning_starts: int = 1,
    render_progress: bool = True,
    log_wandb: bool = False,
    wandb_kwargs: dict | None = None,
    replay_buffer_class: str | DictConfig | type[MightyReplay] | None = None,
    replay_buffer_kwargs: TypeKwargs | None = None,
    meta_methods: list[str | type] | None = None,
    meta_kwargs: list[TypeKwargs] | None = None,
    # DDQN Specific Args
    use_target: bool = True,
    n_units: int = 8,
    soft_update_weight: float = 0.01,
    target_update_freq: int | None = None,
    policy_class: str | DictConfig | type[MightyExplorationPolicy] | None = None,
    policy_kwargs: TypeKwargs | None = None,
    q_class: str | DictConfig | type[DQN] | None = None,
    q_kwargs: TypeKwargs | None = None,
    td_update_class: type[QLearning] = QLearning,
    td_update_kwargs: TypeKwargs | None = None,
    save_replay: bool = False,
    n_gradient_steps: int = 1,
    normalize_obs: bool = False,
    normalize_reward: bool = False,
    rescale_action: bool = False,  # type: ignore
):
    """DQN initialization.

    Creates all relevant class variables and calls agent-specific init function

    :param env: Train environment
    :param eval_env: Evaluation environment
    :param learning_rate: Learning rate for training
    :param epsilon: Exploration factor for training
    :param batch_size: Batch size for training
    :param render_progress: Render progress
    :param log_tensorboard: Log to tensorboard as well as to file
    :param replay_buffer_class: Replay buffer class from coax replay buffers
    :param replay_buffer_kwargs: Arguments for the replay buffer
    :param tracer_class: Reward tracing class from coax tracers
    :param tracer_kwargs: Arguments for the reward tracer
    :param n_units: Number of units for Q network
    :param soft_update_weight: Size of soft updates for target network
    :param policy_class: Policy class from coax value-based policies
    :param policy_kwargs: Arguments for the policy
    :param td_update_class: Kind of TD update used from coax TD updates
    :param td_update_kwargs: Arguments for the TD update
    :return:
    """
    if meta_kwargs is None:
        meta_kwargs = []
    if meta_methods is None:
        meta_methods = []
    if wandb_kwargs is None:
        wandb_kwargs = {}
    self.n_units = n_units
    assert 0.0 <= soft_update_weight <= 1.0  # noqa: PLR2004
    self.soft_update_weight = soft_update_weight
    self.target_update_freq = target_update_freq
    self._target_update_counter = 0

    # Placeholder variables which are filled in self.initialize_agent
    self.q: DQN | None = None
    self.policy: MightyExplorationPolicy | None = None
    self.q_target: DQN | None = None
    self.qlearning: QLearning | None = None
    self.use_target = use_target

    # Q-function Class
    q_class = retrieve_class(cls=q_class, default_cls=DQN)  # type: ignore
    if q_kwargs is None:
        q_kwargs = {"n_layers": 0}  # type: ignore
    self.q_class = q_class
    self.q_kwargs = q_kwargs

    # Policy Class
    policy_class = retrieve_class(cls=policy_class, default_cls=EpsilonGreedy)  # type: ignore
    if policy_kwargs is None and isinstance(policy_class, EpsilonGreedy):
        policy_kwargs = {"epsilon": 0.1}  # type: ignore
    elif policy_kwargs is None:
        policy_kwargs = {}
    self.policy_class = policy_class
    self.policy_kwargs = policy_kwargs

    self.td_update_class = retrieve_class(
        cls=td_update_class, default_cls=DoubleQLearning
    )
    if td_update_kwargs is None:
        td_update_kwargs = {"gamma": gamma}  # type: ignore
    self.td_update_kwargs = td_update_kwargs
    self.save_replay = save_replay

    super().__init__(
        env=env,
        output_dir=output_dir,
        seed=seed,
        eval_env=eval_env,
        learning_rate=learning_rate,
        epsilon=epsilon,
        batch_size=batch_size,
        learning_starts=learning_starts,
        n_gradient_steps=n_gradient_steps,
        render_progress=render_progress,
        log_wandb=log_wandb,
        wandb_kwargs=wandb_kwargs,
        replay_buffer_class=replay_buffer_class,
        replay_buffer_kwargs=replay_buffer_kwargs,
        meta_methods=meta_methods,
        meta_kwargs=meta_kwargs,
        normalize_obs=normalize_obs,
        normalize_reward=normalize_reward,
        rescale_action=rescale_action,
    )

    self.loss_buffer = {
        "Update/loss": [],
        "Update/td_errors": [],
        "step": [],
    }

parameters `property` #

parameters: List

Q-function parameters.

value_function `property` #

value_function: DQN

Q-function.

del #

__del__() -> None

Close wandb upon deletion.

Source code in mighty/mighty_agents/base_agent.py

def __del__(self) -> None:
    """Close wandb upon deletion."""
    self.env.close()  # type: ignore
    if self.log_wandb:
        wandb.finish()

adapt_hps #

adapt_hps(metrics: Dict) -> None

Set hyperparameters.

Source code in mighty/mighty_agents/dqn.py

def adapt_hps(self, metrics: Dict) -> None:
    """Set hyperparameters."""
    super().adapt_hps(metrics)
    if "hp/soft_update_weight" in metrics:
        self.soft_update_weight = metrics["hp/soft_update_weight"]
    for g in self.qlearning.optimizer.param_groups:  # type: ignore
        g["lr"] = self.learning_rate

apply_config #

apply_config(config: Dict) -> None

Apply config to agent.

Source code in mighty/mighty_agents/base_agent.py

def apply_config(self, config: Dict) -> None:
    """Apply config to agent."""
    for n in config:
        algo_name = n.split(".")[-1]
        if hasattr(self, algo_name):
            setattr(self, algo_name, config[n])
        elif hasattr(self, "_" + algo_name):
            setattr(self, "_" + algo_name, config[n])
        elif n in ["architecture", "n_units", "n_layers", "size"]:
            pass
        else:
            print(f"Trying to set hyperparameter {algo_name} which does not exist.")

evaluate #

evaluate(eval_env: MIGHTYENV | None = None) -> Dict

Eval agent on an environment. (Full rollouts).

:param env: The environment to evaluate on :param episodes: The number of episodes to evaluate :return:

Source code in mighty/mighty_agents/base_agent.py

def evaluate(self, eval_env: MIGHTYENV | None = None) -> Dict:  # type: ignore
    """Eval agent on an environment. (Full rollouts).

    :param env: The environment to evaluate on
    :param episodes: The number of episodes to evaluate
    :return:
    """

    terminated, truncated = False, False
    options: Dict = {}
    if eval_env is None:
        eval_env = self.eval_env

    state, _ = eval_env.reset(options=options, seed=self.seed)  # type: ignore
    rewards = np.zeros(eval_env.num_envs)  # type: ignore
    steps = np.zeros(eval_env.num_envs)  # type: ignore
    mask = np.zeros(eval_env.num_envs)  # type: ignore
    while not np.all(mask):
        action = self.policy(state, evaluate=True)  # type: ignore
        state, reward, terminated, truncated, _ = eval_env.step(action)  # type: ignore
        rewards += reward * (1 - mask)
        steps += 1 * (1 - mask)
        dones = np.logical_or(terminated, truncated)
        mask = np.where(dones, 1, mask)

    if isinstance(self.eval_env, DACENV) or isinstance(self.env, CARLENV):
        instance = eval_env.instance  # type: ignore
    else:
        instance = "None"

    eval_metrics = {
        "step": self.steps,
        "seed": self.seed,
        "eval_episodes": np.array(rewards) / steps,
        "mean_eval_step_reward": np.mean(rewards) / steps,
        "mean_eval_reward": np.mean(rewards),
        "instance": instance,
        "eval_rewards": rewards,
    }
    self.eval_buffer = update_buffer(self.eval_buffer, eval_metrics)

    if self.log_wandb:
        wandb.log(eval_metrics)

    return eval_metrics

initialize_agent #

initialize_agent() -> None

General initialization of tracer and buffer for all agents.

Algorithm specific initialization like policies etc. are done in _initialize_agent

Source code in mighty/mighty_agents/base_agent.py

def initialize_agent(self) -> None:
    """General initialization of tracer and buffer for all agents.

    Algorithm specific initialization like policies etc.
    are done in _initialize_agent
    """
    self._initialize_agent()

    if isinstance(self.buffer_class, type) and issubclass(
        self.buffer_class, PrioritizedReplay
    ):
        if isinstance(self.buffer_kwargs, DictConfig):
            self.buffer_kwargs = OmegaConf.to_container(
                self.buffer_kwargs, resolve=True
            )
        # 1) Get observation-space shape
        try:
            obs_space = self.env.single_observation_space
            obs_shape = tuple(obs_space.shape)
        except Exception:
            # Fallback: call env.reset() once and infer shape from returned numpy/torch array
            first_obs, _ = self.env.reset(seed=self.seed)
            obs_shape = tuple(np.array(first_obs).shape)

        # 2) Get action-space shape (if discrete, .n is number of actions)
        action_space = self.env.single_action_space
        if hasattr(action_space, "n"):
            # Discrete action space → action_shape = () (scalar), but Q-net will expect a single integer
            # We store it as a zero-length tuple, and treat it as int later.
            action_shape = ()
        else:
            # Continuous action space, e.g. Box(shape=(3,)), so we store that tuple
            action_shape = tuple(action_space.shape)

        # 3) Overwrite the YAML placeholders (null → actual)
        self.buffer_kwargs["obs_shape"] = obs_shape
        self.buffer_kwargs["action_shape"] = action_shape

    self.buffer = self.buffer_class(**self.buffer_kwargs)  # type: ignore

load #

load(path: str) -> None

Set the internal state of the agent, e.g. after loading.

Source code in mighty/mighty_agents/dqn.py

def load(self, path: str) -> None:
    """Set the internal state of the agent, e.g. after loading."""
    base_path = Path(path)
    q_path = base_path / "q.pt"
    q_state = torch.load(q_path)
    self.q.load_state_dict(q_state)  # type: ignore

    if self.q_target is not None:
        target_path = base_path / "q_target.pt"
        target_state = torch.load(target_path)
        self.q_target.load_state_dict(target_state)

    optimizer_path = base_path / "optimizer.pkl"
    optimizer_state_dict = torch.load(optimizer_path)["optimizer_state"]
    self.qlearning.optimizer.load_state_dict(optimizer_state_dict)  # type: ignore

    replay_path = base_path / "replay.pkl"
    if replay_path.exists():
        self.buffer = dill.loads(replay_path)
    if self.verbose:
        print(f"Loaded checkpoint at {path}")

make_checkpoint_dir #

make_checkpoint_dir(t: int) -> None

Checkpoint model.

:param T: Current timestep :return:

Source code in mighty/mighty_agents/base_agent.py

def make_checkpoint_dir(self, t: int) -> None:
    """Checkpoint model.

    :param T: Current timestep
    :return:
    """
    self.upper_checkpoint_dir = Path(self.output_dir) / Path("checkpoints")
    if not self.upper_checkpoint_dir.exists():
        Path(self.upper_checkpoint_dir).mkdir()
    self.checkpoint_dir = self.upper_checkpoint_dir / f"{t}"
    if not self.checkpoint_dir.exists():
        Path(self.checkpoint_dir).mkdir()

run #

run(
    n_steps: int,
    eval_every_n_steps: int = 1000,
    save_model_every_n_steps: int | None = 5000,
    env: MIGHTYENV = None,
) -> Dict

Run agent.

Source code in mighty/mighty_agents/base_agent.py

def run(  # noqa: PLR0915
    self,
    n_steps: int,
    eval_every_n_steps: int = 1_000,
    save_model_every_n_steps: int | None = 5000,
    env: MIGHTYENV = None,  # type: ignore
) -> Dict:
    """Run agent."""
    episodes = 0
    if env is not None:
        self.env = env

    logging_layout, progress, steps_task = self.make_logging_layout(n_steps)
    update_multiplier = 0

    with Live(logging_layout, refresh_per_second=10, vertical_overflow="visible"):
        steps_since_eval = 0
        steps_since_log = 0

        metrics = {
            "env": self.env,
            "vf": self.value_function,  # type: ignore
            "policy": self.policy,
            "step": self.steps,
            "hp/lr": self.learning_rate,
            "hp/pi_epsilon": self._epsilon,
            "hp/batch_size": self._batch_size,
            "hp/learning_starts": self._learning_starts,
        }

        # Reset env and initialize reward sum
        curr_s, _ = self.env.reset(seed=self.seed)  # type: ignore
        if len(curr_s.squeeze().shape) == 0:
            episode_reward = [0]
        else:
            episode_reward = np.zeros(curr_s.squeeze().shape[0])  # type: ignore

        last_episode_reward = episode_reward
        if not torch.is_tensor(last_episode_reward):
            last_episode_reward = torch.tensor(last_episode_reward).float()

        recent_episode_reward = []
        recent_step_reward = []
        recent_actions = []
        evaluation_reward = []

        # Start logging
        eval_curve = [0]
        learning_curve = [0]
        curve_xs = [0]
        progress.update(steps_task, visible=True)
        logging_layout["lower"]["left"].update(
            self.get_plot(curve_xs, learning_curve, "Training Reward")
        )
        logging_layout["lower"]["right"].update(
            self.get_plot(curve_xs, eval_curve, "Evaluation Reward")
        )

        # Main loop: rollouts, training and evaluation
        while self.steps < n_steps:
            metrics["episode_reward"] = episode_reward

            action, log_prob = self.step(curr_s, metrics)
            # step the env as usual
            next_s, reward, terminated, truncated, infos = self.env.step(action)

            # decide which samples are true “done”
            replay_dones = terminated          # physics‐failure only
            dones = np.logical_or(terminated, truncated)


            # Overwrite next_s on truncation
            # Based on https://github.com/DLR-RM/stable-baselines3/issues/284    
            real_next_s = next_s.copy()
            # infos["final_observation"] is a list/array of the last real obs
            for i, tr in enumerate(truncated):
                if tr:
                    real_next_s[i] = infos["final_observation"][i]
            episode_reward += reward

            # Log everything
            t = {
                "seed": self.seed,
                "step": self.steps,
                "reward": reward,
                "action": action,
                "state": curr_s,
                "next_state": real_next_s,
                "terminated": terminated.astype(int),
                "truncated": truncated.astype(int),
                "dones": replay_dones.astype(int),
                "mean_episode_reward": last_episode_reward.mean()
                .cpu()
                .numpy()
                .item(),
            }
            metrics["log_prob"] = log_prob.detach().cpu().numpy()
            metrics["episode_reward"] = episode_reward
            metrics["transition"] = t

            recent_actions.append(np.mean(action))
            if len(recent_actions) > 100:
                recent_actions.pop(0)

            for k in self.meta_modules:
                self.meta_modules[k].post_step(metrics)

            transition_metrics = self.process_transition(
                metrics["transition"]["state"],
                metrics["transition"]["action"],
                metrics["transition"]["reward"],
                metrics["transition"]["next_state"],
                metrics["transition"]["dones"],
                metrics["log_prob"],
                metrics,
            )
            metrics.update(transition_metrics)
            self.result_buffer = update_buffer(self.result_buffer, t)

            if self.log_wandb:
                wandb.log(t)

            self.steps += len(action)
            metrics["step"] = self.steps
            steps_since_eval += len(action)
            steps_since_log += len(action)
            for _ in range(len(action)):
                progress.advance(steps_task)

            # Update agent
            if (
                len(self.buffer) >= self._batch_size  # type: ignore
                and self.steps >= self._learning_starts
            ):
                update_kwargs = {"next_s": next_s, "dones": dones}
                metrics = self.update(metrics, update_kwargs)

            # End step
            self.last_state = curr_s
            curr_s = next_s

            # Evaluate
            if eval_every_n_steps and steps_since_eval >= eval_every_n_steps:
                steps_since_eval = 0
                eval_metrics = self.evaluate()
                evaluation_reward = eval_metrics["eval_rewards"]

            # Log to command line via rich layout
            if self.steps >= 1000 * update_multiplier:
                metrics_table = self.make_logging_table(
                    self.steps,
                    recent_episode_reward,
                    recent_step_reward,
                    evaluation_reward,
                    recent_actions,
                )
                logging_layout["middle"]["left"].update(metrics_table)
                eval_curve.append(np.mean(evaluation_reward))
                learning_curve.append(np.mean(recent_episode_reward))
                curve_xs.append(self.steps)

                logging_layout["lower"]["left"].update(
                    self.get_plot(curve_xs, learning_curve, "Training Reward")
                )
                logging_layout["lower"]["right"].update(
                    self.get_plot(curve_xs, eval_curve, "Evaluation Reward")
                )
                update_multiplier += 1

            # Save model & metrics
            if (
                save_model_every_n_steps
                and steps_since_log >= save_model_every_n_steps
            ):
                steps_since_log = 0
                self.save(self.steps)
                log_to_file(
                    self.output_dir,
                    self.result_buffer,
                    self.hp_buffer,
                    self.eval_buffer,
                    self.loss_buffer,
                )

            # Perform resets as necessary
            if np.any(dones):
                last_episode_reward = np.where(  # type: ignore
                    dones, episode_reward, last_episode_reward
                )
                recent_episode_reward.append(np.mean(last_episode_reward))
                recent_step_reward.append(
                    np.mean(last_episode_reward) / len(last_episode_reward)
                )
                last_episode_reward = torch.tensor(last_episode_reward).float()
                if len(recent_episode_reward) > 10:
                    recent_episode_reward.pop(0)
                    recent_step_reward.pop(0)
                episode_reward = np.where(dones, 0, episode_reward)  # type: ignore
                # End episode
                if isinstance(self.env, DACENV) or isinstance(self.env, CARLENV):
                    instance = self.env.instance  # type: ignore
                else:
                    instance = None
                metrics["instance"] = instance
                episodes += 1
                for k in self.meta_modules:
                    self.meta_modules[k].post_episode(metrics)

                if "rollout_values" in metrics:
                    del metrics["rollout_values"]

                if "rollout_logits" in metrics:
                    del metrics["rollout_logits"]

                # Meta Module hooks
                for k in self.meta_modules:
                    self.meta_modules[k].pre_episode(metrics)

    # Final logging
    log_to_file(
        self.output_dir,
        self.result_buffer,
        self.hp_buffer,
        self.eval_buffer,
        self.loss_buffer,
    )
    return metrics

save #

save(t: int) -> None

Return current agent state, e.g. for saving.

For DQN, this consists of: - the Q network parameters - the Q network function state - the target network parameters - the target network function state

:return: Agent state

Source code in mighty/mighty_agents/dqn.py

def save(self, t: int) -> None:
    """Return current agent state, e.g. for saving.

    For DQN, this consists of:
    - the Q network parameters
    - the Q network function state
    - the target network parameters
    - the target network function state

    :return: Agent state
    """
    super().make_checkpoint_dir(t)
    # Save q parameters
    q_path = self.checkpoint_dir / "q.pt"
    torch.save(self.q.state_dict(), q_path)  # type: ignore

    # Save target parameters
    if self.q_target is not None:
        target_path = self.checkpoint_dir / "q_target.pt"
        torch.save(self.q_target.state_dict(), target_path)

    # Save optimizer state
    optimizer_path = self.checkpoint_dir / "optimizer.pkl"
    torch.save(
        {"optimizer_state": self.qlearning.optimizer.state_dict()},  # type: ignore
        optimizer_path,
    )

    # Save replay buffer
    if self.save_replay:
        replay_path = self.checkpoint_dir / "replay.pkl"
        self.buffer.save(replay_path)  # type: ignore

    if self.verbose:
        print(f"Saved checkpoint at {self.checkpoint_dir}")

update #

update(metrics: Dict, update_kwargs: Dict) -> Dict

Update agent.

Source code in mighty/mighty_agents/base_agent.py

def update(self, metrics: Dict, update_kwargs: Dict) -> Dict:
    """Update agent."""
    for k in self.meta_modules:
        self.meta_modules[k].pre_update(metrics)

    batches = []
    for batches_left in reversed(range(self.n_gradient_steps)):
        batch = self.buffer.sample(self._batch_size)
        agent_update_metrics = self.update_agent(
            transition_batch=batch, batches_left=batches_left, **update_kwargs
        )

        metrics.update(agent_update_metrics)
        metrics["step"] = self.steps

        if self.log_wandb:
            log_to_wandb(metrics=metrics)

        metrics["env"] = self.env
        metrics["vf"] = self.value_function  # type: ignore
        metrics["policy"] = self.policy
        batches.append(batch)

    metrics["update_batches"] = batches
    for k in self.meta_modules:
        self.meta_modules[k].post_update(metrics)
    del metrics["update_batches"]
    return metrics

update_agent #

update_agent(
    transition_batch, batches_left, **kwargs
) -> Any

Compute and apply TD update.

:param step: Current training step :return:

Source code in mighty/mighty_agents/dqn.py

def update_agent(self, transition_batch, batches_left, **kwargs) -> Any:  # type: ignore
    """Compute and apply TD update.

    :param step: Current training step
    :return:
    """

    preds, targets = self.qlearning.get_targets(  # type: ignore
        transition_batch, self.q, self.q_target
    )

    metrics_q = self.qlearning.apply_update(preds, targets)  # type: ignore
    metrics_q["Update/td_targets"] = targets.detach().numpy()
    metrics_q["Update/td_errors"] = (targets - preds).detach().numpy()
    loss_stats = {
        "step": self.steps,
        "Update/loss": metrics_q["Update/loss"],
        "Update/td_errors": metrics_q["Update/td_errors"].mean().item(),
        "batch_predictions": preds.mean(axis=1).detach().numpy().tolist(),
    }
    self.loss_buffer = update_buffer(self.loss_buffer, loss_stats)

    # sync target model
    if self.q_target is not None:
        # If a hard‐copy frequency is provided, use it
        if self.target_update_freq is not None:
            self._target_update_counter += 1
            if self._target_update_counter >= self.target_update_freq:
                self.q_target.load_state_dict(self.q.state_dict())
                self._target_update_counter = 0
        else:
            # Otherwise, use soft updates if τ < 1.0, or hard copy every step if τ == 1.0
            if self.soft_update_weight < 1.0:
                for param, target_param in zip(
                    self.q.parameters(),  # type: ignore
                    self.q_target.parameters(),
                    strict=False,
                ):
                    target_param.data.copy_(
                        self.soft_update_weight * param.data
                        + (1 - self.soft_update_weight) * target_param.data
                    )
            else:
                # τ == 1.0 and no frequency specified → hard copy every update
                self.q_target.load_state_dict(self.q.state_dict())

    return metrics_q

Dqn

mighty.mighty_agents.dqn #

MightyDQNAgent #

parameters property #

value_function property #

__del__ #

adapt_hps #

apply_config #

evaluate #

initialize_agent #

load #

make_checkpoint_dir #

run #

save #

update #

update_agent #

parameters `property` #

value_function `property` #

del #