alperenunlu commited on 3 days ago

Commit

94e16fc

verified ·

1 Parent(s): 49fb85d

Push model

Browse files

Files changed (28) hide show

.gitattributes +11 -0
README.md +63 -0
dqn_atari.py +561 -0
dqn_atari_1000000.safetensors +3 -0
dqn_atari_2000000.safetensors +3 -0
dqn_atari_3000000.safetensors +3 -0
dqn_atari_4000000.safetensors +3 -0
dqn_atari_5000000.safetensors +3 -0
dqn_atari_6000000.safetensors +3 -0
dqn_atari_7000000.safetensors +3 -0
dqn_atari_8000000.safetensors +3 -0
dqn_atari_9000000.safetensors +3 -0
dqn_atari_final.safetensors +3 -0
events.out.tfevents.1755876604.dc81b9906985.2178.0 +3 -0
hyperparameters.json +25 -0
pyproject.toml +16 -0
replay.mp4 +3 -0
uv.lock +0 -0
videos/SpaceInvadersNoFrameskip-v4_dqn_atari_0_250822_153004_eval/SpaceInvadersNoFrameskip-v4-episode-0.mp4 +3 -0
videos/SpaceInvadersNoFrameskip-v4_dqn_atari_0_250822_153004_eval/SpaceInvadersNoFrameskip-v4-episode-1.mp4 +3 -0
videos/SpaceInvadersNoFrameskip-v4_dqn_atari_0_250822_153004_eval/SpaceInvadersNoFrameskip-v4-episode-2.mp4 +3 -0
videos/SpaceInvadersNoFrameskip-v4_dqn_atari_0_250822_153004_eval/SpaceInvadersNoFrameskip-v4-episode-3.mp4 +3 -0
videos/SpaceInvadersNoFrameskip-v4_dqn_atari_0_250822_153004_eval/SpaceInvadersNoFrameskip-v4-episode-4.mp4 +3 -0
videos/SpaceInvadersNoFrameskip-v4_dqn_atari_0_250822_153004_eval/SpaceInvadersNoFrameskip-v4-episode-5.mp4 +3 -0
videos/SpaceInvadersNoFrameskip-v4_dqn_atari_0_250822_153004_eval/SpaceInvadersNoFrameskip-v4-episode-6.mp4 +3 -0
videos/SpaceInvadersNoFrameskip-v4_dqn_atari_0_250822_153004_eval/SpaceInvadersNoFrameskip-v4-episode-7.mp4 +3 -0
videos/SpaceInvadersNoFrameskip-v4_dqn_atari_0_250822_153004_eval/SpaceInvadersNoFrameskip-v4-episode-8.mp4 +3 -0
videos/SpaceInvadersNoFrameskip-v4_dqn_atari_0_250822_153004_eval/SpaceInvadersNoFrameskip-v4-episode-9.mp4 +3 -0

.gitattributes CHANGED Viewed

@@ -33,3 +33,14 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text

 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text
+videos/SpaceInvadersNoFrameskip-v4_dqn_atari_0_250822_153004_eval/SpaceInvadersNoFrameskip-v4-episode-8.mp4 filter=lfs diff=lfs merge=lfs -text
+videos/SpaceInvadersNoFrameskip-v4_dqn_atari_0_250822_153004_eval/SpaceInvadersNoFrameskip-v4-episode-2.mp4 filter=lfs diff=lfs merge=lfs -text
+videos/SpaceInvadersNoFrameskip-v4_dqn_atari_0_250822_153004_eval/SpaceInvadersNoFrameskip-v4-episode-7.mp4 filter=lfs diff=lfs merge=lfs -text
+videos/SpaceInvadersNoFrameskip-v4_dqn_atari_0_250822_153004_eval/SpaceInvadersNoFrameskip-v4-episode-4.mp4 filter=lfs diff=lfs merge=lfs -text
+videos/SpaceInvadersNoFrameskip-v4_dqn_atari_0_250822_153004_eval/SpaceInvadersNoFrameskip-v4-episode-0.mp4 filter=lfs diff=lfs merge=lfs -text
+videos/SpaceInvadersNoFrameskip-v4_dqn_atari_0_250822_153004_eval/SpaceInvadersNoFrameskip-v4-episode-3.mp4 filter=lfs diff=lfs merge=lfs -text
+videos/SpaceInvadersNoFrameskip-v4_dqn_atari_0_250822_153004_eval/SpaceInvadersNoFrameskip-v4-episode-6.mp4 filter=lfs diff=lfs merge=lfs -text
+videos/SpaceInvadersNoFrameskip-v4_dqn_atari_0_250822_153004_eval/SpaceInvadersNoFrameskip-v4-episode-5.mp4 filter=lfs diff=lfs merge=lfs -text
+videos/SpaceInvadersNoFrameskip-v4_dqn_atari_0_250822_153004_eval/SpaceInvadersNoFrameskip-v4-episode-1.mp4 filter=lfs diff=lfs merge=lfs -text
+videos/SpaceInvadersNoFrameskip-v4_dqn_atari_0_250822_153004_eval/SpaceInvadersNoFrameskip-v4-episode-9.mp4 filter=lfs diff=lfs merge=lfs -text
+replay.mp4 filter=lfs diff=lfs merge=lfs -text

README.md ADDED Viewed

	@@ -0,0 +1,63 @@

+---
+tags:
+- reinforcement-learning
+- deep-reinforcement-learning
+- DQN
+- SpaceInvadersNoFrameskip-v4
+library_name: rlhell
+model-index:
+- name: DQN
+  results:
+  - task:
+      type: reinforcement-learning
+      name: reinforcement-learning
+    dataset:
+      name: SpaceInvadersNoFrameskip-v4
+      type: SpaceInvadersNoFrameskip-v4
+    metrics:
+    - type: mean_reward
+      value: 1813.50 +/- 579.77
+      name: mean_reward
+      verified: false
+---
+# (RLHell) **DQN** Agent Playing **SpaceInvadersNoFrameskip-v4**
+The model was trained by using [rlhell](https://github.com/alperenunlu/rlhell).
+## Command to reproduce the training
+```bash
+curl -OL https://huggingface.co/alperenunlu/SpaceInvadersNoFrameskip-v4-dqn_atari/raw/main/dqn_atari.py
+curl -OL https://huggingface.co/alperenunlu/SpaceInvadersNoFrameskip-v4-dqn_atari/raw/main/pyproject.toml
+curl -OL https://huggingface.co/alperenunlu/SpaceInvadersNoFrameskip-v4-dqn_atari/raw/main/uv.lock
+uv run dqn_atari.py
+```
+# Hyperparameters
+```python
+{'batch_size': 32,
+ 'buffer_size': 1_000_000,
+ 'env_id': 'SpaceInvadersNoFrameskip-v4',
+ 'eval_episodes': 10,
+ 'evaluate': True,
+ 'exp_name': 'dqn_atari',
+ 'exploration_fraction': 0.1,
+ 'final_exploration': 0.01,
+ 'gamma': 0.99,
+ 'hf_entity': 'alperenunlu',
+ 'initial_exploration': 1,
+ 'learning_rate': 0.0001,
+ 'learning_start': 80_000,
+ 'log_interval': 100,
+ 'n_envs': 8,
+ 'push_model': True,
+ 'save_times': 10,
+ 'seed': 0,
+ 'target_network_update_frequency': 1_000,
+ 'tau': 1.0,
+ 'total_timesteps': 10_000_000,
+ 'train_frequency': 4,
+ 'video_capture_frequency': 5}
+```

dqn_atari.py ADDED Viewed

	@@ -0,0 +1,561 @@

+import os
+import random
+import time
+import warnings
+from dataclasses import dataclass
+from datetime import datetime
+from typing import Any, Callable, SupportsFloat
+import ale_py  # noqa: F401
+import gymnasium as gym
+import numpy as np
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+import torch.optim as optim
+import tyro
+from safetensors.torch import save_model
+from torch.utils.tensorboard import SummaryWriter
+from tqdm.auto import tqdm
+warnings.filterwarnings("ignore", category=UserWarning)
+device = torch.device(
+    "cuda"
+    if torch.cuda.is_available()
+    else "mps"
+    if torch.backends.mps.is_available()
+    else "cpu"
+)
+@dataclass
+class HyperParams:
+    env_id: str = "SpaceInvadersNoFrameskip-v4"
+    """The ID of the environment to train on."""
+    exp_name: str = os.path.basename(__file__)[: -len(".py")]
+    """The name of the experiment, used for saving models and logs."""
+    n_envs: int = 8
+    """The number of parallel environments to run."""
+    seed: int = 0
+    """The random seed for reproducibility."""
+    total_timesteps: int = 10_000_000
+    """The total number of timesteps to train the agent."""
+    buffer_size: int = 1_000_000
+    """The size of the replay buffer to store transitions."""
+    video_capture_frequency: int = 5
+    """The interval (in episodes) to record videos of the agent's performance."""
+    initial_exploration: float = 1
+    """The initial exploration rate for the epsilon-greedy policy."""
+    final_exploration: float = 0.01
+    """The final exploration rate after annealing."""
+    exploration_fraction: float = 0.1
+    """The fraction of total timesteps over which to anneal the exploration rate."""
+    learning_start: int = 80_000
+    """The number of timesteps before starting to learn."""
+    train_frequency: int = 4
+    """The frequency (in timesteps) to update the Q-network."""
+    batch_size: int = 32
+    """The batch size for sampling from the replay buffer."""
+    gamma: float = 0.99
+    """The discount factor (gamma) for future rewards."""
+    learning_rate: float = 1e-4
+    """The learning rate for the optimizer."""
+    target_network_update_frequency: int = 1_000
+    """The frequency (in timesteps) to update the target network."""
+    tau: float = 1.0
+    """The rate at which to update the target network towards the Q-network."""
+    log_interval: int = 100
+    """The interval (in timesteps) to log training statistics."""
+    save_times: int = 10
+    """The number of times to save the model during training."""
+    evaluate: bool = True
+    """Whether to evaluate the agent after training."""
+    eval_episodes: int = 10
+    """The number of episodes to run for evaluation."""
+    push_model: bool = True
+    hf_entity: str = "alperenunlu"
+class ReplayBuffer:
+    def __init__(
+        self,
+        buffer_size: int,
+        observation_space: gym.Space,
+        action_space: gym.Space,
+        device: torch.device | str = "auto",
+        n_envs: int = 1,
+        optimize_memory_usage: bool = True,
+    ) -> None:
+        self.buffer_size = max(buffer_size // n_envs, 1)
+        self.n_envs = n_envs
+        self.optimize_memory_usage = optimize_memory_usage
+        self.obs_shape = self.get_obs_shape(observation_space)
+        self.action_dim = self.get_action_dim(action_space)
+        self.device = self.get_device(device)
+        self.observations = np.zeros(
+            (self.buffer_size, self.n_envs, *self.obs_shape),
+            dtype=observation_space.dtype,
+        )
+        if not self.optimize_memory_usage:
+            self.next_observations = np.zeros(
+                (self.buffer_size, self.n_envs, *self.obs_shape),
+                dtype=observation_space.dtype,
+            )
+        else:
+            self.next_observations = None
+        self.actions = np.zeros(
+            (self.buffer_size, self.n_envs, self.action_dim),
+            dtype=action_space.dtype,
+        )
+        self.rewards = np.zeros((self.buffer_size, self.n_envs), dtype=np.float32)
+        self.dones = np.zeros((self.buffer_size, self.n_envs), dtype=np.float32)
+        self.index = 0
+        self.size = 0
+    def add(
+        self,
+        obs: np.ndarray,
+        next_obs: np.ndarray,
+        actions: np.ndarray,
+        rewards: np.ndarray,
+        dones: np.ndarray,
+    ) -> None:
+        """Add a new transition to the replay buffer."""
+        self.observations[self.index] = np.asarray(obs)
+        if self.optimize_memory_usage:
+            # Store next_obs at the *next* slot to save memory
+            self.observations[(self.index + 1) % self.buffer_size] = np.asarray(
+                next_obs
+            )
+        else:
+            self.next_observations[self.index] = np.asarray(next_obs)
+        self.actions[self.index] = np.asarray(actions).reshape(
+            self.n_envs, self.action_dim
+        )
+        self.rewards[self.index] = np.asarray(rewards)
+        self.dones[self.index] = np.asarray(dones)
+        self.index = (self.index + 1) % self.buffer_size
+        self.size = min(self.size + 1, self.buffer_size)
+    def sample(self, batch_size: int) -> dict[str, torch.Tensor]:
+        """Sample a batch of data from the replay buffer."""
+        if not self.optimize_memory_usage:
+            batch_idx = np.random.randint(0, self.size, size=batch_size)
+        else:
+            # Do not sample the write index because its (obs,next_obs) pair is invalid
+            if self.size == self.buffer_size:
+                batch_idx = (
+                    np.random.randint(1, self.buffer_size, size=batch_size) + self.index
+                ) % self.buffer_size
+            else:
+                batch_idx = np.random.randint(0, self.index, size=batch_size)
+        env_idx = np.random.randint(0, self.n_envs, size=batch_size)
+        if self.optimize_memory_usage:
+            next_obs = self.observations[
+                (batch_idx + 1) % self.buffer_size, env_idx, ...
+            ]
+        else:
+            next_obs = self.next_observations[batch_idx, env_idx, ...]
+        data = dict(
+            obs=self.observations[batch_idx, env_idx, ...],
+            next_obs=next_obs,
+            actions=self.actions[batch_idx, env_idx, ...],
+            rewards=self.rewards[batch_idx, env_idx, ...],
+            dones=self.dones[batch_idx, env_idx, ...],
+        )
+        return self.to_torch(data)
+    def to_torch(self, data: dict[str, np.ndarray]) -> dict[str, torch.Tensor]:
+        """Convert numpy arrays to torch tensors and move them to the specified device."""
+        tensor_data = dict()
+        for k, v in data.items():
+            tensor_data[k] = torch.from_numpy(v).to(device=self.device)
+        return tensor_data
+    @staticmethod
+    def get_device(device: torch.device | str = "auto") -> torch.device:
+        """Get the device to use for computations."""
+        if device == "auto":
+            return torch.device(
+                "cuda"
+                if torch.cuda.is_available()
+                else "mps"
+                if torch.backends.mps.is_available()
+                else "cpu"
+            )
+        else:
+            return torch.device(device)
+    @staticmethod
+    def get_obs_shape(
+        observation_space: gym.Space,
+    ) -> tuple[int, ...]:
+        """Get the shape of the observation space."""
+        if isinstance(observation_space, gym.spaces.Box):
+            return observation_space.shape
+        elif isinstance(observation_space, gym.spaces.Discrete):
+            return (1,)
+        elif isinstance(observation_space, gym.spaces.MultiDiscrete):
+            return (int(len(observation_space.nvec)),)
+        elif isinstance(observation_space, gym.spaces.MultiBinary):
+            return observation_space.shape
+        else:
+            raise NotImplementedError(
+                f"{observation_space} observation space is not supported"
+            )
+    @staticmethod
+    def get_action_dim(action_space: gym.spaces.Space) -> int:
+        """Get the dimension of the action space."""
+        if isinstance(action_space, gym.spaces.Box):
+            return int(np.prod(action_space.shape))
+        elif isinstance(action_space, gym.spaces.Discrete):
+            return 1
+        elif isinstance(action_space, gym.spaces.MultiDiscrete):
+            return int(len(action_space.nvec))
+        else:
+            raise NotImplementedError(f"{action_space} action space is not supported")
+class FireResetEnv(gym.Wrapper[np.ndarray, int, np.ndarray, int]):
+    """
+    Take action on reset for environments that are fixed until firing.
+    :param env: Environment to wrap
+    """
+    def __init__(self, env: gym.Env) -> None:
+        super().__init__(env)
+        assert env.unwrapped.get_action_meanings()[1] == "FIRE"  # type: ignore[attr-defined]
+        assert len(env.unwrapped.get_action_meanings()) >= 3  # type: ignore[attr-defined]
+    def reset(self, **kwargs) -> tuple[np.ndarray, dict[str, Any]]:
+        self.env.reset(**kwargs)
+        obs, _, terminated, truncated, info = self.env.step(1)
+        if terminated or truncated:
+            self.env.reset(**kwargs)
+        obs, _, terminated, truncated, info = self.env.step(2)
+        if terminated or truncated:
+            self.env.reset(**kwargs)
+        return obs, info
+class EpisodicLifeEnv(gym.Wrapper[np.ndarray, int, np.ndarray, int]):
+    """
+    Make end-of-life == end-of-episode, but only reset on true game over.
+    Done by DeepMind for the DQN and co. since it helps value estimation.
+    :param env: Environment to wrap
+    """
+    def __init__(self, env: gym.Env) -> None:
+        super().__init__(env)
+        self.lives = 0
+        self.was_real_done = True
+    def step(
+        self, action: int
+    ) -> tuple[np.ndarray, SupportsFloat, bool, bool, dict[str, Any]]:
+        obs, reward, terminated, truncated, info = self.env.step(action)
+        self.was_real_done = terminated or truncated
+        lives = self.env.unwrapped.ale.lives()  # type: ignore[attr-defined]
+        if 0 < lives < self.lives:
+            terminated = True
+        self.lives = lives
+        return obs, reward, terminated, truncated, info
+    def reset(self, **kwargs) -> tuple[np.ndarray, dict[str, Any]]:
+        """
+        Calls the Gym environment reset, only when lives are exhausted.
+        This way all states are still reachable even though lives are episodic,
+        and the learner need not know about any of this behind-the-scenes.
+        :param kwargs: Extra keywords passed to env.reset() call
+        :return: the first observation of the environment
+        """
+        if self.was_real_done:
+            obs, info = self.env.reset(**kwargs)
+        else:
+            obs, _, terminated, truncated, info = self.env.step(0)
+            if terminated or truncated:
+                obs, info = self.env.reset(**kwargs)
+        self.lives = self.env.unwrapped.ale.lives()  # type: ignore[attr-defined]
+        return obs, info
+def make_env(
+    env_id: str, seed: int, idx: int, video_capture_frequency: int, run_name: str
+) -> Callable[[], gym.Env]:
+    """Create a gym environment with specific configurations.
+    Args:
+        env_id (str): The ID of the environment to create.
+        seed (int): The seed for random number generation.
+        idx (int): The index of the environment (for vectorized environments).
+        video_freq (int): Frequency of recording videos (0 to disable).
+        run_name (str): The name of the run for saving videos.
+    Returns:
+        Callable[[], gym.Env]: A function that returns the configured environment.
+    """
+    def _thunk() -> gym.Env:
+        if video_capture_frequency > 0 and idx == 0:
+            env = gym.make(env_id, render_mode="rgb_array")
+            env = gym.wrappers.RecordVideo(
+                env,
+                video_folder=f"videos/{run_name}",
+                episode_trigger=lambda x: x % video_capture_frequency == 0,
+                name_prefix=env_id,
+            )
+        else:
+            env = gym.make(env_id)
+        env = gym.wrappers.RecordEpisodeStatistics(env)
+        env = gym.wrappers.AtariPreprocessing(
+            env,
+            noop_max=30,
+            frame_skip=4,
+            screen_size=(84, 84),
+            terminal_on_life_loss=False,
+            grayscale_obs=True,
+            grayscale_newaxis=False,
+            scale_obs=False,
+        )
+        env = EpisodicLifeEnv(env)
+        if "FIRE" in env.unwrapped.get_action_meanings():  # type: ignore[attr-defined]
+            env = FireResetEnv(env)
+        env = gym.wrappers.ClipReward(env, -1, +1)
+        env = gym.wrappers.FrameStackObservation(env, stack_size=4)
+        env.action_space.seed(seed)
+        return env
+    return _thunk
+class QNetwork(nn.Module):
+    def __init__(self, env) -> None:
+        super().__init__()
+        self.network = nn.Sequential(
+            nn.Conv2d(4, 32, 8, stride=4),
+            nn.ReLU(inplace=True),
+            nn.Conv2d(32, 64, 4, stride=2),
+            nn.ReLU(inplace=True),
+            nn.Conv2d(64, 64, 3, stride=1),
+            nn.ReLU(inplace=True),
+            nn.Flatten(),
+            nn.Linear(7 * 7 * 64, 512),
+            nn.ReLU(inplace=True),
+            nn.Linear(512, env.single_action_space.n),
+        )
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        return self.network(x / 255.0)
+def linear_schedule(t: int, start_e: float, end_e: float, duration: int) -> float:
+    """Linear annealing from start_e to end_e over duration steps.
+    Args:
+        t (int): Current step.
+        start_e (float): Initial exploration rate.
+        end_e (float): Final exploration rate.
+        duration (float): Duration of the annealing in steps.
+    Returns:
+        float: The exploration rate at step t.
+    """
+    slope = (end_e - start_e) / duration
+    return max(slope * t + start_e, end_e)
+def main() -> None:
+    args = tyro.cli(HyperParams)
+    run_name = f"{args.env_id}_{args.exp_name.replace('/', '_')}_{args.seed}_{datetime.now().strftime('%y%m%d_%H%M%S')}"
+    print(run_name)
+    writer = SummaryWriter(f"runs/{run_name}")
+    keyval_str = "\n".join([f"|{key}|{value}|" for key, value in vars(args).items()])
+    writer.add_text(
+        "hyperparameters",
+        f"|param|value|\n|-|-|\n{keyval_str}",
+    )
+    random.seed(args.seed)
+    np.random.seed(args.seed)
+    torch.manual_seed(args.seed)
+    envs = gym.vector.AsyncVectorEnv(
+        [
+            make_env(
+                args.env_id, args.seed + i, i, args.video_capture_frequency, run_name
+            )
+            for i in range(int(args.n_envs))
+        ],
+        autoreset_mode=gym.vector.AutoresetMode.DISABLED,
+    )
+    assert isinstance(envs.single_action_space, gym.spaces.Discrete)
+    envs.action_space.seed(args.seed)
+    q_network = QNetwork(envs).to(device)
+    optimizer = optim.Adam(q_network.parameters(), lr=args.learning_rate)
+    target_network = QNetwork(envs).to(device)
+    target_network.load_state_dict(q_network.state_dict())
+    target_network.eval()
+    rb = ReplayBuffer(
+        args.buffer_size,
+        envs.single_observation_space,
+        envs.single_action_space,
+        device=device,
+        n_envs=envs.num_envs,
+    )
+    start_time = time.time()
+    obs, _ = envs.reset(seed=args.seed)
+    step_pbar = tqdm(total=args.total_timesteps)
+    postfix_dict = dict()
+    for step in range(0, args.total_timesteps, envs.num_envs):
+        epsilon = linear_schedule(
+            step,
+            args.initial_exploration,
+            args.final_exploration,
+            int(args.total_timesteps * args.exploration_fraction),
+        )
+        writer.add_scalar("charts/epsilon", epsilon, step)
+        postfix_dict.update(e=epsilon)
+        with torch.no_grad():
+            q_values = q_network(torch.from_numpy(obs).to(device))
+            greedy_actions = torch.argmax(q_values, dim=1).cpu().numpy()
+        randn_actions = envs.action_space.sample()
+        mask_actions = np.random.rand(envs.num_envs) < epsilon
+        actions = np.where(mask_actions, randn_actions, greedy_actions)
+        next_obs, rewards, terminations, truncations, infos = envs.step(actions)
+        if "episode" in infos:
+            mask = infos["_episode"]
+            r_mean = infos["episode"]["r"][mask].mean()
+            l_mean = infos["episode"]["l"][mask].mean()
+            writer.add_scalar("charts/episodic_return", r_mean, step)
+            writer.add_scalar("charts/episodic_length", l_mean, step)
+            postfix_dict.update(
+                r=r_mean,
+                l=l_mean,
+            )
+        rb.add(obs, next_obs, actions, rewards, terminations)
+        dones = np.logical_or(terminations, truncations)
+        if dones.any():
+            obs, _ = envs.reset(options={"reset_mask": dones})
+        else:
+            obs = next_obs
+        for parallel_step in range(
+            step, min(step + envs.num_envs, args.total_timesteps)
+        ):
+            if parallel_step > args.learning_start:
+                if parallel_step % args.train_frequency == 0:
+                    data = rb.sample(args.batch_size)
+                    with torch.no_grad():
+                        target_max, _ = target_network(data["next_obs"]).max(dim=1)
+                        td_target = data[
+                            "rewards"
+                        ].flatten() + args.gamma * target_max * (
+                            1 - data["dones"].flatten()
+                        )
+                    q_val = q_network(data["obs"]).gather(1, data["actions"]).squeeze()
+                    loss = F.smooth_l1_loss(q_val, td_target)
+                    if step % args.log_interval < envs.num_envs:
+                        writer.add_scalar("losses/td_loss", loss, parallel_step)
+                        writer.add_scalar(
+                            "losses/q_values", q_val.mean().item(), parallel_step
+                        )
+                        writer.add_scalar(
+                            "charts/SPS",
+                            step // (time.time() - start_time),
+                            parallel_step,
+                        )
+                        postfix_dict.update(
+                            td_loss=loss.item(),
+                            q_val_mean=q_val.mean().item(),
+                            sps=step // (time.time() - start_time),
+                        )
+                    optimizer.zero_grad()
+                    loss.backward()
+                    optimizer.step()
+                if parallel_step % args.target_network_update_frequency == 0:
+                    for target_network_param, q_network_param in zip(
+                        target_network.parameters(), q_network.parameters()
+                    ):
+                        target_network_param.data.lerp_(q_network_param.data, args.tau)
+                if parallel_step % (args.total_timesteps // args.save_times) == 0:
+                    save_model(
+                        model=q_network,
+                        filename=f"runs/{run_name}/{args.exp_name}_{step}.safetensors",
+                    )
+        step_pbar.set_postfix(postfix_dict)
+        step_pbar.update(envs.num_envs)
+    envs.close()
+    step_pbar.close()
+    if args.evaluate:
+        run_name_eval = f"{run_name}_eval"
+        final_model_path = f"runs/{run_name}/{args.exp_name}_final.safetensors"
+        save_model(model=q_network, filename=final_model_path)
+        from evals.dqn_eval import evaluate
+        episode_rewards = evaluate(
+            final_model_path=final_model_path,
+            make_env=make_env,
+            env_id=args.env_id,
+            run_name_eval=run_name_eval,
+            QNetwork=QNetwork,
+            device=device,
+            eval_episodes=args.eval_episodes,
+            epsilon=args.final_exploration,
+        )
+        for i, r in enumerate(episode_rewards):
+            writer.add_scalar("eval/episodic_return_eval", r, i)
+        if args.push_model:
+            from utils import push_model
+            push_model(
+                args=args,
+                episode_rewards=episode_rewards,
+                algo_name="DQN",
+                run_path=f"runs/{run_name}",
+                video_folder_path=f"videos/{run_name_eval}",
+            )
+    writer.close()
+if __name__ == "__main__":
+    main()

dqn_atari_1000000.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:c7dd068380a83b737c2f6ba15ce7b54742a97e328733448bb6052bcc76d1abac
+size 6749632

dqn_atari_2000000.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:c4ed5df14bf26a840e1090978e037112bbb02a0f56d3ef229d1c8885082abc1e
+size 6749632

dqn_atari_3000000.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:7f51ada39ff7512a9180606dc367d7e5fe7a7c02692f645e3f5f91d1e937b3e4
+size 6749632

dqn_atari_4000000.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:1bfc97d59160b805d1dd56a0372d8634dbcb7fb17b7ba1ba7239c533a5f7a001
+size 6749632

dqn_atari_5000000.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:852b6388c16dec1cece86432c9cb1b159263b6a7d33ee84a212de7685cf3148c
+size 6749632

dqn_atari_6000000.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:33978695e05d872451f0262f3a3381da1e36fa721de54e98bd3850740612e936
+size 6749632

dqn_atari_7000000.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:6175016bb27fa7fd4b65268a1e9dd3791590ef11525f8805c01a608ba1e1a389
+size 6749632

dqn_atari_8000000.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:e74417b7e7fc1025fb6ab60bafda4b26eacb59f5207893f9930f07a8488f1977
+size 6749632

dqn_atari_9000000.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:1a416598792b7b6968310f07ad932efdb8e12ba5eee63ce13e0df5f40d06703f
+size 6749632

dqn_atari_final.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:6caadb9e62f0806a480f84d4c90ec8f3e7b96dd1f1fb51b7c75223cb936222a2
+size 6749632

events.out.tfevents.1755876604.dc81b9906985.2178.0 ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:9b4ac76ec60c8469e0d0183cefc32741464287ced7fec20e2daca8bbd816b6f8
+size 101547150

hyperparameters.json ADDED Viewed

	@@ -0,0 +1,25 @@

+{
+  "env_id": "SpaceInvadersNoFrameskip-v4",
+  "exp_name": "dqn_atari",
+  "n_envs": 8,
+  "seed": 0,
+  "total_timesteps": 10000000,
+  "buffer_size": 1000000,
+  "video_capture_frequency": 5,
+  "initial_exploration": 1,
+  "final_exploration": 0.01,
+  "exploration_fraction": 0.1,
+  "learning_start": 80000,
+  "train_frequency": 4,
+  "batch_size": 32,
+  "gamma": 0.99,
+  "learning_rate": 0.0001,
+  "target_network_update_frequency": 1000,
+  "tau": 1.0,
+  "log_interval": 100,
+  "save_times": 10,
+  "evaluate": true,
+  "eval_episodes": 10,
+  "push_model": true,
+  "hf_entity": "alperenunlu"
+}

pyproject.toml ADDED Viewed

	@@ -0,0 +1,16 @@

+[project]
+name = "rlhell"
+version = "0.1.0"
+description = "RL Implementations Without Dependency Hell Cutting Edge Versions and Vectorized Training"
+readme = "README.md"
+requires-python = ">=3.12"
+dependencies = [
+    "gymnasium[all]>=1.2.0",
+    "huggingface-hub>=0.34.4",
+    "safetensors>=0.6.2",
+    "swig>=4.3.1",
+    "tensorboard>=2.20.0",
+    "torch>=2.8.0",
+    "tqdm>=4.67.1",
+    "tyro>=0.9.28",
+]

replay.mp4 ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:0aa46fb3860c1865495e256b64def56e973ca1029ae0aa5b29d0eb1b3cc9fcf4
+size 1390433

uv.lock ADDED Viewed

The diff for this file is too large to render. See raw diff

videos/SpaceInvadersNoFrameskip-v4_dqn_atari_0_250822_153004_eval/SpaceInvadersNoFrameskip-v4-episode-0.mp4 ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:0aa46fb3860c1865495e256b64def56e973ca1029ae0aa5b29d0eb1b3cc9fcf4
+size 1390433

videos/SpaceInvadersNoFrameskip-v4_dqn_atari_0_250822_153004_eval/SpaceInvadersNoFrameskip-v4-episode-1.mp4 ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:81058e6ce5baa8ba1699e95f7f88c12356292f47f8744e5f248ccd425712ab56
+size 622516

videos/SpaceInvadersNoFrameskip-v4_dqn_atari_0_250822_153004_eval/SpaceInvadersNoFrameskip-v4-episode-2.mp4 ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:1dd804e02102171a0ebf380b45108fee1fc2c631d2e97cf7e58c7a0ab6420059
+size 670602

videos/SpaceInvadersNoFrameskip-v4_dqn_atari_0_250822_153004_eval/SpaceInvadersNoFrameskip-v4-episode-3.mp4 ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:5b3332e807ee00821010ebfca57343a8bba4f04f85449d1f0bd672212b130f5d
+size 1259808

videos/SpaceInvadersNoFrameskip-v4_dqn_atari_0_250822_153004_eval/SpaceInvadersNoFrameskip-v4-episode-4.mp4 ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:b5accd7fb013d92d5ec6ffb0d200693cc316f21f54a1ca7bae593087303f6baa
+size 1182288

videos/SpaceInvadersNoFrameskip-v4_dqn_atari_0_250822_153004_eval/SpaceInvadersNoFrameskip-v4-episode-5.mp4 ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:f7822ac3dc85aff825689f7022cd6c64ebe64101a4a7e7f5be702c3ebf68731a
+size 728035

videos/SpaceInvadersNoFrameskip-v4_dqn_atari_0_250822_153004_eval/SpaceInvadersNoFrameskip-v4-episode-6.mp4 ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:8a2c826830c9e0bea8736b213fb57792676fe16191580d50727f65e40ec1871f
+size 1118185

videos/SpaceInvadersNoFrameskip-v4_dqn_atari_0_250822_153004_eval/SpaceInvadersNoFrameskip-v4-episode-7.mp4 ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:0660e438dfacf48485940b7388833d7261cfe9b261bf33333aa1db39f8bfae08
+size 805407

videos/SpaceInvadersNoFrameskip-v4_dqn_atari_0_250822_153004_eval/SpaceInvadersNoFrameskip-v4-episode-8.mp4 ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:a1c041e5c76f0bc0c8ed6fd433557af20613544e4455e8ae9e5c1ef9a00675b6
+size 593165

videos/SpaceInvadersNoFrameskip-v4_dqn_atari_0_250822_153004_eval/SpaceInvadersNoFrameskip-v4-episode-9.mp4 ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:99016ba1754bbf677baf13297f08aa533f18072271769d5aa8112da36e7d78e1
+size 903444