wyldecat commited on 7 days ago

Commit

99e7c0c

1 Parent(s): 3261444

fix(muon): add update_p stage and dealloc tensors properly

Browse files

Files changed (47) hide show

.gitignore +0 -1
build/torch27-cxx11-cu118-x86_64-linux/optimizer/__pycache__/__init__.cpython-313.pyc +0 -0
build/torch27-cxx11-cu118-x86_64-linux/optimizer/__pycache__/muon.cpython-313.pyc +0 -0
build/torch27-cxx11-cu118-x86_64-linux/optimizer/_ops.py +3 -3
build/torch27-cxx11-cu118-x86_64-linux/optimizer/{_optimizer_2dc97a1_dirty.abi3.so → _optimizer_0c12ced_dirty.abi3.so} +1 -1
build/torch27-cxx11-cu118-x86_64-linux/optimizer/muon.py +100 -51
build/torch27-cxx11-cu126-x86_64-linux/optimizer/__pycache__/__init__.cpython-313.pyc +0 -0
build/torch27-cxx11-cu126-x86_64-linux/optimizer/__pycache__/muon.cpython-313.pyc +0 -0
build/torch27-cxx11-cu126-x86_64-linux/optimizer/_ops.py +3 -3
build/torch27-cxx11-cu126-x86_64-linux/optimizer/{_optimizer_2dc97a1_dirty.abi3.so → _optimizer_0c12ced_dirty.abi3.so} +1 -1
build/torch27-cxx11-cu126-x86_64-linux/optimizer/muon.py +100 -51
build/torch27-cxx11-cu128-x86_64-linux/optimizer/__pycache__/__init__.cpython-313.pyc +0 -0
build/torch27-cxx11-cu128-x86_64-linux/optimizer/__pycache__/muon.cpython-313.pyc +0 -0
build/torch27-cxx11-cu128-x86_64-linux/optimizer/_ops.py +3 -3
build/{torch28-cxx11-cu128-x86_64-linux/optimizer/_optimizer_2dc97a1_dirty.abi3.so → torch27-cxx11-cu128-x86_64-linux/optimizer/_optimizer_0c12ced_dirty.abi3.so} +1 -1
build/torch27-cxx11-cu128-x86_64-linux/optimizer/muon.py +100 -51
build/torch27-cxx11-rocm63-x86_64-linux/optimizer/__pycache__/__init__.cpython-313.pyc +0 -0
build/torch27-cxx11-rocm63-x86_64-linux/optimizer/__pycache__/muon.cpython-313.pyc +0 -0
build/torch27-cxx11-rocm63-x86_64-linux/optimizer/_ops.py +3 -3
build/torch27-cxx11-rocm63-x86_64-linux/optimizer/{_optimizer_2dc97a1_dirty.abi3.so → _optimizer_0c12ced_dirty.abi3.so} +1 -1
build/torch27-cxx11-rocm63-x86_64-linux/optimizer/muon.py +100 -51
build/torch28-cxx11-cu126-x86_64-linux/optimizer/__pycache__/__init__.cpython-313.pyc +0 -0
build/torch28-cxx11-cu126-x86_64-linux/optimizer/__pycache__/muon.cpython-313.pyc +0 -0
build/torch28-cxx11-cu126-x86_64-linux/optimizer/_ops.py +3 -3
build/torch28-cxx11-cu126-x86_64-linux/optimizer/{_optimizer_2dc97a1_dirty.abi3.so → _optimizer_0c12ced_dirty.abi3.so} +1 -1
build/torch28-cxx11-cu126-x86_64-linux/optimizer/muon.py +100 -51
build/torch28-cxx11-cu128-x86_64-linux/optimizer/__pycache__/__init__.cpython-313.pyc +0 -0
build/torch28-cxx11-cu128-x86_64-linux/optimizer/__pycache__/muon.cpython-313.pyc +0 -0
build/torch28-cxx11-cu128-x86_64-linux/optimizer/_ops.py +3 -3
build/{torch27-cxx11-cu128-x86_64-linux/optimizer/_optimizer_2dc97a1_dirty.abi3.so → torch28-cxx11-cu128-x86_64-linux/optimizer/_optimizer_0c12ced_dirty.abi3.so} +1 -1
build/torch28-cxx11-cu128-x86_64-linux/optimizer/muon.py +100 -51
build/torch28-cxx11-cu129-x86_64-linux/optimizer/__pycache__/__init__.cpython-313.pyc +0 -0
build/torch28-cxx11-cu129-x86_64-linux/optimizer/__pycache__/muon.cpython-313.pyc +0 -0
build/torch28-cxx11-cu129-x86_64-linux/optimizer/_ops.py +3 -3
build/torch28-cxx11-cu129-x86_64-linux/optimizer/{_optimizer_2dc97a1_dirty.abi3.so → _optimizer_0c12ced_dirty.abi3.so} +1 -1
build/torch28-cxx11-cu129-x86_64-linux/optimizer/muon.py +100 -51
build/torch28-cxx11-rocm63-x86_64-linux/optimizer/__pycache__/__init__.cpython-313.pyc +0 -0
build/torch28-cxx11-rocm63-x86_64-linux/optimizer/__pycache__/muon.cpython-313.pyc +0 -0
build/torch28-cxx11-rocm63-x86_64-linux/optimizer/_ops.py +3 -3
build/torch28-cxx11-rocm63-x86_64-linux/optimizer/{_optimizer_2dc97a1_dirty.abi3.so → _optimizer_0c12ced_dirty.abi3.so} +1 -1
build/torch28-cxx11-rocm63-x86_64-linux/optimizer/muon.py +100 -51
build/torch28-cxx11-rocm64-x86_64-linux/optimizer/__pycache__/__init__.cpython-313.pyc +0 -0
build/torch28-cxx11-rocm64-x86_64-linux/optimizer/__pycache__/muon.cpython-313.pyc +0 -0
build/torch28-cxx11-rocm64-x86_64-linux/optimizer/_ops.py +3 -3
build/torch28-cxx11-rocm64-x86_64-linux/optimizer/{_optimizer_2dc97a1_dirty.abi3.so → _optimizer_0c12ced_dirty.abi3.so} +1 -1
build/torch28-cxx11-rocm64-x86_64-linux/optimizer/muon.py +100 -51
torch-ext/optimizer/muon.py +65 -28

.gitignore CHANGED Viewed

@@ -2,7 +2,6 @@ __pycache__
 .idea
 .DS_Store
 *.egg-info
-build
 outputs
 dist/*
 .vscode

 .idea
 .DS_Store
 *.egg-info
 outputs
 dist/*
 .vscode

build/torch27-cxx11-cu118-x86_64-linux/optimizer/__pycache__/__init__.cpython-313.pyc DELETED Viewed

Binary file (307 Bytes)

build/torch27-cxx11-cu118-x86_64-linux/optimizer/__pycache__/muon.cpython-313.pyc DELETED Viewed

Binary file (23.4 kB)

build/torch27-cxx11-cu118-x86_64-linux/optimizer/_ops.py CHANGED Viewed

@@ -1,9 +1,9 @@
 import torch
-from . import _optimizer_2dc97a1_dirty
-ops = torch.ops._optimizer_2dc97a1_dirty
 def add_op_namespace_prefix(op_name: str):
     """
     Prefix op by namespace.
     """
-    return f"_optimizer_2dc97a1_dirty::{op_name}"

 import torch
+from . import _optimizer_0c12ced_dirty
+ops = torch.ops._optimizer_0c12ced_dirty
 def add_op_namespace_prefix(op_name: str):
     """
     Prefix op by namespace.
     """
+    return f"_optimizer_0c12ced_dirty::{op_name}"

build/torch27-cxx11-cu118-x86_64-linux/optimizer/{_optimizer_2dc97a1_dirty.abi3.so → _optimizer_0c12ced_dirty.abi3.so} RENAMED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:9112c8dde01baefa0e3130e143288cd3073ccbab47369a6dc925ce0d35400c6d
 size 1787368

 version https://git-lfs.github.com/spec/v1
+oid sha256:1c2963bea474b130d3b22e507692b42c1926d0b93c20495789602da2caff5ef3
 size 1787368

build/torch27-cxx11-cu118-x86_64-linux/optimizer/muon.py CHANGED Viewed

@@ -47,19 +47,27 @@ class _muon_state:
     # TODO: use Optional
     worker_rank: int | None = None
     gathered_grad: torch.Tensor | None = None
     computed_u: torch.Tensor | None = None
     gather_event: torch.cuda.Event | None = None
     compute_event: torch.cuda.Event | None = None
     process_group = None
 @torch.no_grad()
 def _gather(p, state, rank, comm_stream, none_grad):
     g = p.grad
     if rank == state.worker_rank:
         num_ranks = dist.get_world_size(group=state.process_group)
-        gather_list = [torch.empty_like(g.to_local()) for _ in range(num_ranks)]
     else:
         gather_list = None
@@ -73,8 +81,7 @@ def _gather(p, state, rank, comm_stream, none_grad):
         if rank == state.worker_rank:
             if state.gathered_grad is not None:
                 raise RuntimeError(
-                    "Gather event already exists, which should not happen."
-                )
             state.gathered_grad = torch.cat(gather_list, dim=0)
             state.gather_event = torch.cuda.Event()
             state.gather_event.record()
@@ -82,11 +89,21 @@ def _gather(p, state, rank, comm_stream, none_grad):
             state.gathered_grad = None
             state.gather_event = None
         if none_grad:
             p.grad = None
 @torch.no_grad()
 def _compute_u(state, steps, rank, compute_stream):
     with torch.cuda.stream(compute_stream):
         if rank == state.worker_rank:
             if state.gather_event is None:
@@ -96,16 +113,16 @@ def _compute_u(state, steps, rank, compute_stream):
             state.computed_u = u
             state.compute_event = torch.cuda.Event()
             state.compute_event.record()
-            # Clear the gathered gradient to free memory
-            state.gathered_grad = None
         else:
             state.computed_u = None
             state.compute_event = None
 @torch.no_grad()
-def _scatter(p, state, lr, weight_decay, rank, comm_stream):
-    u = state.computed_u
     with torch.cuda.stream(comm_stream):
         if rank == state.worker_rank:
@@ -113,27 +130,49 @@ def _scatter(p, state, lr, weight_decay, rank, comm_stream):
             if state.compute_event is None:
                 raise RuntimeError("Compute event must be set before scatter.")
             comm_stream.wait_event(state.compute_event)
             scatter_list = list(torch.split(u, p.size(0) // num_ranks, dim=0))
         else:
             scatter_list = None
-        u = torch.empty_like(p.to_local())
         torch.distributed.scatter(
-            u,
             scatter_list=scatter_list,
             src=state.worker_rank,
             group=state.process_group,
         )
-        if rank == state.worker_rank:
-            # Clear u to free memory
-            state.computed_u = None
-        u = DTensor.from_local(
-            u,
             placements=p.placements,
             device_mesh=p.device_mesh,
         )
-        p.data.mul_(1 - lr * weight_decay)
-        p.data.add_(u, alpha=-lr)
 def default_is_muon(x, name):
@@ -154,17 +193,19 @@ class Muon(torch.optim.Optimizer):
     - We believe it may not work well for finetuning pretrained models, but we haven't tested this.
     Arguments:
-        muon_params: The parameters to be optimized by Muon.
         lr: The learning rate. The updates will have spectral norm of `lr`. (0.02 is a good default)
         momentum: The momentum used by the internal SGD. (0.95 is a good default)
         nesterov: Whether to use Nesterov-style momentum in the internal SGD. (recommended)
         ns_steps: The number of Newton-Schulz iterations to run. (6 is probably always enough)
-        adamw_params: The parameters to be optimized by AdamW. Any parameters in `muon_params` which are
         {0, 1}-D or are detected as being the embed or lm_head will be optimized by AdamW as well.
         adamw_lr: The learning rate for the internal AdamW.
         adamw_betas: The betas for the internal AdamW.
         adamw_eps: The epsilon for the internal AdamW.
-        adamw_weight_decay: The weight decay for the internal AdamW.
     """
     def __init__(
@@ -240,9 +281,10 @@ class Muon(torch.optim.Optimizer):
         """
         Get the shard mesh for a parameter p on the given rank.
         """
-        assert isinstance(p, DTensor), "Parallel Muon only supports DTensor parameters."
-        if p.placements == (Shard(dim=0),):
             # Case for FSDP
             return p.device_mesh.mesh, p.device_mesh.get_group(mesh_dim=0)
         elif p.placements == (Replicate(), Shard(dim=0)):
@@ -269,11 +311,12 @@ class Muon(torch.optim.Optimizer):
             total_flops += flops
         if self.debug:
-            print(f"Total TFLOPs for Muon: {total_flops / 1e12:.2f} TFLOPs", flush=True)
-        ordered_params = sorted(
-            params, key=lambda p: param_to_flops[id(p)], reverse=True
-        )
         round_robin = 0
         mesh = None
@@ -317,14 +360,8 @@ class Muon(torch.optim.Optimizer):
             u = _zeropower_via_newtonschulz5(g, steps=group["ns_steps"])
-            # scale update
             adjusted_lr = self.adjust_lr_for_muon(lr, p.shape)
-            # apply weight decay
-            p.data.mul_(1 - lr * weight_decay)
-            # apply update
-            p.data.add_(u, alpha=-adjusted_lr)
     def _update_g(self, p, g, group, momentum):
         # calc update
@@ -339,9 +376,8 @@ class Muon(torch.optim.Optimizer):
             g = buf
         return g
-    def _update_p(self, p, u, lr, weight_decay):
-        # scale update
-        adjusted_lr = self.adjust_lr_for_muon(lr, p.shape)
         # apply weight decay
         p.data.mul_(1 - lr * weight_decay)
         # apply update
@@ -369,28 +405,34 @@ class Muon(torch.optim.Optimizer):
             p.grad = g
         param_to_state, ordered_params = self.init_state_and_assign_params(
-            params, group
-        )
         def enqueue_gathers(start_idx, chunk_size):
-            for p in ordered_params[start_idx : start_idx + chunk_size]:
                 state = param_to_state[id(p)]
-                _gather(p, state, self.rank, self.comm_stream, group["none_grad"])
         def enqueue_computes(start_idx, chunk_size):
-            for p in ordered_params[start_idx : start_idx + chunk_size]:
                 state = param_to_state[id(p)]
-                _compute_u(state, group["ns_steps"], self.rank, self.compute_stream)
         def enqueue_scatters(start_idx, chunk_size):
-            for p in ordered_params[start_idx : start_idx + chunk_size]:
                 state = param_to_state[id(p)]
                 adjusted_lr = self.adjust_lr_for_muon(lr, p.shape)
-                _scatter(
-                    p, state, adjusted_lr, weight_decay, self.rank, self.comm_stream
-                )
-        chunk_size = dist.get_world_size(param_to_state[id(params[0])].process_group)
         # Wait grad update
         self.comm_stream.wait_stream(torch.cuda.current_stream())
@@ -398,10 +440,14 @@ class Muon(torch.optim.Optimizer):
         enqueue_gathers(0, chunk_size)
         for i in range(0, len(params) + chunk_size - 1, chunk_size):
             enqueue_computes(i, chunk_size)
             enqueue_gathers(i + chunk_size, chunk_size)
             enqueue_scatters(i, chunk_size)
-        torch.cuda.current_stream().wait_stream(self.comm_stream)
     def step(self, closure=None):
         """Perform a single optimization step.
@@ -436,15 +482,16 @@ class Muon(torch.optim.Optimizer):
                     continue
                 if isinstance(p.data, DTensor):
                     if all(
-                        isinstance(placement, Replicate) for placement in p.placements
-                    ):
                         param_tensors.append(p)
                     else:
                         param_dtensors.append(p)
                 elif isinstance(p.data, torch.Tensor):
                     param_tensors.append(p)
                 else:
-                    raise TypeError(f"Unsupported parameter type: {type(p.data)}")
             if self.debug:
                 print(
@@ -479,7 +526,9 @@ class Muon(torch.optim.Optimizer):
             #       AdamW backup       #
             ############################
-            params = [p for p in group["params"] if not self.state[p]["use_muon"]]
             lr = group["lr"]
             beta1, beta2 = group["adamw_betas"]
             eps = group["adamw_eps"]

     # TODO: use Optional
     worker_rank: int | None = None
     gathered_grad: torch.Tensor | None = None
+    scattered_u: DTensor | None = None
     computed_u: torch.Tensor | None = None
     gather_event: torch.cuda.Event | None = None
     compute_event: torch.cuda.Event | None = None
+    scatter_event: torch.cuda.Event | None = None
     process_group = None
 @torch.no_grad()
 def _gather(p, state, rank, comm_stream, none_grad):
+    """
+    Gather the gradients to worker_rank.
+    If none_grad is True, free p.grad after the gather.
+    """
     g = p.grad
     if rank == state.worker_rank:
         num_ranks = dist.get_world_size(group=state.process_group)
+        gather_list = [
+            torch.empty_like(g.to_local()) for _ in range(num_ranks)
+        ]
     else:
         gather_list = None
         if rank == state.worker_rank:
             if state.gathered_grad is not None:
                 raise RuntimeError(
+                    "Gather event already exists, which should not happen.")
             state.gathered_grad = torch.cat(gather_list, dim=0)
             state.gather_event = torch.cuda.Event()
             state.gather_event.record()
             state.gathered_grad = None
             state.gather_event = None
         if none_grad:
+            # We can safely free p.grad without calling record_stream:
+            #   p.grad.to_local().record_stream(comm_stream)
+            # Explanation:
+            # 1. p.grad is created on the default stream, but the default stream
+            #    is synchronized with the comm stream later.
+            # 2. There is no further activity on the default stream before the optimizer finishes.
+            # Therefore, it is safe to free p.grad directly on the comm stream.
             p.grad = None
 @torch.no_grad()
 def _compute_u(state, steps, rank, compute_stream):
+    """
+    On worker_rank, compute the orthogonalized update using Newton-Schulz iteration.
+    """
     with torch.cuda.stream(compute_stream):
         if rank == state.worker_rank:
             if state.gather_event is None:
             state.computed_u = u
             state.compute_event = torch.cuda.Event()
             state.compute_event.record()
         else:
             state.computed_u = None
             state.compute_event = None
 @torch.no_grad()
+def _scatter(p, state, rank, comm_stream):
+    """
+    Scatter the computed_u from worker_rank to all ranks.
+    """
     with torch.cuda.stream(comm_stream):
         if rank == state.worker_rank:
             if state.compute_event is None:
                 raise RuntimeError("Compute event must be set before scatter.")
             comm_stream.wait_event(state.compute_event)
+            # Clear the gathered gradient to free memory
+            state.gathered_grad = None
+            u = state.computed_u
             scatter_list = list(torch.split(u, p.size(0) // num_ranks, dim=0))
+            scatter_list = [s.contiguous() for s in scatter_list]
         else:
             scatter_list = None
+        u_received = torch.empty_like(p.to_local())
         torch.distributed.scatter(
+            u_received,
             scatter_list=scatter_list,
             src=state.worker_rank,
             group=state.process_group,
         )
+        u_dtensor = DTensor.from_local(
+            u_received,
             placements=p.placements,
             device_mesh=p.device_mesh,
         )
+        state.scattered_u = u_dtensor
+        state.scatter_event = torch.cuda.Event()
+        state.scatter_event.record()
+def _update_param(p, state, lr, adjusted_lr, weight_decay, rank,
+                  compute_stream):
+    """
+    Update sharded parameter p with the scattered_u.
+    Only worker_rank frees computed_u.
+    """
+    with torch.cuda.stream(compute_stream):
+        if state.scatter_event is None:
+            raise RuntimeError("Scatter event must be set before update")
+        compute_stream.wait_event(state.scatter_event)
+        if rank == state.worker_rank:
+            # Free computed_u
+            state.computed_u = None
+        Muon._update_p(p, state.scattered_u, lr, adjusted_lr, weight_decay)
 def default_is_muon(x, name):
     - We believe it may not work well for finetuning pretrained models, but we haven't tested this.
     Arguments:
+        model: The model to be optimized by Muon.
+        is_muon_func: A function that takes a parameter and its name, and returns whether the parameter should be optimized by Muon.
         lr: The learning rate. The updates will have spectral norm of `lr`. (0.02 is a good default)
         momentum: The momentum used by the internal SGD. (0.95 is a good default)
         nesterov: Whether to use Nesterov-style momentum in the internal SGD. (recommended)
         ns_steps: The number of Newton-Schulz iterations to run. (6 is probably always enough)
+        weight_decay: The weight decay for Muon and AdamW.
         {0, 1}-D or are detected as being the embed or lm_head will be optimized by AdamW as well.
         adamw_lr: The learning rate for the internal AdamW.
         adamw_betas: The betas for the internal AdamW.
         adamw_eps: The epsilon for the internal AdamW.
+        none_grad: Whether to set p.grad to None after gathering the gradients. This can save memory.
+        debug: Whether to print debug information.
     """
     def __init__(
         """
         Get the shard mesh for a parameter p on the given rank.
         """
+        assert isinstance(
+            p, DTensor), "Parallel Muon only supports DTensor parameters."
+        if p.placements == (Shard(dim=0), ):
             # Case for FSDP
             return p.device_mesh.mesh, p.device_mesh.get_group(mesh_dim=0)
         elif p.placements == (Replicate(), Shard(dim=0)):
             total_flops += flops
         if self.debug:
+            print(f"Total TFLOPs for Muon: {total_flops / 1e12:.2f} TFLOPs",
+                  flush=True)
+        ordered_params = sorted(params,
+                                key=lambda p: param_to_flops[id(p)],
+                                reverse=True)
         round_robin = 0
         mesh = None
             u = _zeropower_via_newtonschulz5(g, steps=group["ns_steps"])
             adjusted_lr = self.adjust_lr_for_muon(lr, p.shape)
+            Muon._update_p(p, u, lr, adjusted_lr, weight_decay)
     def _update_g(self, p, g, group, momentum):
         # calc update
             g = buf
         return g
+    @staticmethod
+    def _update_p(p, u, lr, adjusted_lr, weight_decay):
         # apply weight decay
         p.data.mul_(1 - lr * weight_decay)
         # apply update
             p.grad = g
         param_to_state, ordered_params = self.init_state_and_assign_params(
+            params, group)
         def enqueue_gathers(start_idx, chunk_size):
+            for p in ordered_params[start_idx:start_idx + chunk_size]:
                 state = param_to_state[id(p)]
+                _gather(p, state, self.rank, self.comm_stream,
+                        group["none_grad"])
         def enqueue_computes(start_idx, chunk_size):
+            for p in ordered_params[start_idx:start_idx + chunk_size]:
                 state = param_to_state[id(p)]
+                _compute_u(state, group["ns_steps"], self.rank,
+                           self.compute_stream)
         def enqueue_scatters(start_idx, chunk_size):
+            for p in ordered_params[start_idx:start_idx + chunk_size]:
+                state = param_to_state[id(p)]
+                _scatter(p, state, self.rank, self.comm_stream)
+        def enqueue_update_param(start_idx, chunk_size):
+            for p in ordered_params[start_idx:start_idx + chunk_size]:
                 state = param_to_state[id(p)]
                 adjusted_lr = self.adjust_lr_for_muon(lr, p.shape)
+                _update_param(p, state, lr, adjusted_lr, weight_decay,
+                              self.rank, self.compute_stream)
+        chunk_size = dist.get_world_size(param_to_state[id(
+            params[0])].process_group)
         # Wait grad update
         self.comm_stream.wait_stream(torch.cuda.current_stream())
         enqueue_gathers(0, chunk_size)
         for i in range(0, len(params) + chunk_size - 1, chunk_size):
             enqueue_computes(i, chunk_size)
+            if i > 0:
+                enqueue_update_param(i - chunk_size, chunk_size)
             enqueue_gathers(i + chunk_size, chunk_size)
             enqueue_scatters(i, chunk_size)
+        enqueue_update_param(i, chunk_size)
+        # Wait the last update_param to finish
+        torch.cuda.current_stream().wait_stream(self.compute_stream)
     def step(self, closure=None):
         """Perform a single optimization step.
                     continue
                 if isinstance(p.data, DTensor):
                     if all(
+                            isinstance(placement, Replicate)
+                            for placement in p.placements):
                         param_tensors.append(p)
                     else:
                         param_dtensors.append(p)
                 elif isinstance(p.data, torch.Tensor):
                     param_tensors.append(p)
                 else:
+                    raise TypeError(
+                        f"Unsupported parameter type: {type(p.data)}")
             if self.debug:
                 print(
             #       AdamW backup       #
             ############################
+            params = [
+                p for p in group["params"] if not self.state[p]["use_muon"]
+            ]
             lr = group["lr"]
             beta1, beta2 = group["adamw_betas"]
             eps = group["adamw_eps"]

build/torch27-cxx11-cu126-x86_64-linux/optimizer/__pycache__/__init__.cpython-313.pyc DELETED Viewed

Binary file (307 Bytes)

build/torch27-cxx11-cu126-x86_64-linux/optimizer/__pycache__/muon.cpython-313.pyc DELETED Viewed

Binary file (23.4 kB)

build/torch27-cxx11-cu126-x86_64-linux/optimizer/_ops.py CHANGED Viewed

@@ -1,9 +1,9 @@
 import torch
-from . import _optimizer_2dc97a1_dirty
-ops = torch.ops._optimizer_2dc97a1_dirty
 def add_op_namespace_prefix(op_name: str):
     """
     Prefix op by namespace.
     """
-    return f"_optimizer_2dc97a1_dirty::{op_name}"

 import torch
+from . import _optimizer_0c12ced_dirty
+ops = torch.ops._optimizer_0c12ced_dirty
 def add_op_namespace_prefix(op_name: str):
     """
     Prefix op by namespace.
     """
+    return f"_optimizer_0c12ced_dirty::{op_name}"

build/torch27-cxx11-cu126-x86_64-linux/optimizer/{_optimizer_2dc97a1_dirty.abi3.so → _optimizer_0c12ced_dirty.abi3.so} RENAMED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:0449cd352f44c3e848d1f9c847b00bf576673b4fef2a954ec8bd8d2524b8353a
 size 1824256

 version https://git-lfs.github.com/spec/v1
+oid sha256:a55c3d0aba4548dc74a08d66987307bd381c2d93b149702fbdc60da19e03e5fc
 size 1824256

build/torch27-cxx11-cu126-x86_64-linux/optimizer/muon.py CHANGED Viewed

@@ -47,19 +47,27 @@ class _muon_state:
     # TODO: use Optional
     worker_rank: int | None = None
     gathered_grad: torch.Tensor | None = None
     computed_u: torch.Tensor | None = None
     gather_event: torch.cuda.Event | None = None
     compute_event: torch.cuda.Event | None = None
     process_group = None
 @torch.no_grad()
 def _gather(p, state, rank, comm_stream, none_grad):
     g = p.grad
     if rank == state.worker_rank:
         num_ranks = dist.get_world_size(group=state.process_group)
-        gather_list = [torch.empty_like(g.to_local()) for _ in range(num_ranks)]
     else:
         gather_list = None
@@ -73,8 +81,7 @@ def _gather(p, state, rank, comm_stream, none_grad):
         if rank == state.worker_rank:
             if state.gathered_grad is not None:
                 raise RuntimeError(
-                    "Gather event already exists, which should not happen."
-                )
             state.gathered_grad = torch.cat(gather_list, dim=0)
             state.gather_event = torch.cuda.Event()
             state.gather_event.record()
@@ -82,11 +89,21 @@ def _gather(p, state, rank, comm_stream, none_grad):
             state.gathered_grad = None
             state.gather_event = None
         if none_grad:
             p.grad = None
 @torch.no_grad()
 def _compute_u(state, steps, rank, compute_stream):
     with torch.cuda.stream(compute_stream):
         if rank == state.worker_rank:
             if state.gather_event is None:
@@ -96,16 +113,16 @@ def _compute_u(state, steps, rank, compute_stream):
             state.computed_u = u
             state.compute_event = torch.cuda.Event()
             state.compute_event.record()
-            # Clear the gathered gradient to free memory
-            state.gathered_grad = None
         else:
             state.computed_u = None
             state.compute_event = None
 @torch.no_grad()
-def _scatter(p, state, lr, weight_decay, rank, comm_stream):
-    u = state.computed_u
     with torch.cuda.stream(comm_stream):
         if rank == state.worker_rank:
@@ -113,27 +130,49 @@ def _scatter(p, state, lr, weight_decay, rank, comm_stream):
             if state.compute_event is None:
                 raise RuntimeError("Compute event must be set before scatter.")
             comm_stream.wait_event(state.compute_event)
             scatter_list = list(torch.split(u, p.size(0) // num_ranks, dim=0))
         else:
             scatter_list = None
-        u = torch.empty_like(p.to_local())
         torch.distributed.scatter(
-            u,
             scatter_list=scatter_list,
             src=state.worker_rank,
             group=state.process_group,
         )
-        if rank == state.worker_rank:
-            # Clear u to free memory
-            state.computed_u = None
-        u = DTensor.from_local(
-            u,
             placements=p.placements,
             device_mesh=p.device_mesh,
         )
-        p.data.mul_(1 - lr * weight_decay)
-        p.data.add_(u, alpha=-lr)
 def default_is_muon(x, name):
@@ -154,17 +193,19 @@ class Muon(torch.optim.Optimizer):
     - We believe it may not work well for finetuning pretrained models, but we haven't tested this.
     Arguments:
-        muon_params: The parameters to be optimized by Muon.
         lr: The learning rate. The updates will have spectral norm of `lr`. (0.02 is a good default)
         momentum: The momentum used by the internal SGD. (0.95 is a good default)
         nesterov: Whether to use Nesterov-style momentum in the internal SGD. (recommended)
         ns_steps: The number of Newton-Schulz iterations to run. (6 is probably always enough)
-        adamw_params: The parameters to be optimized by AdamW. Any parameters in `muon_params` which are
         {0, 1}-D or are detected as being the embed or lm_head will be optimized by AdamW as well.
         adamw_lr: The learning rate for the internal AdamW.
         adamw_betas: The betas for the internal AdamW.
         adamw_eps: The epsilon for the internal AdamW.
-        adamw_weight_decay: The weight decay for the internal AdamW.
     """
     def __init__(
@@ -240,9 +281,10 @@ class Muon(torch.optim.Optimizer):
         """
         Get the shard mesh for a parameter p on the given rank.
         """
-        assert isinstance(p, DTensor), "Parallel Muon only supports DTensor parameters."
-        if p.placements == (Shard(dim=0),):
             # Case for FSDP
             return p.device_mesh.mesh, p.device_mesh.get_group(mesh_dim=0)
         elif p.placements == (Replicate(), Shard(dim=0)):
@@ -269,11 +311,12 @@ class Muon(torch.optim.Optimizer):
             total_flops += flops
         if self.debug:
-            print(f"Total TFLOPs for Muon: {total_flops / 1e12:.2f} TFLOPs", flush=True)
-        ordered_params = sorted(
-            params, key=lambda p: param_to_flops[id(p)], reverse=True
-        )
         round_robin = 0
         mesh = None
@@ -317,14 +360,8 @@ class Muon(torch.optim.Optimizer):
             u = _zeropower_via_newtonschulz5(g, steps=group["ns_steps"])
-            # scale update
             adjusted_lr = self.adjust_lr_for_muon(lr, p.shape)
-            # apply weight decay
-            p.data.mul_(1 - lr * weight_decay)
-            # apply update
-            p.data.add_(u, alpha=-adjusted_lr)
     def _update_g(self, p, g, group, momentum):
         # calc update
@@ -339,9 +376,8 @@ class Muon(torch.optim.Optimizer):
             g = buf
         return g
-    def _update_p(self, p, u, lr, weight_decay):
-        # scale update
-        adjusted_lr = self.adjust_lr_for_muon(lr, p.shape)
         # apply weight decay
         p.data.mul_(1 - lr * weight_decay)
         # apply update
@@ -369,28 +405,34 @@ class Muon(torch.optim.Optimizer):
             p.grad = g
         param_to_state, ordered_params = self.init_state_and_assign_params(
-            params, group
-        )
         def enqueue_gathers(start_idx, chunk_size):
-            for p in ordered_params[start_idx : start_idx + chunk_size]:
                 state = param_to_state[id(p)]
-                _gather(p, state, self.rank, self.comm_stream, group["none_grad"])
         def enqueue_computes(start_idx, chunk_size):
-            for p in ordered_params[start_idx : start_idx + chunk_size]:
                 state = param_to_state[id(p)]
-                _compute_u(state, group["ns_steps"], self.rank, self.compute_stream)
         def enqueue_scatters(start_idx, chunk_size):
-            for p in ordered_params[start_idx : start_idx + chunk_size]:
                 state = param_to_state[id(p)]
                 adjusted_lr = self.adjust_lr_for_muon(lr, p.shape)
-                _scatter(
-                    p, state, adjusted_lr, weight_decay, self.rank, self.comm_stream
-                )
-        chunk_size = dist.get_world_size(param_to_state[id(params[0])].process_group)
         # Wait grad update
         self.comm_stream.wait_stream(torch.cuda.current_stream())
@@ -398,10 +440,14 @@ class Muon(torch.optim.Optimizer):
         enqueue_gathers(0, chunk_size)
         for i in range(0, len(params) + chunk_size - 1, chunk_size):
             enqueue_computes(i, chunk_size)
             enqueue_gathers(i + chunk_size, chunk_size)
             enqueue_scatters(i, chunk_size)
-        torch.cuda.current_stream().wait_stream(self.comm_stream)
     def step(self, closure=None):
         """Perform a single optimization step.
@@ -436,15 +482,16 @@ class Muon(torch.optim.Optimizer):
                     continue
                 if isinstance(p.data, DTensor):
                     if all(
-                        isinstance(placement, Replicate) for placement in p.placements
-                    ):
                         param_tensors.append(p)
                     else:
                         param_dtensors.append(p)
                 elif isinstance(p.data, torch.Tensor):
                     param_tensors.append(p)
                 else:
-                    raise TypeError(f"Unsupported parameter type: {type(p.data)}")
             if self.debug:
                 print(
@@ -479,7 +526,9 @@ class Muon(torch.optim.Optimizer):
             #       AdamW backup       #
             ############################
-            params = [p for p in group["params"] if not self.state[p]["use_muon"]]
             lr = group["lr"]
             beta1, beta2 = group["adamw_betas"]
             eps = group["adamw_eps"]

     # TODO: use Optional
     worker_rank: int | None = None
     gathered_grad: torch.Tensor | None = None
+    scattered_u: DTensor | None = None
     computed_u: torch.Tensor | None = None
     gather_event: torch.cuda.Event | None = None
     compute_event: torch.cuda.Event | None = None
+    scatter_event: torch.cuda.Event | None = None
     process_group = None
 @torch.no_grad()
 def _gather(p, state, rank, comm_stream, none_grad):
+    """
+    Gather the gradients to worker_rank.
+    If none_grad is True, free p.grad after the gather.
+    """
     g = p.grad
     if rank == state.worker_rank:
         num_ranks = dist.get_world_size(group=state.process_group)
+        gather_list = [
+            torch.empty_like(g.to_local()) for _ in range(num_ranks)
+        ]
     else:
         gather_list = None
         if rank == state.worker_rank:
             if state.gathered_grad is not None:
                 raise RuntimeError(
+                    "Gather event already exists, which should not happen.")
             state.gathered_grad = torch.cat(gather_list, dim=0)
             state.gather_event = torch.cuda.Event()
             state.gather_event.record()
             state.gathered_grad = None
             state.gather_event = None
         if none_grad:
+            # We can safely free p.grad without calling record_stream:
+            #   p.grad.to_local().record_stream(comm_stream)
+            # Explanation:
+            # 1. p.grad is created on the default stream, but the default stream
+            #    is synchronized with the comm stream later.
+            # 2. There is no further activity on the default stream before the optimizer finishes.
+            # Therefore, it is safe to free p.grad directly on the comm stream.
             p.grad = None
 @torch.no_grad()
 def _compute_u(state, steps, rank, compute_stream):
+    """
+    On worker_rank, compute the orthogonalized update using Newton-Schulz iteration.
+    """
     with torch.cuda.stream(compute_stream):
         if rank == state.worker_rank:
             if state.gather_event is None:
             state.computed_u = u
             state.compute_event = torch.cuda.Event()
             state.compute_event.record()
         else:
             state.computed_u = None
             state.compute_event = None
 @torch.no_grad()
+def _scatter(p, state, rank, comm_stream):
+    """
+    Scatter the computed_u from worker_rank to all ranks.
+    """
     with torch.cuda.stream(comm_stream):
         if rank == state.worker_rank:
             if state.compute_event is None:
                 raise RuntimeError("Compute event must be set before scatter.")
             comm_stream.wait_event(state.compute_event)
+            # Clear the gathered gradient to free memory
+            state.gathered_grad = None
+            u = state.computed_u
             scatter_list = list(torch.split(u, p.size(0) // num_ranks, dim=0))
+            scatter_list = [s.contiguous() for s in scatter_list]
         else:
             scatter_list = None
+        u_received = torch.empty_like(p.to_local())
         torch.distributed.scatter(
+            u_received,
             scatter_list=scatter_list,
             src=state.worker_rank,
             group=state.process_group,
         )
+        u_dtensor = DTensor.from_local(
+            u_received,
             placements=p.placements,
             device_mesh=p.device_mesh,
         )
+        state.scattered_u = u_dtensor
+        state.scatter_event = torch.cuda.Event()
+        state.scatter_event.record()
+def _update_param(p, state, lr, adjusted_lr, weight_decay, rank,
+                  compute_stream):
+    """
+    Update sharded parameter p with the scattered_u.
+    Only worker_rank frees computed_u.
+    """
+    with torch.cuda.stream(compute_stream):
+        if state.scatter_event is None:
+            raise RuntimeError("Scatter event must be set before update")
+        compute_stream.wait_event(state.scatter_event)
+        if rank == state.worker_rank:
+            # Free computed_u
+            state.computed_u = None
+        Muon._update_p(p, state.scattered_u, lr, adjusted_lr, weight_decay)
 def default_is_muon(x, name):
     - We believe it may not work well for finetuning pretrained models, but we haven't tested this.
     Arguments:
+        model: The model to be optimized by Muon.
+        is_muon_func: A function that takes a parameter and its name, and returns whether the parameter should be optimized by Muon.
         lr: The learning rate. The updates will have spectral norm of `lr`. (0.02 is a good default)
         momentum: The momentum used by the internal SGD. (0.95 is a good default)
         nesterov: Whether to use Nesterov-style momentum in the internal SGD. (recommended)
         ns_steps: The number of Newton-Schulz iterations to run. (6 is probably always enough)
+        weight_decay: The weight decay for Muon and AdamW.
         {0, 1}-D or are detected as being the embed or lm_head will be optimized by AdamW as well.
         adamw_lr: The learning rate for the internal AdamW.
         adamw_betas: The betas for the internal AdamW.
         adamw_eps: The epsilon for the internal AdamW.
+        none_grad: Whether to set p.grad to None after gathering the gradients. This can save memory.
+        debug: Whether to print debug information.
     """
     def __init__(
         """
         Get the shard mesh for a parameter p on the given rank.
         """
+        assert isinstance(
+            p, DTensor), "Parallel Muon only supports DTensor parameters."
+        if p.placements == (Shard(dim=0), ):
             # Case for FSDP
             return p.device_mesh.mesh, p.device_mesh.get_group(mesh_dim=0)
         elif p.placements == (Replicate(), Shard(dim=0)):
             total_flops += flops
         if self.debug:
+            print(f"Total TFLOPs for Muon: {total_flops / 1e12:.2f} TFLOPs",
+                  flush=True)
+        ordered_params = sorted(params,
+                                key=lambda p: param_to_flops[id(p)],
+                                reverse=True)
         round_robin = 0
         mesh = None
             u = _zeropower_via_newtonschulz5(g, steps=group["ns_steps"])
             adjusted_lr = self.adjust_lr_for_muon(lr, p.shape)
+            Muon._update_p(p, u, lr, adjusted_lr, weight_decay)
     def _update_g(self, p, g, group, momentum):
         # calc update
             g = buf
         return g
+    @staticmethod
+    def _update_p(p, u, lr, adjusted_lr, weight_decay):
         # apply weight decay
         p.data.mul_(1 - lr * weight_decay)
         # apply update
             p.grad = g
         param_to_state, ordered_params = self.init_state_and_assign_params(
+            params, group)
         def enqueue_gathers(start_idx, chunk_size):
+            for p in ordered_params[start_idx:start_idx + chunk_size]:
                 state = param_to_state[id(p)]
+                _gather(p, state, self.rank, self.comm_stream,
+                        group["none_grad"])
         def enqueue_computes(start_idx, chunk_size):
+            for p in ordered_params[start_idx:start_idx + chunk_size]:
                 state = param_to_state[id(p)]
+                _compute_u(state, group["ns_steps"], self.rank,
+                           self.compute_stream)
         def enqueue_scatters(start_idx, chunk_size):
+            for p in ordered_params[start_idx:start_idx + chunk_size]:
+                state = param_to_state[id(p)]
+                _scatter(p, state, self.rank, self.comm_stream)
+        def enqueue_update_param(start_idx, chunk_size):
+            for p in ordered_params[start_idx:start_idx + chunk_size]:
                 state = param_to_state[id(p)]
                 adjusted_lr = self.adjust_lr_for_muon(lr, p.shape)
+                _update_param(p, state, lr, adjusted_lr, weight_decay,
+                              self.rank, self.compute_stream)
+        chunk_size = dist.get_world_size(param_to_state[id(
+            params[0])].process_group)
         # Wait grad update
         self.comm_stream.wait_stream(torch.cuda.current_stream())
         enqueue_gathers(0, chunk_size)
         for i in range(0, len(params) + chunk_size - 1, chunk_size):
             enqueue_computes(i, chunk_size)
+            if i > 0:
+                enqueue_update_param(i - chunk_size, chunk_size)
             enqueue_gathers(i + chunk_size, chunk_size)
             enqueue_scatters(i, chunk_size)
+        enqueue_update_param(i, chunk_size)
+        # Wait the last update_param to finish
+        torch.cuda.current_stream().wait_stream(self.compute_stream)
     def step(self, closure=None):
         """Perform a single optimization step.
                     continue
                 if isinstance(p.data, DTensor):
                     if all(
+                            isinstance(placement, Replicate)
+                            for placement in p.placements):
                         param_tensors.append(p)
                     else:
                         param_dtensors.append(p)
                 elif isinstance(p.data, torch.Tensor):
                     param_tensors.append(p)
                 else:
+                    raise TypeError(
+                        f"Unsupported parameter type: {type(p.data)}")
             if self.debug:
                 print(
             #       AdamW backup       #
             ############################
+            params = [
+                p for p in group["params"] if not self.state[p]["use_muon"]
+            ]
             lr = group["lr"]
             beta1, beta2 = group["adamw_betas"]
             eps = group["adamw_eps"]

build/torch27-cxx11-cu128-x86_64-linux/optimizer/__pycache__/__init__.cpython-313.pyc DELETED Viewed

Binary file (307 Bytes)

build/torch27-cxx11-cu128-x86_64-linux/optimizer/__pycache__/muon.cpython-313.pyc DELETED Viewed

Binary file (23.4 kB)

build/torch27-cxx11-cu128-x86_64-linux/optimizer/_ops.py CHANGED Viewed

@@ -1,9 +1,9 @@
 import torch
-from . import _optimizer_2dc97a1_dirty
-ops = torch.ops._optimizer_2dc97a1_dirty
 def add_op_namespace_prefix(op_name: str):
     """
     Prefix op by namespace.
     """
-    return f"_optimizer_2dc97a1_dirty::{op_name}"

 import torch
+from . import _optimizer_0c12ced_dirty
+ops = torch.ops._optimizer_0c12ced_dirty
 def add_op_namespace_prefix(op_name: str):
     """
     Prefix op by namespace.
     """
+    return f"_optimizer_0c12ced_dirty::{op_name}"

build/{torch28-cxx11-cu128-x86_64-linux/optimizer/_optimizer_2dc97a1_dirty.abi3.so → torch27-cxx11-cu128-x86_64-linux/optimizer/_optimizer_0c12ced_dirty.abi3.so} RENAMED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:86d98863cc7ef0b271808b0ef7b1082603cfb5a76986481df37431527aaaf27b
 size 1883352

 version https://git-lfs.github.com/spec/v1
+oid sha256:c319d0fb497363746229fbabed6d14b82090a660de602125fb67135117c53f5a
 size 1883352

build/torch27-cxx11-cu128-x86_64-linux/optimizer/muon.py CHANGED Viewed

@@ -47,19 +47,27 @@ class _muon_state:
     # TODO: use Optional
     worker_rank: int | None = None
     gathered_grad: torch.Tensor | None = None
     computed_u: torch.Tensor | None = None
     gather_event: torch.cuda.Event | None = None
     compute_event: torch.cuda.Event | None = None
     process_group = None
 @torch.no_grad()
 def _gather(p, state, rank, comm_stream, none_grad):
     g = p.grad
     if rank == state.worker_rank:
         num_ranks = dist.get_world_size(group=state.process_group)
-        gather_list = [torch.empty_like(g.to_local()) for _ in range(num_ranks)]
     else:
         gather_list = None
@@ -73,8 +81,7 @@ def _gather(p, state, rank, comm_stream, none_grad):
         if rank == state.worker_rank:
             if state.gathered_grad is not None:
                 raise RuntimeError(
-                    "Gather event already exists, which should not happen."
-                )
             state.gathered_grad = torch.cat(gather_list, dim=0)
             state.gather_event = torch.cuda.Event()
             state.gather_event.record()
@@ -82,11 +89,21 @@ def _gather(p, state, rank, comm_stream, none_grad):
             state.gathered_grad = None
             state.gather_event = None
         if none_grad:
             p.grad = None
 @torch.no_grad()
 def _compute_u(state, steps, rank, compute_stream):
     with torch.cuda.stream(compute_stream):
         if rank == state.worker_rank:
             if state.gather_event is None:
@@ -96,16 +113,16 @@ def _compute_u(state, steps, rank, compute_stream):
             state.computed_u = u
             state.compute_event = torch.cuda.Event()
             state.compute_event.record()
-            # Clear the gathered gradient to free memory
-            state.gathered_grad = None
         else:
             state.computed_u = None
             state.compute_event = None
 @torch.no_grad()
-def _scatter(p, state, lr, weight_decay, rank, comm_stream):
-    u = state.computed_u
     with torch.cuda.stream(comm_stream):
         if rank == state.worker_rank:
@@ -113,27 +130,49 @@ def _scatter(p, state, lr, weight_decay, rank, comm_stream):
             if state.compute_event is None:
                 raise RuntimeError("Compute event must be set before scatter.")
             comm_stream.wait_event(state.compute_event)
             scatter_list = list(torch.split(u, p.size(0) // num_ranks, dim=0))
         else:
             scatter_list = None
-        u = torch.empty_like(p.to_local())
         torch.distributed.scatter(
-            u,
             scatter_list=scatter_list,
             src=state.worker_rank,
             group=state.process_group,
         )
-        if rank == state.worker_rank:
-            # Clear u to free memory
-            state.computed_u = None
-        u = DTensor.from_local(
-            u,
             placements=p.placements,
             device_mesh=p.device_mesh,
         )
-        p.data.mul_(1 - lr * weight_decay)
-        p.data.add_(u, alpha=-lr)
 def default_is_muon(x, name):
@@ -154,17 +193,19 @@ class Muon(torch.optim.Optimizer):
     - We believe it may not work well for finetuning pretrained models, but we haven't tested this.
     Arguments:
-        muon_params: The parameters to be optimized by Muon.
         lr: The learning rate. The updates will have spectral norm of `lr`. (0.02 is a good default)
         momentum: The momentum used by the internal SGD. (0.95 is a good default)
         nesterov: Whether to use Nesterov-style momentum in the internal SGD. (recommended)
         ns_steps: The number of Newton-Schulz iterations to run. (6 is probably always enough)
-        adamw_params: The parameters to be optimized by AdamW. Any parameters in `muon_params` which are
         {0, 1}-D or are detected as being the embed or lm_head will be optimized by AdamW as well.
         adamw_lr: The learning rate for the internal AdamW.
         adamw_betas: The betas for the internal AdamW.
         adamw_eps: The epsilon for the internal AdamW.
-        adamw_weight_decay: The weight decay for the internal AdamW.
     """
     def __init__(
@@ -240,9 +281,10 @@ class Muon(torch.optim.Optimizer):
         """
         Get the shard mesh for a parameter p on the given rank.
         """
-        assert isinstance(p, DTensor), "Parallel Muon only supports DTensor parameters."
-        if p.placements == (Shard(dim=0),):
             # Case for FSDP
             return p.device_mesh.mesh, p.device_mesh.get_group(mesh_dim=0)
         elif p.placements == (Replicate(), Shard(dim=0)):
@@ -269,11 +311,12 @@ class Muon(torch.optim.Optimizer):
             total_flops += flops
         if self.debug:
-            print(f"Total TFLOPs for Muon: {total_flops / 1e12:.2f} TFLOPs", flush=True)
-        ordered_params = sorted(
-            params, key=lambda p: param_to_flops[id(p)], reverse=True
-        )
         round_robin = 0
         mesh = None
@@ -317,14 +360,8 @@ class Muon(torch.optim.Optimizer):
             u = _zeropower_via_newtonschulz5(g, steps=group["ns_steps"])
-            # scale update
             adjusted_lr = self.adjust_lr_for_muon(lr, p.shape)
-            # apply weight decay
-            p.data.mul_(1 - lr * weight_decay)
-            # apply update
-            p.data.add_(u, alpha=-adjusted_lr)
     def _update_g(self, p, g, group, momentum):
         # calc update
@@ -339,9 +376,8 @@ class Muon(torch.optim.Optimizer):
             g = buf
         return g
-    def _update_p(self, p, u, lr, weight_decay):
-        # scale update
-        adjusted_lr = self.adjust_lr_for_muon(lr, p.shape)
         # apply weight decay
         p.data.mul_(1 - lr * weight_decay)
         # apply update
@@ -369,28 +405,34 @@ class Muon(torch.optim.Optimizer):
             p.grad = g
         param_to_state, ordered_params = self.init_state_and_assign_params(
-            params, group
-        )
         def enqueue_gathers(start_idx, chunk_size):
-            for p in ordered_params[start_idx : start_idx + chunk_size]:
                 state = param_to_state[id(p)]
-                _gather(p, state, self.rank, self.comm_stream, group["none_grad"])
         def enqueue_computes(start_idx, chunk_size):
-            for p in ordered_params[start_idx : start_idx + chunk_size]:
                 state = param_to_state[id(p)]
-                _compute_u(state, group["ns_steps"], self.rank, self.compute_stream)
         def enqueue_scatters(start_idx, chunk_size):
-            for p in ordered_params[start_idx : start_idx + chunk_size]:
                 state = param_to_state[id(p)]
                 adjusted_lr = self.adjust_lr_for_muon(lr, p.shape)
-                _scatter(
-                    p, state, adjusted_lr, weight_decay, self.rank, self.comm_stream
-                )
-        chunk_size = dist.get_world_size(param_to_state[id(params[0])].process_group)
         # Wait grad update
         self.comm_stream.wait_stream(torch.cuda.current_stream())
@@ -398,10 +440,14 @@ class Muon(torch.optim.Optimizer):
         enqueue_gathers(0, chunk_size)
         for i in range(0, len(params) + chunk_size - 1, chunk_size):
             enqueue_computes(i, chunk_size)
             enqueue_gathers(i + chunk_size, chunk_size)
             enqueue_scatters(i, chunk_size)
-        torch.cuda.current_stream().wait_stream(self.comm_stream)
     def step(self, closure=None):
         """Perform a single optimization step.
@@ -436,15 +482,16 @@ class Muon(torch.optim.Optimizer):
                     continue
                 if isinstance(p.data, DTensor):
                     if all(
-                        isinstance(placement, Replicate) for placement in p.placements
-                    ):
                         param_tensors.append(p)
                     else:
                         param_dtensors.append(p)
                 elif isinstance(p.data, torch.Tensor):
                     param_tensors.append(p)
                 else:
-                    raise TypeError(f"Unsupported parameter type: {type(p.data)}")
             if self.debug:
                 print(
@@ -479,7 +526,9 @@ class Muon(torch.optim.Optimizer):
             #       AdamW backup       #
             ############################
-            params = [p for p in group["params"] if not self.state[p]["use_muon"]]
             lr = group["lr"]
             beta1, beta2 = group["adamw_betas"]
             eps = group["adamw_eps"]

     # TODO: use Optional
     worker_rank: int | None = None
     gathered_grad: torch.Tensor | None = None
+    scattered_u: DTensor | None = None
     computed_u: torch.Tensor | None = None
     gather_event: torch.cuda.Event | None = None
     compute_event: torch.cuda.Event | None = None
+    scatter_event: torch.cuda.Event | None = None
     process_group = None
 @torch.no_grad()
 def _gather(p, state, rank, comm_stream, none_grad):
+    """
+    Gather the gradients to worker_rank.
+    If none_grad is True, free p.grad after the gather.
+    """
     g = p.grad
     if rank == state.worker_rank:
         num_ranks = dist.get_world_size(group=state.process_group)
+        gather_list = [
+            torch.empty_like(g.to_local()) for _ in range(num_ranks)
+        ]
     else:
         gather_list = None
         if rank == state.worker_rank:
             if state.gathered_grad is not None:
                 raise RuntimeError(
+                    "Gather event already exists, which should not happen.")
             state.gathered_grad = torch.cat(gather_list, dim=0)
             state.gather_event = torch.cuda.Event()
             state.gather_event.record()
             state.gathered_grad = None
             state.gather_event = None
         if none_grad:
+            # We can safely free p.grad without calling record_stream:
+            #   p.grad.to_local().record_stream(comm_stream)
+            # Explanation:
+            # 1. p.grad is created on the default stream, but the default stream
+            #    is synchronized with the comm stream later.
+            # 2. There is no further activity on the default stream before the optimizer finishes.
+            # Therefore, it is safe to free p.grad directly on the comm stream.
             p.grad = None
 @torch.no_grad()
 def _compute_u(state, steps, rank, compute_stream):
+    """
+    On worker_rank, compute the orthogonalized update using Newton-Schulz iteration.
+    """
     with torch.cuda.stream(compute_stream):
         if rank == state.worker_rank:
             if state.gather_event is None:
             state.computed_u = u
             state.compute_event = torch.cuda.Event()
             state.compute_event.record()
         else:
             state.computed_u = None
             state.compute_event = None
 @torch.no_grad()
+def _scatter(p, state, rank, comm_stream):
+    """
+    Scatter the computed_u from worker_rank to all ranks.
+    """
     with torch.cuda.stream(comm_stream):
         if rank == state.worker_rank:
             if state.compute_event is None:
                 raise RuntimeError("Compute event must be set before scatter.")
             comm_stream.wait_event(state.compute_event)
+            # Clear the gathered gradient to free memory
+            state.gathered_grad = None
+            u = state.computed_u
             scatter_list = list(torch.split(u, p.size(0) // num_ranks, dim=0))
+            scatter_list = [s.contiguous() for s in scatter_list]
         else:
             scatter_list = None
+        u_received = torch.empty_like(p.to_local())
         torch.distributed.scatter(
+            u_received,
             scatter_list=scatter_list,
             src=state.worker_rank,
             group=state.process_group,
         )
+        u_dtensor = DTensor.from_local(
+            u_received,
             placements=p.placements,
             device_mesh=p.device_mesh,
         )
+        state.scattered_u = u_dtensor
+        state.scatter_event = torch.cuda.Event()
+        state.scatter_event.record()
+def _update_param(p, state, lr, adjusted_lr, weight_decay, rank,
+                  compute_stream):
+    """
+    Update sharded parameter p with the scattered_u.
+    Only worker_rank frees computed_u.
+    """
+    with torch.cuda.stream(compute_stream):
+        if state.scatter_event is None:
+            raise RuntimeError("Scatter event must be set before update")
+        compute_stream.wait_event(state.scatter_event)
+        if rank == state.worker_rank:
+            # Free computed_u
+            state.computed_u = None
+        Muon._update_p(p, state.scattered_u, lr, adjusted_lr, weight_decay)
 def default_is_muon(x, name):
     - We believe it may not work well for finetuning pretrained models, but we haven't tested this.
     Arguments:
+        model: The model to be optimized by Muon.
+        is_muon_func: A function that takes a parameter and its name, and returns whether the parameter should be optimized by Muon.
         lr: The learning rate. The updates will have spectral norm of `lr`. (0.02 is a good default)
         momentum: The momentum used by the internal SGD. (0.95 is a good default)
         nesterov: Whether to use Nesterov-style momentum in the internal SGD. (recommended)
         ns_steps: The number of Newton-Schulz iterations to run. (6 is probably always enough)
+        weight_decay: The weight decay for Muon and AdamW.
         {0, 1}-D or are detected as being the embed or lm_head will be optimized by AdamW as well.
         adamw_lr: The learning rate for the internal AdamW.
         adamw_betas: The betas for the internal AdamW.
         adamw_eps: The epsilon for the internal AdamW.
+        none_grad: Whether to set p.grad to None after gathering the gradients. This can save memory.
+        debug: Whether to print debug information.
     """
     def __init__(
         """
         Get the shard mesh for a parameter p on the given rank.
         """
+        assert isinstance(
+            p, DTensor), "Parallel Muon only supports DTensor parameters."
+        if p.placements == (Shard(dim=0), ):
             # Case for FSDP
             return p.device_mesh.mesh, p.device_mesh.get_group(mesh_dim=0)
         elif p.placements == (Replicate(), Shard(dim=0)):
             total_flops += flops
         if self.debug:
+            print(f"Total TFLOPs for Muon: {total_flops / 1e12:.2f} TFLOPs",
+                  flush=True)
+        ordered_params = sorted(params,
+                                key=lambda p: param_to_flops[id(p)],
+                                reverse=True)
         round_robin = 0
         mesh = None
             u = _zeropower_via_newtonschulz5(g, steps=group["ns_steps"])
             adjusted_lr = self.adjust_lr_for_muon(lr, p.shape)
+            Muon._update_p(p, u, lr, adjusted_lr, weight_decay)
     def _update_g(self, p, g, group, momentum):
         # calc update
             g = buf
         return g
+    @staticmethod
+    def _update_p(p, u, lr, adjusted_lr, weight_decay):
         # apply weight decay
         p.data.mul_(1 - lr * weight_decay)
         # apply update
             p.grad = g
         param_to_state, ordered_params = self.init_state_and_assign_params(
+            params, group)
         def enqueue_gathers(start_idx, chunk_size):
+            for p in ordered_params[start_idx:start_idx + chunk_size]:
                 state = param_to_state[id(p)]
+                _gather(p, state, self.rank, self.comm_stream,
+                        group["none_grad"])
         def enqueue_computes(start_idx, chunk_size):
+            for p in ordered_params[start_idx:start_idx + chunk_size]:
                 state = param_to_state[id(p)]
+                _compute_u(state, group["ns_steps"], self.rank,
+                           self.compute_stream)
         def enqueue_scatters(start_idx, chunk_size):
+            for p in ordered_params[start_idx:start_idx + chunk_size]:
+                state = param_to_state[id(p)]
+                _scatter(p, state, self.rank, self.comm_stream)
+        def enqueue_update_param(start_idx, chunk_size):
+            for p in ordered_params[start_idx:start_idx + chunk_size]:
                 state = param_to_state[id(p)]
                 adjusted_lr = self.adjust_lr_for_muon(lr, p.shape)
+                _update_param(p, state, lr, adjusted_lr, weight_decay,
+                              self.rank, self.compute_stream)
+        chunk_size = dist.get_world_size(param_to_state[id(
+            params[0])].process_group)
         # Wait grad update
         self.comm_stream.wait_stream(torch.cuda.current_stream())
         enqueue_gathers(0, chunk_size)
         for i in range(0, len(params) + chunk_size - 1, chunk_size):
             enqueue_computes(i, chunk_size)
+            if i > 0:
+                enqueue_update_param(i - chunk_size, chunk_size)
             enqueue_gathers(i + chunk_size, chunk_size)
             enqueue_scatters(i, chunk_size)
+        enqueue_update_param(i, chunk_size)
+        # Wait the last update_param to finish
+        torch.cuda.current_stream().wait_stream(self.compute_stream)
     def step(self, closure=None):
         """Perform a single optimization step.
                     continue
                 if isinstance(p.data, DTensor):
                     if all(
+                            isinstance(placement, Replicate)
+                            for placement in p.placements):
                         param_tensors.append(p)
                     else:
                         param_dtensors.append(p)
                 elif isinstance(p.data, torch.Tensor):
                     param_tensors.append(p)
                 else:
+                    raise TypeError(
+                        f"Unsupported parameter type: {type(p.data)}")
             if self.debug:
                 print(
             #       AdamW backup       #
             ############################
+            params = [
+                p for p in group["params"] if not self.state[p]["use_muon"]
+            ]
             lr = group["lr"]
             beta1, beta2 = group["adamw_betas"]
             eps = group["adamw_eps"]

build/torch27-cxx11-rocm63-x86_64-linux/optimizer/__pycache__/__init__.cpython-313.pyc DELETED Viewed

Binary file (308 Bytes)

build/torch27-cxx11-rocm63-x86_64-linux/optimizer/__pycache__/muon.cpython-313.pyc DELETED Viewed

Binary file (23.4 kB)

build/torch27-cxx11-rocm63-x86_64-linux/optimizer/_ops.py CHANGED Viewed

@@ -1,9 +1,9 @@
 import torch
-from . import _optimizer_2dc97a1_dirty
-ops = torch.ops._optimizer_2dc97a1_dirty
 def add_op_namespace_prefix(op_name: str):
     """
     Prefix op by namespace.
     """
-    return f"_optimizer_2dc97a1_dirty::{op_name}"

 import torch
+from . import _optimizer_0c12ced_dirty
+ops = torch.ops._optimizer_0c12ced_dirty
 def add_op_namespace_prefix(op_name: str):
     """
     Prefix op by namespace.
     """
+    return f"_optimizer_0c12ced_dirty::{op_name}"

build/torch27-cxx11-rocm63-x86_64-linux/optimizer/{_optimizer_2dc97a1_dirty.abi3.so → _optimizer_0c12ced_dirty.abi3.so} RENAMED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:bdcf9e3d8bf13aa01bf1ae7a94a12dd05c50702a24b57e4cfcc2e54ca5ed21c3
 size 1749840

 version https://git-lfs.github.com/spec/v1
+oid sha256:e8bda6399291a15b5bcba88214ffd3d0291b10d1cdfb0ab668436d176a9396ec
 size 1749840

build/torch27-cxx11-rocm63-x86_64-linux/optimizer/muon.py CHANGED Viewed

@@ -47,19 +47,27 @@ class _muon_state:
     # TODO: use Optional
     worker_rank: int | None = None
     gathered_grad: torch.Tensor | None = None
     computed_u: torch.Tensor | None = None
     gather_event: torch.cuda.Event | None = None
     compute_event: torch.cuda.Event | None = None
     process_group = None
 @torch.no_grad()
 def _gather(p, state, rank, comm_stream, none_grad):
     g = p.grad
     if rank == state.worker_rank:
         num_ranks = dist.get_world_size(group=state.process_group)
-        gather_list = [torch.empty_like(g.to_local()) for _ in range(num_ranks)]
     else:
         gather_list = None
@@ -73,8 +81,7 @@ def _gather(p, state, rank, comm_stream, none_grad):
         if rank == state.worker_rank:
             if state.gathered_grad is not None:
                 raise RuntimeError(
-                    "Gather event already exists, which should not happen."
-                )
             state.gathered_grad = torch.cat(gather_list, dim=0)
             state.gather_event = torch.cuda.Event()
             state.gather_event.record()
@@ -82,11 +89,21 @@ def _gather(p, state, rank, comm_stream, none_grad):
             state.gathered_grad = None
             state.gather_event = None
         if none_grad:
             p.grad = None
 @torch.no_grad()
 def _compute_u(state, steps, rank, compute_stream):
     with torch.cuda.stream(compute_stream):
         if rank == state.worker_rank:
             if state.gather_event is None:
@@ -96,16 +113,16 @@ def _compute_u(state, steps, rank, compute_stream):
             state.computed_u = u
             state.compute_event = torch.cuda.Event()
             state.compute_event.record()
-            # Clear the gathered gradient to free memory
-            state.gathered_grad = None
         else:
             state.computed_u = None
             state.compute_event = None
 @torch.no_grad()
-def _scatter(p, state, lr, weight_decay, rank, comm_stream):
-    u = state.computed_u
     with torch.cuda.stream(comm_stream):
         if rank == state.worker_rank:
@@ -113,27 +130,49 @@ def _scatter(p, state, lr, weight_decay, rank, comm_stream):
             if state.compute_event is None:
                 raise RuntimeError("Compute event must be set before scatter.")
             comm_stream.wait_event(state.compute_event)
             scatter_list = list(torch.split(u, p.size(0) // num_ranks, dim=0))
         else:
             scatter_list = None
-        u = torch.empty_like(p.to_local())
         torch.distributed.scatter(
-            u,
             scatter_list=scatter_list,
             src=state.worker_rank,
             group=state.process_group,
         )
-        if rank == state.worker_rank:
-            # Clear u to free memory
-            state.computed_u = None
-        u = DTensor.from_local(
-            u,
             placements=p.placements,
             device_mesh=p.device_mesh,
         )
-        p.data.mul_(1 - lr * weight_decay)
-        p.data.add_(u, alpha=-lr)
 def default_is_muon(x, name):
@@ -154,17 +193,19 @@ class Muon(torch.optim.Optimizer):
     - We believe it may not work well for finetuning pretrained models, but we haven't tested this.
     Arguments:
-        muon_params: The parameters to be optimized by Muon.
         lr: The learning rate. The updates will have spectral norm of `lr`. (0.02 is a good default)
         momentum: The momentum used by the internal SGD. (0.95 is a good default)
         nesterov: Whether to use Nesterov-style momentum in the internal SGD. (recommended)
         ns_steps: The number of Newton-Schulz iterations to run. (6 is probably always enough)
-        adamw_params: The parameters to be optimized by AdamW. Any parameters in `muon_params` which are
         {0, 1}-D or are detected as being the embed or lm_head will be optimized by AdamW as well.
         adamw_lr: The learning rate for the internal AdamW.
         adamw_betas: The betas for the internal AdamW.
         adamw_eps: The epsilon for the internal AdamW.
-        adamw_weight_decay: The weight decay for the internal AdamW.
     """
     def __init__(
@@ -240,9 +281,10 @@ class Muon(torch.optim.Optimizer):
         """
         Get the shard mesh for a parameter p on the given rank.
         """
-        assert isinstance(p, DTensor), "Parallel Muon only supports DTensor parameters."
-        if p.placements == (Shard(dim=0),):
             # Case for FSDP
             return p.device_mesh.mesh, p.device_mesh.get_group(mesh_dim=0)
         elif p.placements == (Replicate(), Shard(dim=0)):
@@ -269,11 +311,12 @@ class Muon(torch.optim.Optimizer):
             total_flops += flops
         if self.debug:
-            print(f"Total TFLOPs for Muon: {total_flops / 1e12:.2f} TFLOPs", flush=True)
-        ordered_params = sorted(
-            params, key=lambda p: param_to_flops[id(p)], reverse=True
-        )
         round_robin = 0
         mesh = None
@@ -317,14 +360,8 @@ class Muon(torch.optim.Optimizer):
             u = _zeropower_via_newtonschulz5(g, steps=group["ns_steps"])
-            # scale update
             adjusted_lr = self.adjust_lr_for_muon(lr, p.shape)
-            # apply weight decay
-            p.data.mul_(1 - lr * weight_decay)
-            # apply update
-            p.data.add_(u, alpha=-adjusted_lr)
     def _update_g(self, p, g, group, momentum):
         # calc update
@@ -339,9 +376,8 @@ class Muon(torch.optim.Optimizer):
             g = buf
         return g
-    def _update_p(self, p, u, lr, weight_decay):
-        # scale update
-        adjusted_lr = self.adjust_lr_for_muon(lr, p.shape)
         # apply weight decay
         p.data.mul_(1 - lr * weight_decay)
         # apply update
@@ -369,28 +405,34 @@ class Muon(torch.optim.Optimizer):
             p.grad = g
         param_to_state, ordered_params = self.init_state_and_assign_params(
-            params, group
-        )
         def enqueue_gathers(start_idx, chunk_size):
-            for p in ordered_params[start_idx : start_idx + chunk_size]:
                 state = param_to_state[id(p)]
-                _gather(p, state, self.rank, self.comm_stream, group["none_grad"])
         def enqueue_computes(start_idx, chunk_size):
-            for p in ordered_params[start_idx : start_idx + chunk_size]:
                 state = param_to_state[id(p)]
-                _compute_u(state, group["ns_steps"], self.rank, self.compute_stream)
         def enqueue_scatters(start_idx, chunk_size):
-            for p in ordered_params[start_idx : start_idx + chunk_size]:
                 state = param_to_state[id(p)]
                 adjusted_lr = self.adjust_lr_for_muon(lr, p.shape)
-                _scatter(
-                    p, state, adjusted_lr, weight_decay, self.rank, self.comm_stream
-                )
-        chunk_size = dist.get_world_size(param_to_state[id(params[0])].process_group)
         # Wait grad update
         self.comm_stream.wait_stream(torch.cuda.current_stream())
@@ -398,10 +440,14 @@ class Muon(torch.optim.Optimizer):
         enqueue_gathers(0, chunk_size)
         for i in range(0, len(params) + chunk_size - 1, chunk_size):
             enqueue_computes(i, chunk_size)
             enqueue_gathers(i + chunk_size, chunk_size)
             enqueue_scatters(i, chunk_size)
-        torch.cuda.current_stream().wait_stream(self.comm_stream)
     def step(self, closure=None):
         """Perform a single optimization step.
@@ -436,15 +482,16 @@ class Muon(torch.optim.Optimizer):
                     continue
                 if isinstance(p.data, DTensor):
                     if all(
-                        isinstance(placement, Replicate) for placement in p.placements
-                    ):
                         param_tensors.append(p)
                     else:
                         param_dtensors.append(p)
                 elif isinstance(p.data, torch.Tensor):
                     param_tensors.append(p)
                 else:
-                    raise TypeError(f"Unsupported parameter type: {type(p.data)}")
             if self.debug:
                 print(
@@ -479,7 +526,9 @@ class Muon(torch.optim.Optimizer):
             #       AdamW backup       #
             ############################
-            params = [p for p in group["params"] if not self.state[p]["use_muon"]]
             lr = group["lr"]
             beta1, beta2 = group["adamw_betas"]
             eps = group["adamw_eps"]

     # TODO: use Optional
     worker_rank: int | None = None
     gathered_grad: torch.Tensor | None = None
+    scattered_u: DTensor | None = None
     computed_u: torch.Tensor | None = None
     gather_event: torch.cuda.Event | None = None
     compute_event: torch.cuda.Event | None = None
+    scatter_event: torch.cuda.Event | None = None
     process_group = None
 @torch.no_grad()
 def _gather(p, state, rank, comm_stream, none_grad):
+    """
+    Gather the gradients to worker_rank.
+    If none_grad is True, free p.grad after the gather.
+    """
     g = p.grad
     if rank == state.worker_rank:
         num_ranks = dist.get_world_size(group=state.process_group)
+        gather_list = [
+            torch.empty_like(g.to_local()) for _ in range(num_ranks)
+        ]
     else:
         gather_list = None
         if rank == state.worker_rank:
             if state.gathered_grad is not None:
                 raise RuntimeError(
+                    "Gather event already exists, which should not happen.")
             state.gathered_grad = torch.cat(gather_list, dim=0)
             state.gather_event = torch.cuda.Event()
             state.gather_event.record()
             state.gathered_grad = None
             state.gather_event = None
         if none_grad:
+            # We can safely free p.grad without calling record_stream:
+            #   p.grad.to_local().record_stream(comm_stream)
+            # Explanation:
+            # 1. p.grad is created on the default stream, but the default stream
+            #    is synchronized with the comm stream later.
+            # 2. There is no further activity on the default stream before the optimizer finishes.
+            # Therefore, it is safe to free p.grad directly on the comm stream.
             p.grad = None
 @torch.no_grad()
 def _compute_u(state, steps, rank, compute_stream):
+    """
+    On worker_rank, compute the orthogonalized update using Newton-Schulz iteration.
+    """
     with torch.cuda.stream(compute_stream):
         if rank == state.worker_rank:
             if state.gather_event is None:
             state.computed_u = u
             state.compute_event = torch.cuda.Event()
             state.compute_event.record()
         else:
             state.computed_u = None
             state.compute_event = None
 @torch.no_grad()
+def _scatter(p, state, rank, comm_stream):
+    """
+    Scatter the computed_u from worker_rank to all ranks.
+    """
     with torch.cuda.stream(comm_stream):
         if rank == state.worker_rank:
             if state.compute_event is None:
                 raise RuntimeError("Compute event must be set before scatter.")
             comm_stream.wait_event(state.compute_event)
+            # Clear the gathered gradient to free memory
+            state.gathered_grad = None
+            u = state.computed_u
             scatter_list = list(torch.split(u, p.size(0) // num_ranks, dim=0))
+            scatter_list = [s.contiguous() for s in scatter_list]
         else:
             scatter_list = None
+        u_received = torch.empty_like(p.to_local())
         torch.distributed.scatter(
+            u_received,
             scatter_list=scatter_list,
             src=state.worker_rank,
             group=state.process_group,
         )
+        u_dtensor = DTensor.from_local(
+            u_received,
             placements=p.placements,
             device_mesh=p.device_mesh,
         )
+        state.scattered_u = u_dtensor
+        state.scatter_event = torch.cuda.Event()
+        state.scatter_event.record()
+def _update_param(p, state, lr, adjusted_lr, weight_decay, rank,
+                  compute_stream):
+    """
+    Update sharded parameter p with the scattered_u.
+    Only worker_rank frees computed_u.
+    """
+    with torch.cuda.stream(compute_stream):
+        if state.scatter_event is None:
+            raise RuntimeError("Scatter event must be set before update")
+        compute_stream.wait_event(state.scatter_event)
+        if rank == state.worker_rank:
+            # Free computed_u
+            state.computed_u = None
+        Muon._update_p(p, state.scattered_u, lr, adjusted_lr, weight_decay)
 def default_is_muon(x, name):
     - We believe it may not work well for finetuning pretrained models, but we haven't tested this.
     Arguments:
+        model: The model to be optimized by Muon.
+        is_muon_func: A function that takes a parameter and its name, and returns whether the parameter should be optimized by Muon.
         lr: The learning rate. The updates will have spectral norm of `lr`. (0.02 is a good default)
         momentum: The momentum used by the internal SGD. (0.95 is a good default)
         nesterov: Whether to use Nesterov-style momentum in the internal SGD. (recommended)
         ns_steps: The number of Newton-Schulz iterations to run. (6 is probably always enough)
+        weight_decay: The weight decay for Muon and AdamW.
         {0, 1}-D or are detected as being the embed or lm_head will be optimized by AdamW as well.
         adamw_lr: The learning rate for the internal AdamW.
         adamw_betas: The betas for the internal AdamW.
         adamw_eps: The epsilon for the internal AdamW.
+        none_grad: Whether to set p.grad to None after gathering the gradients. This can save memory.
+        debug: Whether to print debug information.
     """
     def __init__(
         """
         Get the shard mesh for a parameter p on the given rank.
         """
+        assert isinstance(
+            p, DTensor), "Parallel Muon only supports DTensor parameters."
+        if p.placements == (Shard(dim=0), ):
             # Case for FSDP
             return p.device_mesh.mesh, p.device_mesh.get_group(mesh_dim=0)
         elif p.placements == (Replicate(), Shard(dim=0)):
             total_flops += flops
         if self.debug:
+            print(f"Total TFLOPs for Muon: {total_flops / 1e12:.2f} TFLOPs",
+                  flush=True)
+        ordered_params = sorted(params,
+                                key=lambda p: param_to_flops[id(p)],
+                                reverse=True)
         round_robin = 0
         mesh = None
             u = _zeropower_via_newtonschulz5(g, steps=group["ns_steps"])
             adjusted_lr = self.adjust_lr_for_muon(lr, p.shape)
+            Muon._update_p(p, u, lr, adjusted_lr, weight_decay)
     def _update_g(self, p, g, group, momentum):
         # calc update
             g = buf
         return g
+    @staticmethod
+    def _update_p(p, u, lr, adjusted_lr, weight_decay):
         # apply weight decay
         p.data.mul_(1 - lr * weight_decay)
         # apply update
             p.grad = g
         param_to_state, ordered_params = self.init_state_and_assign_params(
+            params, group)
         def enqueue_gathers(start_idx, chunk_size):
+            for p in ordered_params[start_idx:start_idx + chunk_size]:
                 state = param_to_state[id(p)]
+                _gather(p, state, self.rank, self.comm_stream,
+                        group["none_grad"])
         def enqueue_computes(start_idx, chunk_size):
+            for p in ordered_params[start_idx:start_idx + chunk_size]:
                 state = param_to_state[id(p)]
+                _compute_u(state, group["ns_steps"], self.rank,
+                           self.compute_stream)
         def enqueue_scatters(start_idx, chunk_size):
+            for p in ordered_params[start_idx:start_idx + chunk_size]:
+                state = param_to_state[id(p)]
+                _scatter(p, state, self.rank, self.comm_stream)
+        def enqueue_update_param(start_idx, chunk_size):
+            for p in ordered_params[start_idx:start_idx + chunk_size]:
                 state = param_to_state[id(p)]
                 adjusted_lr = self.adjust_lr_for_muon(lr, p.shape)
+                _update_param(p, state, lr, adjusted_lr, weight_decay,
+                              self.rank, self.compute_stream)
+        chunk_size = dist.get_world_size(param_to_state[id(
+            params[0])].process_group)
         # Wait grad update
         self.comm_stream.wait_stream(torch.cuda.current_stream())
         enqueue_gathers(0, chunk_size)
         for i in range(0, len(params) + chunk_size - 1, chunk_size):
             enqueue_computes(i, chunk_size)
+            if i > 0:
+                enqueue_update_param(i - chunk_size, chunk_size)
             enqueue_gathers(i + chunk_size, chunk_size)
             enqueue_scatters(i, chunk_size)
+        enqueue_update_param(i, chunk_size)
+        # Wait the last update_param to finish
+        torch.cuda.current_stream().wait_stream(self.compute_stream)
     def step(self, closure=None):
         """Perform a single optimization step.
                     continue
                 if isinstance(p.data, DTensor):
                     if all(
+                            isinstance(placement, Replicate)
+                            for placement in p.placements):
                         param_tensors.append(p)
                     else:
                         param_dtensors.append(p)
                 elif isinstance(p.data, torch.Tensor):
                     param_tensors.append(p)
                 else:
+                    raise TypeError(
+                        f"Unsupported parameter type: {type(p.data)}")
             if self.debug:
                 print(
             #       AdamW backup       #
             ############################
+            params = [
+                p for p in group["params"] if not self.state[p]["use_muon"]
+            ]
             lr = group["lr"]
             beta1, beta2 = group["adamw_betas"]
             eps = group["adamw_eps"]

build/torch28-cxx11-cu126-x86_64-linux/optimizer/__pycache__/__init__.cpython-313.pyc DELETED Viewed

Binary file (307 Bytes)

build/torch28-cxx11-cu126-x86_64-linux/optimizer/__pycache__/muon.cpython-313.pyc DELETED Viewed

Binary file (23.4 kB)

build/torch28-cxx11-cu126-x86_64-linux/optimizer/_ops.py CHANGED Viewed

@@ -1,9 +1,9 @@
 import torch
-from . import _optimizer_2dc97a1_dirty
-ops = torch.ops._optimizer_2dc97a1_dirty
 def add_op_namespace_prefix(op_name: str):
     """
     Prefix op by namespace.
     """
-    return f"_optimizer_2dc97a1_dirty::{op_name}"

 import torch
+from . import _optimizer_0c12ced_dirty
+ops = torch.ops._optimizer_0c12ced_dirty
 def add_op_namespace_prefix(op_name: str):
     """
     Prefix op by namespace.
     """
+    return f"_optimizer_0c12ced_dirty::{op_name}"

build/torch28-cxx11-cu126-x86_64-linux/optimizer/{_optimizer_2dc97a1_dirty.abi3.so → _optimizer_0c12ced_dirty.abi3.so} RENAMED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:a423eb4ab3a31c53a3326c71e34fa59fc661f8d432701e41a7de900a9c23e37c
 size 1824256

 version https://git-lfs.github.com/spec/v1
+oid sha256:df5044ffb45124dfe7088ed991123724405b00285e4d8d1ba2961802f521aa0f
 size 1824256

build/torch28-cxx11-cu126-x86_64-linux/optimizer/muon.py CHANGED Viewed

@@ -47,19 +47,27 @@ class _muon_state:
     # TODO: use Optional
     worker_rank: int | None = None
     gathered_grad: torch.Tensor | None = None
     computed_u: torch.Tensor | None = None
     gather_event: torch.cuda.Event | None = None
     compute_event: torch.cuda.Event | None = None
     process_group = None
 @torch.no_grad()
 def _gather(p, state, rank, comm_stream, none_grad):
     g = p.grad
     if rank == state.worker_rank:
         num_ranks = dist.get_world_size(group=state.process_group)
-        gather_list = [torch.empty_like(g.to_local()) for _ in range(num_ranks)]
     else:
         gather_list = None
@@ -73,8 +81,7 @@ def _gather(p, state, rank, comm_stream, none_grad):
         if rank == state.worker_rank:
             if state.gathered_grad is not None:
                 raise RuntimeError(
-                    "Gather event already exists, which should not happen."
-                )
             state.gathered_grad = torch.cat(gather_list, dim=0)
             state.gather_event = torch.cuda.Event()
             state.gather_event.record()
@@ -82,11 +89,21 @@ def _gather(p, state, rank, comm_stream, none_grad):
             state.gathered_grad = None
             state.gather_event = None
         if none_grad:
             p.grad = None
 @torch.no_grad()
 def _compute_u(state, steps, rank, compute_stream):
     with torch.cuda.stream(compute_stream):
         if rank == state.worker_rank:
             if state.gather_event is None:
@@ -96,16 +113,16 @@ def _compute_u(state, steps, rank, compute_stream):
             state.computed_u = u
             state.compute_event = torch.cuda.Event()
             state.compute_event.record()
-            # Clear the gathered gradient to free memory
-            state.gathered_grad = None
         else:
             state.computed_u = None
             state.compute_event = None
 @torch.no_grad()
-def _scatter(p, state, lr, weight_decay, rank, comm_stream):
-    u = state.computed_u
     with torch.cuda.stream(comm_stream):
         if rank == state.worker_rank:
@@ -113,27 +130,49 @@ def _scatter(p, state, lr, weight_decay, rank, comm_stream):
             if state.compute_event is None:
                 raise RuntimeError("Compute event must be set before scatter.")
             comm_stream.wait_event(state.compute_event)
             scatter_list = list(torch.split(u, p.size(0) // num_ranks, dim=0))
         else:
             scatter_list = None
-        u = torch.empty_like(p.to_local())
         torch.distributed.scatter(
-            u,
             scatter_list=scatter_list,
             src=state.worker_rank,
             group=state.process_group,
         )
-        if rank == state.worker_rank:
-            # Clear u to free memory
-            state.computed_u = None
-        u = DTensor.from_local(
-            u,
             placements=p.placements,
             device_mesh=p.device_mesh,
         )
-        p.data.mul_(1 - lr * weight_decay)
-        p.data.add_(u, alpha=-lr)
 def default_is_muon(x, name):
@@ -154,17 +193,19 @@ class Muon(torch.optim.Optimizer):
     - We believe it may not work well for finetuning pretrained models, but we haven't tested this.
     Arguments:
-        muon_params: The parameters to be optimized by Muon.
         lr: The learning rate. The updates will have spectral norm of `lr`. (0.02 is a good default)
         momentum: The momentum used by the internal SGD. (0.95 is a good default)
         nesterov: Whether to use Nesterov-style momentum in the internal SGD. (recommended)
         ns_steps: The number of Newton-Schulz iterations to run. (6 is probably always enough)
-        adamw_params: The parameters to be optimized by AdamW. Any parameters in `muon_params` which are
         {0, 1}-D or are detected as being the embed or lm_head will be optimized by AdamW as well.
         adamw_lr: The learning rate for the internal AdamW.
         adamw_betas: The betas for the internal AdamW.
         adamw_eps: The epsilon for the internal AdamW.
-        adamw_weight_decay: The weight decay for the internal AdamW.
     """
     def __init__(
@@ -240,9 +281,10 @@ class Muon(torch.optim.Optimizer):
         """
         Get the shard mesh for a parameter p on the given rank.
         """
-        assert isinstance(p, DTensor), "Parallel Muon only supports DTensor parameters."
-        if p.placements == (Shard(dim=0),):
             # Case for FSDP
             return p.device_mesh.mesh, p.device_mesh.get_group(mesh_dim=0)
         elif p.placements == (Replicate(), Shard(dim=0)):
@@ -269,11 +311,12 @@ class Muon(torch.optim.Optimizer):
             total_flops += flops
         if self.debug:
-            print(f"Total TFLOPs for Muon: {total_flops / 1e12:.2f} TFLOPs", flush=True)
-        ordered_params = sorted(
-            params, key=lambda p: param_to_flops[id(p)], reverse=True
-        )
         round_robin = 0
         mesh = None
@@ -317,14 +360,8 @@ class Muon(torch.optim.Optimizer):
             u = _zeropower_via_newtonschulz5(g, steps=group["ns_steps"])
-            # scale update
             adjusted_lr = self.adjust_lr_for_muon(lr, p.shape)
-            # apply weight decay
-            p.data.mul_(1 - lr * weight_decay)
-            # apply update
-            p.data.add_(u, alpha=-adjusted_lr)
     def _update_g(self, p, g, group, momentum):
         # calc update
@@ -339,9 +376,8 @@ class Muon(torch.optim.Optimizer):
             g = buf
         return g
-    def _update_p(self, p, u, lr, weight_decay):
-        # scale update
-        adjusted_lr = self.adjust_lr_for_muon(lr, p.shape)
         # apply weight decay
         p.data.mul_(1 - lr * weight_decay)
         # apply update
@@ -369,28 +405,34 @@ class Muon(torch.optim.Optimizer):
             p.grad = g
         param_to_state, ordered_params = self.init_state_and_assign_params(
-            params, group
-        )
         def enqueue_gathers(start_idx, chunk_size):
-            for p in ordered_params[start_idx : start_idx + chunk_size]:
                 state = param_to_state[id(p)]
-                _gather(p, state, self.rank, self.comm_stream, group["none_grad"])
         def enqueue_computes(start_idx, chunk_size):
-            for p in ordered_params[start_idx : start_idx + chunk_size]:
                 state = param_to_state[id(p)]
-                _compute_u(state, group["ns_steps"], self.rank, self.compute_stream)
         def enqueue_scatters(start_idx, chunk_size):
-            for p in ordered_params[start_idx : start_idx + chunk_size]:
                 state = param_to_state[id(p)]
                 adjusted_lr = self.adjust_lr_for_muon(lr, p.shape)
-                _scatter(
-                    p, state, adjusted_lr, weight_decay, self.rank, self.comm_stream
-                )
-        chunk_size = dist.get_world_size(param_to_state[id(params[0])].process_group)
         # Wait grad update
         self.comm_stream.wait_stream(torch.cuda.current_stream())
@@ -398,10 +440,14 @@ class Muon(torch.optim.Optimizer):
         enqueue_gathers(0, chunk_size)
         for i in range(0, len(params) + chunk_size - 1, chunk_size):
             enqueue_computes(i, chunk_size)
             enqueue_gathers(i + chunk_size, chunk_size)
             enqueue_scatters(i, chunk_size)
-        torch.cuda.current_stream().wait_stream(self.comm_stream)
     def step(self, closure=None):
         """Perform a single optimization step.
@@ -436,15 +482,16 @@ class Muon(torch.optim.Optimizer):
                     continue
                 if isinstance(p.data, DTensor):
                     if all(
-                        isinstance(placement, Replicate) for placement in p.placements
-                    ):
                         param_tensors.append(p)
                     else:
                         param_dtensors.append(p)
                 elif isinstance(p.data, torch.Tensor):
                     param_tensors.append(p)
                 else:
-                    raise TypeError(f"Unsupported parameter type: {type(p.data)}")
             if self.debug:
                 print(
@@ -479,7 +526,9 @@ class Muon(torch.optim.Optimizer):
             #       AdamW backup       #
             ############################
-            params = [p for p in group["params"] if not self.state[p]["use_muon"]]
             lr = group["lr"]
             beta1, beta2 = group["adamw_betas"]
             eps = group["adamw_eps"]

     # TODO: use Optional
     worker_rank: int | None = None
     gathered_grad: torch.Tensor | None = None
+    scattered_u: DTensor | None = None
     computed_u: torch.Tensor | None = None
     gather_event: torch.cuda.Event | None = None
     compute_event: torch.cuda.Event | None = None
+    scatter_event: torch.cuda.Event | None = None
     process_group = None
 @torch.no_grad()
 def _gather(p, state, rank, comm_stream, none_grad):
+    """
+    Gather the gradients to worker_rank.
+    If none_grad is True, free p.grad after the gather.
+    """
     g = p.grad
     if rank == state.worker_rank:
         num_ranks = dist.get_world_size(group=state.process_group)
+        gather_list = [
+            torch.empty_like(g.to_local()) for _ in range(num_ranks)
+        ]
     else:
         gather_list = None
         if rank == state.worker_rank:
             if state.gathered_grad is not None:
                 raise RuntimeError(
+                    "Gather event already exists, which should not happen.")
             state.gathered_grad = torch.cat(gather_list, dim=0)
             state.gather_event = torch.cuda.Event()
             state.gather_event.record()
             state.gathered_grad = None
             state.gather_event = None
         if none_grad:
+            # We can safely free p.grad without calling record_stream:
+            #   p.grad.to_local().record_stream(comm_stream)
+            # Explanation:
+            # 1. p.grad is created on the default stream, but the default stream
+            #    is synchronized with the comm stream later.
+            # 2. There is no further activity on the default stream before the optimizer finishes.
+            # Therefore, it is safe to free p.grad directly on the comm stream.
             p.grad = None
 @torch.no_grad()
 def _compute_u(state, steps, rank, compute_stream):
+    """
+    On worker_rank, compute the orthogonalized update using Newton-Schulz iteration.
+    """
     with torch.cuda.stream(compute_stream):
         if rank == state.worker_rank:
             if state.gather_event is None:
             state.computed_u = u
             state.compute_event = torch.cuda.Event()
             state.compute_event.record()
         else:
             state.computed_u = None
             state.compute_event = None
 @torch.no_grad()
+def _scatter(p, state, rank, comm_stream):
+    """
+    Scatter the computed_u from worker_rank to all ranks.
+    """
     with torch.cuda.stream(comm_stream):
         if rank == state.worker_rank:
             if state.compute_event is None:
                 raise RuntimeError("Compute event must be set before scatter.")
             comm_stream.wait_event(state.compute_event)
+            # Clear the gathered gradient to free memory
+            state.gathered_grad = None
+            u = state.computed_u
             scatter_list = list(torch.split(u, p.size(0) // num_ranks, dim=0))
+            scatter_list = [s.contiguous() for s in scatter_list]
         else:
             scatter_list = None
+        u_received = torch.empty_like(p.to_local())
         torch.distributed.scatter(
+            u_received,
             scatter_list=scatter_list,
             src=state.worker_rank,
             group=state.process_group,
         )
+        u_dtensor = DTensor.from_local(
+            u_received,
             placements=p.placements,
             device_mesh=p.device_mesh,
         )
+        state.scattered_u = u_dtensor
+        state.scatter_event = torch.cuda.Event()
+        state.scatter_event.record()
+def _update_param(p, state, lr, adjusted_lr, weight_decay, rank,
+                  compute_stream):
+    """
+    Update sharded parameter p with the scattered_u.
+    Only worker_rank frees computed_u.
+    """
+    with torch.cuda.stream(compute_stream):
+        if state.scatter_event is None:
+            raise RuntimeError("Scatter event must be set before update")
+        compute_stream.wait_event(state.scatter_event)
+        if rank == state.worker_rank:
+            # Free computed_u
+            state.computed_u = None
+        Muon._update_p(p, state.scattered_u, lr, adjusted_lr, weight_decay)
 def default_is_muon(x, name):
     - We believe it may not work well for finetuning pretrained models, but we haven't tested this.
     Arguments:
+        model: The model to be optimized by Muon.
+        is_muon_func: A function that takes a parameter and its name, and returns whether the parameter should be optimized by Muon.
         lr: The learning rate. The updates will have spectral norm of `lr`. (0.02 is a good default)
         momentum: The momentum used by the internal SGD. (0.95 is a good default)
         nesterov: Whether to use Nesterov-style momentum in the internal SGD. (recommended)
         ns_steps: The number of Newton-Schulz iterations to run. (6 is probably always enough)
+        weight_decay: The weight decay for Muon and AdamW.
         {0, 1}-D or are detected as being the embed or lm_head will be optimized by AdamW as well.
         adamw_lr: The learning rate for the internal AdamW.
         adamw_betas: The betas for the internal AdamW.
         adamw_eps: The epsilon for the internal AdamW.
+        none_grad: Whether to set p.grad to None after gathering the gradients. This can save memory.
+        debug: Whether to print debug information.
     """
     def __init__(
         """
         Get the shard mesh for a parameter p on the given rank.
         """
+        assert isinstance(
+            p, DTensor), "Parallel Muon only supports DTensor parameters."
+        if p.placements == (Shard(dim=0), ):
             # Case for FSDP
             return p.device_mesh.mesh, p.device_mesh.get_group(mesh_dim=0)
         elif p.placements == (Replicate(), Shard(dim=0)):
             total_flops += flops
         if self.debug:
+            print(f"Total TFLOPs for Muon: {total_flops / 1e12:.2f} TFLOPs",
+                  flush=True)
+        ordered_params = sorted(params,
+                                key=lambda p: param_to_flops[id(p)],
+                                reverse=True)
         round_robin = 0
         mesh = None
             u = _zeropower_via_newtonschulz5(g, steps=group["ns_steps"])
             adjusted_lr = self.adjust_lr_for_muon(lr, p.shape)
+            Muon._update_p(p, u, lr, adjusted_lr, weight_decay)
     def _update_g(self, p, g, group, momentum):
         # calc update
             g = buf
         return g
+    @staticmethod
+    def _update_p(p, u, lr, adjusted_lr, weight_decay):
         # apply weight decay
         p.data.mul_(1 - lr * weight_decay)
         # apply update
             p.grad = g
         param_to_state, ordered_params = self.init_state_and_assign_params(
+            params, group)
         def enqueue_gathers(start_idx, chunk_size):
+            for p in ordered_params[start_idx:start_idx + chunk_size]:
                 state = param_to_state[id(p)]
+                _gather(p, state, self.rank, self.comm_stream,
+                        group["none_grad"])
         def enqueue_computes(start_idx, chunk_size):
+            for p in ordered_params[start_idx:start_idx + chunk_size]:
                 state = param_to_state[id(p)]
+                _compute_u(state, group["ns_steps"], self.rank,
+                           self.compute_stream)
         def enqueue_scatters(start_idx, chunk_size):
+            for p in ordered_params[start_idx:start_idx + chunk_size]:
+                state = param_to_state[id(p)]
+                _scatter(p, state, self.rank, self.comm_stream)
+        def enqueue_update_param(start_idx, chunk_size):
+            for p in ordered_params[start_idx:start_idx + chunk_size]:
                 state = param_to_state[id(p)]
                 adjusted_lr = self.adjust_lr_for_muon(lr, p.shape)
+                _update_param(p, state, lr, adjusted_lr, weight_decay,
+                              self.rank, self.compute_stream)
+        chunk_size = dist.get_world_size(param_to_state[id(
+            params[0])].process_group)
         # Wait grad update
         self.comm_stream.wait_stream(torch.cuda.current_stream())
         enqueue_gathers(0, chunk_size)
         for i in range(0, len(params) + chunk_size - 1, chunk_size):
             enqueue_computes(i, chunk_size)
+            if i > 0:
+                enqueue_update_param(i - chunk_size, chunk_size)
             enqueue_gathers(i + chunk_size, chunk_size)
             enqueue_scatters(i, chunk_size)
+        enqueue_update_param(i, chunk_size)
+        # Wait the last update_param to finish
+        torch.cuda.current_stream().wait_stream(self.compute_stream)
     def step(self, closure=None):
         """Perform a single optimization step.
                     continue
                 if isinstance(p.data, DTensor):
                     if all(
+                            isinstance(placement, Replicate)
+                            for placement in p.placements):
                         param_tensors.append(p)
                     else:
                         param_dtensors.append(p)
                 elif isinstance(p.data, torch.Tensor):
                     param_tensors.append(p)
                 else:
+                    raise TypeError(
+                        f"Unsupported parameter type: {type(p.data)}")
             if self.debug:
                 print(
             #       AdamW backup       #
             ############################
+            params = [
+                p for p in group["params"] if not self.state[p]["use_muon"]
+            ]
             lr = group["lr"]
             beta1, beta2 = group["adamw_betas"]
             eps = group["adamw_eps"]

build/torch28-cxx11-cu128-x86_64-linux/optimizer/__pycache__/__init__.cpython-313.pyc DELETED Viewed

Binary file (307 Bytes)

build/torch28-cxx11-cu128-x86_64-linux/optimizer/__pycache__/muon.cpython-313.pyc DELETED Viewed

Binary file (23.4 kB)

build/torch28-cxx11-cu128-x86_64-linux/optimizer/_ops.py CHANGED Viewed

@@ -1,9 +1,9 @@
 import torch
-from . import _optimizer_2dc97a1_dirty
-ops = torch.ops._optimizer_2dc97a1_dirty
 def add_op_namespace_prefix(op_name: str):
     """
     Prefix op by namespace.
     """
-    return f"_optimizer_2dc97a1_dirty::{op_name}"

 import torch
+from . import _optimizer_0c12ced_dirty
+ops = torch.ops._optimizer_0c12ced_dirty
 def add_op_namespace_prefix(op_name: str):
     """
     Prefix op by namespace.
     """
+    return f"_optimizer_0c12ced_dirty::{op_name}"

build/{torch27-cxx11-cu128-x86_64-linux/optimizer/_optimizer_2dc97a1_dirty.abi3.so → torch28-cxx11-cu128-x86_64-linux/optimizer/_optimizer_0c12ced_dirty.abi3.so} RENAMED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:2e6bab72b965f42d466cd74bbda49851549f2810278e642cef8738e40de4fdc5
 size 1883352

 version https://git-lfs.github.com/spec/v1
+oid sha256:80cb3ac21d3afafe368f31318c31a4c6356b53bbc2186ae81b79e1eb3ff441f5
 size 1883352

build/torch28-cxx11-cu128-x86_64-linux/optimizer/muon.py CHANGED Viewed

@@ -47,19 +47,27 @@ class _muon_state:
     # TODO: use Optional
     worker_rank: int | None = None
     gathered_grad: torch.Tensor | None = None
     computed_u: torch.Tensor | None = None
     gather_event: torch.cuda.Event | None = None
     compute_event: torch.cuda.Event | None = None
     process_group = None
 @torch.no_grad()
 def _gather(p, state, rank, comm_stream, none_grad):
     g = p.grad
     if rank == state.worker_rank:
         num_ranks = dist.get_world_size(group=state.process_group)
-        gather_list = [torch.empty_like(g.to_local()) for _ in range(num_ranks)]
     else:
         gather_list = None
@@ -73,8 +81,7 @@ def _gather(p, state, rank, comm_stream, none_grad):
         if rank == state.worker_rank:
             if state.gathered_grad is not None:
                 raise RuntimeError(
-                    "Gather event already exists, which should not happen."
-                )
             state.gathered_grad = torch.cat(gather_list, dim=0)
             state.gather_event = torch.cuda.Event()
             state.gather_event.record()
@@ -82,11 +89,21 @@ def _gather(p, state, rank, comm_stream, none_grad):
             state.gathered_grad = None
             state.gather_event = None
         if none_grad:
             p.grad = None
 @torch.no_grad()
 def _compute_u(state, steps, rank, compute_stream):
     with torch.cuda.stream(compute_stream):
         if rank == state.worker_rank:
             if state.gather_event is None:
@@ -96,16 +113,16 @@ def _compute_u(state, steps, rank, compute_stream):
             state.computed_u = u
             state.compute_event = torch.cuda.Event()
             state.compute_event.record()
-            # Clear the gathered gradient to free memory
-            state.gathered_grad = None
         else:
             state.computed_u = None
             state.compute_event = None
 @torch.no_grad()
-def _scatter(p, state, lr, weight_decay, rank, comm_stream):
-    u = state.computed_u
     with torch.cuda.stream(comm_stream):
         if rank == state.worker_rank:
@@ -113,27 +130,49 @@ def _scatter(p, state, lr, weight_decay, rank, comm_stream):
             if state.compute_event is None:
                 raise RuntimeError("Compute event must be set before scatter.")
             comm_stream.wait_event(state.compute_event)
             scatter_list = list(torch.split(u, p.size(0) // num_ranks, dim=0))
         else:
             scatter_list = None
-        u = torch.empty_like(p.to_local())
         torch.distributed.scatter(
-            u,
             scatter_list=scatter_list,
             src=state.worker_rank,
             group=state.process_group,
         )
-        if rank == state.worker_rank:
-            # Clear u to free memory
-            state.computed_u = None
-        u = DTensor.from_local(
-            u,
             placements=p.placements,
             device_mesh=p.device_mesh,
         )
-        p.data.mul_(1 - lr * weight_decay)
-        p.data.add_(u, alpha=-lr)
 def default_is_muon(x, name):
@@ -154,17 +193,19 @@ class Muon(torch.optim.Optimizer):
     - We believe it may not work well for finetuning pretrained models, but we haven't tested this.
     Arguments:
-        muon_params: The parameters to be optimized by Muon.
         lr: The learning rate. The updates will have spectral norm of `lr`. (0.02 is a good default)
         momentum: The momentum used by the internal SGD. (0.95 is a good default)
         nesterov: Whether to use Nesterov-style momentum in the internal SGD. (recommended)
         ns_steps: The number of Newton-Schulz iterations to run. (6 is probably always enough)
-        adamw_params: The parameters to be optimized by AdamW. Any parameters in `muon_params` which are
         {0, 1}-D or are detected as being the embed or lm_head will be optimized by AdamW as well.
         adamw_lr: The learning rate for the internal AdamW.
         adamw_betas: The betas for the internal AdamW.
         adamw_eps: The epsilon for the internal AdamW.
-        adamw_weight_decay: The weight decay for the internal AdamW.
     """
     def __init__(
@@ -240,9 +281,10 @@ class Muon(torch.optim.Optimizer):
         """
         Get the shard mesh for a parameter p on the given rank.
         """
-        assert isinstance(p, DTensor), "Parallel Muon only supports DTensor parameters."
-        if p.placements == (Shard(dim=0),):
             # Case for FSDP
             return p.device_mesh.mesh, p.device_mesh.get_group(mesh_dim=0)
         elif p.placements == (Replicate(), Shard(dim=0)):
@@ -269,11 +311,12 @@ class Muon(torch.optim.Optimizer):
             total_flops += flops
         if self.debug:
-            print(f"Total TFLOPs for Muon: {total_flops / 1e12:.2f} TFLOPs", flush=True)
-        ordered_params = sorted(
-            params, key=lambda p: param_to_flops[id(p)], reverse=True
-        )
         round_robin = 0
         mesh = None
@@ -317,14 +360,8 @@ class Muon(torch.optim.Optimizer):
             u = _zeropower_via_newtonschulz5(g, steps=group["ns_steps"])
-            # scale update
             adjusted_lr = self.adjust_lr_for_muon(lr, p.shape)
-            # apply weight decay
-            p.data.mul_(1 - lr * weight_decay)
-            # apply update
-            p.data.add_(u, alpha=-adjusted_lr)
     def _update_g(self, p, g, group, momentum):
         # calc update
@@ -339,9 +376,8 @@ class Muon(torch.optim.Optimizer):
             g = buf
         return g
-    def _update_p(self, p, u, lr, weight_decay):
-        # scale update
-        adjusted_lr = self.adjust_lr_for_muon(lr, p.shape)
         # apply weight decay
         p.data.mul_(1 - lr * weight_decay)
         # apply update
@@ -369,28 +405,34 @@ class Muon(torch.optim.Optimizer):
             p.grad = g
         param_to_state, ordered_params = self.init_state_and_assign_params(
-            params, group
-        )
         def enqueue_gathers(start_idx, chunk_size):
-            for p in ordered_params[start_idx : start_idx + chunk_size]:
                 state = param_to_state[id(p)]
-                _gather(p, state, self.rank, self.comm_stream, group["none_grad"])
         def enqueue_computes(start_idx, chunk_size):
-            for p in ordered_params[start_idx : start_idx + chunk_size]:
                 state = param_to_state[id(p)]
-                _compute_u(state, group["ns_steps"], self.rank, self.compute_stream)
         def enqueue_scatters(start_idx, chunk_size):
-            for p in ordered_params[start_idx : start_idx + chunk_size]:
                 state = param_to_state[id(p)]
                 adjusted_lr = self.adjust_lr_for_muon(lr, p.shape)
-                _scatter(
-                    p, state, adjusted_lr, weight_decay, self.rank, self.comm_stream
-                )
-        chunk_size = dist.get_world_size(param_to_state[id(params[0])].process_group)
         # Wait grad update
         self.comm_stream.wait_stream(torch.cuda.current_stream())
@@ -398,10 +440,14 @@ class Muon(torch.optim.Optimizer):
         enqueue_gathers(0, chunk_size)
         for i in range(0, len(params) + chunk_size - 1, chunk_size):
             enqueue_computes(i, chunk_size)
             enqueue_gathers(i + chunk_size, chunk_size)
             enqueue_scatters(i, chunk_size)
-        torch.cuda.current_stream().wait_stream(self.comm_stream)
     def step(self, closure=None):
         """Perform a single optimization step.
@@ -436,15 +482,16 @@ class Muon(torch.optim.Optimizer):
                     continue
                 if isinstance(p.data, DTensor):
                     if all(
-                        isinstance(placement, Replicate) for placement in p.placements
-                    ):
                         param_tensors.append(p)
                     else:
                         param_dtensors.append(p)
                 elif isinstance(p.data, torch.Tensor):
                     param_tensors.append(p)
                 else:
-                    raise TypeError(f"Unsupported parameter type: {type(p.data)}")
             if self.debug:
                 print(
@@ -479,7 +526,9 @@ class Muon(torch.optim.Optimizer):
             #       AdamW backup       #
             ############################
-            params = [p for p in group["params"] if not self.state[p]["use_muon"]]
             lr = group["lr"]
             beta1, beta2 = group["adamw_betas"]
             eps = group["adamw_eps"]

     # TODO: use Optional
     worker_rank: int | None = None
     gathered_grad: torch.Tensor | None = None
+    scattered_u: DTensor | None = None
     computed_u: torch.Tensor | None = None
     gather_event: torch.cuda.Event | None = None
     compute_event: torch.cuda.Event | None = None
+    scatter_event: torch.cuda.Event | None = None
     process_group = None
 @torch.no_grad()
 def _gather(p, state, rank, comm_stream, none_grad):
+    """
+    Gather the gradients to worker_rank.
+    If none_grad is True, free p.grad after the gather.
+    """
     g = p.grad
     if rank == state.worker_rank:
         num_ranks = dist.get_world_size(group=state.process_group)
+        gather_list = [
+            torch.empty_like(g.to_local()) for _ in range(num_ranks)
+        ]
     else:
         gather_list = None
         if rank == state.worker_rank:
             if state.gathered_grad is not None:
                 raise RuntimeError(
+                    "Gather event already exists, which should not happen.")
             state.gathered_grad = torch.cat(gather_list, dim=0)
             state.gather_event = torch.cuda.Event()
             state.gather_event.record()
             state.gathered_grad = None
             state.gather_event = None
         if none_grad:
+            # We can safely free p.grad without calling record_stream:
+            #   p.grad.to_local().record_stream(comm_stream)
+            # Explanation:
+            # 1. p.grad is created on the default stream, but the default stream
+            #    is synchronized with the comm stream later.
+            # 2. There is no further activity on the default stream before the optimizer finishes.
+            # Therefore, it is safe to free p.grad directly on the comm stream.
             p.grad = None
 @torch.no_grad()
 def _compute_u(state, steps, rank, compute_stream):
+    """
+    On worker_rank, compute the orthogonalized update using Newton-Schulz iteration.
+    """
     with torch.cuda.stream(compute_stream):
         if rank == state.worker_rank:
             if state.gather_event is None:
             state.computed_u = u
             state.compute_event = torch.cuda.Event()
             state.compute_event.record()
         else:
             state.computed_u = None
             state.compute_event = None
 @torch.no_grad()
+def _scatter(p, state, rank, comm_stream):
+    """
+    Scatter the computed_u from worker_rank to all ranks.
+    """
     with torch.cuda.stream(comm_stream):
         if rank == state.worker_rank:
             if state.compute_event is None:
                 raise RuntimeError("Compute event must be set before scatter.")
             comm_stream.wait_event(state.compute_event)
+            # Clear the gathered gradient to free memory
+            state.gathered_grad = None
+            u = state.computed_u
             scatter_list = list(torch.split(u, p.size(0) // num_ranks, dim=0))
+            scatter_list = [s.contiguous() for s in scatter_list]
         else:
             scatter_list = None
+        u_received = torch.empty_like(p.to_local())
         torch.distributed.scatter(
+            u_received,
             scatter_list=scatter_list,
             src=state.worker_rank,
             group=state.process_group,
         )
+        u_dtensor = DTensor.from_local(
+            u_received,
             placements=p.placements,
             device_mesh=p.device_mesh,
         )
+        state.scattered_u = u_dtensor
+        state.scatter_event = torch.cuda.Event()
+        state.scatter_event.record()
+def _update_param(p, state, lr, adjusted_lr, weight_decay, rank,
+                  compute_stream):
+    """
+    Update sharded parameter p with the scattered_u.
+    Only worker_rank frees computed_u.
+    """
+    with torch.cuda.stream(compute_stream):
+        if state.scatter_event is None:
+            raise RuntimeError("Scatter event must be set before update")
+        compute_stream.wait_event(state.scatter_event)
+        if rank == state.worker_rank:
+            # Free computed_u
+            state.computed_u = None
+        Muon._update_p(p, state.scattered_u, lr, adjusted_lr, weight_decay)
 def default_is_muon(x, name):
     - We believe it may not work well for finetuning pretrained models, but we haven't tested this.
     Arguments:
+        model: The model to be optimized by Muon.
+        is_muon_func: A function that takes a parameter and its name, and returns whether the parameter should be optimized by Muon.
         lr: The learning rate. The updates will have spectral norm of `lr`. (0.02 is a good default)
         momentum: The momentum used by the internal SGD. (0.95 is a good default)
         nesterov: Whether to use Nesterov-style momentum in the internal SGD. (recommended)
         ns_steps: The number of Newton-Schulz iterations to run. (6 is probably always enough)
+        weight_decay: The weight decay for Muon and AdamW.
         {0, 1}-D or are detected as being the embed or lm_head will be optimized by AdamW as well.
         adamw_lr: The learning rate for the internal AdamW.
         adamw_betas: The betas for the internal AdamW.
         adamw_eps: The epsilon for the internal AdamW.
+        none_grad: Whether to set p.grad to None after gathering the gradients. This can save memory.
+        debug: Whether to print debug information.
     """
     def __init__(
         """
         Get the shard mesh for a parameter p on the given rank.
         """
+        assert isinstance(
+            p, DTensor), "Parallel Muon only supports DTensor parameters."
+        if p.placements == (Shard(dim=0), ):
             # Case for FSDP
             return p.device_mesh.mesh, p.device_mesh.get_group(mesh_dim=0)
         elif p.placements == (Replicate(), Shard(dim=0)):
             total_flops += flops
         if self.debug:
+            print(f"Total TFLOPs for Muon: {total_flops / 1e12:.2f} TFLOPs",
+                  flush=True)
+        ordered_params = sorted(params,
+                                key=lambda p: param_to_flops[id(p)],
+                                reverse=True)
         round_robin = 0
         mesh = None
             u = _zeropower_via_newtonschulz5(g, steps=group["ns_steps"])
             adjusted_lr = self.adjust_lr_for_muon(lr, p.shape)
+            Muon._update_p(p, u, lr, adjusted_lr, weight_decay)
     def _update_g(self, p, g, group, momentum):
         # calc update
             g = buf
         return g
+    @staticmethod
+    def _update_p(p, u, lr, adjusted_lr, weight_decay):
         # apply weight decay
         p.data.mul_(1 - lr * weight_decay)
         # apply update
             p.grad = g
         param_to_state, ordered_params = self.init_state_and_assign_params(
+            params, group)
         def enqueue_gathers(start_idx, chunk_size):
+            for p in ordered_params[start_idx:start_idx + chunk_size]:
                 state = param_to_state[id(p)]
+                _gather(p, state, self.rank, self.comm_stream,
+                        group["none_grad"])
         def enqueue_computes(start_idx, chunk_size):
+            for p in ordered_params[start_idx:start_idx + chunk_size]:
                 state = param_to_state[id(p)]
+                _compute_u(state, group["ns_steps"], self.rank,
+                           self.compute_stream)
         def enqueue_scatters(start_idx, chunk_size):
+            for p in ordered_params[start_idx:start_idx + chunk_size]:
+                state = param_to_state[id(p)]
+                _scatter(p, state, self.rank, self.comm_stream)
+        def enqueue_update_param(start_idx, chunk_size):
+            for p in ordered_params[start_idx:start_idx + chunk_size]:
                 state = param_to_state[id(p)]
                 adjusted_lr = self.adjust_lr_for_muon(lr, p.shape)
+                _update_param(p, state, lr, adjusted_lr, weight_decay,
+                              self.rank, self.compute_stream)
+        chunk_size = dist.get_world_size(param_to_state[id(
+            params[0])].process_group)
         # Wait grad update
         self.comm_stream.wait_stream(torch.cuda.current_stream())
         enqueue_gathers(0, chunk_size)
         for i in range(0, len(params) + chunk_size - 1, chunk_size):
             enqueue_computes(i, chunk_size)
+            if i > 0:
+                enqueue_update_param(i - chunk_size, chunk_size)
             enqueue_gathers(i + chunk_size, chunk_size)
             enqueue_scatters(i, chunk_size)
+        enqueue_update_param(i, chunk_size)
+        # Wait the last update_param to finish
+        torch.cuda.current_stream().wait_stream(self.compute_stream)
     def step(self, closure=None):
         """Perform a single optimization step.
                     continue
                 if isinstance(p.data, DTensor):
                     if all(
+                            isinstance(placement, Replicate)
+                            for placement in p.placements):
                         param_tensors.append(p)
                     else:
                         param_dtensors.append(p)
                 elif isinstance(p.data, torch.Tensor):
                     param_tensors.append(p)
                 else:
+                    raise TypeError(
+                        f"Unsupported parameter type: {type(p.data)}")
             if self.debug:
                 print(
             #       AdamW backup       #
             ############################
+            params = [
+                p for p in group["params"] if not self.state[p]["use_muon"]
+            ]
             lr = group["lr"]
             beta1, beta2 = group["adamw_betas"]
             eps = group["adamw_eps"]

build/torch28-cxx11-cu129-x86_64-linux/optimizer/__pycache__/__init__.cpython-313.pyc DELETED Viewed

Binary file (307 Bytes)

build/torch28-cxx11-cu129-x86_64-linux/optimizer/__pycache__/muon.cpython-313.pyc DELETED Viewed

Binary file (23.4 kB)

build/torch28-cxx11-cu129-x86_64-linux/optimizer/_ops.py CHANGED Viewed

@@ -1,9 +1,9 @@
 import torch
-from . import _optimizer_2dc97a1_dirty
-ops = torch.ops._optimizer_2dc97a1_dirty
 def add_op_namespace_prefix(op_name: str):
     """
     Prefix op by namespace.
     """
-    return f"_optimizer_2dc97a1_dirty::{op_name}"

 import torch
+from . import _optimizer_0c12ced_dirty
+ops = torch.ops._optimizer_0c12ced_dirty
 def add_op_namespace_prefix(op_name: str):
     """
     Prefix op by namespace.
     """
+    return f"_optimizer_0c12ced_dirty::{op_name}"

build/torch28-cxx11-cu129-x86_64-linux/optimizer/{_optimizer_2dc97a1_dirty.abi3.so → _optimizer_0c12ced_dirty.abi3.so} RENAMED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:f8daaad69e6958850f848fab60c9acb938c3a5e54e3ec34a1bec03a3d32653cb
 size 1883352

 version https://git-lfs.github.com/spec/v1
+oid sha256:32af855517484e2695b6d83c29a03d85fcbaaea559d95cbb62fd9fa67cc3ccac
 size 1883352

build/torch28-cxx11-cu129-x86_64-linux/optimizer/muon.py CHANGED Viewed

@@ -47,19 +47,27 @@ class _muon_state:
     # TODO: use Optional
     worker_rank: int | None = None
     gathered_grad: torch.Tensor | None = None
     computed_u: torch.Tensor | None = None
     gather_event: torch.cuda.Event | None = None
     compute_event: torch.cuda.Event | None = None
     process_group = None
 @torch.no_grad()
 def _gather(p, state, rank, comm_stream, none_grad):
     g = p.grad
     if rank == state.worker_rank:
         num_ranks = dist.get_world_size(group=state.process_group)
-        gather_list = [torch.empty_like(g.to_local()) for _ in range(num_ranks)]
     else:
         gather_list = None
@@ -73,8 +81,7 @@ def _gather(p, state, rank, comm_stream, none_grad):
         if rank == state.worker_rank:
             if state.gathered_grad is not None:
                 raise RuntimeError(
-                    "Gather event already exists, which should not happen."
-                )
             state.gathered_grad = torch.cat(gather_list, dim=0)
             state.gather_event = torch.cuda.Event()
             state.gather_event.record()
@@ -82,11 +89,21 @@ def _gather(p, state, rank, comm_stream, none_grad):
             state.gathered_grad = None
             state.gather_event = None
         if none_grad:
             p.grad = None
 @torch.no_grad()
 def _compute_u(state, steps, rank, compute_stream):
     with torch.cuda.stream(compute_stream):
         if rank == state.worker_rank:
             if state.gather_event is None:
@@ -96,16 +113,16 @@ def _compute_u(state, steps, rank, compute_stream):
             state.computed_u = u
             state.compute_event = torch.cuda.Event()
             state.compute_event.record()
-            # Clear the gathered gradient to free memory
-            state.gathered_grad = None
         else:
             state.computed_u = None
             state.compute_event = None
 @torch.no_grad()
-def _scatter(p, state, lr, weight_decay, rank, comm_stream):
-    u = state.computed_u
     with torch.cuda.stream(comm_stream):
         if rank == state.worker_rank:
@@ -113,27 +130,49 @@ def _scatter(p, state, lr, weight_decay, rank, comm_stream):
             if state.compute_event is None:
                 raise RuntimeError("Compute event must be set before scatter.")
             comm_stream.wait_event(state.compute_event)
             scatter_list = list(torch.split(u, p.size(0) // num_ranks, dim=0))
         else:
             scatter_list = None
-        u = torch.empty_like(p.to_local())
         torch.distributed.scatter(
-            u,
             scatter_list=scatter_list,
             src=state.worker_rank,
             group=state.process_group,
         )
-        if rank == state.worker_rank:
-            # Clear u to free memory
-            state.computed_u = None
-        u = DTensor.from_local(
-            u,
             placements=p.placements,
             device_mesh=p.device_mesh,
         )
-        p.data.mul_(1 - lr * weight_decay)
-        p.data.add_(u, alpha=-lr)
 def default_is_muon(x, name):
@@ -154,17 +193,19 @@ class Muon(torch.optim.Optimizer):
     - We believe it may not work well for finetuning pretrained models, but we haven't tested this.
     Arguments:
-        muon_params: The parameters to be optimized by Muon.
         lr: The learning rate. The updates will have spectral norm of `lr`. (0.02 is a good default)
         momentum: The momentum used by the internal SGD. (0.95 is a good default)
         nesterov: Whether to use Nesterov-style momentum in the internal SGD. (recommended)
         ns_steps: The number of Newton-Schulz iterations to run. (6 is probably always enough)
-        adamw_params: The parameters to be optimized by AdamW. Any parameters in `muon_params` which are
         {0, 1}-D or are detected as being the embed or lm_head will be optimized by AdamW as well.
         adamw_lr: The learning rate for the internal AdamW.
         adamw_betas: The betas for the internal AdamW.
         adamw_eps: The epsilon for the internal AdamW.
-        adamw_weight_decay: The weight decay for the internal AdamW.
     """
     def __init__(
@@ -240,9 +281,10 @@ class Muon(torch.optim.Optimizer):
         """
         Get the shard mesh for a parameter p on the given rank.
         """
-        assert isinstance(p, DTensor), "Parallel Muon only supports DTensor parameters."
-        if p.placements == (Shard(dim=0),):
             # Case for FSDP
             return p.device_mesh.mesh, p.device_mesh.get_group(mesh_dim=0)
         elif p.placements == (Replicate(), Shard(dim=0)):
@@ -269,11 +311,12 @@ class Muon(torch.optim.Optimizer):
             total_flops += flops
         if self.debug:
-            print(f"Total TFLOPs for Muon: {total_flops / 1e12:.2f} TFLOPs", flush=True)
-        ordered_params = sorted(
-            params, key=lambda p: param_to_flops[id(p)], reverse=True
-        )
         round_robin = 0
         mesh = None
@@ -317,14 +360,8 @@ class Muon(torch.optim.Optimizer):
             u = _zeropower_via_newtonschulz5(g, steps=group["ns_steps"])
-            # scale update
             adjusted_lr = self.adjust_lr_for_muon(lr, p.shape)
-            # apply weight decay
-            p.data.mul_(1 - lr * weight_decay)
-            # apply update
-            p.data.add_(u, alpha=-adjusted_lr)
     def _update_g(self, p, g, group, momentum):
         # calc update
@@ -339,9 +376,8 @@ class Muon(torch.optim.Optimizer):
             g = buf
         return g
-    def _update_p(self, p, u, lr, weight_decay):
-        # scale update
-        adjusted_lr = self.adjust_lr_for_muon(lr, p.shape)
         # apply weight decay
         p.data.mul_(1 - lr * weight_decay)
         # apply update
@@ -369,28 +405,34 @@ class Muon(torch.optim.Optimizer):
             p.grad = g
         param_to_state, ordered_params = self.init_state_and_assign_params(
-            params, group
-        )
         def enqueue_gathers(start_idx, chunk_size):
-            for p in ordered_params[start_idx : start_idx + chunk_size]:
                 state = param_to_state[id(p)]
-                _gather(p, state, self.rank, self.comm_stream, group["none_grad"])
         def enqueue_computes(start_idx, chunk_size):
-            for p in ordered_params[start_idx : start_idx + chunk_size]:
                 state = param_to_state[id(p)]
-                _compute_u(state, group["ns_steps"], self.rank, self.compute_stream)
         def enqueue_scatters(start_idx, chunk_size):
-            for p in ordered_params[start_idx : start_idx + chunk_size]:
                 state = param_to_state[id(p)]
                 adjusted_lr = self.adjust_lr_for_muon(lr, p.shape)
-                _scatter(
-                    p, state, adjusted_lr, weight_decay, self.rank, self.comm_stream
-                )
-        chunk_size = dist.get_world_size(param_to_state[id(params[0])].process_group)
         # Wait grad update
         self.comm_stream.wait_stream(torch.cuda.current_stream())
@@ -398,10 +440,14 @@ class Muon(torch.optim.Optimizer):
         enqueue_gathers(0, chunk_size)
         for i in range(0, len(params) + chunk_size - 1, chunk_size):
             enqueue_computes(i, chunk_size)
             enqueue_gathers(i + chunk_size, chunk_size)
             enqueue_scatters(i, chunk_size)
-        torch.cuda.current_stream().wait_stream(self.comm_stream)
     def step(self, closure=None):
         """Perform a single optimization step.
@@ -436,15 +482,16 @@ class Muon(torch.optim.Optimizer):
                     continue
                 if isinstance(p.data, DTensor):
                     if all(
-                        isinstance(placement, Replicate) for placement in p.placements
-                    ):
                         param_tensors.append(p)
                     else:
                         param_dtensors.append(p)
                 elif isinstance(p.data, torch.Tensor):
                     param_tensors.append(p)
                 else:
-                    raise TypeError(f"Unsupported parameter type: {type(p.data)}")
             if self.debug:
                 print(
@@ -479,7 +526,9 @@ class Muon(torch.optim.Optimizer):
             #       AdamW backup       #
             ############################
-            params = [p for p in group["params"] if not self.state[p]["use_muon"]]
             lr = group["lr"]
             beta1, beta2 = group["adamw_betas"]
             eps = group["adamw_eps"]

     # TODO: use Optional
     worker_rank: int | None = None
     gathered_grad: torch.Tensor | None = None
+    scattered_u: DTensor | None = None
     computed_u: torch.Tensor | None = None
     gather_event: torch.cuda.Event | None = None
     compute_event: torch.cuda.Event | None = None
+    scatter_event: torch.cuda.Event | None = None
     process_group = None
 @torch.no_grad()
 def _gather(p, state, rank, comm_stream, none_grad):
+    """
+    Gather the gradients to worker_rank.
+    If none_grad is True, free p.grad after the gather.
+    """
     g = p.grad
     if rank == state.worker_rank:
         num_ranks = dist.get_world_size(group=state.process_group)
+        gather_list = [
+            torch.empty_like(g.to_local()) for _ in range(num_ranks)
+        ]
     else:
         gather_list = None
         if rank == state.worker_rank:
             if state.gathered_grad is not None:
                 raise RuntimeError(
+                    "Gather event already exists, which should not happen.")
             state.gathered_grad = torch.cat(gather_list, dim=0)
             state.gather_event = torch.cuda.Event()
             state.gather_event.record()
             state.gathered_grad = None
             state.gather_event = None
         if none_grad:
+            # We can safely free p.grad without calling record_stream:
+            #   p.grad.to_local().record_stream(comm_stream)
+            # Explanation:
+            # 1. p.grad is created on the default stream, but the default stream
+            #    is synchronized with the comm stream later.
+            # 2. There is no further activity on the default stream before the optimizer finishes.
+            # Therefore, it is safe to free p.grad directly on the comm stream.
             p.grad = None
 @torch.no_grad()
 def _compute_u(state, steps, rank, compute_stream):
+    """
+    On worker_rank, compute the orthogonalized update using Newton-Schulz iteration.
+    """
     with torch.cuda.stream(compute_stream):
         if rank == state.worker_rank:
             if state.gather_event is None:
             state.computed_u = u
             state.compute_event = torch.cuda.Event()
             state.compute_event.record()
         else:
             state.computed_u = None
             state.compute_event = None
 @torch.no_grad()
+def _scatter(p, state, rank, comm_stream):
+    """
+    Scatter the computed_u from worker_rank to all ranks.
+    """
     with torch.cuda.stream(comm_stream):
         if rank == state.worker_rank:
             if state.compute_event is None:
                 raise RuntimeError("Compute event must be set before scatter.")
             comm_stream.wait_event(state.compute_event)
+            # Clear the gathered gradient to free memory
+            state.gathered_grad = None
+            u = state.computed_u
             scatter_list = list(torch.split(u, p.size(0) // num_ranks, dim=0))
+            scatter_list = [s.contiguous() for s in scatter_list]
         else:
             scatter_list = None
+        u_received = torch.empty_like(p.to_local())
         torch.distributed.scatter(
+            u_received,
             scatter_list=scatter_list,
             src=state.worker_rank,
             group=state.process_group,
         )
+        u_dtensor = DTensor.from_local(
+            u_received,
             placements=p.placements,
             device_mesh=p.device_mesh,
         )
+        state.scattered_u = u_dtensor
+        state.scatter_event = torch.cuda.Event()
+        state.scatter_event.record()
+def _update_param(p, state, lr, adjusted_lr, weight_decay, rank,
+                  compute_stream):
+    """
+    Update sharded parameter p with the scattered_u.
+    Only worker_rank frees computed_u.
+    """
+    with torch.cuda.stream(compute_stream):
+        if state.scatter_event is None:
+            raise RuntimeError("Scatter event must be set before update")
+        compute_stream.wait_event(state.scatter_event)
+        if rank == state.worker_rank:
+            # Free computed_u
+            state.computed_u = None
+        Muon._update_p(p, state.scattered_u, lr, adjusted_lr, weight_decay)
 def default_is_muon(x, name):
     - We believe it may not work well for finetuning pretrained models, but we haven't tested this.
     Arguments:
+        model: The model to be optimized by Muon.
+        is_muon_func: A function that takes a parameter and its name, and returns whether the parameter should be optimized by Muon.
         lr: The learning rate. The updates will have spectral norm of `lr`. (0.02 is a good default)
         momentum: The momentum used by the internal SGD. (0.95 is a good default)
         nesterov: Whether to use Nesterov-style momentum in the internal SGD. (recommended)
         ns_steps: The number of Newton-Schulz iterations to run. (6 is probably always enough)
+        weight_decay: The weight decay for Muon and AdamW.
         {0, 1}-D or are detected as being the embed or lm_head will be optimized by AdamW as well.
         adamw_lr: The learning rate for the internal AdamW.
         adamw_betas: The betas for the internal AdamW.
         adamw_eps: The epsilon for the internal AdamW.
+        none_grad: Whether to set p.grad to None after gathering the gradients. This can save memory.
+        debug: Whether to print debug information.
     """
     def __init__(
         """
         Get the shard mesh for a parameter p on the given rank.
         """
+        assert isinstance(
+            p, DTensor), "Parallel Muon only supports DTensor parameters."
+        if p.placements == (Shard(dim=0), ):
             # Case for FSDP
             return p.device_mesh.mesh, p.device_mesh.get_group(mesh_dim=0)
         elif p.placements == (Replicate(), Shard(dim=0)):
             total_flops += flops
         if self.debug:
+            print(f"Total TFLOPs for Muon: {total_flops / 1e12:.2f} TFLOPs",
+                  flush=True)
+        ordered_params = sorted(params,
+                                key=lambda p: param_to_flops[id(p)],
+                                reverse=True)
         round_robin = 0
         mesh = None
             u = _zeropower_via_newtonschulz5(g, steps=group["ns_steps"])
             adjusted_lr = self.adjust_lr_for_muon(lr, p.shape)
+            Muon._update_p(p, u, lr, adjusted_lr, weight_decay)
     def _update_g(self, p, g, group, momentum):
         # calc update
             g = buf
         return g
+    @staticmethod
+    def _update_p(p, u, lr, adjusted_lr, weight_decay):
         # apply weight decay
         p.data.mul_(1 - lr * weight_decay)
         # apply update
             p.grad = g
         param_to_state, ordered_params = self.init_state_and_assign_params(
+            params, group)
         def enqueue_gathers(start_idx, chunk_size):
+            for p in ordered_params[start_idx:start_idx + chunk_size]:
                 state = param_to_state[id(p)]
+                _gather(p, state, self.rank, self.comm_stream,
+                        group["none_grad"])
         def enqueue_computes(start_idx, chunk_size):
+            for p in ordered_params[start_idx:start_idx + chunk_size]:
                 state = param_to_state[id(p)]
+                _compute_u(state, group["ns_steps"], self.rank,
+                           self.compute_stream)
         def enqueue_scatters(start_idx, chunk_size):
+            for p in ordered_params[start_idx:start_idx + chunk_size]:
+                state = param_to_state[id(p)]
+                _scatter(p, state, self.rank, self.comm_stream)
+        def enqueue_update_param(start_idx, chunk_size):
+            for p in ordered_params[start_idx:start_idx + chunk_size]:
                 state = param_to_state[id(p)]
                 adjusted_lr = self.adjust_lr_for_muon(lr, p.shape)
+                _update_param(p, state, lr, adjusted_lr, weight_decay,
+                              self.rank, self.compute_stream)
+        chunk_size = dist.get_world_size(param_to_state[id(
+            params[0])].process_group)
         # Wait grad update
         self.comm_stream.wait_stream(torch.cuda.current_stream())
         enqueue_gathers(0, chunk_size)
         for i in range(0, len(params) + chunk_size - 1, chunk_size):
             enqueue_computes(i, chunk_size)
+            if i > 0:
+                enqueue_update_param(i - chunk_size, chunk_size)
             enqueue_gathers(i + chunk_size, chunk_size)
             enqueue_scatters(i, chunk_size)
+        enqueue_update_param(i, chunk_size)
+        # Wait the last update_param to finish
+        torch.cuda.current_stream().wait_stream(self.compute_stream)
     def step(self, closure=None):
         """Perform a single optimization step.
                     continue
                 if isinstance(p.data, DTensor):
                     if all(
+                            isinstance(placement, Replicate)
+                            for placement in p.placements):
                         param_tensors.append(p)
                     else:
                         param_dtensors.append(p)
                 elif isinstance(p.data, torch.Tensor):
                     param_tensors.append(p)
                 else:
+                    raise TypeError(
+                        f"Unsupported parameter type: {type(p.data)}")
             if self.debug:
                 print(
             #       AdamW backup       #
             ############################
+            params = [
+                p for p in group["params"] if not self.state[p]["use_muon"]
+            ]
             lr = group["lr"]
             beta1, beta2 = group["adamw_betas"]
             eps = group["adamw_eps"]

build/torch28-cxx11-rocm63-x86_64-linux/optimizer/__pycache__/__init__.cpython-313.pyc DELETED Viewed

Binary file (308 Bytes)

build/torch28-cxx11-rocm63-x86_64-linux/optimizer/__pycache__/muon.cpython-313.pyc DELETED Viewed

Binary file (23.4 kB)

build/torch28-cxx11-rocm63-x86_64-linux/optimizer/_ops.py CHANGED Viewed

@@ -1,9 +1,9 @@
 import torch
-from . import _optimizer_2dc97a1_dirty
-ops = torch.ops._optimizer_2dc97a1_dirty
 def add_op_namespace_prefix(op_name: str):
     """
     Prefix op by namespace.
     """
-    return f"_optimizer_2dc97a1_dirty::{op_name}"

 import torch
+from . import _optimizer_0c12ced_dirty
+ops = torch.ops._optimizer_0c12ced_dirty
 def add_op_namespace_prefix(op_name: str):
     """
     Prefix op by namespace.
     """
+    return f"_optimizer_0c12ced_dirty::{op_name}"

build/torch28-cxx11-rocm63-x86_64-linux/optimizer/{_optimizer_2dc97a1_dirty.abi3.so → _optimizer_0c12ced_dirty.abi3.so} RENAMED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:76910ba81e2c95c83207118725c4379db636346c4ccf05010e2ee00c41dff1ce
 size 1750000

 version https://git-lfs.github.com/spec/v1
+oid sha256:2dd72f3b9f513dc8bd0724fede9b668761b1d701dfdf3a294979706d803b0800
 size 1750000

build/torch28-cxx11-rocm63-x86_64-linux/optimizer/muon.py CHANGED Viewed

@@ -47,19 +47,27 @@ class _muon_state:
     # TODO: use Optional
     worker_rank: int | None = None
     gathered_grad: torch.Tensor | None = None
     computed_u: torch.Tensor | None = None
     gather_event: torch.cuda.Event | None = None
     compute_event: torch.cuda.Event | None = None
     process_group = None
 @torch.no_grad()
 def _gather(p, state, rank, comm_stream, none_grad):
     g = p.grad
     if rank == state.worker_rank:
         num_ranks = dist.get_world_size(group=state.process_group)
-        gather_list = [torch.empty_like(g.to_local()) for _ in range(num_ranks)]
     else:
         gather_list = None
@@ -73,8 +81,7 @@ def _gather(p, state, rank, comm_stream, none_grad):
         if rank == state.worker_rank:
             if state.gathered_grad is not None:
                 raise RuntimeError(
-                    "Gather event already exists, which should not happen."
-                )
             state.gathered_grad = torch.cat(gather_list, dim=0)
             state.gather_event = torch.cuda.Event()
             state.gather_event.record()
@@ -82,11 +89,21 @@ def _gather(p, state, rank, comm_stream, none_grad):
             state.gathered_grad = None
             state.gather_event = None
         if none_grad:
             p.grad = None
 @torch.no_grad()
 def _compute_u(state, steps, rank, compute_stream):
     with torch.cuda.stream(compute_stream):
         if rank == state.worker_rank:
             if state.gather_event is None:
@@ -96,16 +113,16 @@ def _compute_u(state, steps, rank, compute_stream):
             state.computed_u = u
             state.compute_event = torch.cuda.Event()
             state.compute_event.record()
-            # Clear the gathered gradient to free memory
-            state.gathered_grad = None
         else:
             state.computed_u = None
             state.compute_event = None
 @torch.no_grad()
-def _scatter(p, state, lr, weight_decay, rank, comm_stream):
-    u = state.computed_u
     with torch.cuda.stream(comm_stream):
         if rank == state.worker_rank:
@@ -113,27 +130,49 @@ def _scatter(p, state, lr, weight_decay, rank, comm_stream):
             if state.compute_event is None:
                 raise RuntimeError("Compute event must be set before scatter.")
             comm_stream.wait_event(state.compute_event)
             scatter_list = list(torch.split(u, p.size(0) // num_ranks, dim=0))
         else:
             scatter_list = None
-        u = torch.empty_like(p.to_local())
         torch.distributed.scatter(
-            u,
             scatter_list=scatter_list,
             src=state.worker_rank,
             group=state.process_group,
         )
-        if rank == state.worker_rank:
-            # Clear u to free memory
-            state.computed_u = None
-        u = DTensor.from_local(
-            u,
             placements=p.placements,
             device_mesh=p.device_mesh,
         )
-        p.data.mul_(1 - lr * weight_decay)
-        p.data.add_(u, alpha=-lr)
 def default_is_muon(x, name):
@@ -154,17 +193,19 @@ class Muon(torch.optim.Optimizer):
     - We believe it may not work well for finetuning pretrained models, but we haven't tested this.
     Arguments:
-        muon_params: The parameters to be optimized by Muon.
         lr: The learning rate. The updates will have spectral norm of `lr`. (0.02 is a good default)
         momentum: The momentum used by the internal SGD. (0.95 is a good default)
         nesterov: Whether to use Nesterov-style momentum in the internal SGD. (recommended)
         ns_steps: The number of Newton-Schulz iterations to run. (6 is probably always enough)
-        adamw_params: The parameters to be optimized by AdamW. Any parameters in `muon_params` which are
         {0, 1}-D or are detected as being the embed or lm_head will be optimized by AdamW as well.
         adamw_lr: The learning rate for the internal AdamW.
         adamw_betas: The betas for the internal AdamW.
         adamw_eps: The epsilon for the internal AdamW.
-        adamw_weight_decay: The weight decay for the internal AdamW.
     """
     def __init__(
@@ -240,9 +281,10 @@ class Muon(torch.optim.Optimizer):
         """
         Get the shard mesh for a parameter p on the given rank.
         """
-        assert isinstance(p, DTensor), "Parallel Muon only supports DTensor parameters."
-        if p.placements == (Shard(dim=0),):
             # Case for FSDP
             return p.device_mesh.mesh, p.device_mesh.get_group(mesh_dim=0)
         elif p.placements == (Replicate(), Shard(dim=0)):
@@ -269,11 +311,12 @@ class Muon(torch.optim.Optimizer):
             total_flops += flops
         if self.debug:
-            print(f"Total TFLOPs for Muon: {total_flops / 1e12:.2f} TFLOPs", flush=True)
-        ordered_params = sorted(
-            params, key=lambda p: param_to_flops[id(p)], reverse=True
-        )
         round_robin = 0
         mesh = None
@@ -317,14 +360,8 @@ class Muon(torch.optim.Optimizer):
             u = _zeropower_via_newtonschulz5(g, steps=group["ns_steps"])
-            # scale update
             adjusted_lr = self.adjust_lr_for_muon(lr, p.shape)
-            # apply weight decay
-            p.data.mul_(1 - lr * weight_decay)
-            # apply update
-            p.data.add_(u, alpha=-adjusted_lr)
     def _update_g(self, p, g, group, momentum):
         # calc update
@@ -339,9 +376,8 @@ class Muon(torch.optim.Optimizer):
             g = buf
         return g
-    def _update_p(self, p, u, lr, weight_decay):
-        # scale update
-        adjusted_lr = self.adjust_lr_for_muon(lr, p.shape)
         # apply weight decay
         p.data.mul_(1 - lr * weight_decay)
         # apply update
@@ -369,28 +405,34 @@ class Muon(torch.optim.Optimizer):
             p.grad = g
         param_to_state, ordered_params = self.init_state_and_assign_params(
-            params, group
-        )
         def enqueue_gathers(start_idx, chunk_size):
-            for p in ordered_params[start_idx : start_idx + chunk_size]:
                 state = param_to_state[id(p)]
-                _gather(p, state, self.rank, self.comm_stream, group["none_grad"])
         def enqueue_computes(start_idx, chunk_size):
-            for p in ordered_params[start_idx : start_idx + chunk_size]:
                 state = param_to_state[id(p)]
-                _compute_u(state, group["ns_steps"], self.rank, self.compute_stream)
         def enqueue_scatters(start_idx, chunk_size):
-            for p in ordered_params[start_idx : start_idx + chunk_size]:
                 state = param_to_state[id(p)]
                 adjusted_lr = self.adjust_lr_for_muon(lr, p.shape)
-                _scatter(
-                    p, state, adjusted_lr, weight_decay, self.rank, self.comm_stream
-                )
-        chunk_size = dist.get_world_size(param_to_state[id(params[0])].process_group)
         # Wait grad update
         self.comm_stream.wait_stream(torch.cuda.current_stream())
@@ -398,10 +440,14 @@ class Muon(torch.optim.Optimizer):
         enqueue_gathers(0, chunk_size)
         for i in range(0, len(params) + chunk_size - 1, chunk_size):
             enqueue_computes(i, chunk_size)
             enqueue_gathers(i + chunk_size, chunk_size)
             enqueue_scatters(i, chunk_size)
-        torch.cuda.current_stream().wait_stream(self.comm_stream)
     def step(self, closure=None):
         """Perform a single optimization step.
@@ -436,15 +482,16 @@ class Muon(torch.optim.Optimizer):
                     continue
                 if isinstance(p.data, DTensor):
                     if all(
-                        isinstance(placement, Replicate) for placement in p.placements
-                    ):
                         param_tensors.append(p)
                     else:
                         param_dtensors.append(p)
                 elif isinstance(p.data, torch.Tensor):
                     param_tensors.append(p)
                 else:
-                    raise TypeError(f"Unsupported parameter type: {type(p.data)}")
             if self.debug:
                 print(
@@ -479,7 +526,9 @@ class Muon(torch.optim.Optimizer):
             #       AdamW backup       #
             ############################
-            params = [p for p in group["params"] if not self.state[p]["use_muon"]]
             lr = group["lr"]
             beta1, beta2 = group["adamw_betas"]
             eps = group["adamw_eps"]

     # TODO: use Optional
     worker_rank: int | None = None
     gathered_grad: torch.Tensor | None = None
+    scattered_u: DTensor | None = None
     computed_u: torch.Tensor | None = None
     gather_event: torch.cuda.Event | None = None
     compute_event: torch.cuda.Event | None = None
+    scatter_event: torch.cuda.Event | None = None
     process_group = None
 @torch.no_grad()
 def _gather(p, state, rank, comm_stream, none_grad):
+    """
+    Gather the gradients to worker_rank.
+    If none_grad is True, free p.grad after the gather.
+    """
     g = p.grad
     if rank == state.worker_rank:
         num_ranks = dist.get_world_size(group=state.process_group)
+        gather_list = [
+            torch.empty_like(g.to_local()) for _ in range(num_ranks)
+        ]
     else:
         gather_list = None
         if rank == state.worker_rank:
             if state.gathered_grad is not None:
                 raise RuntimeError(
+                    "Gather event already exists, which should not happen.")
             state.gathered_grad = torch.cat(gather_list, dim=0)
             state.gather_event = torch.cuda.Event()
             state.gather_event.record()
             state.gathered_grad = None
             state.gather_event = None
         if none_grad:
+            # We can safely free p.grad without calling record_stream:
+            #   p.grad.to_local().record_stream(comm_stream)
+            # Explanation:
+            # 1. p.grad is created on the default stream, but the default stream
+            #    is synchronized with the comm stream later.
+            # 2. There is no further activity on the default stream before the optimizer finishes.
+            # Therefore, it is safe to free p.grad directly on the comm stream.
             p.grad = None
 @torch.no_grad()
 def _compute_u(state, steps, rank, compute_stream):
+    """
+    On worker_rank, compute the orthogonalized update using Newton-Schulz iteration.
+    """
     with torch.cuda.stream(compute_stream):
         if rank == state.worker_rank:
             if state.gather_event is None:
             state.computed_u = u
             state.compute_event = torch.cuda.Event()
             state.compute_event.record()
         else:
             state.computed_u = None
             state.compute_event = None
 @torch.no_grad()
+def _scatter(p, state, rank, comm_stream):
+    """
+    Scatter the computed_u from worker_rank to all ranks.
+    """
     with torch.cuda.stream(comm_stream):
         if rank == state.worker_rank:
             if state.compute_event is None:
                 raise RuntimeError("Compute event must be set before scatter.")
             comm_stream.wait_event(state.compute_event)
+            # Clear the gathered gradient to free memory
+            state.gathered_grad = None
+            u = state.computed_u
             scatter_list = list(torch.split(u, p.size(0) // num_ranks, dim=0))
+            scatter_list = [s.contiguous() for s in scatter_list]
         else:
             scatter_list = None
+        u_received = torch.empty_like(p.to_local())
         torch.distributed.scatter(
+            u_received,
             scatter_list=scatter_list,
             src=state.worker_rank,
             group=state.process_group,
         )
+        u_dtensor = DTensor.from_local(
+            u_received,
             placements=p.placements,
             device_mesh=p.device_mesh,
         )
+        state.scattered_u = u_dtensor
+        state.scatter_event = torch.cuda.Event()
+        state.scatter_event.record()
+def _update_param(p, state, lr, adjusted_lr, weight_decay, rank,
+                  compute_stream):
+    """
+    Update sharded parameter p with the scattered_u.
+    Only worker_rank frees computed_u.
+    """
+    with torch.cuda.stream(compute_stream):
+        if state.scatter_event is None:
+            raise RuntimeError("Scatter event must be set before update")
+        compute_stream.wait_event(state.scatter_event)
+        if rank == state.worker_rank:
+            # Free computed_u
+            state.computed_u = None
+        Muon._update_p(p, state.scattered_u, lr, adjusted_lr, weight_decay)
 def default_is_muon(x, name):
     - We believe it may not work well for finetuning pretrained models, but we haven't tested this.
     Arguments:
+        model: The model to be optimized by Muon.
+        is_muon_func: A function that takes a parameter and its name, and returns whether the parameter should be optimized by Muon.
         lr: The learning rate. The updates will have spectral norm of `lr`. (0.02 is a good default)
         momentum: The momentum used by the internal SGD. (0.95 is a good default)
         nesterov: Whether to use Nesterov-style momentum in the internal SGD. (recommended)
         ns_steps: The number of Newton-Schulz iterations to run. (6 is probably always enough)
+        weight_decay: The weight decay for Muon and AdamW.
         {0, 1}-D or are detected as being the embed or lm_head will be optimized by AdamW as well.
         adamw_lr: The learning rate for the internal AdamW.
         adamw_betas: The betas for the internal AdamW.
         adamw_eps: The epsilon for the internal AdamW.
+        none_grad: Whether to set p.grad to None after gathering the gradients. This can save memory.
+        debug: Whether to print debug information.
     """
     def __init__(
         """
         Get the shard mesh for a parameter p on the given rank.
         """
+        assert isinstance(
+            p, DTensor), "Parallel Muon only supports DTensor parameters."
+        if p.placements == (Shard(dim=0), ):
             # Case for FSDP
             return p.device_mesh.mesh, p.device_mesh.get_group(mesh_dim=0)
         elif p.placements == (Replicate(), Shard(dim=0)):
             total_flops += flops
         if self.debug:
+            print(f"Total TFLOPs for Muon: {total_flops / 1e12:.2f} TFLOPs",
+                  flush=True)
+        ordered_params = sorted(params,
+                                key=lambda p: param_to_flops[id(p)],
+                                reverse=True)
         round_robin = 0
         mesh = None
             u = _zeropower_via_newtonschulz5(g, steps=group["ns_steps"])
             adjusted_lr = self.adjust_lr_for_muon(lr, p.shape)
+            Muon._update_p(p, u, lr, adjusted_lr, weight_decay)
     def _update_g(self, p, g, group, momentum):
         # calc update
             g = buf
         return g
+    @staticmethod
+    def _update_p(p, u, lr, adjusted_lr, weight_decay):
         # apply weight decay
         p.data.mul_(1 - lr * weight_decay)
         # apply update
             p.grad = g
         param_to_state, ordered_params = self.init_state_and_assign_params(
+            params, group)
         def enqueue_gathers(start_idx, chunk_size):
+            for p in ordered_params[start_idx:start_idx + chunk_size]:
                 state = param_to_state[id(p)]
+                _gather(p, state, self.rank, self.comm_stream,
+                        group["none_grad"])
         def enqueue_computes(start_idx, chunk_size):
+            for p in ordered_params[start_idx:start_idx + chunk_size]:
                 state = param_to_state[id(p)]
+                _compute_u(state, group["ns_steps"], self.rank,
+                           self.compute_stream)
         def enqueue_scatters(start_idx, chunk_size):
+            for p in ordered_params[start_idx:start_idx + chunk_size]:
+                state = param_to_state[id(p)]
+                _scatter(p, state, self.rank, self.comm_stream)
+        def enqueue_update_param(start_idx, chunk_size):
+            for p in ordered_params[start_idx:start_idx + chunk_size]:
                 state = param_to_state[id(p)]
                 adjusted_lr = self.adjust_lr_for_muon(lr, p.shape)
+                _update_param(p, state, lr, adjusted_lr, weight_decay,
+                              self.rank, self.compute_stream)
+        chunk_size = dist.get_world_size(param_to_state[id(
+            params[0])].process_group)
         # Wait grad update
         self.comm_stream.wait_stream(torch.cuda.current_stream())
         enqueue_gathers(0, chunk_size)
         for i in range(0, len(params) + chunk_size - 1, chunk_size):
             enqueue_computes(i, chunk_size)
+            if i > 0:
+                enqueue_update_param(i - chunk_size, chunk_size)
             enqueue_gathers(i + chunk_size, chunk_size)
             enqueue_scatters(i, chunk_size)
+        enqueue_update_param(i, chunk_size)
+        # Wait the last update_param to finish
+        torch.cuda.current_stream().wait_stream(self.compute_stream)
     def step(self, closure=None):
         """Perform a single optimization step.
                     continue
                 if isinstance(p.data, DTensor):
                     if all(
+                            isinstance(placement, Replicate)
+                            for placement in p.placements):
                         param_tensors.append(p)
                     else:
                         param_dtensors.append(p)
                 elif isinstance(p.data, torch.Tensor):
                     param_tensors.append(p)
                 else:
+                    raise TypeError(
+                        f"Unsupported parameter type: {type(p.data)}")
             if self.debug:
                 print(
             #       AdamW backup       #
             ############################
+            params = [
+                p for p in group["params"] if not self.state[p]["use_muon"]
+            ]
             lr = group["lr"]
             beta1, beta2 = group["adamw_betas"]
             eps = group["adamw_eps"]

build/torch28-cxx11-rocm64-x86_64-linux/optimizer/__pycache__/__init__.cpython-313.pyc DELETED Viewed

Binary file (308 Bytes)

build/torch28-cxx11-rocm64-x86_64-linux/optimizer/__pycache__/muon.cpython-313.pyc DELETED Viewed

Binary file (23.4 kB)

build/torch28-cxx11-rocm64-x86_64-linux/optimizer/_ops.py CHANGED Viewed

@@ -1,9 +1,9 @@
 import torch
-from . import _optimizer_2dc97a1_dirty
-ops = torch.ops._optimizer_2dc97a1_dirty
 def add_op_namespace_prefix(op_name: str):
     """
     Prefix op by namespace.
     """
-    return f"_optimizer_2dc97a1_dirty::{op_name}"

 import torch
+from . import _optimizer_0c12ced_dirty
+ops = torch.ops._optimizer_0c12ced_dirty
 def add_op_namespace_prefix(op_name: str):
     """
     Prefix op by namespace.
     """
+    return f"_optimizer_0c12ced_dirty::{op_name}"

build/torch28-cxx11-rocm64-x86_64-linux/optimizer/{_optimizer_2dc97a1_dirty.abi3.so → _optimizer_0c12ced_dirty.abi3.so} RENAMED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:dd0a35a6f846a075a8f4561cfc66ef17c6358dd4a0062e63057b02625d9d6af7
 size 1750088

 version https://git-lfs.github.com/spec/v1
+oid sha256:2a49b0225ecf27b33bbbe55936811ecf443ce97be97ccb7237b3b66eb46c0ad8
 size 1750088

build/torch28-cxx11-rocm64-x86_64-linux/optimizer/muon.py CHANGED Viewed

@@ -47,19 +47,27 @@ class _muon_state:
     # TODO: use Optional
     worker_rank: int | None = None
     gathered_grad: torch.Tensor | None = None
     computed_u: torch.Tensor | None = None
     gather_event: torch.cuda.Event | None = None
     compute_event: torch.cuda.Event | None = None
     process_group = None
 @torch.no_grad()
 def _gather(p, state, rank, comm_stream, none_grad):
     g = p.grad
     if rank == state.worker_rank:
         num_ranks = dist.get_world_size(group=state.process_group)
-        gather_list = [torch.empty_like(g.to_local()) for _ in range(num_ranks)]
     else:
         gather_list = None
@@ -73,8 +81,7 @@ def _gather(p, state, rank, comm_stream, none_grad):
         if rank == state.worker_rank:
             if state.gathered_grad is not None:
                 raise RuntimeError(
-                    "Gather event already exists, which should not happen."
-                )
             state.gathered_grad = torch.cat(gather_list, dim=0)
             state.gather_event = torch.cuda.Event()
             state.gather_event.record()
@@ -82,11 +89,21 @@ def _gather(p, state, rank, comm_stream, none_grad):
             state.gathered_grad = None
             state.gather_event = None
         if none_grad:
             p.grad = None
 @torch.no_grad()
 def _compute_u(state, steps, rank, compute_stream):
     with torch.cuda.stream(compute_stream):
         if rank == state.worker_rank:
             if state.gather_event is None:
@@ -96,16 +113,16 @@ def _compute_u(state, steps, rank, compute_stream):
             state.computed_u = u
             state.compute_event = torch.cuda.Event()
             state.compute_event.record()
-            # Clear the gathered gradient to free memory
-            state.gathered_grad = None
         else:
             state.computed_u = None
             state.compute_event = None
 @torch.no_grad()
-def _scatter(p, state, lr, weight_decay, rank, comm_stream):
-    u = state.computed_u
     with torch.cuda.stream(comm_stream):
         if rank == state.worker_rank:
@@ -113,27 +130,49 @@ def _scatter(p, state, lr, weight_decay, rank, comm_stream):
             if state.compute_event is None:
                 raise RuntimeError("Compute event must be set before scatter.")
             comm_stream.wait_event(state.compute_event)
             scatter_list = list(torch.split(u, p.size(0) // num_ranks, dim=0))
         else:
             scatter_list = None
-        u = torch.empty_like(p.to_local())
         torch.distributed.scatter(
-            u,
             scatter_list=scatter_list,
             src=state.worker_rank,
             group=state.process_group,
         )
-        if rank == state.worker_rank:
-            # Clear u to free memory
-            state.computed_u = None
-        u = DTensor.from_local(
-            u,
             placements=p.placements,
             device_mesh=p.device_mesh,
         )
-        p.data.mul_(1 - lr * weight_decay)
-        p.data.add_(u, alpha=-lr)
 def default_is_muon(x, name):
@@ -154,17 +193,19 @@ class Muon(torch.optim.Optimizer):
     - We believe it may not work well for finetuning pretrained models, but we haven't tested this.
     Arguments:
-        muon_params: The parameters to be optimized by Muon.
         lr: The learning rate. The updates will have spectral norm of `lr`. (0.02 is a good default)
         momentum: The momentum used by the internal SGD. (0.95 is a good default)
         nesterov: Whether to use Nesterov-style momentum in the internal SGD. (recommended)
         ns_steps: The number of Newton-Schulz iterations to run. (6 is probably always enough)
-        adamw_params: The parameters to be optimized by AdamW. Any parameters in `muon_params` which are
         {0, 1}-D or are detected as being the embed or lm_head will be optimized by AdamW as well.
         adamw_lr: The learning rate for the internal AdamW.
         adamw_betas: The betas for the internal AdamW.
         adamw_eps: The epsilon for the internal AdamW.
-        adamw_weight_decay: The weight decay for the internal AdamW.
     """
     def __init__(
@@ -240,9 +281,10 @@ class Muon(torch.optim.Optimizer):
         """
         Get the shard mesh for a parameter p on the given rank.
         """
-        assert isinstance(p, DTensor), "Parallel Muon only supports DTensor parameters."
-        if p.placements == (Shard(dim=0),):
             # Case for FSDP
             return p.device_mesh.mesh, p.device_mesh.get_group(mesh_dim=0)
         elif p.placements == (Replicate(), Shard(dim=0)):
@@ -269,11 +311,12 @@ class Muon(torch.optim.Optimizer):
             total_flops += flops
         if self.debug:
-            print(f"Total TFLOPs for Muon: {total_flops / 1e12:.2f} TFLOPs", flush=True)
-        ordered_params = sorted(
-            params, key=lambda p: param_to_flops[id(p)], reverse=True
-        )
         round_robin = 0
         mesh = None
@@ -317,14 +360,8 @@ class Muon(torch.optim.Optimizer):
             u = _zeropower_via_newtonschulz5(g, steps=group["ns_steps"])
-            # scale update
             adjusted_lr = self.adjust_lr_for_muon(lr, p.shape)
-            # apply weight decay
-            p.data.mul_(1 - lr * weight_decay)
-            # apply update
-            p.data.add_(u, alpha=-adjusted_lr)
     def _update_g(self, p, g, group, momentum):
         # calc update
@@ -339,9 +376,8 @@ class Muon(torch.optim.Optimizer):
             g = buf
         return g
-    def _update_p(self, p, u, lr, weight_decay):
-        # scale update
-        adjusted_lr = self.adjust_lr_for_muon(lr, p.shape)
         # apply weight decay
         p.data.mul_(1 - lr * weight_decay)
         # apply update
@@ -369,28 +405,34 @@ class Muon(torch.optim.Optimizer):
             p.grad = g
         param_to_state, ordered_params = self.init_state_and_assign_params(
-            params, group
-        )
         def enqueue_gathers(start_idx, chunk_size):
-            for p in ordered_params[start_idx : start_idx + chunk_size]:
                 state = param_to_state[id(p)]
-                _gather(p, state, self.rank, self.comm_stream, group["none_grad"])
         def enqueue_computes(start_idx, chunk_size):
-            for p in ordered_params[start_idx : start_idx + chunk_size]:
                 state = param_to_state[id(p)]
-                _compute_u(state, group["ns_steps"], self.rank, self.compute_stream)
         def enqueue_scatters(start_idx, chunk_size):
-            for p in ordered_params[start_idx : start_idx + chunk_size]:
                 state = param_to_state[id(p)]
                 adjusted_lr = self.adjust_lr_for_muon(lr, p.shape)
-                _scatter(
-                    p, state, adjusted_lr, weight_decay, self.rank, self.comm_stream
-                )
-        chunk_size = dist.get_world_size(param_to_state[id(params[0])].process_group)
         # Wait grad update
         self.comm_stream.wait_stream(torch.cuda.current_stream())
@@ -398,10 +440,14 @@ class Muon(torch.optim.Optimizer):
         enqueue_gathers(0, chunk_size)
         for i in range(0, len(params) + chunk_size - 1, chunk_size):
             enqueue_computes(i, chunk_size)
             enqueue_gathers(i + chunk_size, chunk_size)
             enqueue_scatters(i, chunk_size)
-        torch.cuda.current_stream().wait_stream(self.comm_stream)
     def step(self, closure=None):
         """Perform a single optimization step.
@@ -436,15 +482,16 @@ class Muon(torch.optim.Optimizer):
                     continue
                 if isinstance(p.data, DTensor):
                     if all(
-                        isinstance(placement, Replicate) for placement in p.placements
-                    ):
                         param_tensors.append(p)
                     else:
                         param_dtensors.append(p)
                 elif isinstance(p.data, torch.Tensor):
                     param_tensors.append(p)
                 else:
-                    raise TypeError(f"Unsupported parameter type: {type(p.data)}")
             if self.debug:
                 print(
@@ -479,7 +526,9 @@ class Muon(torch.optim.Optimizer):
             #       AdamW backup       #
             ############################
-            params = [p for p in group["params"] if not self.state[p]["use_muon"]]
             lr = group["lr"]
             beta1, beta2 = group["adamw_betas"]
             eps = group["adamw_eps"]

     # TODO: use Optional
     worker_rank: int | None = None
     gathered_grad: torch.Tensor | None = None
+    scattered_u: DTensor | None = None
     computed_u: torch.Tensor | None = None
     gather_event: torch.cuda.Event | None = None
     compute_event: torch.cuda.Event | None = None
+    scatter_event: torch.cuda.Event | None = None
     process_group = None
 @torch.no_grad()
 def _gather(p, state, rank, comm_stream, none_grad):
+    """
+    Gather the gradients to worker_rank.
+    If none_grad is True, free p.grad after the gather.
+    """
     g = p.grad
     if rank == state.worker_rank:
         num_ranks = dist.get_world_size(group=state.process_group)
+        gather_list = [
+            torch.empty_like(g.to_local()) for _ in range(num_ranks)
+        ]
     else:
         gather_list = None
         if rank == state.worker_rank:
             if state.gathered_grad is not None:
                 raise RuntimeError(
+                    "Gather event already exists, which should not happen.")
             state.gathered_grad = torch.cat(gather_list, dim=0)
             state.gather_event = torch.cuda.Event()
             state.gather_event.record()
             state.gathered_grad = None
             state.gather_event = None
         if none_grad:
+            # We can safely free p.grad without calling record_stream:
+            #   p.grad.to_local().record_stream(comm_stream)
+            # Explanation:
+            # 1. p.grad is created on the default stream, but the default stream
+            #    is synchronized with the comm stream later.
+            # 2. There is no further activity on the default stream before the optimizer finishes.
+            # Therefore, it is safe to free p.grad directly on the comm stream.
             p.grad = None
 @torch.no_grad()
 def _compute_u(state, steps, rank, compute_stream):
+    """
+    On worker_rank, compute the orthogonalized update using Newton-Schulz iteration.
+    """
     with torch.cuda.stream(compute_stream):
         if rank == state.worker_rank:
             if state.gather_event is None:
             state.computed_u = u
             state.compute_event = torch.cuda.Event()
             state.compute_event.record()
         else:
             state.computed_u = None
             state.compute_event = None
 @torch.no_grad()
+def _scatter(p, state, rank, comm_stream):
+    """
+    Scatter the computed_u from worker_rank to all ranks.
+    """
     with torch.cuda.stream(comm_stream):
         if rank == state.worker_rank:
             if state.compute_event is None:
                 raise RuntimeError("Compute event must be set before scatter.")
             comm_stream.wait_event(state.compute_event)
+            # Clear the gathered gradient to free memory
+            state.gathered_grad = None
+            u = state.computed_u
             scatter_list = list(torch.split(u, p.size(0) // num_ranks, dim=0))
+            scatter_list = [s.contiguous() for s in scatter_list]
         else:
             scatter_list = None
+        u_received = torch.empty_like(p.to_local())
         torch.distributed.scatter(
+            u_received,
             scatter_list=scatter_list,
             src=state.worker_rank,
             group=state.process_group,
         )
+        u_dtensor = DTensor.from_local(
+            u_received,
             placements=p.placements,
             device_mesh=p.device_mesh,
         )
+        state.scattered_u = u_dtensor
+        state.scatter_event = torch.cuda.Event()
+        state.scatter_event.record()
+def _update_param(p, state, lr, adjusted_lr, weight_decay, rank,
+                  compute_stream):
+    """
+    Update sharded parameter p with the scattered_u.
+    Only worker_rank frees computed_u.
+    """
+    with torch.cuda.stream(compute_stream):
+        if state.scatter_event is None:
+            raise RuntimeError("Scatter event must be set before update")
+        compute_stream.wait_event(state.scatter_event)
+        if rank == state.worker_rank:
+            # Free computed_u
+            state.computed_u = None
+        Muon._update_p(p, state.scattered_u, lr, adjusted_lr, weight_decay)
 def default_is_muon(x, name):
     - We believe it may not work well for finetuning pretrained models, but we haven't tested this.
     Arguments:
+        model: The model to be optimized by Muon.
+        is_muon_func: A function that takes a parameter and its name, and returns whether the parameter should be optimized by Muon.
         lr: The learning rate. The updates will have spectral norm of `lr`. (0.02 is a good default)
         momentum: The momentum used by the internal SGD. (0.95 is a good default)
         nesterov: Whether to use Nesterov-style momentum in the internal SGD. (recommended)
         ns_steps: The number of Newton-Schulz iterations to run. (6 is probably always enough)
+        weight_decay: The weight decay for Muon and AdamW.
         {0, 1}-D or are detected as being the embed or lm_head will be optimized by AdamW as well.
         adamw_lr: The learning rate for the internal AdamW.
         adamw_betas: The betas for the internal AdamW.
         adamw_eps: The epsilon for the internal AdamW.
+        none_grad: Whether to set p.grad to None after gathering the gradients. This can save memory.
+        debug: Whether to print debug information.
     """
     def __init__(
         """
         Get the shard mesh for a parameter p on the given rank.
         """
+        assert isinstance(
+            p, DTensor), "Parallel Muon only supports DTensor parameters."
+        if p.placements == (Shard(dim=0), ):
             # Case for FSDP
             return p.device_mesh.mesh, p.device_mesh.get_group(mesh_dim=0)
         elif p.placements == (Replicate(), Shard(dim=0)):
             total_flops += flops
         if self.debug:
+            print(f"Total TFLOPs for Muon: {total_flops / 1e12:.2f} TFLOPs",
+                  flush=True)
+        ordered_params = sorted(params,
+                                key=lambda p: param_to_flops[id(p)],
+                                reverse=True)
         round_robin = 0
         mesh = None
             u = _zeropower_via_newtonschulz5(g, steps=group["ns_steps"])
             adjusted_lr = self.adjust_lr_for_muon(lr, p.shape)
+            Muon._update_p(p, u, lr, adjusted_lr, weight_decay)
     def _update_g(self, p, g, group, momentum):
         # calc update
             g = buf
         return g
+    @staticmethod
+    def _update_p(p, u, lr, adjusted_lr, weight_decay):
         # apply weight decay
         p.data.mul_(1 - lr * weight_decay)
         # apply update
             p.grad = g
         param_to_state, ordered_params = self.init_state_and_assign_params(
+            params, group)
         def enqueue_gathers(start_idx, chunk_size):
+            for p in ordered_params[start_idx:start_idx + chunk_size]:
                 state = param_to_state[id(p)]
+                _gather(p, state, self.rank, self.comm_stream,
+                        group["none_grad"])
         def enqueue_computes(start_idx, chunk_size):
+            for p in ordered_params[start_idx:start_idx + chunk_size]:
                 state = param_to_state[id(p)]
+                _compute_u(state, group["ns_steps"], self.rank,
+                           self.compute_stream)
         def enqueue_scatters(start_idx, chunk_size):
+            for p in ordered_params[start_idx:start_idx + chunk_size]:
+                state = param_to_state[id(p)]
+                _scatter(p, state, self.rank, self.comm_stream)
+        def enqueue_update_param(start_idx, chunk_size):
+            for p in ordered_params[start_idx:start_idx + chunk_size]:
                 state = param_to_state[id(p)]
                 adjusted_lr = self.adjust_lr_for_muon(lr, p.shape)
+                _update_param(p, state, lr, adjusted_lr, weight_decay,
+                              self.rank, self.compute_stream)
+        chunk_size = dist.get_world_size(param_to_state[id(
+            params[0])].process_group)
         # Wait grad update
         self.comm_stream.wait_stream(torch.cuda.current_stream())
         enqueue_gathers(0, chunk_size)
         for i in range(0, len(params) + chunk_size - 1, chunk_size):
             enqueue_computes(i, chunk_size)
+            if i > 0:
+                enqueue_update_param(i - chunk_size, chunk_size)
             enqueue_gathers(i + chunk_size, chunk_size)
             enqueue_scatters(i, chunk_size)
+        enqueue_update_param(i, chunk_size)
+        # Wait the last update_param to finish
+        torch.cuda.current_stream().wait_stream(self.compute_stream)
     def step(self, closure=None):
         """Perform a single optimization step.
                     continue
                 if isinstance(p.data, DTensor):
                     if all(
+                            isinstance(placement, Replicate)
+                            for placement in p.placements):
                         param_tensors.append(p)
                     else:
                         param_dtensors.append(p)
                 elif isinstance(p.data, torch.Tensor):
                     param_tensors.append(p)
                 else:
+                    raise TypeError(
+                        f"Unsupported parameter type: {type(p.data)}")
             if self.debug:
                 print(
             #       AdamW backup       #
             ############################
+            params = [
+                p for p in group["params"] if not self.state[p]["use_muon"]
+            ]
             lr = group["lr"]
             beta1, beta2 = group["adamw_betas"]
             eps = group["adamw_eps"]

torch-ext/optimizer/muon.py CHANGED Viewed

@@ -47,14 +47,20 @@ class _muon_state:
     # TODO: use Optional
     worker_rank: int | None = None
     gathered_grad: torch.Tensor | None = None
     computed_u: torch.Tensor | None = None
     gather_event: torch.cuda.Event | None = None
     compute_event: torch.cuda.Event | None = None
     process_group = None
 @torch.no_grad()
 def _gather(p, state, rank, comm_stream, none_grad):
     g = p.grad
     if rank == state.worker_rank:
@@ -83,12 +89,21 @@ def _gather(p, state, rank, comm_stream, none_grad):
             state.gathered_grad = None
             state.gather_event = None
         if none_grad:
-            p.grad.record_stream(comm_stream)
             p.grad = None
 @torch.no_grad()
 def _compute_u(state, steps, rank, compute_stream):
     with torch.cuda.stream(compute_stream):
         if rank == state.worker_rank:
             if state.gather_event is None:
@@ -98,16 +113,16 @@ def _compute_u(state, steps, rank, compute_stream):
             state.computed_u = u
             state.compute_event = torch.cuda.Event()
             state.compute_event.record()
-            # Clear the gathered gradient to free memory
-            state.gathered_grad.record_stream(compute_stream)
-            state.gathered_grad = None
         else:
             state.computed_u = None
             state.compute_event = None
 @torch.no_grad()
-def _scatter(p, state, lr, adjusted_lr, weight_decay, rank, comm_stream):
     with torch.cuda.stream(comm_stream):
         if rank == state.worker_rank:
@@ -115,6 +130,10 @@ def _scatter(p, state, lr, adjusted_lr, weight_decay, rank, comm_stream):
             if state.compute_event is None:
                 raise RuntimeError("Compute event must be set before scatter.")
             comm_stream.wait_event(state.compute_event)
             u = state.computed_u
             scatter_list = list(torch.split(u, p.size(0) // num_ranks, dim=0))
             scatter_list = [s.contiguous() for s in scatter_list]
@@ -128,18 +147,32 @@ def _scatter(p, state, lr, adjusted_lr, weight_decay, rank, comm_stream):
             src=state.worker_rank,
             group=state.process_group,
         )
-        if rank == state.worker_rank:
-            # Clear u to free memory
-            state.computed_u.record_stream(comm_stream)
-            state.computed_u = None
         u_dtensor = DTensor.from_local(
             u_received,
             placements=p.placements,
             device_mesh=p.device_mesh,
         )
-        p.data.mul_(1 - lr * weight_decay)
-        p.data.add_(u_dtensor, alpha=-adjusted_lr)
 def default_is_muon(x, name):
@@ -160,17 +193,19 @@ class Muon(torch.optim.Optimizer):
     - We believe it may not work well for finetuning pretrained models, but we haven't tested this.
     Arguments:
-        muon_params: The parameters to be optimized by Muon.
         lr: The learning rate. The updates will have spectral norm of `lr`. (0.02 is a good default)
         momentum: The momentum used by the internal SGD. (0.95 is a good default)
         nesterov: Whether to use Nesterov-style momentum in the internal SGD. (recommended)
         ns_steps: The number of Newton-Schulz iterations to run. (6 is probably always enough)
-        adamw_params: The parameters to be optimized by AdamW. Any parameters in `muon_params` which are
         {0, 1}-D or are detected as being the embed or lm_head will be optimized by AdamW as well.
         adamw_lr: The learning rate for the internal AdamW.
         adamw_betas: The betas for the internal AdamW.
         adamw_eps: The epsilon for the internal AdamW.
-        adamw_weight_decay: The weight decay for the internal AdamW.
     """
     def __init__(
@@ -325,14 +360,8 @@ class Muon(torch.optim.Optimizer):
             u = _zeropower_via_newtonschulz5(g, steps=group["ns_steps"])
-            # scale update
             adjusted_lr = self.adjust_lr_for_muon(lr, p.shape)
-            # apply weight decay
-            p.data.mul_(1 - lr * weight_decay)
-            # apply update
-            p.data.add_(u, alpha=-adjusted_lr)
     def _update_g(self, p, g, group, momentum):
         # calc update
@@ -347,9 +376,8 @@ class Muon(torch.optim.Optimizer):
             g = buf
         return g
-    def _update_p(self, p, u, lr, weight_decay):
-        # scale update
-        adjusted_lr = self.adjust_lr_for_muon(lr, p.shape)
         # apply weight decay
         p.data.mul_(1 - lr * weight_decay)
         # apply update
@@ -392,11 +420,16 @@ class Muon(torch.optim.Optimizer):
                            self.compute_stream)
         def enqueue_scatters(start_idx, chunk_size):
             for p in ordered_params[start_idx:start_idx + chunk_size]:
                 state = param_to_state[id(p)]
                 adjusted_lr = self.adjust_lr_for_muon(lr, p.shape)
-                _scatter(p, state, lr, adjusted_lr, weight_decay, self.rank,
-                         self.comm_stream)
         chunk_size = dist.get_world_size(param_to_state[id(
             params[0])].process_group)
@@ -407,10 +440,14 @@ class Muon(torch.optim.Optimizer):
         enqueue_gathers(0, chunk_size)
         for i in range(0, len(params) + chunk_size - 1, chunk_size):
             enqueue_computes(i, chunk_size)
             enqueue_gathers(i + chunk_size, chunk_size)
             enqueue_scatters(i, chunk_size)
-        torch.cuda.current_stream().wait_stream(self.comm_stream)
     def step(self, closure=None):
         """Perform a single optimization step.

     # TODO: use Optional
     worker_rank: int | None = None
     gathered_grad: torch.Tensor | None = None
+    scattered_u: DTensor | None = None
     computed_u: torch.Tensor | None = None
     gather_event: torch.cuda.Event | None = None
     compute_event: torch.cuda.Event | None = None
+    scatter_event: torch.cuda.Event | None = None
     process_group = None
 @torch.no_grad()
 def _gather(p, state, rank, comm_stream, none_grad):
+    """
+    Gather the gradients to worker_rank.
+    If none_grad is True, free p.grad after the gather.
+    """
     g = p.grad
     if rank == state.worker_rank:
             state.gathered_grad = None
             state.gather_event = None
         if none_grad:
+            # We can safely free p.grad without calling record_stream:
+            #   p.grad.to_local().record_stream(comm_stream)
+            # Explanation:
+            # 1. p.grad is created on the default stream, but the default stream
+            #    is synchronized with the comm stream later.
+            # 2. There is no further activity on the default stream before the optimizer finishes.
+            # Therefore, it is safe to free p.grad directly on the comm stream.
             p.grad = None
 @torch.no_grad()
 def _compute_u(state, steps, rank, compute_stream):
+    """
+    On worker_rank, compute the orthogonalized update using Newton-Schulz iteration.
+    """
     with torch.cuda.stream(compute_stream):
         if rank == state.worker_rank:
             if state.gather_event is None:
             state.computed_u = u
             state.compute_event = torch.cuda.Event()
             state.compute_event.record()
         else:
             state.computed_u = None
             state.compute_event = None
 @torch.no_grad()
+def _scatter(p, state, rank, comm_stream):
+    """
+    Scatter the computed_u from worker_rank to all ranks.
+    """
     with torch.cuda.stream(comm_stream):
         if rank == state.worker_rank:
             if state.compute_event is None:
                 raise RuntimeError("Compute event must be set before scatter.")
             comm_stream.wait_event(state.compute_event)
+            # Clear the gathered gradient to free memory
+            state.gathered_grad = None
             u = state.computed_u
             scatter_list = list(torch.split(u, p.size(0) // num_ranks, dim=0))
             scatter_list = [s.contiguous() for s in scatter_list]
             src=state.worker_rank,
             group=state.process_group,
         )
         u_dtensor = DTensor.from_local(
             u_received,
             placements=p.placements,
             device_mesh=p.device_mesh,
         )
+        state.scattered_u = u_dtensor
+        state.scatter_event = torch.cuda.Event()
+        state.scatter_event.record()
+def _update_param(p, state, lr, adjusted_lr, weight_decay, rank,
+                  compute_stream):
+    """
+    Update sharded parameter p with the scattered_u.
+    Only worker_rank frees computed_u.
+    """
+    with torch.cuda.stream(compute_stream):
+        if state.scatter_event is None:
+            raise RuntimeError("Scatter event must be set before update")
+        compute_stream.wait_event(state.scatter_event)
+        if rank == state.worker_rank:
+            # Free computed_u
+            state.computed_u = None
+        Muon._update_p(p, state.scattered_u, lr, adjusted_lr, weight_decay)
 def default_is_muon(x, name):
     - We believe it may not work well for finetuning pretrained models, but we haven't tested this.
     Arguments:
+        model: The model to be optimized by Muon.
+        is_muon_func: A function that takes a parameter and its name, and returns whether the parameter should be optimized by Muon.
         lr: The learning rate. The updates will have spectral norm of `lr`. (0.02 is a good default)
         momentum: The momentum used by the internal SGD. (0.95 is a good default)
         nesterov: Whether to use Nesterov-style momentum in the internal SGD. (recommended)
         ns_steps: The number of Newton-Schulz iterations to run. (6 is probably always enough)
+        weight_decay: The weight decay for Muon and AdamW.
         {0, 1}-D or are detected as being the embed or lm_head will be optimized by AdamW as well.
         adamw_lr: The learning rate for the internal AdamW.
         adamw_betas: The betas for the internal AdamW.
         adamw_eps: The epsilon for the internal AdamW.
+        none_grad: Whether to set p.grad to None after gathering the gradients. This can save memory.
+        debug: Whether to print debug information.
     """
     def __init__(
             u = _zeropower_via_newtonschulz5(g, steps=group["ns_steps"])
             adjusted_lr = self.adjust_lr_for_muon(lr, p.shape)
+            Muon._update_p(p, u, lr, adjusted_lr, weight_decay)
     def _update_g(self, p, g, group, momentum):
         # calc update
             g = buf
         return g
+    @staticmethod
+    def _update_p(p, u, lr, adjusted_lr, weight_decay):
         # apply weight decay
         p.data.mul_(1 - lr * weight_decay)
         # apply update
                            self.compute_stream)
         def enqueue_scatters(start_idx, chunk_size):
+            for p in ordered_params[start_idx:start_idx + chunk_size]:
+                state = param_to_state[id(p)]
+                _scatter(p, state, self.rank, self.comm_stream)
+        def enqueue_update_param(start_idx, chunk_size):
             for p in ordered_params[start_idx:start_idx + chunk_size]:
                 state = param_to_state[id(p)]
                 adjusted_lr = self.adjust_lr_for_muon(lr, p.shape)
+                _update_param(p, state, lr, adjusted_lr, weight_decay,
+                              self.rank, self.compute_stream)
         chunk_size = dist.get_world_size(param_to_state[id(
             params[0])].process_group)
         enqueue_gathers(0, chunk_size)
         for i in range(0, len(params) + chunk_size - 1, chunk_size):
             enqueue_computes(i, chunk_size)
+            if i > 0:
+                enqueue_update_param(i - chunk_size, chunk_size)
             enqueue_gathers(i + chunk_size, chunk_size)
             enqueue_scatters(i, chunk_size)
+        enqueue_update_param(i, chunk_size)
+        # Wait the last update_param to finish
+        torch.cuda.current_stream().wait_stream(self.compute_stream)
     def step(self, closure=None):
         """Perform a single optimization step.