wyldecat commited on 3 days ago

Commit

f7faa93

1 Parent(s): b0f46c7

feat(muon) : add tuned-abc-values & blfoat16 communication

Browse files

Files changed (32) hide show

build/torch27-cxx11-cu118-x86_64-linux/optimizer/_ops.py +3 -3
build/torch27-cxx11-cu118-x86_64-linux/optimizer/{_optimizer_20250911094409.abi3.so → _optimizer_ee6ed44_dirty.abi3.so} +1 -1
build/torch27-cxx11-cu118-x86_64-linux/optimizer/muon.py +158 -50
build/torch27-cxx11-cu126-x86_64-linux/optimizer/_ops.py +3 -3
build/{torch28-cxx11-cu126-x86_64-linux/optimizer/_optimizer_20250911094409.abi3.so → torch27-cxx11-cu126-x86_64-linux/optimizer/_optimizer_ee6ed44_dirty.abi3.so} +1 -1
build/torch27-cxx11-cu126-x86_64-linux/optimizer/muon.py +158 -50
build/torch27-cxx11-cu128-x86_64-linux/optimizer/_ops.py +3 -3
build/torch27-cxx11-cu128-x86_64-linux/optimizer/{_optimizer_20250911094409.abi3.so → _optimizer_ee6ed44_dirty.abi3.so} +2 -2
build/torch27-cxx11-cu128-x86_64-linux/optimizer/muon.py +158 -50
build/torch27-cxx11-rocm63-x86_64-linux/optimizer/_ops.py +3 -3
build/torch27-cxx11-rocm63-x86_64-linux/optimizer/{_optimizer_20250911094409.abi3.so → _optimizer_ee6ed44_dirty.abi3.so} +2 -2
build/torch27-cxx11-rocm63-x86_64-linux/optimizer/muon.py +158 -50
build/torch28-cxx11-cu126-x86_64-linux/optimizer/_ops.py +3 -3
build/{torch27-cxx11-cu126-x86_64-linux/optimizer/_optimizer_20250911094409.abi3.so → torch28-cxx11-cu126-x86_64-linux/optimizer/_optimizer_ee6ed44_dirty.abi3.so} +1 -1
build/torch28-cxx11-cu126-x86_64-linux/optimizer/muon.py +158 -50
build/torch28-cxx11-cu128-x86_64-linux/optimizer/_ops.py +3 -3
build/torch28-cxx11-cu128-x86_64-linux/optimizer/_optimizer_20250911094409.abi3.so +0 -3
build/torch28-cxx11-cu128-x86_64-linux/optimizer/_optimizer_ee6ed44_dirty.abi3.so +3 -0
build/torch28-cxx11-cu128-x86_64-linux/optimizer/muon.py +158 -50
build/torch28-cxx11-cu129-x86_64-linux/optimizer/_ops.py +3 -3
build/torch28-cxx11-cu129-x86_64-linux/optimizer/_optimizer_20250911094409.abi3.so +0 -3
build/torch28-cxx11-cu129-x86_64-linux/optimizer/_optimizer_ee6ed44_dirty.abi3.so +3 -0
build/torch28-cxx11-cu129-x86_64-linux/optimizer/muon.py +158 -50
build/torch28-cxx11-rocm63-x86_64-linux/optimizer/_ops.py +3 -3
build/torch28-cxx11-rocm63-x86_64-linux/optimizer/_optimizer_20250911094409.abi3.so +0 -3
build/torch28-cxx11-rocm63-x86_64-linux/optimizer/_optimizer_ee6ed44_dirty.abi3.so +3 -0
build/torch28-cxx11-rocm63-x86_64-linux/optimizer/muon.py +158 -50
build/torch28-cxx11-rocm64-x86_64-linux/optimizer/_ops.py +3 -3
build/torch28-cxx11-rocm64-x86_64-linux/optimizer/_optimizer_20250911094409.abi3.so +0 -3
build/torch28-cxx11-rocm64-x86_64-linux/optimizer/_optimizer_ee6ed44_dirty.abi3.so +3 -0
build/torch28-cxx11-rocm64-x86_64-linux/optimizer/muon.py +158 -50
torch-ext/optimizer/muon.py +158 -50

build/torch27-cxx11-cu118-x86_64-linux/optimizer/_ops.py CHANGED Viewed

@@ -1,9 +1,9 @@
 import torch
-from . import _optimizer_20250911094409
-ops = torch.ops._optimizer_20250911094409
 def add_op_namespace_prefix(op_name: str):
     """
     Prefix op by namespace.
     """
-    return f"_optimizer_20250911094409::{op_name}"

 import torch
+from . import _optimizer_ee6ed44_dirty
+ops = torch.ops._optimizer_ee6ed44_dirty
 def add_op_namespace_prefix(op_name: str):
     """
     Prefix op by namespace.
     """
+    return f"_optimizer_ee6ed44_dirty::{op_name}"

build/torch27-cxx11-cu118-x86_64-linux/optimizer/{_optimizer_20250911094409.abi3.so → _optimizer_ee6ed44_dirty.abi3.so} RENAMED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:48cd88108696ba8ed7487e637b785445bb5ff6075a3ae0c15355698958ad340a
 size 1787376

 version https://git-lfs.github.com/spec/v1
+oid sha256:55f17ad6ecdd22d84ea5b776a317fa9fbb6b81f622fa8fc80b78e0ef80bd4ea6
 size 1787376

build/torch27-cxx11-cu118-x86_64-linux/optimizer/muon.py CHANGED Viewed

@@ -2,6 +2,7 @@ import logging
 import math
 import types
 from dataclasses import dataclass
 import torch
 import torch.distributed as dist
@@ -12,6 +13,8 @@ logger = logging.getLogger(__name__)
 # This code snippet is a modified version adapted from the following GitHub repositories:
 # https://github.com/KellerJordan/Muon/blob/master/muon.py
 @torch.no_grad()
 def _zeropower_via_newtonschulz5(G, steps):
     """
@@ -24,15 +27,21 @@ def _zeropower_via_newtonschulz5(G, steps):
     performance at all relative to UV^T, where USV^T = G is the SVD.
     """
     assert len(G.shape) == 2
-    a, b, c = (3.4445, -4.7750, 2.0315)
     X = G  # no manual typecast
     if G.size(0) > G.size(1):
         X = X.T
     # Ensure spectral norm is at most 1
     X = X / (X.norm() + 1e-7)
-    X = X.bfloat16()
     # Perform the NS iterations
-    for _ in range(steps):
         A = X @ X.T
         # B = (
         #    b * A + c * A @ A
@@ -43,7 +52,7 @@ def _zeropower_via_newtonschulz5(G, steps):
     if G.size(0) > G.size(1):
         X = X.T
-    return X.to(G.dtype)
 @dataclass
@@ -65,17 +74,19 @@ def _gather(p, state, rank, comm_stream, none_grad):
     Gather the gradients to worker_rank.
     If none_grad is True, free p.grad after the gather.
     """
-    g = p.grad
-    if rank == state.worker_rank:
-        num_ranks = dist.get_world_size(group=state.process_group)
-        gather_list = [
-            torch.empty_like(g.to_local()) for _ in range(num_ranks)
-        ]
-    else:
-        gather_list = None
     with torch.cuda.stream(comm_stream):
         torch.distributed.gather(
             g.to_local(),
             dst=state.worker_rank,
@@ -92,6 +103,7 @@ def _gather(p, state, rank, comm_stream, none_grad):
         else:
             state.gathered_grad = None
             state.gather_event = None
         if none_grad:
             # We can safely free p.grad without calling record_stream:
             #   p.grad.to_local().record_stream(comm_stream)
@@ -104,7 +116,7 @@ def _gather(p, state, rank, comm_stream, none_grad):
 @torch.no_grad()
-def _compute_u(state, steps, rank, compute_stream):
     """
     On worker_rank, compute the orthogonalized update using Newton-Schulz iteration.
     """
@@ -115,11 +127,11 @@ def _compute_u(state, steps, rank, compute_stream):
             compute_stream.wait_event(state.gather_event)
             u = _zeropower_via_newtonschulz5(state.gathered_grad, steps)
             state.computed_u = u
-            state.compute_event = torch.cuda.Event()
-            state.compute_event.record()
-        else:
-            state.computed_u = None
-            state.compute_event = None
 @torch.no_grad()
@@ -129,12 +141,12 @@ def _scatter(p, state, rank, comm_stream):
     """
     with torch.cuda.stream(comm_stream):
         if rank == state.worker_rank:
             num_ranks = dist.get_world_size(group=state.process_group)
-            if state.compute_event is None:
-                raise RuntimeError("Compute event must be set before scatter.")
-            comm_stream.wait_event(state.compute_event)
             # Clear the gathered gradient to free memory
             state.gathered_grad = None
@@ -144,22 +156,15 @@ def _scatter(p, state, rank, comm_stream):
         else:
             scatter_list = None
-        u_received = torch.empty_like(p.to_local())
         torch.distributed.scatter(
-            u_received,
             scatter_list=scatter_list,
             src=state.worker_rank,
             group=state.process_group,
         )
-        u_dtensor = DTensor.from_local(
-            u_received,
-            placements=p.placements,
-            device_mesh=p.device_mesh,
-        )
-        state.scattered_u = u_dtensor
         state.scatter_event = torch.cuda.Event()
         state.scatter_event.record()
 def _update_param(p, state, lr, adjusted_lr, weight_decay, rank,
@@ -172,11 +177,21 @@ def _update_param(p, state, lr, adjusted_lr, weight_decay, rank,
         if state.scatter_event is None:
             raise RuntimeError("Scatter event must be set before update")
         compute_stream.wait_event(state.scatter_event)
         if rank == state.worker_rank:
             # Free computed_u
             state.computed_u = None
         Muon._update_p(p, state.scattered_u, lr, adjusted_lr, weight_decay)
 def default_is_muon(name, x):
@@ -375,7 +390,8 @@ class Muon(torch.optim.Optimizer):
             else:
                 g = buf
-            u = _zeropower_via_newtonschulz5(g, steps=group["ns_steps"])
             adjusted_lr = self.adjust_lr_for_muon(lr, p.shape)
             Muon._update_p(p, u, lr, adjusted_lr, weight_decay)
@@ -433,7 +449,7 @@ class Muon(torch.optim.Optimizer):
         def enqueue_computes(start_idx, chunk_size):
             for p in ordered_params[start_idx:start_idx + chunk_size]:
                 state = param_to_state[id(p)]
-                _compute_u(state, group["ns_steps"], self.rank,
                            self.compute_stream)
         def enqueue_scatters(start_idx, chunk_size):
@@ -466,6 +482,77 @@ class Muon(torch.optim.Optimizer):
         # Wait the last update_param to finish
         torch.cuda.current_stream().wait_stream(self.compute_stream)
     def step(self, closure=None):
         """Perform a single optimization step.
@@ -542,6 +629,12 @@ class Muon(torch.optim.Optimizer):
                 #       AdamW backup       #
                 ############################
                 lr = group["lr"]
                 beta1, beta2 = group["adamw_betas"]
                 eps = group["adamw_eps"]
@@ -552,23 +645,38 @@ class Muon(torch.optim.Optimizer):
                     if g is None:
                         continue
                     state = self.state[p]
                     if "step" not in state:
-                        state["step"] = 0
                         state["moment1"] = torch.zeros_like(g)
                         state["moment2"] = torch.zeros_like(g)
-                    state["step"] += 1
-                    step = state["step"]
-                    buf1 = state["moment1"]
-                    buf2 = state["moment2"]
-                    buf1.lerp_(g, 1 - beta1)
-                    buf2.lerp_(g.square(), 1 - beta2)
-                    g = buf1 / (eps + buf2.sqrt())
-                    bias_correction1 = 1 - beta1**step
-                    bias_correction2 = 1 - beta2**step
-                    scale = bias_correction1 / bias_correction2**0.5
-                    p.data.mul_(1 - lr * weight_decay)
-                    p.data.add_(g, alpha=-lr / scale)
         return loss

 import math
 import types
 from dataclasses import dataclass
+from typing import Optional, Union, cast
 import torch
 import torch.distributed as dist
 # This code snippet is a modified version adapted from the following GitHub repositories:
 # https://github.com/KellerJordan/Muon/blob/master/muon.py
+# Muon's Newton–Schulz iteration causes high variance in singular values
+# Idea: give each iteration its own 3 coefficients and optimize them via gradient descent.
 @torch.no_grad()
 def _zeropower_via_newtonschulz5(G, steps):
     """
     performance at all relative to UV^T, where USV^T = G is the SVD.
     """
     assert len(G.shape) == 2
+    assert G.dtype == torch.bfloat16
     X = G  # no manual typecast
     if G.size(0) > G.size(1):
         X = X.T
     # Ensure spectral norm is at most 1
     X = X / (X.norm() + 1e-7)
     # Perform the NS iterations
+    for a, b, c in [
+        (4.0848, -6.8946, 2.9270),
+        (3.9505, -6.3029, 2.6377),
+        (3.7418, -5.5913, 2.3037),
+        (2.8769, -3.1427, 1.2046),
+        (2.8366, -3.0525, 1.2012),
+    ]:
         A = X @ X.T
         # B = (
         #    b * A + c * A @ A
     if G.size(0) > G.size(1):
         X = X.T
+    return X
 @dataclass
     Gather the gradients to worker_rank.
     If none_grad is True, free p.grad after the gather.
     """
     with torch.cuda.stream(comm_stream):
+        g = p.grad
+        if rank == state.worker_rank:
+            num_ranks = dist.get_world_size(group=state.process_group)
+            gather_list = [
+                torch.empty_like(g.to_local(), dtype=torch.bfloat16)
+                for _ in range(num_ranks)
+            ]
+        else:
+            gather_list = None
+        g = g.to(torch.bfloat16)
         torch.distributed.gather(
             g.to_local(),
             dst=state.worker_rank,
         else:
             state.gathered_grad = None
             state.gather_event = None
+        gather_list = None
         if none_grad:
             # We can safely free p.grad without calling record_stream:
             #   p.grad.to_local().record_stream(comm_stream)
 @torch.no_grad()
+def _compute_u(p, state, steps, rank, compute_stream):
     """
     On worker_rank, compute the orthogonalized update using Newton-Schulz iteration.
     """
             compute_stream.wait_event(state.gather_event)
             u = _zeropower_via_newtonschulz5(state.gathered_grad, steps)
             state.computed_u = u
+        state.scattered_u = torch.empty_like(p.to_local(),
+                                             dtype=torch.bfloat16)
+        state.compute_event = torch.cuda.Event()
+        state.compute_event.record()
+        u = None
 @torch.no_grad()
     """
     with torch.cuda.stream(comm_stream):
+        if state.compute_event is None:
+            raise RuntimeError("Compute event must be set before scatter.")
+        comm_stream.wait_event(state.compute_event)
         if rank == state.worker_rank:
             num_ranks = dist.get_world_size(group=state.process_group)
             # Clear the gathered gradient to free memory
             state.gathered_grad = None
         else:
             scatter_list = None
         torch.distributed.scatter(
+            state.scattered_u,
             scatter_list=scatter_list,
             src=state.worker_rank,
             group=state.process_group,
         )
         state.scatter_event = torch.cuda.Event()
         state.scatter_event.record()
+        scatter_list = None
 def _update_param(p, state, lr, adjusted_lr, weight_decay, rank,
         if state.scatter_event is None:
             raise RuntimeError("Scatter event must be set before update")
         compute_stream.wait_event(state.scatter_event)
+        u_dtensor = DTensor.from_local(
+            state.scattered_u,
+            placements=p.placements,
+            device_mesh=p.device_mesh,
+        )
+        state.scattered_u = u_dtensor
         if rank == state.worker_rank:
             # Free computed_u
             state.computed_u = None
         Muon._update_p(p, state.scattered_u, lr, adjusted_lr, weight_decay)
+        state.scattered_u = None
+        u_dtensor = None
 def default_is_muon(name, x):
             else:
                 g = buf
+            u = _zeropower_via_newtonschulz5(g.bfloat16(),
+                                             steps=group["ns_steps"])
             adjusted_lr = self.adjust_lr_for_muon(lr, p.shape)
             Muon._update_p(p, u, lr, adjusted_lr, weight_decay)
         def enqueue_computes(start_idx, chunk_size):
             for p in ordered_params[start_idx:start_idx + chunk_size]:
                 state = param_to_state[id(p)]
+                _compute_u(p, state, group["ns_steps"], self.rank,
                            self.compute_stream)
         def enqueue_scatters(start_idx, chunk_size):
         # Wait the last update_param to finish
         torch.cuda.current_stream().wait_stream(self.compute_stream)
+    @staticmethod
+    def _fused_adamw(
+        params: list[torch.Tensor],
+        grads: list[torch.Tensor],
+        exp_avgs: list[torch.Tensor],
+        exp_avg_sqs: list[torch.Tensor],
+        max_exp_avg_sqs: list[torch.Tensor],
+        state_steps: list[torch.Tensor],
+        amsgrad: bool,
+        beta1: float,
+        beta2: float,
+        lr: Union[float, torch.Tensor],
+        weight_decay: float,
+        eps: float,
+        maximize: bool,
+    ) -> None:
+        if not params:
+            return
+        # We only shuffle around the lr when it is a Tensor and on CUDA, otherwise, we prefer
+        # treating it as a scalar.
+        lr_dict: Optional[DeviceDict] = ({
+            lr.device: lr
+        } if isinstance(lr, torch.Tensor) and str(lr.device) != "cpu" else
+                                         None)
+        grouped_tensors = torch.optim.Optimizer._group_tensors_by_device_and_dtype(
+            [
+                params, grads, exp_avgs, exp_avg_sqs, max_exp_avg_sqs,
+                state_steps
+            ]  # type: ignore[list-item]
+        )
+        for (device, _), (
+            (
+                device_params_,
+                device_grads_,
+                device_exp_avgs_,
+                device_exp_avg_sqs_,
+                device_max_exp_avg_sqs,
+                device_state_steps_,
+            ),
+                _,
+        ) in grouped_tensors.items():
+            device_params = cast(list[torch.Tensor], device_params_)
+            device_grads = cast(list[torch.Tensor], device_grads_)
+            device_exp_avgs = cast(list[torch.Tensor], device_exp_avgs_)
+            device_exp_avg_sqs = cast(list[torch.Tensor], device_exp_avg_sqs_)
+            device_state_steps = cast(list[torch.Tensor], device_state_steps_)
+            if lr_dict is not None and device not in lr_dict:
+                lr_dict[device] = lr.to(
+                    device=device,
+                    non_blocking=True)  # type: ignore[union-attr]
+                lr = lr_dict[device]
+            torch._foreach_add_(device_state_steps, 1)
+            func = torch._fused_adamw_
+            func(
+                device_params,
+                device_grads,
+                device_exp_avgs,
+                device_exp_avg_sqs,
+                device_max_exp_avg_sqs,  # type: ignore[arg-type]
+                device_state_steps,
+                amsgrad=amsgrad,
+                lr=lr,  # type: ignore[arg-type]
+                beta1=beta1,
+                beta2=beta2,
+                weight_decay=weight_decay,
+                eps=eps,
+                maximize=maximize,
+            )
     def step(self, closure=None):
         """Perform a single optimization step.
                 #       AdamW backup       #
                 ############################
+                params_with_grads = []
+                grads = []
+                moment1 = []
+                moment2 = []
+                max_exp_avg_sqs = []
+                state_steps = []
                 lr = group["lr"]
                 beta1, beta2 = group["adamw_betas"]
                 eps = group["adamw_eps"]
                     if g is None:
                         continue
                     state = self.state[p]
+                    params_with_grads.append(p)
+                    grads.append(g)
                     if "step" not in state:
+                        state["step"] = (torch.zeros((),
+                                                     dtype=torch.float32,
+                                                     device=p.device))
                         state["moment1"] = torch.zeros_like(g)
                         state["moment2"] = torch.zeros_like(g)
+                    moment1.append(state["moment1"])
+                    moment2.append(state["moment2"])
+                    if not isinstance(state["step"], torch.Tensor):
+                        step_tensor = torch.tensor(state["step"],
+                                                   dtype=torch.float32,
+                                                   device=p.device)
+                    else:
+                        step_tensor = state["step"]
+                    state_steps.append(step_tensor)
+                self._fused_adamw(
+                    params_with_grads,
+                    grads,
+                    moment1,
+                    moment2,
+                    max_exp_avg_sqs,
+                    state_steps,
+                    amsgrad=False,
+                    beta1=beta1,
+                    beta2=beta2,
+                    lr=lr,
+                    weight_decay=weight_decay,
+                    eps=eps,
+                    maximize=False,
+                )
         return loss

build/torch27-cxx11-cu126-x86_64-linux/optimizer/_ops.py CHANGED Viewed

@@ -1,9 +1,9 @@
 import torch
-from . import _optimizer_20250911094409
-ops = torch.ops._optimizer_20250911094409
 def add_op_namespace_prefix(op_name: str):
     """
     Prefix op by namespace.
     """
-    return f"_optimizer_20250911094409::{op_name}"

 import torch
+from . import _optimizer_ee6ed44_dirty
+ops = torch.ops._optimizer_ee6ed44_dirty
 def add_op_namespace_prefix(op_name: str):
     """
     Prefix op by namespace.
     """
+    return f"_optimizer_ee6ed44_dirty::{op_name}"

build/{torch28-cxx11-cu126-x86_64-linux/optimizer/_optimizer_20250911094409.abi3.so → torch27-cxx11-cu126-x86_64-linux/optimizer/_optimizer_ee6ed44_dirty.abi3.so} RENAMED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:cebddf4b9cb794ad3cd7b88affd011160f7fb9a16257fcb4d942604839b31b37
 size 1824264

 version https://git-lfs.github.com/spec/v1
+oid sha256:f37c80a535a081e997c1973902a010c48b33ca40085a7f267a5278e56cff26f3
 size 1824264

build/torch27-cxx11-cu126-x86_64-linux/optimizer/muon.py CHANGED Viewed

@@ -2,6 +2,7 @@ import logging
 import math
 import types
 from dataclasses import dataclass
 import torch
 import torch.distributed as dist
@@ -12,6 +13,8 @@ logger = logging.getLogger(__name__)
 # This code snippet is a modified version adapted from the following GitHub repositories:
 # https://github.com/KellerJordan/Muon/blob/master/muon.py
 @torch.no_grad()
 def _zeropower_via_newtonschulz5(G, steps):
     """
@@ -24,15 +27,21 @@ def _zeropower_via_newtonschulz5(G, steps):
     performance at all relative to UV^T, where USV^T = G is the SVD.
     """
     assert len(G.shape) == 2
-    a, b, c = (3.4445, -4.7750, 2.0315)
     X = G  # no manual typecast
     if G.size(0) > G.size(1):
         X = X.T
     # Ensure spectral norm is at most 1
     X = X / (X.norm() + 1e-7)
-    X = X.bfloat16()
     # Perform the NS iterations
-    for _ in range(steps):
         A = X @ X.T
         # B = (
         #    b * A + c * A @ A
@@ -43,7 +52,7 @@ def _zeropower_via_newtonschulz5(G, steps):
     if G.size(0) > G.size(1):
         X = X.T
-    return X.to(G.dtype)
 @dataclass
@@ -65,17 +74,19 @@ def _gather(p, state, rank, comm_stream, none_grad):
     Gather the gradients to worker_rank.
     If none_grad is True, free p.grad after the gather.
     """
-    g = p.grad
-    if rank == state.worker_rank:
-        num_ranks = dist.get_world_size(group=state.process_group)
-        gather_list = [
-            torch.empty_like(g.to_local()) for _ in range(num_ranks)
-        ]
-    else:
-        gather_list = None
     with torch.cuda.stream(comm_stream):
         torch.distributed.gather(
             g.to_local(),
             dst=state.worker_rank,
@@ -92,6 +103,7 @@ def _gather(p, state, rank, comm_stream, none_grad):
         else:
             state.gathered_grad = None
             state.gather_event = None
         if none_grad:
             # We can safely free p.grad without calling record_stream:
             #   p.grad.to_local().record_stream(comm_stream)
@@ -104,7 +116,7 @@ def _gather(p, state, rank, comm_stream, none_grad):
 @torch.no_grad()
-def _compute_u(state, steps, rank, compute_stream):
     """
     On worker_rank, compute the orthogonalized update using Newton-Schulz iteration.
     """
@@ -115,11 +127,11 @@ def _compute_u(state, steps, rank, compute_stream):
             compute_stream.wait_event(state.gather_event)
             u = _zeropower_via_newtonschulz5(state.gathered_grad, steps)
             state.computed_u = u
-            state.compute_event = torch.cuda.Event()
-            state.compute_event.record()
-        else:
-            state.computed_u = None
-            state.compute_event = None
 @torch.no_grad()
@@ -129,12 +141,12 @@ def _scatter(p, state, rank, comm_stream):
     """
     with torch.cuda.stream(comm_stream):
         if rank == state.worker_rank:
             num_ranks = dist.get_world_size(group=state.process_group)
-            if state.compute_event is None:
-                raise RuntimeError("Compute event must be set before scatter.")
-            comm_stream.wait_event(state.compute_event)
             # Clear the gathered gradient to free memory
             state.gathered_grad = None
@@ -144,22 +156,15 @@ def _scatter(p, state, rank, comm_stream):
         else:
             scatter_list = None
-        u_received = torch.empty_like(p.to_local())
         torch.distributed.scatter(
-            u_received,
             scatter_list=scatter_list,
             src=state.worker_rank,
             group=state.process_group,
         )
-        u_dtensor = DTensor.from_local(
-            u_received,
-            placements=p.placements,
-            device_mesh=p.device_mesh,
-        )
-        state.scattered_u = u_dtensor
         state.scatter_event = torch.cuda.Event()
         state.scatter_event.record()
 def _update_param(p, state, lr, adjusted_lr, weight_decay, rank,
@@ -172,11 +177,21 @@ def _update_param(p, state, lr, adjusted_lr, weight_decay, rank,
         if state.scatter_event is None:
             raise RuntimeError("Scatter event must be set before update")
         compute_stream.wait_event(state.scatter_event)
         if rank == state.worker_rank:
             # Free computed_u
             state.computed_u = None
         Muon._update_p(p, state.scattered_u, lr, adjusted_lr, weight_decay)
 def default_is_muon(name, x):
@@ -375,7 +390,8 @@ class Muon(torch.optim.Optimizer):
             else:
                 g = buf
-            u = _zeropower_via_newtonschulz5(g, steps=group["ns_steps"])
             adjusted_lr = self.adjust_lr_for_muon(lr, p.shape)
             Muon._update_p(p, u, lr, adjusted_lr, weight_decay)
@@ -433,7 +449,7 @@ class Muon(torch.optim.Optimizer):
         def enqueue_computes(start_idx, chunk_size):
             for p in ordered_params[start_idx:start_idx + chunk_size]:
                 state = param_to_state[id(p)]
-                _compute_u(state, group["ns_steps"], self.rank,
                            self.compute_stream)
         def enqueue_scatters(start_idx, chunk_size):
@@ -466,6 +482,77 @@ class Muon(torch.optim.Optimizer):
         # Wait the last update_param to finish
         torch.cuda.current_stream().wait_stream(self.compute_stream)
     def step(self, closure=None):
         """Perform a single optimization step.
@@ -542,6 +629,12 @@ class Muon(torch.optim.Optimizer):
                 #       AdamW backup       #
                 ############################
                 lr = group["lr"]
                 beta1, beta2 = group["adamw_betas"]
                 eps = group["adamw_eps"]
@@ -552,23 +645,38 @@ class Muon(torch.optim.Optimizer):
                     if g is None:
                         continue
                     state = self.state[p]
                     if "step" not in state:
-                        state["step"] = 0
                         state["moment1"] = torch.zeros_like(g)
                         state["moment2"] = torch.zeros_like(g)
-                    state["step"] += 1
-                    step = state["step"]
-                    buf1 = state["moment1"]
-                    buf2 = state["moment2"]
-                    buf1.lerp_(g, 1 - beta1)
-                    buf2.lerp_(g.square(), 1 - beta2)
-                    g = buf1 / (eps + buf2.sqrt())
-                    bias_correction1 = 1 - beta1**step
-                    bias_correction2 = 1 - beta2**step
-                    scale = bias_correction1 / bias_correction2**0.5
-                    p.data.mul_(1 - lr * weight_decay)
-                    p.data.add_(g, alpha=-lr / scale)
         return loss

 import math
 import types
 from dataclasses import dataclass
+from typing import Optional, Union, cast
 import torch
 import torch.distributed as dist
 # This code snippet is a modified version adapted from the following GitHub repositories:
 # https://github.com/KellerJordan/Muon/blob/master/muon.py
+# Muon's Newton–Schulz iteration causes high variance in singular values
+# Idea: give each iteration its own 3 coefficients and optimize them via gradient descent.
 @torch.no_grad()
 def _zeropower_via_newtonschulz5(G, steps):
     """
     performance at all relative to UV^T, where USV^T = G is the SVD.
     """
     assert len(G.shape) == 2
+    assert G.dtype == torch.bfloat16
     X = G  # no manual typecast
     if G.size(0) > G.size(1):
         X = X.T
     # Ensure spectral norm is at most 1
     X = X / (X.norm() + 1e-7)
     # Perform the NS iterations
+    for a, b, c in [
+        (4.0848, -6.8946, 2.9270),
+        (3.9505, -6.3029, 2.6377),
+        (3.7418, -5.5913, 2.3037),
+        (2.8769, -3.1427, 1.2046),
+        (2.8366, -3.0525, 1.2012),
+    ]:
         A = X @ X.T
         # B = (
         #    b * A + c * A @ A
     if G.size(0) > G.size(1):
         X = X.T
+    return X
 @dataclass
     Gather the gradients to worker_rank.
     If none_grad is True, free p.grad after the gather.
     """
     with torch.cuda.stream(comm_stream):
+        g = p.grad
+        if rank == state.worker_rank:
+            num_ranks = dist.get_world_size(group=state.process_group)
+            gather_list = [
+                torch.empty_like(g.to_local(), dtype=torch.bfloat16)
+                for _ in range(num_ranks)
+            ]
+        else:
+            gather_list = None
+        g = g.to(torch.bfloat16)
         torch.distributed.gather(
             g.to_local(),
             dst=state.worker_rank,
         else:
             state.gathered_grad = None
             state.gather_event = None
+        gather_list = None
         if none_grad:
             # We can safely free p.grad without calling record_stream:
             #   p.grad.to_local().record_stream(comm_stream)
 @torch.no_grad()
+def _compute_u(p, state, steps, rank, compute_stream):
     """
     On worker_rank, compute the orthogonalized update using Newton-Schulz iteration.
     """
             compute_stream.wait_event(state.gather_event)
             u = _zeropower_via_newtonschulz5(state.gathered_grad, steps)
             state.computed_u = u
+        state.scattered_u = torch.empty_like(p.to_local(),
+                                             dtype=torch.bfloat16)
+        state.compute_event = torch.cuda.Event()
+        state.compute_event.record()
+        u = None
 @torch.no_grad()
     """
     with torch.cuda.stream(comm_stream):
+        if state.compute_event is None:
+            raise RuntimeError("Compute event must be set before scatter.")
+        comm_stream.wait_event(state.compute_event)
         if rank == state.worker_rank:
             num_ranks = dist.get_world_size(group=state.process_group)
             # Clear the gathered gradient to free memory
             state.gathered_grad = None
         else:
             scatter_list = None
         torch.distributed.scatter(
+            state.scattered_u,
             scatter_list=scatter_list,
             src=state.worker_rank,
             group=state.process_group,
         )
         state.scatter_event = torch.cuda.Event()
         state.scatter_event.record()
+        scatter_list = None
 def _update_param(p, state, lr, adjusted_lr, weight_decay, rank,
         if state.scatter_event is None:
             raise RuntimeError("Scatter event must be set before update")
         compute_stream.wait_event(state.scatter_event)
+        u_dtensor = DTensor.from_local(
+            state.scattered_u,
+            placements=p.placements,
+            device_mesh=p.device_mesh,
+        )
+        state.scattered_u = u_dtensor
         if rank == state.worker_rank:
             # Free computed_u
             state.computed_u = None
         Muon._update_p(p, state.scattered_u, lr, adjusted_lr, weight_decay)
+        state.scattered_u = None
+        u_dtensor = None
 def default_is_muon(name, x):
             else:
                 g = buf
+            u = _zeropower_via_newtonschulz5(g.bfloat16(),
+                                             steps=group["ns_steps"])
             adjusted_lr = self.adjust_lr_for_muon(lr, p.shape)
             Muon._update_p(p, u, lr, adjusted_lr, weight_decay)
         def enqueue_computes(start_idx, chunk_size):
             for p in ordered_params[start_idx:start_idx + chunk_size]:
                 state = param_to_state[id(p)]
+                _compute_u(p, state, group["ns_steps"], self.rank,
                            self.compute_stream)
         def enqueue_scatters(start_idx, chunk_size):
         # Wait the last update_param to finish
         torch.cuda.current_stream().wait_stream(self.compute_stream)
+    @staticmethod
+    def _fused_adamw(
+        params: list[torch.Tensor],
+        grads: list[torch.Tensor],
+        exp_avgs: list[torch.Tensor],
+        exp_avg_sqs: list[torch.Tensor],
+        max_exp_avg_sqs: list[torch.Tensor],
+        state_steps: list[torch.Tensor],
+        amsgrad: bool,
+        beta1: float,
+        beta2: float,
+        lr: Union[float, torch.Tensor],
+        weight_decay: float,
+        eps: float,
+        maximize: bool,
+    ) -> None:
+        if not params:
+            return
+        # We only shuffle around the lr when it is a Tensor and on CUDA, otherwise, we prefer
+        # treating it as a scalar.
+        lr_dict: Optional[DeviceDict] = ({
+            lr.device: lr
+        } if isinstance(lr, torch.Tensor) and str(lr.device) != "cpu" else
+                                         None)
+        grouped_tensors = torch.optim.Optimizer._group_tensors_by_device_and_dtype(
+            [
+                params, grads, exp_avgs, exp_avg_sqs, max_exp_avg_sqs,
+                state_steps
+            ]  # type: ignore[list-item]
+        )
+        for (device, _), (
+            (
+                device_params_,
+                device_grads_,
+                device_exp_avgs_,
+                device_exp_avg_sqs_,
+                device_max_exp_avg_sqs,
+                device_state_steps_,
+            ),
+                _,
+        ) in grouped_tensors.items():
+            device_params = cast(list[torch.Tensor], device_params_)
+            device_grads = cast(list[torch.Tensor], device_grads_)
+            device_exp_avgs = cast(list[torch.Tensor], device_exp_avgs_)
+            device_exp_avg_sqs = cast(list[torch.Tensor], device_exp_avg_sqs_)
+            device_state_steps = cast(list[torch.Tensor], device_state_steps_)
+            if lr_dict is not None and device not in lr_dict:
+                lr_dict[device] = lr.to(
+                    device=device,
+                    non_blocking=True)  # type: ignore[union-attr]
+                lr = lr_dict[device]
+            torch._foreach_add_(device_state_steps, 1)
+            func = torch._fused_adamw_
+            func(
+                device_params,
+                device_grads,
+                device_exp_avgs,
+                device_exp_avg_sqs,
+                device_max_exp_avg_sqs,  # type: ignore[arg-type]
+                device_state_steps,
+                amsgrad=amsgrad,
+                lr=lr,  # type: ignore[arg-type]
+                beta1=beta1,
+                beta2=beta2,
+                weight_decay=weight_decay,
+                eps=eps,
+                maximize=maximize,
+            )
     def step(self, closure=None):
         """Perform a single optimization step.
                 #       AdamW backup       #
                 ############################
+                params_with_grads = []
+                grads = []
+                moment1 = []
+                moment2 = []
+                max_exp_avg_sqs = []
+                state_steps = []
                 lr = group["lr"]
                 beta1, beta2 = group["adamw_betas"]
                 eps = group["adamw_eps"]
                     if g is None:
                         continue
                     state = self.state[p]
+                    params_with_grads.append(p)
+                    grads.append(g)
                     if "step" not in state:
+                        state["step"] = (torch.zeros((),
+                                                     dtype=torch.float32,
+                                                     device=p.device))
                         state["moment1"] = torch.zeros_like(g)
                         state["moment2"] = torch.zeros_like(g)
+                    moment1.append(state["moment1"])
+                    moment2.append(state["moment2"])
+                    if not isinstance(state["step"], torch.Tensor):
+                        step_tensor = torch.tensor(state["step"],
+                                                   dtype=torch.float32,
+                                                   device=p.device)
+                    else:
+                        step_tensor = state["step"]
+                    state_steps.append(step_tensor)
+                self._fused_adamw(
+                    params_with_grads,
+                    grads,
+                    moment1,
+                    moment2,
+                    max_exp_avg_sqs,
+                    state_steps,
+                    amsgrad=False,
+                    beta1=beta1,
+                    beta2=beta2,
+                    lr=lr,
+                    weight_decay=weight_decay,
+                    eps=eps,
+                    maximize=False,
+                )
         return loss

build/torch27-cxx11-cu128-x86_64-linux/optimizer/_ops.py CHANGED Viewed

@@ -1,9 +1,9 @@
 import torch
-from . import _optimizer_20250911094409
-ops = torch.ops._optimizer_20250911094409
 def add_op_namespace_prefix(op_name: str):
     """
     Prefix op by namespace.
     """
-    return f"_optimizer_20250911094409::{op_name}"

 import torch
+from . import _optimizer_ee6ed44_dirty
+ops = torch.ops._optimizer_ee6ed44_dirty
 def add_op_namespace_prefix(op_name: str):
     """
     Prefix op by namespace.
     """
+    return f"_optimizer_ee6ed44_dirty::{op_name}"

build/torch27-cxx11-cu128-x86_64-linux/optimizer/{_optimizer_20250911094409.abi3.so → _optimizer_ee6ed44_dirty.abi3.so} RENAMED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:b1729faaee0dd55134348a0d775c147cf3aaba106e0475e1389159d48dfc1ebe
-size 1883360

 version https://git-lfs.github.com/spec/v1
+oid sha256:5f8bf16b0ae5af74852e8c890183c8c32175886c3d0366cfc776fb3e1ee15906
+size 1883352

build/torch27-cxx11-cu128-x86_64-linux/optimizer/muon.py CHANGED Viewed

@@ -2,6 +2,7 @@ import logging
 import math
 import types
 from dataclasses import dataclass
 import torch
 import torch.distributed as dist
@@ -12,6 +13,8 @@ logger = logging.getLogger(__name__)
 # This code snippet is a modified version adapted from the following GitHub repositories:
 # https://github.com/KellerJordan/Muon/blob/master/muon.py
 @torch.no_grad()
 def _zeropower_via_newtonschulz5(G, steps):
     """
@@ -24,15 +27,21 @@ def _zeropower_via_newtonschulz5(G, steps):
     performance at all relative to UV^T, where USV^T = G is the SVD.
     """
     assert len(G.shape) == 2
-    a, b, c = (3.4445, -4.7750, 2.0315)
     X = G  # no manual typecast
     if G.size(0) > G.size(1):
         X = X.T
     # Ensure spectral norm is at most 1
     X = X / (X.norm() + 1e-7)
-    X = X.bfloat16()
     # Perform the NS iterations
-    for _ in range(steps):
         A = X @ X.T
         # B = (
         #    b * A + c * A @ A
@@ -43,7 +52,7 @@ def _zeropower_via_newtonschulz5(G, steps):
     if G.size(0) > G.size(1):
         X = X.T
-    return X.to(G.dtype)
 @dataclass
@@ -65,17 +74,19 @@ def _gather(p, state, rank, comm_stream, none_grad):
     Gather the gradients to worker_rank.
     If none_grad is True, free p.grad after the gather.
     """
-    g = p.grad
-    if rank == state.worker_rank:
-        num_ranks = dist.get_world_size(group=state.process_group)
-        gather_list = [
-            torch.empty_like(g.to_local()) for _ in range(num_ranks)
-        ]
-    else:
-        gather_list = None
     with torch.cuda.stream(comm_stream):
         torch.distributed.gather(
             g.to_local(),
             dst=state.worker_rank,
@@ -92,6 +103,7 @@ def _gather(p, state, rank, comm_stream, none_grad):
         else:
             state.gathered_grad = None
             state.gather_event = None
         if none_grad:
             # We can safely free p.grad without calling record_stream:
             #   p.grad.to_local().record_stream(comm_stream)
@@ -104,7 +116,7 @@ def _gather(p, state, rank, comm_stream, none_grad):
 @torch.no_grad()
-def _compute_u(state, steps, rank, compute_stream):
     """
     On worker_rank, compute the orthogonalized update using Newton-Schulz iteration.
     """
@@ -115,11 +127,11 @@ def _compute_u(state, steps, rank, compute_stream):
             compute_stream.wait_event(state.gather_event)
             u = _zeropower_via_newtonschulz5(state.gathered_grad, steps)
             state.computed_u = u
-            state.compute_event = torch.cuda.Event()
-            state.compute_event.record()
-        else:
-            state.computed_u = None
-            state.compute_event = None
 @torch.no_grad()
@@ -129,12 +141,12 @@ def _scatter(p, state, rank, comm_stream):
     """
     with torch.cuda.stream(comm_stream):
         if rank == state.worker_rank:
             num_ranks = dist.get_world_size(group=state.process_group)
-            if state.compute_event is None:
-                raise RuntimeError("Compute event must be set before scatter.")
-            comm_stream.wait_event(state.compute_event)
             # Clear the gathered gradient to free memory
             state.gathered_grad = None
@@ -144,22 +156,15 @@ def _scatter(p, state, rank, comm_stream):
         else:
             scatter_list = None
-        u_received = torch.empty_like(p.to_local())
         torch.distributed.scatter(
-            u_received,
             scatter_list=scatter_list,
             src=state.worker_rank,
             group=state.process_group,
         )
-        u_dtensor = DTensor.from_local(
-            u_received,
-            placements=p.placements,
-            device_mesh=p.device_mesh,
-        )
-        state.scattered_u = u_dtensor
         state.scatter_event = torch.cuda.Event()
         state.scatter_event.record()
 def _update_param(p, state, lr, adjusted_lr, weight_decay, rank,
@@ -172,11 +177,21 @@ def _update_param(p, state, lr, adjusted_lr, weight_decay, rank,
         if state.scatter_event is None:
             raise RuntimeError("Scatter event must be set before update")
         compute_stream.wait_event(state.scatter_event)
         if rank == state.worker_rank:
             # Free computed_u
             state.computed_u = None
         Muon._update_p(p, state.scattered_u, lr, adjusted_lr, weight_decay)
 def default_is_muon(name, x):
@@ -375,7 +390,8 @@ class Muon(torch.optim.Optimizer):
             else:
                 g = buf
-            u = _zeropower_via_newtonschulz5(g, steps=group["ns_steps"])
             adjusted_lr = self.adjust_lr_for_muon(lr, p.shape)
             Muon._update_p(p, u, lr, adjusted_lr, weight_decay)
@@ -433,7 +449,7 @@ class Muon(torch.optim.Optimizer):
         def enqueue_computes(start_idx, chunk_size):
             for p in ordered_params[start_idx:start_idx + chunk_size]:
                 state = param_to_state[id(p)]
-                _compute_u(state, group["ns_steps"], self.rank,
                            self.compute_stream)
         def enqueue_scatters(start_idx, chunk_size):
@@ -466,6 +482,77 @@ class Muon(torch.optim.Optimizer):
         # Wait the last update_param to finish
         torch.cuda.current_stream().wait_stream(self.compute_stream)
     def step(self, closure=None):
         """Perform a single optimization step.
@@ -542,6 +629,12 @@ class Muon(torch.optim.Optimizer):
                 #       AdamW backup       #
                 ############################
                 lr = group["lr"]
                 beta1, beta2 = group["adamw_betas"]
                 eps = group["adamw_eps"]
@@ -552,23 +645,38 @@ class Muon(torch.optim.Optimizer):
                     if g is None:
                         continue
                     state = self.state[p]
                     if "step" not in state:
-                        state["step"] = 0
                         state["moment1"] = torch.zeros_like(g)
                         state["moment2"] = torch.zeros_like(g)
-                    state["step"] += 1
-                    step = state["step"]
-                    buf1 = state["moment1"]
-                    buf2 = state["moment2"]
-                    buf1.lerp_(g, 1 - beta1)
-                    buf2.lerp_(g.square(), 1 - beta2)
-                    g = buf1 / (eps + buf2.sqrt())
-                    bias_correction1 = 1 - beta1**step
-                    bias_correction2 = 1 - beta2**step
-                    scale = bias_correction1 / bias_correction2**0.5
-                    p.data.mul_(1 - lr * weight_decay)
-                    p.data.add_(g, alpha=-lr / scale)
         return loss

 import math
 import types
 from dataclasses import dataclass
+from typing import Optional, Union, cast
 import torch
 import torch.distributed as dist
 # This code snippet is a modified version adapted from the following GitHub repositories:
 # https://github.com/KellerJordan/Muon/blob/master/muon.py
+# Muon's Newton–Schulz iteration causes high variance in singular values
+# Idea: give each iteration its own 3 coefficients and optimize them via gradient descent.
 @torch.no_grad()
 def _zeropower_via_newtonschulz5(G, steps):
     """
     performance at all relative to UV^T, where USV^T = G is the SVD.
     """
     assert len(G.shape) == 2
+    assert G.dtype == torch.bfloat16
     X = G  # no manual typecast
     if G.size(0) > G.size(1):
         X = X.T
     # Ensure spectral norm is at most 1
     X = X / (X.norm() + 1e-7)
     # Perform the NS iterations
+    for a, b, c in [
+        (4.0848, -6.8946, 2.9270),
+        (3.9505, -6.3029, 2.6377),
+        (3.7418, -5.5913, 2.3037),
+        (2.8769, -3.1427, 1.2046),
+        (2.8366, -3.0525, 1.2012),
+    ]:
         A = X @ X.T
         # B = (
         #    b * A + c * A @ A
     if G.size(0) > G.size(1):
         X = X.T
+    return X
 @dataclass
     Gather the gradients to worker_rank.
     If none_grad is True, free p.grad after the gather.
     """
     with torch.cuda.stream(comm_stream):
+        g = p.grad
+        if rank == state.worker_rank:
+            num_ranks = dist.get_world_size(group=state.process_group)
+            gather_list = [
+                torch.empty_like(g.to_local(), dtype=torch.bfloat16)
+                for _ in range(num_ranks)
+            ]
+        else:
+            gather_list = None
+        g = g.to(torch.bfloat16)
         torch.distributed.gather(
             g.to_local(),
             dst=state.worker_rank,
         else:
             state.gathered_grad = None
             state.gather_event = None
+        gather_list = None
         if none_grad:
             # We can safely free p.grad without calling record_stream:
             #   p.grad.to_local().record_stream(comm_stream)
 @torch.no_grad()
+def _compute_u(p, state, steps, rank, compute_stream):
     """
     On worker_rank, compute the orthogonalized update using Newton-Schulz iteration.
     """
             compute_stream.wait_event(state.gather_event)
             u = _zeropower_via_newtonschulz5(state.gathered_grad, steps)
             state.computed_u = u
+        state.scattered_u = torch.empty_like(p.to_local(),
+                                             dtype=torch.bfloat16)
+        state.compute_event = torch.cuda.Event()
+        state.compute_event.record()
+        u = None
 @torch.no_grad()
     """
     with torch.cuda.stream(comm_stream):
+        if state.compute_event is None:
+            raise RuntimeError("Compute event must be set before scatter.")
+        comm_stream.wait_event(state.compute_event)
         if rank == state.worker_rank:
             num_ranks = dist.get_world_size(group=state.process_group)
             # Clear the gathered gradient to free memory
             state.gathered_grad = None
         else:
             scatter_list = None
         torch.distributed.scatter(
+            state.scattered_u,
             scatter_list=scatter_list,
             src=state.worker_rank,
             group=state.process_group,
         )
         state.scatter_event = torch.cuda.Event()
         state.scatter_event.record()
+        scatter_list = None
 def _update_param(p, state, lr, adjusted_lr, weight_decay, rank,
         if state.scatter_event is None:
             raise RuntimeError("Scatter event must be set before update")
         compute_stream.wait_event(state.scatter_event)
+        u_dtensor = DTensor.from_local(
+            state.scattered_u,
+            placements=p.placements,
+            device_mesh=p.device_mesh,
+        )
+        state.scattered_u = u_dtensor
         if rank == state.worker_rank:
             # Free computed_u
             state.computed_u = None
         Muon._update_p(p, state.scattered_u, lr, adjusted_lr, weight_decay)
+        state.scattered_u = None
+        u_dtensor = None
 def default_is_muon(name, x):
             else:
                 g = buf
+            u = _zeropower_via_newtonschulz5(g.bfloat16(),
+                                             steps=group["ns_steps"])
             adjusted_lr = self.adjust_lr_for_muon(lr, p.shape)
             Muon._update_p(p, u, lr, adjusted_lr, weight_decay)
         def enqueue_computes(start_idx, chunk_size):
             for p in ordered_params[start_idx:start_idx + chunk_size]:
                 state = param_to_state[id(p)]
+                _compute_u(p, state, group["ns_steps"], self.rank,
                            self.compute_stream)
         def enqueue_scatters(start_idx, chunk_size):
         # Wait the last update_param to finish
         torch.cuda.current_stream().wait_stream(self.compute_stream)
+    @staticmethod
+    def _fused_adamw(
+        params: list[torch.Tensor],
+        grads: list[torch.Tensor],
+        exp_avgs: list[torch.Tensor],
+        exp_avg_sqs: list[torch.Tensor],
+        max_exp_avg_sqs: list[torch.Tensor],
+        state_steps: list[torch.Tensor],
+        amsgrad: bool,
+        beta1: float,
+        beta2: float,
+        lr: Union[float, torch.Tensor],
+        weight_decay: float,
+        eps: float,
+        maximize: bool,
+    ) -> None:
+        if not params:
+            return
+        # We only shuffle around the lr when it is a Tensor and on CUDA, otherwise, we prefer
+        # treating it as a scalar.
+        lr_dict: Optional[DeviceDict] = ({
+            lr.device: lr
+        } if isinstance(lr, torch.Tensor) and str(lr.device) != "cpu" else
+                                         None)
+        grouped_tensors = torch.optim.Optimizer._group_tensors_by_device_and_dtype(
+            [
+                params, grads, exp_avgs, exp_avg_sqs, max_exp_avg_sqs,
+                state_steps
+            ]  # type: ignore[list-item]
+        )
+        for (device, _), (
+            (
+                device_params_,
+                device_grads_,
+                device_exp_avgs_,
+                device_exp_avg_sqs_,
+                device_max_exp_avg_sqs,
+                device_state_steps_,
+            ),
+                _,
+        ) in grouped_tensors.items():
+            device_params = cast(list[torch.Tensor], device_params_)
+            device_grads = cast(list[torch.Tensor], device_grads_)
+            device_exp_avgs = cast(list[torch.Tensor], device_exp_avgs_)
+            device_exp_avg_sqs = cast(list[torch.Tensor], device_exp_avg_sqs_)
+            device_state_steps = cast(list[torch.Tensor], device_state_steps_)
+            if lr_dict is not None and device not in lr_dict:
+                lr_dict[device] = lr.to(
+                    device=device,
+                    non_blocking=True)  # type: ignore[union-attr]
+                lr = lr_dict[device]
+            torch._foreach_add_(device_state_steps, 1)
+            func = torch._fused_adamw_
+            func(
+                device_params,
+                device_grads,
+                device_exp_avgs,
+                device_exp_avg_sqs,
+                device_max_exp_avg_sqs,  # type: ignore[arg-type]
+                device_state_steps,
+                amsgrad=amsgrad,
+                lr=lr,  # type: ignore[arg-type]
+                beta1=beta1,
+                beta2=beta2,
+                weight_decay=weight_decay,
+                eps=eps,
+                maximize=maximize,
+            )
     def step(self, closure=None):
         """Perform a single optimization step.
                 #       AdamW backup       #
                 ############################
+                params_with_grads = []
+                grads = []
+                moment1 = []
+                moment2 = []
+                max_exp_avg_sqs = []
+                state_steps = []
                 lr = group["lr"]
                 beta1, beta2 = group["adamw_betas"]
                 eps = group["adamw_eps"]
                     if g is None:
                         continue
                     state = self.state[p]
+                    params_with_grads.append(p)
+                    grads.append(g)
                     if "step" not in state:
+                        state["step"] = (torch.zeros((),
+                                                     dtype=torch.float32,
+                                                     device=p.device))
                         state["moment1"] = torch.zeros_like(g)
                         state["moment2"] = torch.zeros_like(g)
+                    moment1.append(state["moment1"])
+                    moment2.append(state["moment2"])
+                    if not isinstance(state["step"], torch.Tensor):
+                        step_tensor = torch.tensor(state["step"],
+                                                   dtype=torch.float32,
+                                                   device=p.device)
+                    else:
+                        step_tensor = state["step"]
+                    state_steps.append(step_tensor)
+                self._fused_adamw(
+                    params_with_grads,
+                    grads,
+                    moment1,
+                    moment2,
+                    max_exp_avg_sqs,
+                    state_steps,
+                    amsgrad=False,
+                    beta1=beta1,
+                    beta2=beta2,
+                    lr=lr,
+                    weight_decay=weight_decay,
+                    eps=eps,
+                    maximize=False,
+                )
         return loss

build/torch27-cxx11-rocm63-x86_64-linux/optimizer/_ops.py CHANGED Viewed

@@ -1,9 +1,9 @@
 import torch
-from . import _optimizer_20250911094409
-ops = torch.ops._optimizer_20250911094409
 def add_op_namespace_prefix(op_name: str):
     """
     Prefix op by namespace.
     """
-    return f"_optimizer_20250911094409::{op_name}"

 import torch
+from . import _optimizer_ee6ed44_dirty
+ops = torch.ops._optimizer_ee6ed44_dirty
 def add_op_namespace_prefix(op_name: str):
     """
     Prefix op by namespace.
     """
+    return f"_optimizer_ee6ed44_dirty::{op_name}"

build/torch27-cxx11-rocm63-x86_64-linux/optimizer/{_optimizer_20250911094409.abi3.so → _optimizer_ee6ed44_dirty.abi3.so} RENAMED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:0857945a1ebfdbb6c7219d0b96c8ab47649aa3b47b65fa800c84b51ddbda9c19
-size 1749880

 version https://git-lfs.github.com/spec/v1
+oid sha256:d50267ec23db9512ae1d82c99012901d58e50dee9bf34346702561a5d3e6d9e7
+size 1749840

build/torch27-cxx11-rocm63-x86_64-linux/optimizer/muon.py CHANGED Viewed

@@ -2,6 +2,7 @@ import logging
 import math
 import types
 from dataclasses import dataclass
 import torch
 import torch.distributed as dist
@@ -12,6 +13,8 @@ logger = logging.getLogger(__name__)
 # This code snippet is a modified version adapted from the following GitHub repositories:
 # https://github.com/KellerJordan/Muon/blob/master/muon.py
 @torch.no_grad()
 def _zeropower_via_newtonschulz5(G, steps):
     """
@@ -24,15 +27,21 @@ def _zeropower_via_newtonschulz5(G, steps):
     performance at all relative to UV^T, where USV^T = G is the SVD.
     """
     assert len(G.shape) == 2
-    a, b, c = (3.4445, -4.7750, 2.0315)
     X = G  # no manual typecast
     if G.size(0) > G.size(1):
         X = X.T
     # Ensure spectral norm is at most 1
     X = X / (X.norm() + 1e-7)
-    X = X.bfloat16()
     # Perform the NS iterations
-    for _ in range(steps):
         A = X @ X.T
         # B = (
         #    b * A + c * A @ A
@@ -43,7 +52,7 @@ def _zeropower_via_newtonschulz5(G, steps):
     if G.size(0) > G.size(1):
         X = X.T
-    return X.to(G.dtype)
 @dataclass
@@ -65,17 +74,19 @@ def _gather(p, state, rank, comm_stream, none_grad):
     Gather the gradients to worker_rank.
     If none_grad is True, free p.grad after the gather.
     """
-    g = p.grad
-    if rank == state.worker_rank:
-        num_ranks = dist.get_world_size(group=state.process_group)
-        gather_list = [
-            torch.empty_like(g.to_local()) for _ in range(num_ranks)
-        ]
-    else:
-        gather_list = None
     with torch.cuda.stream(comm_stream):
         torch.distributed.gather(
             g.to_local(),
             dst=state.worker_rank,
@@ -92,6 +103,7 @@ def _gather(p, state, rank, comm_stream, none_grad):
         else:
             state.gathered_grad = None
             state.gather_event = None
         if none_grad:
             # We can safely free p.grad without calling record_stream:
             #   p.grad.to_local().record_stream(comm_stream)
@@ -104,7 +116,7 @@ def _gather(p, state, rank, comm_stream, none_grad):
 @torch.no_grad()
-def _compute_u(state, steps, rank, compute_stream):
     """
     On worker_rank, compute the orthogonalized update using Newton-Schulz iteration.
     """
@@ -115,11 +127,11 @@ def _compute_u(state, steps, rank, compute_stream):
             compute_stream.wait_event(state.gather_event)
             u = _zeropower_via_newtonschulz5(state.gathered_grad, steps)
             state.computed_u = u
-            state.compute_event = torch.cuda.Event()
-            state.compute_event.record()
-        else:
-            state.computed_u = None
-            state.compute_event = None
 @torch.no_grad()
@@ -129,12 +141,12 @@ def _scatter(p, state, rank, comm_stream):
     """
     with torch.cuda.stream(comm_stream):
         if rank == state.worker_rank:
             num_ranks = dist.get_world_size(group=state.process_group)
-            if state.compute_event is None:
-                raise RuntimeError("Compute event must be set before scatter.")
-            comm_stream.wait_event(state.compute_event)
             # Clear the gathered gradient to free memory
             state.gathered_grad = None
@@ -144,22 +156,15 @@ def _scatter(p, state, rank, comm_stream):
         else:
             scatter_list = None
-        u_received = torch.empty_like(p.to_local())
         torch.distributed.scatter(
-            u_received,
             scatter_list=scatter_list,
             src=state.worker_rank,
             group=state.process_group,
         )
-        u_dtensor = DTensor.from_local(
-            u_received,
-            placements=p.placements,
-            device_mesh=p.device_mesh,
-        )
-        state.scattered_u = u_dtensor
         state.scatter_event = torch.cuda.Event()
         state.scatter_event.record()
 def _update_param(p, state, lr, adjusted_lr, weight_decay, rank,
@@ -172,11 +177,21 @@ def _update_param(p, state, lr, adjusted_lr, weight_decay, rank,
         if state.scatter_event is None:
             raise RuntimeError("Scatter event must be set before update")
         compute_stream.wait_event(state.scatter_event)
         if rank == state.worker_rank:
             # Free computed_u
             state.computed_u = None
         Muon._update_p(p, state.scattered_u, lr, adjusted_lr, weight_decay)
 def default_is_muon(name, x):
@@ -375,7 +390,8 @@ class Muon(torch.optim.Optimizer):
             else:
                 g = buf
-            u = _zeropower_via_newtonschulz5(g, steps=group["ns_steps"])
             adjusted_lr = self.adjust_lr_for_muon(lr, p.shape)
             Muon._update_p(p, u, lr, adjusted_lr, weight_decay)
@@ -433,7 +449,7 @@ class Muon(torch.optim.Optimizer):
         def enqueue_computes(start_idx, chunk_size):
             for p in ordered_params[start_idx:start_idx + chunk_size]:
                 state = param_to_state[id(p)]
-                _compute_u(state, group["ns_steps"], self.rank,
                            self.compute_stream)
         def enqueue_scatters(start_idx, chunk_size):
@@ -466,6 +482,77 @@ class Muon(torch.optim.Optimizer):
         # Wait the last update_param to finish
         torch.cuda.current_stream().wait_stream(self.compute_stream)
     def step(self, closure=None):
         """Perform a single optimization step.
@@ -542,6 +629,12 @@ class Muon(torch.optim.Optimizer):
                 #       AdamW backup       #
                 ############################
                 lr = group["lr"]
                 beta1, beta2 = group["adamw_betas"]
                 eps = group["adamw_eps"]
@@ -552,23 +645,38 @@ class Muon(torch.optim.Optimizer):
                     if g is None:
                         continue
                     state = self.state[p]
                     if "step" not in state:
-                        state["step"] = 0
                         state["moment1"] = torch.zeros_like(g)
                         state["moment2"] = torch.zeros_like(g)
-                    state["step"] += 1
-                    step = state["step"]
-                    buf1 = state["moment1"]
-                    buf2 = state["moment2"]
-                    buf1.lerp_(g, 1 - beta1)
-                    buf2.lerp_(g.square(), 1 - beta2)
-                    g = buf1 / (eps + buf2.sqrt())
-                    bias_correction1 = 1 - beta1**step
-                    bias_correction2 = 1 - beta2**step
-                    scale = bias_correction1 / bias_correction2**0.5
-                    p.data.mul_(1 - lr * weight_decay)
-                    p.data.add_(g, alpha=-lr / scale)
         return loss

 import math
 import types
 from dataclasses import dataclass
+from typing import Optional, Union, cast
 import torch
 import torch.distributed as dist
 # This code snippet is a modified version adapted from the following GitHub repositories:
 # https://github.com/KellerJordan/Muon/blob/master/muon.py
+# Muon's Newton–Schulz iteration causes high variance in singular values
+# Idea: give each iteration its own 3 coefficients and optimize them via gradient descent.
 @torch.no_grad()
 def _zeropower_via_newtonschulz5(G, steps):
     """
     performance at all relative to UV^T, where USV^T = G is the SVD.
     """
     assert len(G.shape) == 2
+    assert G.dtype == torch.bfloat16
     X = G  # no manual typecast
     if G.size(0) > G.size(1):
         X = X.T
     # Ensure spectral norm is at most 1
     X = X / (X.norm() + 1e-7)
     # Perform the NS iterations
+    for a, b, c in [
+        (4.0848, -6.8946, 2.9270),
+        (3.9505, -6.3029, 2.6377),
+        (3.7418, -5.5913, 2.3037),
+        (2.8769, -3.1427, 1.2046),
+        (2.8366, -3.0525, 1.2012),
+    ]:
         A = X @ X.T
         # B = (
         #    b * A + c * A @ A
     if G.size(0) > G.size(1):
         X = X.T
+    return X
 @dataclass
     Gather the gradients to worker_rank.
     If none_grad is True, free p.grad after the gather.
     """
     with torch.cuda.stream(comm_stream):
+        g = p.grad
+        if rank == state.worker_rank:
+            num_ranks = dist.get_world_size(group=state.process_group)
+            gather_list = [
+                torch.empty_like(g.to_local(), dtype=torch.bfloat16)
+                for _ in range(num_ranks)
+            ]
+        else:
+            gather_list = None
+        g = g.to(torch.bfloat16)
         torch.distributed.gather(
             g.to_local(),
             dst=state.worker_rank,
         else:
             state.gathered_grad = None
             state.gather_event = None
+        gather_list = None
         if none_grad:
             # We can safely free p.grad without calling record_stream:
             #   p.grad.to_local().record_stream(comm_stream)
 @torch.no_grad()
+def _compute_u(p, state, steps, rank, compute_stream):
     """
     On worker_rank, compute the orthogonalized update using Newton-Schulz iteration.
     """
             compute_stream.wait_event(state.gather_event)
             u = _zeropower_via_newtonschulz5(state.gathered_grad, steps)
             state.computed_u = u
+        state.scattered_u = torch.empty_like(p.to_local(),
+                                             dtype=torch.bfloat16)
+        state.compute_event = torch.cuda.Event()
+        state.compute_event.record()
+        u = None
 @torch.no_grad()
     """
     with torch.cuda.stream(comm_stream):
+        if state.compute_event is None:
+            raise RuntimeError("Compute event must be set before scatter.")
+        comm_stream.wait_event(state.compute_event)
         if rank == state.worker_rank:
             num_ranks = dist.get_world_size(group=state.process_group)
             # Clear the gathered gradient to free memory
             state.gathered_grad = None
         else:
             scatter_list = None
         torch.distributed.scatter(
+            state.scattered_u,
             scatter_list=scatter_list,
             src=state.worker_rank,
             group=state.process_group,
         )
         state.scatter_event = torch.cuda.Event()
         state.scatter_event.record()
+        scatter_list = None
 def _update_param(p, state, lr, adjusted_lr, weight_decay, rank,
         if state.scatter_event is None:
             raise RuntimeError("Scatter event must be set before update")
         compute_stream.wait_event(state.scatter_event)
+        u_dtensor = DTensor.from_local(
+            state.scattered_u,
+            placements=p.placements,
+            device_mesh=p.device_mesh,
+        )
+        state.scattered_u = u_dtensor
         if rank == state.worker_rank:
             # Free computed_u
             state.computed_u = None
         Muon._update_p(p, state.scattered_u, lr, adjusted_lr, weight_decay)
+        state.scattered_u = None
+        u_dtensor = None
 def default_is_muon(name, x):
             else:
                 g = buf
+            u = _zeropower_via_newtonschulz5(g.bfloat16(),
+                                             steps=group["ns_steps"])
             adjusted_lr = self.adjust_lr_for_muon(lr, p.shape)
             Muon._update_p(p, u, lr, adjusted_lr, weight_decay)
         def enqueue_computes(start_idx, chunk_size):
             for p in ordered_params[start_idx:start_idx + chunk_size]:
                 state = param_to_state[id(p)]
+                _compute_u(p, state, group["ns_steps"], self.rank,
                            self.compute_stream)
         def enqueue_scatters(start_idx, chunk_size):
         # Wait the last update_param to finish
         torch.cuda.current_stream().wait_stream(self.compute_stream)
+    @staticmethod
+    def _fused_adamw(
+        params: list[torch.Tensor],
+        grads: list[torch.Tensor],
+        exp_avgs: list[torch.Tensor],
+        exp_avg_sqs: list[torch.Tensor],
+        max_exp_avg_sqs: list[torch.Tensor],
+        state_steps: list[torch.Tensor],
+        amsgrad: bool,
+        beta1: float,
+        beta2: float,
+        lr: Union[float, torch.Tensor],
+        weight_decay: float,
+        eps: float,
+        maximize: bool,
+    ) -> None:
+        if not params:
+            return
+        # We only shuffle around the lr when it is a Tensor and on CUDA, otherwise, we prefer
+        # treating it as a scalar.
+        lr_dict: Optional[DeviceDict] = ({
+            lr.device: lr
+        } if isinstance(lr, torch.Tensor) and str(lr.device) != "cpu" else
+                                         None)
+        grouped_tensors = torch.optim.Optimizer._group_tensors_by_device_and_dtype(
+            [
+                params, grads, exp_avgs, exp_avg_sqs, max_exp_avg_sqs,
+                state_steps
+            ]  # type: ignore[list-item]
+        )
+        for (device, _), (
+            (
+                device_params_,
+                device_grads_,
+                device_exp_avgs_,
+                device_exp_avg_sqs_,
+                device_max_exp_avg_sqs,
+                device_state_steps_,
+            ),
+                _,
+        ) in grouped_tensors.items():
+            device_params = cast(list[torch.Tensor], device_params_)
+            device_grads = cast(list[torch.Tensor], device_grads_)
+            device_exp_avgs = cast(list[torch.Tensor], device_exp_avgs_)
+            device_exp_avg_sqs = cast(list[torch.Tensor], device_exp_avg_sqs_)
+            device_state_steps = cast(list[torch.Tensor], device_state_steps_)
+            if lr_dict is not None and device not in lr_dict:
+                lr_dict[device] = lr.to(
+                    device=device,
+                    non_blocking=True)  # type: ignore[union-attr]
+                lr = lr_dict[device]
+            torch._foreach_add_(device_state_steps, 1)
+            func = torch._fused_adamw_
+            func(
+                device_params,
+                device_grads,
+                device_exp_avgs,
+                device_exp_avg_sqs,
+                device_max_exp_avg_sqs,  # type: ignore[arg-type]
+                device_state_steps,
+                amsgrad=amsgrad,
+                lr=lr,  # type: ignore[arg-type]
+                beta1=beta1,
+                beta2=beta2,
+                weight_decay=weight_decay,
+                eps=eps,
+                maximize=maximize,
+            )
     def step(self, closure=None):
         """Perform a single optimization step.
                 #       AdamW backup       #
                 ############################
+                params_with_grads = []
+                grads = []
+                moment1 = []
+                moment2 = []
+                max_exp_avg_sqs = []
+                state_steps = []
                 lr = group["lr"]
                 beta1, beta2 = group["adamw_betas"]
                 eps = group["adamw_eps"]
                     if g is None:
                         continue
                     state = self.state[p]
+                    params_with_grads.append(p)
+                    grads.append(g)
                     if "step" not in state:
+                        state["step"] = (torch.zeros((),
+                                                     dtype=torch.float32,
+                                                     device=p.device))
                         state["moment1"] = torch.zeros_like(g)
                         state["moment2"] = torch.zeros_like(g)
+                    moment1.append(state["moment1"])
+                    moment2.append(state["moment2"])
+                    if not isinstance(state["step"], torch.Tensor):
+                        step_tensor = torch.tensor(state["step"],
+                                                   dtype=torch.float32,
+                                                   device=p.device)
+                    else:
+                        step_tensor = state["step"]
+                    state_steps.append(step_tensor)
+                self._fused_adamw(
+                    params_with_grads,
+                    grads,
+                    moment1,
+                    moment2,
+                    max_exp_avg_sqs,
+                    state_steps,
+                    amsgrad=False,
+                    beta1=beta1,
+                    beta2=beta2,
+                    lr=lr,
+                    weight_decay=weight_decay,
+                    eps=eps,
+                    maximize=False,
+                )
         return loss

build/torch28-cxx11-cu126-x86_64-linux/optimizer/_ops.py CHANGED Viewed

@@ -1,9 +1,9 @@
 import torch
-from . import _optimizer_20250911094409
-ops = torch.ops._optimizer_20250911094409
 def add_op_namespace_prefix(op_name: str):
     """
     Prefix op by namespace.
     """
-    return f"_optimizer_20250911094409::{op_name}"

 import torch
+from . import _optimizer_ee6ed44_dirty
+ops = torch.ops._optimizer_ee6ed44_dirty
 def add_op_namespace_prefix(op_name: str):
     """
     Prefix op by namespace.
     """
+    return f"_optimizer_ee6ed44_dirty::{op_name}"

build/{torch27-cxx11-cu126-x86_64-linux/optimizer/_optimizer_20250911094409.abi3.so → torch28-cxx11-cu126-x86_64-linux/optimizer/_optimizer_ee6ed44_dirty.abi3.so} RENAMED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:a5908748e60a61c59e315fbba8b32e3867a4b673b587a2a9606ddde5b4f67da5
 size 1824264

 version https://git-lfs.github.com/spec/v1
+oid sha256:80ce6b0d62167a8ea10b6e2a1f90df70aa108997570c0ed210f458debd26f32f
 size 1824264

build/torch28-cxx11-cu126-x86_64-linux/optimizer/muon.py CHANGED Viewed

@@ -2,6 +2,7 @@ import logging
 import math
 import types
 from dataclasses import dataclass
 import torch
 import torch.distributed as dist
@@ -12,6 +13,8 @@ logger = logging.getLogger(__name__)
 # This code snippet is a modified version adapted from the following GitHub repositories:
 # https://github.com/KellerJordan/Muon/blob/master/muon.py
 @torch.no_grad()
 def _zeropower_via_newtonschulz5(G, steps):
     """
@@ -24,15 +27,21 @@ def _zeropower_via_newtonschulz5(G, steps):
     performance at all relative to UV^T, where USV^T = G is the SVD.
     """
     assert len(G.shape) == 2
-    a, b, c = (3.4445, -4.7750, 2.0315)
     X = G  # no manual typecast
     if G.size(0) > G.size(1):
         X = X.T
     # Ensure spectral norm is at most 1
     X = X / (X.norm() + 1e-7)
-    X = X.bfloat16()
     # Perform the NS iterations
-    for _ in range(steps):
         A = X @ X.T
         # B = (
         #    b * A + c * A @ A
@@ -43,7 +52,7 @@ def _zeropower_via_newtonschulz5(G, steps):
     if G.size(0) > G.size(1):
         X = X.T
-    return X.to(G.dtype)
 @dataclass
@@ -65,17 +74,19 @@ def _gather(p, state, rank, comm_stream, none_grad):
     Gather the gradients to worker_rank.
     If none_grad is True, free p.grad after the gather.
     """
-    g = p.grad
-    if rank == state.worker_rank:
-        num_ranks = dist.get_world_size(group=state.process_group)
-        gather_list = [
-            torch.empty_like(g.to_local()) for _ in range(num_ranks)
-        ]
-    else:
-        gather_list = None
     with torch.cuda.stream(comm_stream):
         torch.distributed.gather(
             g.to_local(),
             dst=state.worker_rank,
@@ -92,6 +103,7 @@ def _gather(p, state, rank, comm_stream, none_grad):
         else:
             state.gathered_grad = None
             state.gather_event = None
         if none_grad:
             # We can safely free p.grad without calling record_stream:
             #   p.grad.to_local().record_stream(comm_stream)
@@ -104,7 +116,7 @@ def _gather(p, state, rank, comm_stream, none_grad):
 @torch.no_grad()
-def _compute_u(state, steps, rank, compute_stream):
     """
     On worker_rank, compute the orthogonalized update using Newton-Schulz iteration.
     """
@@ -115,11 +127,11 @@ def _compute_u(state, steps, rank, compute_stream):
             compute_stream.wait_event(state.gather_event)
             u = _zeropower_via_newtonschulz5(state.gathered_grad, steps)
             state.computed_u = u
-            state.compute_event = torch.cuda.Event()
-            state.compute_event.record()
-        else:
-            state.computed_u = None
-            state.compute_event = None
 @torch.no_grad()
@@ -129,12 +141,12 @@ def _scatter(p, state, rank, comm_stream):
     """
     with torch.cuda.stream(comm_stream):
         if rank == state.worker_rank:
             num_ranks = dist.get_world_size(group=state.process_group)
-            if state.compute_event is None:
-                raise RuntimeError("Compute event must be set before scatter.")
-            comm_stream.wait_event(state.compute_event)
             # Clear the gathered gradient to free memory
             state.gathered_grad = None
@@ -144,22 +156,15 @@ def _scatter(p, state, rank, comm_stream):
         else:
             scatter_list = None
-        u_received = torch.empty_like(p.to_local())
         torch.distributed.scatter(
-            u_received,
             scatter_list=scatter_list,
             src=state.worker_rank,
             group=state.process_group,
         )
-        u_dtensor = DTensor.from_local(
-            u_received,
-            placements=p.placements,
-            device_mesh=p.device_mesh,
-        )
-        state.scattered_u = u_dtensor
         state.scatter_event = torch.cuda.Event()
         state.scatter_event.record()
 def _update_param(p, state, lr, adjusted_lr, weight_decay, rank,
@@ -172,11 +177,21 @@ def _update_param(p, state, lr, adjusted_lr, weight_decay, rank,
         if state.scatter_event is None:
             raise RuntimeError("Scatter event must be set before update")
         compute_stream.wait_event(state.scatter_event)
         if rank == state.worker_rank:
             # Free computed_u
             state.computed_u = None
         Muon._update_p(p, state.scattered_u, lr, adjusted_lr, weight_decay)
 def default_is_muon(name, x):
@@ -375,7 +390,8 @@ class Muon(torch.optim.Optimizer):
             else:
                 g = buf
-            u = _zeropower_via_newtonschulz5(g, steps=group["ns_steps"])
             adjusted_lr = self.adjust_lr_for_muon(lr, p.shape)
             Muon._update_p(p, u, lr, adjusted_lr, weight_decay)
@@ -433,7 +449,7 @@ class Muon(torch.optim.Optimizer):
         def enqueue_computes(start_idx, chunk_size):
             for p in ordered_params[start_idx:start_idx + chunk_size]:
                 state = param_to_state[id(p)]
-                _compute_u(state, group["ns_steps"], self.rank,
                            self.compute_stream)
         def enqueue_scatters(start_idx, chunk_size):
@@ -466,6 +482,77 @@ class Muon(torch.optim.Optimizer):
         # Wait the last update_param to finish
         torch.cuda.current_stream().wait_stream(self.compute_stream)
     def step(self, closure=None):
         """Perform a single optimization step.
@@ -542,6 +629,12 @@ class Muon(torch.optim.Optimizer):
                 #       AdamW backup       #
                 ############################
                 lr = group["lr"]
                 beta1, beta2 = group["adamw_betas"]
                 eps = group["adamw_eps"]
@@ -552,23 +645,38 @@ class Muon(torch.optim.Optimizer):
                     if g is None:
                         continue
                     state = self.state[p]
                     if "step" not in state:
-                        state["step"] = 0
                         state["moment1"] = torch.zeros_like(g)
                         state["moment2"] = torch.zeros_like(g)
-                    state["step"] += 1
-                    step = state["step"]
-                    buf1 = state["moment1"]
-                    buf2 = state["moment2"]
-                    buf1.lerp_(g, 1 - beta1)
-                    buf2.lerp_(g.square(), 1 - beta2)
-                    g = buf1 / (eps + buf2.sqrt())
-                    bias_correction1 = 1 - beta1**step
-                    bias_correction2 = 1 - beta2**step
-                    scale = bias_correction1 / bias_correction2**0.5
-                    p.data.mul_(1 - lr * weight_decay)
-                    p.data.add_(g, alpha=-lr / scale)
         return loss

 import math
 import types
 from dataclasses import dataclass
+from typing import Optional, Union, cast
 import torch
 import torch.distributed as dist
 # This code snippet is a modified version adapted from the following GitHub repositories:
 # https://github.com/KellerJordan/Muon/blob/master/muon.py
+# Muon's Newton–Schulz iteration causes high variance in singular values
+# Idea: give each iteration its own 3 coefficients and optimize them via gradient descent.
 @torch.no_grad()
 def _zeropower_via_newtonschulz5(G, steps):
     """
     performance at all relative to UV^T, where USV^T = G is the SVD.
     """
     assert len(G.shape) == 2
+    assert G.dtype == torch.bfloat16
     X = G  # no manual typecast
     if G.size(0) > G.size(1):
         X = X.T
     # Ensure spectral norm is at most 1
     X = X / (X.norm() + 1e-7)
     # Perform the NS iterations
+    for a, b, c in [
+        (4.0848, -6.8946, 2.9270),
+        (3.9505, -6.3029, 2.6377),
+        (3.7418, -5.5913, 2.3037),
+        (2.8769, -3.1427, 1.2046),
+        (2.8366, -3.0525, 1.2012),
+    ]:
         A = X @ X.T
         # B = (
         #    b * A + c * A @ A
     if G.size(0) > G.size(1):
         X = X.T
+    return X
 @dataclass
     Gather the gradients to worker_rank.
     If none_grad is True, free p.grad after the gather.
     """
     with torch.cuda.stream(comm_stream):
+        g = p.grad
+        if rank == state.worker_rank:
+            num_ranks = dist.get_world_size(group=state.process_group)
+            gather_list = [
+                torch.empty_like(g.to_local(), dtype=torch.bfloat16)
+                for _ in range(num_ranks)
+            ]
+        else:
+            gather_list = None
+        g = g.to(torch.bfloat16)
         torch.distributed.gather(
             g.to_local(),
             dst=state.worker_rank,
         else:
             state.gathered_grad = None
             state.gather_event = None
+        gather_list = None
         if none_grad:
             # We can safely free p.grad without calling record_stream:
             #   p.grad.to_local().record_stream(comm_stream)
 @torch.no_grad()
+def _compute_u(p, state, steps, rank, compute_stream):
     """
     On worker_rank, compute the orthogonalized update using Newton-Schulz iteration.
     """
             compute_stream.wait_event(state.gather_event)
             u = _zeropower_via_newtonschulz5(state.gathered_grad, steps)
             state.computed_u = u
+        state.scattered_u = torch.empty_like(p.to_local(),
+                                             dtype=torch.bfloat16)
+        state.compute_event = torch.cuda.Event()
+        state.compute_event.record()
+        u = None
 @torch.no_grad()
     """
     with torch.cuda.stream(comm_stream):
+        if state.compute_event is None:
+            raise RuntimeError("Compute event must be set before scatter.")
+        comm_stream.wait_event(state.compute_event)
         if rank == state.worker_rank:
             num_ranks = dist.get_world_size(group=state.process_group)
             # Clear the gathered gradient to free memory
             state.gathered_grad = None
         else:
             scatter_list = None
         torch.distributed.scatter(
+            state.scattered_u,
             scatter_list=scatter_list,
             src=state.worker_rank,
             group=state.process_group,
         )
         state.scatter_event = torch.cuda.Event()
         state.scatter_event.record()
+        scatter_list = None
 def _update_param(p, state, lr, adjusted_lr, weight_decay, rank,
         if state.scatter_event is None:
             raise RuntimeError("Scatter event must be set before update")
         compute_stream.wait_event(state.scatter_event)
+        u_dtensor = DTensor.from_local(
+            state.scattered_u,
+            placements=p.placements,
+            device_mesh=p.device_mesh,
+        )
+        state.scattered_u = u_dtensor
         if rank == state.worker_rank:
             # Free computed_u
             state.computed_u = None
         Muon._update_p(p, state.scattered_u, lr, adjusted_lr, weight_decay)
+        state.scattered_u = None
+        u_dtensor = None
 def default_is_muon(name, x):
             else:
                 g = buf
+            u = _zeropower_via_newtonschulz5(g.bfloat16(),
+                                             steps=group["ns_steps"])
             adjusted_lr = self.adjust_lr_for_muon(lr, p.shape)
             Muon._update_p(p, u, lr, adjusted_lr, weight_decay)
         def enqueue_computes(start_idx, chunk_size):
             for p in ordered_params[start_idx:start_idx + chunk_size]:
                 state = param_to_state[id(p)]
+                _compute_u(p, state, group["ns_steps"], self.rank,
                            self.compute_stream)
         def enqueue_scatters(start_idx, chunk_size):
         # Wait the last update_param to finish
         torch.cuda.current_stream().wait_stream(self.compute_stream)
+    @staticmethod
+    def _fused_adamw(
+        params: list[torch.Tensor],
+        grads: list[torch.Tensor],
+        exp_avgs: list[torch.Tensor],
+        exp_avg_sqs: list[torch.Tensor],
+        max_exp_avg_sqs: list[torch.Tensor],
+        state_steps: list[torch.Tensor],
+        amsgrad: bool,
+        beta1: float,
+        beta2: float,
+        lr: Union[float, torch.Tensor],
+        weight_decay: float,
+        eps: float,
+        maximize: bool,
+    ) -> None:
+        if not params:
+            return
+        # We only shuffle around the lr when it is a Tensor and on CUDA, otherwise, we prefer
+        # treating it as a scalar.
+        lr_dict: Optional[DeviceDict] = ({
+            lr.device: lr
+        } if isinstance(lr, torch.Tensor) and str(lr.device) != "cpu" else
+                                         None)
+        grouped_tensors = torch.optim.Optimizer._group_tensors_by_device_and_dtype(
+            [
+                params, grads, exp_avgs, exp_avg_sqs, max_exp_avg_sqs,
+                state_steps
+            ]  # type: ignore[list-item]
+        )
+        for (device, _), (
+            (
+                device_params_,
+                device_grads_,
+                device_exp_avgs_,
+                device_exp_avg_sqs_,
+                device_max_exp_avg_sqs,
+                device_state_steps_,
+            ),
+                _,
+        ) in grouped_tensors.items():
+            device_params = cast(list[torch.Tensor], device_params_)
+            device_grads = cast(list[torch.Tensor], device_grads_)
+            device_exp_avgs = cast(list[torch.Tensor], device_exp_avgs_)
+            device_exp_avg_sqs = cast(list[torch.Tensor], device_exp_avg_sqs_)
+            device_state_steps = cast(list[torch.Tensor], device_state_steps_)
+            if lr_dict is not None and device not in lr_dict:
+                lr_dict[device] = lr.to(
+                    device=device,
+                    non_blocking=True)  # type: ignore[union-attr]
+                lr = lr_dict[device]
+            torch._foreach_add_(device_state_steps, 1)
+            func = torch._fused_adamw_
+            func(
+                device_params,
+                device_grads,
+                device_exp_avgs,
+                device_exp_avg_sqs,
+                device_max_exp_avg_sqs,  # type: ignore[arg-type]
+                device_state_steps,
+                amsgrad=amsgrad,
+                lr=lr,  # type: ignore[arg-type]
+                beta1=beta1,
+                beta2=beta2,
+                weight_decay=weight_decay,
+                eps=eps,
+                maximize=maximize,
+            )
     def step(self, closure=None):
         """Perform a single optimization step.
                 #       AdamW backup       #
                 ############################
+                params_with_grads = []
+                grads = []
+                moment1 = []
+                moment2 = []
+                max_exp_avg_sqs = []
+                state_steps = []
                 lr = group["lr"]
                 beta1, beta2 = group["adamw_betas"]
                 eps = group["adamw_eps"]
                     if g is None:
                         continue
                     state = self.state[p]
+                    params_with_grads.append(p)
+                    grads.append(g)
                     if "step" not in state:
+                        state["step"] = (torch.zeros((),
+                                                     dtype=torch.float32,
+                                                     device=p.device))
                         state["moment1"] = torch.zeros_like(g)
                         state["moment2"] = torch.zeros_like(g)
+                    moment1.append(state["moment1"])
+                    moment2.append(state["moment2"])
+                    if not isinstance(state["step"], torch.Tensor):
+                        step_tensor = torch.tensor(state["step"],
+                                                   dtype=torch.float32,
+                                                   device=p.device)
+                    else:
+                        step_tensor = state["step"]
+                    state_steps.append(step_tensor)
+                self._fused_adamw(
+                    params_with_grads,
+                    grads,
+                    moment1,
+                    moment2,
+                    max_exp_avg_sqs,
+                    state_steps,
+                    amsgrad=False,
+                    beta1=beta1,
+                    beta2=beta2,
+                    lr=lr,
+                    weight_decay=weight_decay,
+                    eps=eps,
+                    maximize=False,
+                )
         return loss

build/torch28-cxx11-cu128-x86_64-linux/optimizer/_ops.py CHANGED Viewed

@@ -1,9 +1,9 @@
 import torch
-from . import _optimizer_20250911094409
-ops = torch.ops._optimizer_20250911094409
 def add_op_namespace_prefix(op_name: str):
     """
     Prefix op by namespace.
     """
-    return f"_optimizer_20250911094409::{op_name}"

 import torch
+from . import _optimizer_ee6ed44_dirty
+ops = torch.ops._optimizer_ee6ed44_dirty
 def add_op_namespace_prefix(op_name: str):
     """
     Prefix op by namespace.
     """
+    return f"_optimizer_ee6ed44_dirty::{op_name}"

build/torch28-cxx11-cu128-x86_64-linux/optimizer/_optimizer_20250911094409.abi3.so DELETED Viewed

@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:22dc3ab77ab74837126281f79f417c5d55b2cc9885388fd9d3a1c7c824ece2bd
-size 1883360

build/torch28-cxx11-cu128-x86_64-linux/optimizer/_optimizer_ee6ed44_dirty.abi3.so ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:3487612a8f022a1df1353945fc6d65bbd6797179b06c5d3202dc6e2aa6afb27a
+size 1883352

build/torch28-cxx11-cu128-x86_64-linux/optimizer/muon.py CHANGED Viewed

@@ -2,6 +2,7 @@ import logging
 import math
 import types
 from dataclasses import dataclass
 import torch
 import torch.distributed as dist
@@ -12,6 +13,8 @@ logger = logging.getLogger(__name__)
 # This code snippet is a modified version adapted from the following GitHub repositories:
 # https://github.com/KellerJordan/Muon/blob/master/muon.py
 @torch.no_grad()
 def _zeropower_via_newtonschulz5(G, steps):
     """
@@ -24,15 +27,21 @@ def _zeropower_via_newtonschulz5(G, steps):
     performance at all relative to UV^T, where USV^T = G is the SVD.
     """
     assert len(G.shape) == 2
-    a, b, c = (3.4445, -4.7750, 2.0315)
     X = G  # no manual typecast
     if G.size(0) > G.size(1):
         X = X.T
     # Ensure spectral norm is at most 1
     X = X / (X.norm() + 1e-7)
-    X = X.bfloat16()
     # Perform the NS iterations
-    for _ in range(steps):
         A = X @ X.T
         # B = (
         #    b * A + c * A @ A
@@ -43,7 +52,7 @@ def _zeropower_via_newtonschulz5(G, steps):
     if G.size(0) > G.size(1):
         X = X.T
-    return X.to(G.dtype)
 @dataclass
@@ -65,17 +74,19 @@ def _gather(p, state, rank, comm_stream, none_grad):
     Gather the gradients to worker_rank.
     If none_grad is True, free p.grad after the gather.
     """
-    g = p.grad
-    if rank == state.worker_rank:
-        num_ranks = dist.get_world_size(group=state.process_group)
-        gather_list = [
-            torch.empty_like(g.to_local()) for _ in range(num_ranks)
-        ]
-    else:
-        gather_list = None
     with torch.cuda.stream(comm_stream):
         torch.distributed.gather(
             g.to_local(),
             dst=state.worker_rank,
@@ -92,6 +103,7 @@ def _gather(p, state, rank, comm_stream, none_grad):
         else:
             state.gathered_grad = None
             state.gather_event = None
         if none_grad:
             # We can safely free p.grad without calling record_stream:
             #   p.grad.to_local().record_stream(comm_stream)
@@ -104,7 +116,7 @@ def _gather(p, state, rank, comm_stream, none_grad):
 @torch.no_grad()
-def _compute_u(state, steps, rank, compute_stream):
     """
     On worker_rank, compute the orthogonalized update using Newton-Schulz iteration.
     """
@@ -115,11 +127,11 @@ def _compute_u(state, steps, rank, compute_stream):
             compute_stream.wait_event(state.gather_event)
             u = _zeropower_via_newtonschulz5(state.gathered_grad, steps)
             state.computed_u = u
-            state.compute_event = torch.cuda.Event()
-            state.compute_event.record()
-        else:
-            state.computed_u = None
-            state.compute_event = None
 @torch.no_grad()
@@ -129,12 +141,12 @@ def _scatter(p, state, rank, comm_stream):
     """
     with torch.cuda.stream(comm_stream):
         if rank == state.worker_rank:
             num_ranks = dist.get_world_size(group=state.process_group)
-            if state.compute_event is None:
-                raise RuntimeError("Compute event must be set before scatter.")
-            comm_stream.wait_event(state.compute_event)
             # Clear the gathered gradient to free memory
             state.gathered_grad = None
@@ -144,22 +156,15 @@ def _scatter(p, state, rank, comm_stream):
         else:
             scatter_list = None
-        u_received = torch.empty_like(p.to_local())
         torch.distributed.scatter(
-            u_received,
             scatter_list=scatter_list,
             src=state.worker_rank,
             group=state.process_group,
         )
-        u_dtensor = DTensor.from_local(
-            u_received,
-            placements=p.placements,
-            device_mesh=p.device_mesh,
-        )
-        state.scattered_u = u_dtensor
         state.scatter_event = torch.cuda.Event()
         state.scatter_event.record()
 def _update_param(p, state, lr, adjusted_lr, weight_decay, rank,
@@ -172,11 +177,21 @@ def _update_param(p, state, lr, adjusted_lr, weight_decay, rank,
         if state.scatter_event is None:
             raise RuntimeError("Scatter event must be set before update")
         compute_stream.wait_event(state.scatter_event)
         if rank == state.worker_rank:
             # Free computed_u
             state.computed_u = None
         Muon._update_p(p, state.scattered_u, lr, adjusted_lr, weight_decay)
 def default_is_muon(name, x):
@@ -375,7 +390,8 @@ class Muon(torch.optim.Optimizer):
             else:
                 g = buf
-            u = _zeropower_via_newtonschulz5(g, steps=group["ns_steps"])
             adjusted_lr = self.adjust_lr_for_muon(lr, p.shape)
             Muon._update_p(p, u, lr, adjusted_lr, weight_decay)
@@ -433,7 +449,7 @@ class Muon(torch.optim.Optimizer):
         def enqueue_computes(start_idx, chunk_size):
             for p in ordered_params[start_idx:start_idx + chunk_size]:
                 state = param_to_state[id(p)]
-                _compute_u(state, group["ns_steps"], self.rank,
                            self.compute_stream)
         def enqueue_scatters(start_idx, chunk_size):
@@ -466,6 +482,77 @@ class Muon(torch.optim.Optimizer):
         # Wait the last update_param to finish
         torch.cuda.current_stream().wait_stream(self.compute_stream)
     def step(self, closure=None):
         """Perform a single optimization step.
@@ -542,6 +629,12 @@ class Muon(torch.optim.Optimizer):
                 #       AdamW backup       #
                 ############################
                 lr = group["lr"]
                 beta1, beta2 = group["adamw_betas"]
                 eps = group["adamw_eps"]
@@ -552,23 +645,38 @@ class Muon(torch.optim.Optimizer):
                     if g is None:
                         continue
                     state = self.state[p]
                     if "step" not in state:
-                        state["step"] = 0
                         state["moment1"] = torch.zeros_like(g)
                         state["moment2"] = torch.zeros_like(g)
-                    state["step"] += 1
-                    step = state["step"]
-                    buf1 = state["moment1"]
-                    buf2 = state["moment2"]
-                    buf1.lerp_(g, 1 - beta1)
-                    buf2.lerp_(g.square(), 1 - beta2)
-                    g = buf1 / (eps + buf2.sqrt())
-                    bias_correction1 = 1 - beta1**step
-                    bias_correction2 = 1 - beta2**step
-                    scale = bias_correction1 / bias_correction2**0.5
-                    p.data.mul_(1 - lr * weight_decay)
-                    p.data.add_(g, alpha=-lr / scale)
         return loss

 import math
 import types
 from dataclasses import dataclass
+from typing import Optional, Union, cast
 import torch
 import torch.distributed as dist
 # This code snippet is a modified version adapted from the following GitHub repositories:
 # https://github.com/KellerJordan/Muon/blob/master/muon.py
+# Muon's Newton–Schulz iteration causes high variance in singular values
+# Idea: give each iteration its own 3 coefficients and optimize them via gradient descent.
 @torch.no_grad()
 def _zeropower_via_newtonschulz5(G, steps):
     """
     performance at all relative to UV^T, where USV^T = G is the SVD.
     """
     assert len(G.shape) == 2
+    assert G.dtype == torch.bfloat16
     X = G  # no manual typecast
     if G.size(0) > G.size(1):
         X = X.T
     # Ensure spectral norm is at most 1
     X = X / (X.norm() + 1e-7)
     # Perform the NS iterations
+    for a, b, c in [
+        (4.0848, -6.8946, 2.9270),
+        (3.9505, -6.3029, 2.6377),
+        (3.7418, -5.5913, 2.3037),
+        (2.8769, -3.1427, 1.2046),
+        (2.8366, -3.0525, 1.2012),
+    ]:
         A = X @ X.T
         # B = (
         #    b * A + c * A @ A
     if G.size(0) > G.size(1):
         X = X.T
+    return X
 @dataclass
     Gather the gradients to worker_rank.
     If none_grad is True, free p.grad after the gather.
     """
     with torch.cuda.stream(comm_stream):
+        g = p.grad
+        if rank == state.worker_rank:
+            num_ranks = dist.get_world_size(group=state.process_group)
+            gather_list = [
+                torch.empty_like(g.to_local(), dtype=torch.bfloat16)
+                for _ in range(num_ranks)
+            ]
+        else:
+            gather_list = None
+        g = g.to(torch.bfloat16)
         torch.distributed.gather(
             g.to_local(),
             dst=state.worker_rank,
         else:
             state.gathered_grad = None
             state.gather_event = None
+        gather_list = None
         if none_grad:
             # We can safely free p.grad without calling record_stream:
             #   p.grad.to_local().record_stream(comm_stream)
 @torch.no_grad()
+def _compute_u(p, state, steps, rank, compute_stream):
     """
     On worker_rank, compute the orthogonalized update using Newton-Schulz iteration.
     """
             compute_stream.wait_event(state.gather_event)
             u = _zeropower_via_newtonschulz5(state.gathered_grad, steps)
             state.computed_u = u
+        state.scattered_u = torch.empty_like(p.to_local(),
+                                             dtype=torch.bfloat16)
+        state.compute_event = torch.cuda.Event()
+        state.compute_event.record()
+        u = None
 @torch.no_grad()
     """
     with torch.cuda.stream(comm_stream):
+        if state.compute_event is None:
+            raise RuntimeError("Compute event must be set before scatter.")
+        comm_stream.wait_event(state.compute_event)
         if rank == state.worker_rank:
             num_ranks = dist.get_world_size(group=state.process_group)
             # Clear the gathered gradient to free memory
             state.gathered_grad = None
         else:
             scatter_list = None
         torch.distributed.scatter(
+            state.scattered_u,
             scatter_list=scatter_list,
             src=state.worker_rank,
             group=state.process_group,
         )
         state.scatter_event = torch.cuda.Event()
         state.scatter_event.record()
+        scatter_list = None
 def _update_param(p, state, lr, adjusted_lr, weight_decay, rank,
         if state.scatter_event is None:
             raise RuntimeError("Scatter event must be set before update")
         compute_stream.wait_event(state.scatter_event)
+        u_dtensor = DTensor.from_local(
+            state.scattered_u,
+            placements=p.placements,
+            device_mesh=p.device_mesh,
+        )
+        state.scattered_u = u_dtensor
         if rank == state.worker_rank:
             # Free computed_u
             state.computed_u = None
         Muon._update_p(p, state.scattered_u, lr, adjusted_lr, weight_decay)
+        state.scattered_u = None
+        u_dtensor = None
 def default_is_muon(name, x):
             else:
                 g = buf
+            u = _zeropower_via_newtonschulz5(g.bfloat16(),
+                                             steps=group["ns_steps"])
             adjusted_lr = self.adjust_lr_for_muon(lr, p.shape)
             Muon._update_p(p, u, lr, adjusted_lr, weight_decay)
         def enqueue_computes(start_idx, chunk_size):
             for p in ordered_params[start_idx:start_idx + chunk_size]:
                 state = param_to_state[id(p)]
+                _compute_u(p, state, group["ns_steps"], self.rank,
                            self.compute_stream)
         def enqueue_scatters(start_idx, chunk_size):
         # Wait the last update_param to finish
         torch.cuda.current_stream().wait_stream(self.compute_stream)
+    @staticmethod
+    def _fused_adamw(
+        params: list[torch.Tensor],
+        grads: list[torch.Tensor],
+        exp_avgs: list[torch.Tensor],
+        exp_avg_sqs: list[torch.Tensor],
+        max_exp_avg_sqs: list[torch.Tensor],
+        state_steps: list[torch.Tensor],
+        amsgrad: bool,
+        beta1: float,
+        beta2: float,
+        lr: Union[float, torch.Tensor],
+        weight_decay: float,
+        eps: float,
+        maximize: bool,
+    ) -> None:
+        if not params:
+            return
+        # We only shuffle around the lr when it is a Tensor and on CUDA, otherwise, we prefer
+        # treating it as a scalar.
+        lr_dict: Optional[DeviceDict] = ({
+            lr.device: lr
+        } if isinstance(lr, torch.Tensor) and str(lr.device) != "cpu" else
+                                         None)
+        grouped_tensors = torch.optim.Optimizer._group_tensors_by_device_and_dtype(
+            [
+                params, grads, exp_avgs, exp_avg_sqs, max_exp_avg_sqs,
+                state_steps
+            ]  # type: ignore[list-item]
+        )
+        for (device, _), (
+            (
+                device_params_,
+                device_grads_,
+                device_exp_avgs_,
+                device_exp_avg_sqs_,
+                device_max_exp_avg_sqs,
+                device_state_steps_,
+            ),
+                _,
+        ) in grouped_tensors.items():
+            device_params = cast(list[torch.Tensor], device_params_)
+            device_grads = cast(list[torch.Tensor], device_grads_)
+            device_exp_avgs = cast(list[torch.Tensor], device_exp_avgs_)
+            device_exp_avg_sqs = cast(list[torch.Tensor], device_exp_avg_sqs_)
+            device_state_steps = cast(list[torch.Tensor], device_state_steps_)
+            if lr_dict is not None and device not in lr_dict:
+                lr_dict[device] = lr.to(
+                    device=device,
+                    non_blocking=True)  # type: ignore[union-attr]
+                lr = lr_dict[device]
+            torch._foreach_add_(device_state_steps, 1)
+            func = torch._fused_adamw_
+            func(
+                device_params,
+                device_grads,
+                device_exp_avgs,
+                device_exp_avg_sqs,
+                device_max_exp_avg_sqs,  # type: ignore[arg-type]
+                device_state_steps,
+                amsgrad=amsgrad,
+                lr=lr,  # type: ignore[arg-type]
+                beta1=beta1,
+                beta2=beta2,
+                weight_decay=weight_decay,
+                eps=eps,
+                maximize=maximize,
+            )
     def step(self, closure=None):
         """Perform a single optimization step.
                 #       AdamW backup       #
                 ############################
+                params_with_grads = []
+                grads = []
+                moment1 = []
+                moment2 = []
+                max_exp_avg_sqs = []
+                state_steps = []
                 lr = group["lr"]
                 beta1, beta2 = group["adamw_betas"]
                 eps = group["adamw_eps"]
                     if g is None:
                         continue
                     state = self.state[p]
+                    params_with_grads.append(p)
+                    grads.append(g)
                     if "step" not in state:
+                        state["step"] = (torch.zeros((),
+                                                     dtype=torch.float32,
+                                                     device=p.device))
                         state["moment1"] = torch.zeros_like(g)
                         state["moment2"] = torch.zeros_like(g)
+                    moment1.append(state["moment1"])
+                    moment2.append(state["moment2"])
+                    if not isinstance(state["step"], torch.Tensor):
+                        step_tensor = torch.tensor(state["step"],
+                                                   dtype=torch.float32,
+                                                   device=p.device)
+                    else:
+                        step_tensor = state["step"]
+                    state_steps.append(step_tensor)
+                self._fused_adamw(
+                    params_with_grads,
+                    grads,
+                    moment1,
+                    moment2,
+                    max_exp_avg_sqs,
+                    state_steps,
+                    amsgrad=False,
+                    beta1=beta1,
+                    beta2=beta2,
+                    lr=lr,
+                    weight_decay=weight_decay,
+                    eps=eps,
+                    maximize=False,
+                )
         return loss

build/torch28-cxx11-cu129-x86_64-linux/optimizer/_ops.py CHANGED Viewed

@@ -1,9 +1,9 @@
 import torch
-from . import _optimizer_20250911094409
-ops = torch.ops._optimizer_20250911094409
 def add_op_namespace_prefix(op_name: str):
     """
     Prefix op by namespace.
     """
-    return f"_optimizer_20250911094409::{op_name}"

 import torch
+from . import _optimizer_ee6ed44_dirty
+ops = torch.ops._optimizer_ee6ed44_dirty
 def add_op_namespace_prefix(op_name: str):
     """
     Prefix op by namespace.
     """
+    return f"_optimizer_ee6ed44_dirty::{op_name}"

build/torch28-cxx11-cu129-x86_64-linux/optimizer/_optimizer_20250911094409.abi3.so DELETED Viewed

@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:62ecfc7e6a1ab0c4ada19ed7aea40fc0a431c4ceb1729666efa98ac0e407f9c8
-size 1883360

build/torch28-cxx11-cu129-x86_64-linux/optimizer/_optimizer_ee6ed44_dirty.abi3.so ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:f5e375def39d93758b60534cef504ae75d9c13e0d86da5dcf7642f1f90b77f52
+size 1883352

build/torch28-cxx11-cu129-x86_64-linux/optimizer/muon.py CHANGED Viewed

@@ -2,6 +2,7 @@ import logging
 import math
 import types
 from dataclasses import dataclass
 import torch
 import torch.distributed as dist
@@ -12,6 +13,8 @@ logger = logging.getLogger(__name__)
 # This code snippet is a modified version adapted from the following GitHub repositories:
 # https://github.com/KellerJordan/Muon/blob/master/muon.py
 @torch.no_grad()
 def _zeropower_via_newtonschulz5(G, steps):
     """
@@ -24,15 +27,21 @@ def _zeropower_via_newtonschulz5(G, steps):
     performance at all relative to UV^T, where USV^T = G is the SVD.
     """
     assert len(G.shape) == 2
-    a, b, c = (3.4445, -4.7750, 2.0315)
     X = G  # no manual typecast
     if G.size(0) > G.size(1):
         X = X.T
     # Ensure spectral norm is at most 1
     X = X / (X.norm() + 1e-7)
-    X = X.bfloat16()
     # Perform the NS iterations
-    for _ in range(steps):
         A = X @ X.T
         # B = (
         #    b * A + c * A @ A
@@ -43,7 +52,7 @@ def _zeropower_via_newtonschulz5(G, steps):
     if G.size(0) > G.size(1):
         X = X.T
-    return X.to(G.dtype)
 @dataclass
@@ -65,17 +74,19 @@ def _gather(p, state, rank, comm_stream, none_grad):
     Gather the gradients to worker_rank.
     If none_grad is True, free p.grad after the gather.
     """
-    g = p.grad
-    if rank == state.worker_rank:
-        num_ranks = dist.get_world_size(group=state.process_group)
-        gather_list = [
-            torch.empty_like(g.to_local()) for _ in range(num_ranks)
-        ]
-    else:
-        gather_list = None
     with torch.cuda.stream(comm_stream):
         torch.distributed.gather(
             g.to_local(),
             dst=state.worker_rank,
@@ -92,6 +103,7 @@ def _gather(p, state, rank, comm_stream, none_grad):
         else:
             state.gathered_grad = None
             state.gather_event = None
         if none_grad:
             # We can safely free p.grad without calling record_stream:
             #   p.grad.to_local().record_stream(comm_stream)
@@ -104,7 +116,7 @@ def _gather(p, state, rank, comm_stream, none_grad):
 @torch.no_grad()
-def _compute_u(state, steps, rank, compute_stream):
     """
     On worker_rank, compute the orthogonalized update using Newton-Schulz iteration.
     """
@@ -115,11 +127,11 @@ def _compute_u(state, steps, rank, compute_stream):
             compute_stream.wait_event(state.gather_event)
             u = _zeropower_via_newtonschulz5(state.gathered_grad, steps)
             state.computed_u = u
-            state.compute_event = torch.cuda.Event()
-            state.compute_event.record()
-        else:
-            state.computed_u = None
-            state.compute_event = None
 @torch.no_grad()
@@ -129,12 +141,12 @@ def _scatter(p, state, rank, comm_stream):
     """
     with torch.cuda.stream(comm_stream):
         if rank == state.worker_rank:
             num_ranks = dist.get_world_size(group=state.process_group)
-            if state.compute_event is None:
-                raise RuntimeError("Compute event must be set before scatter.")
-            comm_stream.wait_event(state.compute_event)
             # Clear the gathered gradient to free memory
             state.gathered_grad = None
@@ -144,22 +156,15 @@ def _scatter(p, state, rank, comm_stream):
         else:
             scatter_list = None
-        u_received = torch.empty_like(p.to_local())
         torch.distributed.scatter(
-            u_received,
             scatter_list=scatter_list,
             src=state.worker_rank,
             group=state.process_group,
         )
-        u_dtensor = DTensor.from_local(
-            u_received,
-            placements=p.placements,
-            device_mesh=p.device_mesh,
-        )
-        state.scattered_u = u_dtensor
         state.scatter_event = torch.cuda.Event()
         state.scatter_event.record()
 def _update_param(p, state, lr, adjusted_lr, weight_decay, rank,
@@ -172,11 +177,21 @@ def _update_param(p, state, lr, adjusted_lr, weight_decay, rank,
         if state.scatter_event is None:
             raise RuntimeError("Scatter event must be set before update")
         compute_stream.wait_event(state.scatter_event)
         if rank == state.worker_rank:
             # Free computed_u
             state.computed_u = None
         Muon._update_p(p, state.scattered_u, lr, adjusted_lr, weight_decay)
 def default_is_muon(name, x):
@@ -375,7 +390,8 @@ class Muon(torch.optim.Optimizer):
             else:
                 g = buf
-            u = _zeropower_via_newtonschulz5(g, steps=group["ns_steps"])
             adjusted_lr = self.adjust_lr_for_muon(lr, p.shape)
             Muon._update_p(p, u, lr, adjusted_lr, weight_decay)
@@ -433,7 +449,7 @@ class Muon(torch.optim.Optimizer):
         def enqueue_computes(start_idx, chunk_size):
             for p in ordered_params[start_idx:start_idx + chunk_size]:
                 state = param_to_state[id(p)]
-                _compute_u(state, group["ns_steps"], self.rank,
                            self.compute_stream)
         def enqueue_scatters(start_idx, chunk_size):
@@ -466,6 +482,77 @@ class Muon(torch.optim.Optimizer):
         # Wait the last update_param to finish
         torch.cuda.current_stream().wait_stream(self.compute_stream)
     def step(self, closure=None):
         """Perform a single optimization step.
@@ -542,6 +629,12 @@ class Muon(torch.optim.Optimizer):
                 #       AdamW backup       #
                 ############################
                 lr = group["lr"]
                 beta1, beta2 = group["adamw_betas"]
                 eps = group["adamw_eps"]
@@ -552,23 +645,38 @@ class Muon(torch.optim.Optimizer):
                     if g is None:
                         continue
                     state = self.state[p]
                     if "step" not in state:
-                        state["step"] = 0
                         state["moment1"] = torch.zeros_like(g)
                         state["moment2"] = torch.zeros_like(g)
-                    state["step"] += 1
-                    step = state["step"]
-                    buf1 = state["moment1"]
-                    buf2 = state["moment2"]
-                    buf1.lerp_(g, 1 - beta1)
-                    buf2.lerp_(g.square(), 1 - beta2)
-                    g = buf1 / (eps + buf2.sqrt())
-                    bias_correction1 = 1 - beta1**step
-                    bias_correction2 = 1 - beta2**step
-                    scale = bias_correction1 / bias_correction2**0.5
-                    p.data.mul_(1 - lr * weight_decay)
-                    p.data.add_(g, alpha=-lr / scale)
         return loss

 import math
 import types
 from dataclasses import dataclass
+from typing import Optional, Union, cast
 import torch
 import torch.distributed as dist
 # This code snippet is a modified version adapted from the following GitHub repositories:
 # https://github.com/KellerJordan/Muon/blob/master/muon.py
+# Muon's Newton–Schulz iteration causes high variance in singular values
+# Idea: give each iteration its own 3 coefficients and optimize them via gradient descent.
 @torch.no_grad()
 def _zeropower_via_newtonschulz5(G, steps):
     """
     performance at all relative to UV^T, where USV^T = G is the SVD.
     """
     assert len(G.shape) == 2
+    assert G.dtype == torch.bfloat16
     X = G  # no manual typecast
     if G.size(0) > G.size(1):
         X = X.T
     # Ensure spectral norm is at most 1
     X = X / (X.norm() + 1e-7)
     # Perform the NS iterations
+    for a, b, c in [
+        (4.0848, -6.8946, 2.9270),
+        (3.9505, -6.3029, 2.6377),
+        (3.7418, -5.5913, 2.3037),
+        (2.8769, -3.1427, 1.2046),
+        (2.8366, -3.0525, 1.2012),
+    ]:
         A = X @ X.T
         # B = (
         #    b * A + c * A @ A
     if G.size(0) > G.size(1):
         X = X.T
+    return X
 @dataclass
     Gather the gradients to worker_rank.
     If none_grad is True, free p.grad after the gather.
     """
     with torch.cuda.stream(comm_stream):
+        g = p.grad
+        if rank == state.worker_rank:
+            num_ranks = dist.get_world_size(group=state.process_group)
+            gather_list = [
+                torch.empty_like(g.to_local(), dtype=torch.bfloat16)
+                for _ in range(num_ranks)
+            ]
+        else:
+            gather_list = None
+        g = g.to(torch.bfloat16)
         torch.distributed.gather(
             g.to_local(),
             dst=state.worker_rank,
         else:
             state.gathered_grad = None
             state.gather_event = None
+        gather_list = None
         if none_grad:
             # We can safely free p.grad without calling record_stream:
             #   p.grad.to_local().record_stream(comm_stream)
 @torch.no_grad()
+def _compute_u(p, state, steps, rank, compute_stream):
     """
     On worker_rank, compute the orthogonalized update using Newton-Schulz iteration.
     """
             compute_stream.wait_event(state.gather_event)
             u = _zeropower_via_newtonschulz5(state.gathered_grad, steps)
             state.computed_u = u
+        state.scattered_u = torch.empty_like(p.to_local(),
+                                             dtype=torch.bfloat16)
+        state.compute_event = torch.cuda.Event()
+        state.compute_event.record()
+        u = None
 @torch.no_grad()
     """
     with torch.cuda.stream(comm_stream):
+        if state.compute_event is None:
+            raise RuntimeError("Compute event must be set before scatter.")
+        comm_stream.wait_event(state.compute_event)
         if rank == state.worker_rank:
             num_ranks = dist.get_world_size(group=state.process_group)
             # Clear the gathered gradient to free memory
             state.gathered_grad = None
         else:
             scatter_list = None
         torch.distributed.scatter(
+            state.scattered_u,
             scatter_list=scatter_list,
             src=state.worker_rank,
             group=state.process_group,
         )
         state.scatter_event = torch.cuda.Event()
         state.scatter_event.record()
+        scatter_list = None
 def _update_param(p, state, lr, adjusted_lr, weight_decay, rank,
         if state.scatter_event is None:
             raise RuntimeError("Scatter event must be set before update")
         compute_stream.wait_event(state.scatter_event)
+        u_dtensor = DTensor.from_local(
+            state.scattered_u,
+            placements=p.placements,
+            device_mesh=p.device_mesh,
+        )
+        state.scattered_u = u_dtensor
         if rank == state.worker_rank:
             # Free computed_u
             state.computed_u = None
         Muon._update_p(p, state.scattered_u, lr, adjusted_lr, weight_decay)
+        state.scattered_u = None
+        u_dtensor = None
 def default_is_muon(name, x):
             else:
                 g = buf
+            u = _zeropower_via_newtonschulz5(g.bfloat16(),
+                                             steps=group["ns_steps"])
             adjusted_lr = self.adjust_lr_for_muon(lr, p.shape)
             Muon._update_p(p, u, lr, adjusted_lr, weight_decay)
         def enqueue_computes(start_idx, chunk_size):
             for p in ordered_params[start_idx:start_idx + chunk_size]:
                 state = param_to_state[id(p)]
+                _compute_u(p, state, group["ns_steps"], self.rank,
                            self.compute_stream)
         def enqueue_scatters(start_idx, chunk_size):
         # Wait the last update_param to finish
         torch.cuda.current_stream().wait_stream(self.compute_stream)
+    @staticmethod
+    def _fused_adamw(
+        params: list[torch.Tensor],
+        grads: list[torch.Tensor],
+        exp_avgs: list[torch.Tensor],
+        exp_avg_sqs: list[torch.Tensor],
+        max_exp_avg_sqs: list[torch.Tensor],
+        state_steps: list[torch.Tensor],
+        amsgrad: bool,
+        beta1: float,
+        beta2: float,
+        lr: Union[float, torch.Tensor],
+        weight_decay: float,
+        eps: float,
+        maximize: bool,
+    ) -> None:
+        if not params:
+            return
+        # We only shuffle around the lr when it is a Tensor and on CUDA, otherwise, we prefer
+        # treating it as a scalar.
+        lr_dict: Optional[DeviceDict] = ({
+            lr.device: lr
+        } if isinstance(lr, torch.Tensor) and str(lr.device) != "cpu" else
+                                         None)
+        grouped_tensors = torch.optim.Optimizer._group_tensors_by_device_and_dtype(
+            [
+                params, grads, exp_avgs, exp_avg_sqs, max_exp_avg_sqs,
+                state_steps
+            ]  # type: ignore[list-item]
+        )
+        for (device, _), (
+            (
+                device_params_,
+                device_grads_,
+                device_exp_avgs_,
+                device_exp_avg_sqs_,
+                device_max_exp_avg_sqs,
+                device_state_steps_,
+            ),
+                _,
+        ) in grouped_tensors.items():
+            device_params = cast(list[torch.Tensor], device_params_)
+            device_grads = cast(list[torch.Tensor], device_grads_)
+            device_exp_avgs = cast(list[torch.Tensor], device_exp_avgs_)
+            device_exp_avg_sqs = cast(list[torch.Tensor], device_exp_avg_sqs_)
+            device_state_steps = cast(list[torch.Tensor], device_state_steps_)
+            if lr_dict is not None and device not in lr_dict:
+                lr_dict[device] = lr.to(
+                    device=device,
+                    non_blocking=True)  # type: ignore[union-attr]
+                lr = lr_dict[device]
+            torch._foreach_add_(device_state_steps, 1)
+            func = torch._fused_adamw_
+            func(
+                device_params,
+                device_grads,
+                device_exp_avgs,
+                device_exp_avg_sqs,
+                device_max_exp_avg_sqs,  # type: ignore[arg-type]
+                device_state_steps,
+                amsgrad=amsgrad,
+                lr=lr,  # type: ignore[arg-type]
+                beta1=beta1,
+                beta2=beta2,
+                weight_decay=weight_decay,
+                eps=eps,
+                maximize=maximize,
+            )
     def step(self, closure=None):
         """Perform a single optimization step.
                 #       AdamW backup       #
                 ############################
+                params_with_grads = []
+                grads = []
+                moment1 = []
+                moment2 = []
+                max_exp_avg_sqs = []
+                state_steps = []
                 lr = group["lr"]
                 beta1, beta2 = group["adamw_betas"]
                 eps = group["adamw_eps"]
                     if g is None:
                         continue
                     state = self.state[p]
+                    params_with_grads.append(p)
+                    grads.append(g)
                     if "step" not in state:
+                        state["step"] = (torch.zeros((),
+                                                     dtype=torch.float32,
+                                                     device=p.device))
                         state["moment1"] = torch.zeros_like(g)
                         state["moment2"] = torch.zeros_like(g)
+                    moment1.append(state["moment1"])
+                    moment2.append(state["moment2"])
+                    if not isinstance(state["step"], torch.Tensor):
+                        step_tensor = torch.tensor(state["step"],
+                                                   dtype=torch.float32,
+                                                   device=p.device)
+                    else:
+                        step_tensor = state["step"]
+                    state_steps.append(step_tensor)
+                self._fused_adamw(
+                    params_with_grads,
+                    grads,
+                    moment1,
+                    moment2,
+                    max_exp_avg_sqs,
+                    state_steps,
+                    amsgrad=False,
+                    beta1=beta1,
+                    beta2=beta2,
+                    lr=lr,
+                    weight_decay=weight_decay,
+                    eps=eps,
+                    maximize=False,
+                )
         return loss

build/torch28-cxx11-rocm63-x86_64-linux/optimizer/_ops.py CHANGED Viewed

@@ -1,9 +1,9 @@
 import torch
-from . import _optimizer_20250911094409
-ops = torch.ops._optimizer_20250911094409
 def add_op_namespace_prefix(op_name: str):
     """
     Prefix op by namespace.
     """
-    return f"_optimizer_20250911094409::{op_name}"

 import torch
+from . import _optimizer_ee6ed44_dirty
+ops = torch.ops._optimizer_ee6ed44_dirty
 def add_op_namespace_prefix(op_name: str):
     """
     Prefix op by namespace.
     """
+    return f"_optimizer_ee6ed44_dirty::{op_name}"

build/torch28-cxx11-rocm63-x86_64-linux/optimizer/_optimizer_20250911094409.abi3.so DELETED Viewed

@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:37e389c650fc1fcbc9fbd68f1e7c1a768b08e90509fd8a5d87879655726f2db2
-size 1750040

build/torch28-cxx11-rocm63-x86_64-linux/optimizer/_optimizer_ee6ed44_dirty.abi3.so ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:33e0d50fbf340612b0e1129717e4116197c8562592e5920f2dedc718ce9a0585
+size 1750000

build/torch28-cxx11-rocm63-x86_64-linux/optimizer/muon.py CHANGED Viewed

@@ -2,6 +2,7 @@ import logging
 import math
 import types
 from dataclasses import dataclass
 import torch
 import torch.distributed as dist
@@ -12,6 +13,8 @@ logger = logging.getLogger(__name__)
 # This code snippet is a modified version adapted from the following GitHub repositories:
 # https://github.com/KellerJordan/Muon/blob/master/muon.py
 @torch.no_grad()
 def _zeropower_via_newtonschulz5(G, steps):
     """
@@ -24,15 +27,21 @@ def _zeropower_via_newtonschulz5(G, steps):
     performance at all relative to UV^T, where USV^T = G is the SVD.
     """
     assert len(G.shape) == 2
-    a, b, c = (3.4445, -4.7750, 2.0315)
     X = G  # no manual typecast
     if G.size(0) > G.size(1):
         X = X.T
     # Ensure spectral norm is at most 1
     X = X / (X.norm() + 1e-7)
-    X = X.bfloat16()
     # Perform the NS iterations
-    for _ in range(steps):
         A = X @ X.T
         # B = (
         #    b * A + c * A @ A
@@ -43,7 +52,7 @@ def _zeropower_via_newtonschulz5(G, steps):
     if G.size(0) > G.size(1):
         X = X.T
-    return X.to(G.dtype)
 @dataclass
@@ -65,17 +74,19 @@ def _gather(p, state, rank, comm_stream, none_grad):
     Gather the gradients to worker_rank.
     If none_grad is True, free p.grad after the gather.
     """
-    g = p.grad
-    if rank == state.worker_rank:
-        num_ranks = dist.get_world_size(group=state.process_group)
-        gather_list = [
-            torch.empty_like(g.to_local()) for _ in range(num_ranks)
-        ]
-    else:
-        gather_list = None
     with torch.cuda.stream(comm_stream):
         torch.distributed.gather(
             g.to_local(),
             dst=state.worker_rank,
@@ -92,6 +103,7 @@ def _gather(p, state, rank, comm_stream, none_grad):
         else:
             state.gathered_grad = None
             state.gather_event = None
         if none_grad:
             # We can safely free p.grad without calling record_stream:
             #   p.grad.to_local().record_stream(comm_stream)
@@ -104,7 +116,7 @@ def _gather(p, state, rank, comm_stream, none_grad):
 @torch.no_grad()
-def _compute_u(state, steps, rank, compute_stream):
     """
     On worker_rank, compute the orthogonalized update using Newton-Schulz iteration.
     """
@@ -115,11 +127,11 @@ def _compute_u(state, steps, rank, compute_stream):
             compute_stream.wait_event(state.gather_event)
             u = _zeropower_via_newtonschulz5(state.gathered_grad, steps)
             state.computed_u = u
-            state.compute_event = torch.cuda.Event()
-            state.compute_event.record()
-        else:
-            state.computed_u = None
-            state.compute_event = None
 @torch.no_grad()
@@ -129,12 +141,12 @@ def _scatter(p, state, rank, comm_stream):
     """
     with torch.cuda.stream(comm_stream):
         if rank == state.worker_rank:
             num_ranks = dist.get_world_size(group=state.process_group)
-            if state.compute_event is None:
-                raise RuntimeError("Compute event must be set before scatter.")
-            comm_stream.wait_event(state.compute_event)
             # Clear the gathered gradient to free memory
             state.gathered_grad = None
@@ -144,22 +156,15 @@ def _scatter(p, state, rank, comm_stream):
         else:
             scatter_list = None
-        u_received = torch.empty_like(p.to_local())
         torch.distributed.scatter(
-            u_received,
             scatter_list=scatter_list,
             src=state.worker_rank,
             group=state.process_group,
         )
-        u_dtensor = DTensor.from_local(
-            u_received,
-            placements=p.placements,
-            device_mesh=p.device_mesh,
-        )
-        state.scattered_u = u_dtensor
         state.scatter_event = torch.cuda.Event()
         state.scatter_event.record()
 def _update_param(p, state, lr, adjusted_lr, weight_decay, rank,
@@ -172,11 +177,21 @@ def _update_param(p, state, lr, adjusted_lr, weight_decay, rank,
         if state.scatter_event is None:
             raise RuntimeError("Scatter event must be set before update")
         compute_stream.wait_event(state.scatter_event)
         if rank == state.worker_rank:
             # Free computed_u
             state.computed_u = None
         Muon._update_p(p, state.scattered_u, lr, adjusted_lr, weight_decay)
 def default_is_muon(name, x):
@@ -375,7 +390,8 @@ class Muon(torch.optim.Optimizer):
             else:
                 g = buf
-            u = _zeropower_via_newtonschulz5(g, steps=group["ns_steps"])
             adjusted_lr = self.adjust_lr_for_muon(lr, p.shape)
             Muon._update_p(p, u, lr, adjusted_lr, weight_decay)
@@ -433,7 +449,7 @@ class Muon(torch.optim.Optimizer):
         def enqueue_computes(start_idx, chunk_size):
             for p in ordered_params[start_idx:start_idx + chunk_size]:
                 state = param_to_state[id(p)]
-                _compute_u(state, group["ns_steps"], self.rank,
                            self.compute_stream)
         def enqueue_scatters(start_idx, chunk_size):
@@ -466,6 +482,77 @@ class Muon(torch.optim.Optimizer):
         # Wait the last update_param to finish
         torch.cuda.current_stream().wait_stream(self.compute_stream)
     def step(self, closure=None):
         """Perform a single optimization step.
@@ -542,6 +629,12 @@ class Muon(torch.optim.Optimizer):
                 #       AdamW backup       #
                 ############################
                 lr = group["lr"]
                 beta1, beta2 = group["adamw_betas"]
                 eps = group["adamw_eps"]
@@ -552,23 +645,38 @@ class Muon(torch.optim.Optimizer):
                     if g is None:
                         continue
                     state = self.state[p]
                     if "step" not in state:
-                        state["step"] = 0
                         state["moment1"] = torch.zeros_like(g)
                         state["moment2"] = torch.zeros_like(g)
-                    state["step"] += 1
-                    step = state["step"]
-                    buf1 = state["moment1"]
-                    buf2 = state["moment2"]
-                    buf1.lerp_(g, 1 - beta1)
-                    buf2.lerp_(g.square(), 1 - beta2)
-                    g = buf1 / (eps + buf2.sqrt())
-                    bias_correction1 = 1 - beta1**step
-                    bias_correction2 = 1 - beta2**step
-                    scale = bias_correction1 / bias_correction2**0.5
-                    p.data.mul_(1 - lr * weight_decay)
-                    p.data.add_(g, alpha=-lr / scale)
         return loss

 import math
 import types
 from dataclasses import dataclass
+from typing import Optional, Union, cast
 import torch
 import torch.distributed as dist
 # This code snippet is a modified version adapted from the following GitHub repositories:
 # https://github.com/KellerJordan/Muon/blob/master/muon.py
+# Muon's Newton–Schulz iteration causes high variance in singular values
+# Idea: give each iteration its own 3 coefficients and optimize them via gradient descent.
 @torch.no_grad()
 def _zeropower_via_newtonschulz5(G, steps):
     """
     performance at all relative to UV^T, where USV^T = G is the SVD.
     """
     assert len(G.shape) == 2
+    assert G.dtype == torch.bfloat16
     X = G  # no manual typecast
     if G.size(0) > G.size(1):
         X = X.T
     # Ensure spectral norm is at most 1
     X = X / (X.norm() + 1e-7)
     # Perform the NS iterations
+    for a, b, c in [
+        (4.0848, -6.8946, 2.9270),
+        (3.9505, -6.3029, 2.6377),
+        (3.7418, -5.5913, 2.3037),
+        (2.8769, -3.1427, 1.2046),
+        (2.8366, -3.0525, 1.2012),
+    ]:
         A = X @ X.T
         # B = (
         #    b * A + c * A @ A
     if G.size(0) > G.size(1):
         X = X.T
+    return X
 @dataclass
     Gather the gradients to worker_rank.
     If none_grad is True, free p.grad after the gather.
     """
     with torch.cuda.stream(comm_stream):
+        g = p.grad
+        if rank == state.worker_rank:
+            num_ranks = dist.get_world_size(group=state.process_group)
+            gather_list = [
+                torch.empty_like(g.to_local(), dtype=torch.bfloat16)
+                for _ in range(num_ranks)
+            ]
+        else:
+            gather_list = None
+        g = g.to(torch.bfloat16)
         torch.distributed.gather(
             g.to_local(),
             dst=state.worker_rank,
         else:
             state.gathered_grad = None
             state.gather_event = None
+        gather_list = None
         if none_grad:
             # We can safely free p.grad without calling record_stream:
             #   p.grad.to_local().record_stream(comm_stream)
 @torch.no_grad()
+def _compute_u(p, state, steps, rank, compute_stream):
     """
     On worker_rank, compute the orthogonalized update using Newton-Schulz iteration.
     """
             compute_stream.wait_event(state.gather_event)
             u = _zeropower_via_newtonschulz5(state.gathered_grad, steps)
             state.computed_u = u
+        state.scattered_u = torch.empty_like(p.to_local(),
+                                             dtype=torch.bfloat16)
+        state.compute_event = torch.cuda.Event()
+        state.compute_event.record()
+        u = None
 @torch.no_grad()
     """
     with torch.cuda.stream(comm_stream):
+        if state.compute_event is None:
+            raise RuntimeError("Compute event must be set before scatter.")
+        comm_stream.wait_event(state.compute_event)
         if rank == state.worker_rank:
             num_ranks = dist.get_world_size(group=state.process_group)
             # Clear the gathered gradient to free memory
             state.gathered_grad = None
         else:
             scatter_list = None
         torch.distributed.scatter(
+            state.scattered_u,
             scatter_list=scatter_list,
             src=state.worker_rank,
             group=state.process_group,
         )
         state.scatter_event = torch.cuda.Event()
         state.scatter_event.record()
+        scatter_list = None
 def _update_param(p, state, lr, adjusted_lr, weight_decay, rank,
         if state.scatter_event is None:
             raise RuntimeError("Scatter event must be set before update")
         compute_stream.wait_event(state.scatter_event)
+        u_dtensor = DTensor.from_local(
+            state.scattered_u,
+            placements=p.placements,
+            device_mesh=p.device_mesh,
+        )
+        state.scattered_u = u_dtensor
         if rank == state.worker_rank:
             # Free computed_u
             state.computed_u = None
         Muon._update_p(p, state.scattered_u, lr, adjusted_lr, weight_decay)
+        state.scattered_u = None
+        u_dtensor = None
 def default_is_muon(name, x):
             else:
                 g = buf
+            u = _zeropower_via_newtonschulz5(g.bfloat16(),
+                                             steps=group["ns_steps"])
             adjusted_lr = self.adjust_lr_for_muon(lr, p.shape)
             Muon._update_p(p, u, lr, adjusted_lr, weight_decay)
         def enqueue_computes(start_idx, chunk_size):
             for p in ordered_params[start_idx:start_idx + chunk_size]:
                 state = param_to_state[id(p)]
+                _compute_u(p, state, group["ns_steps"], self.rank,
                            self.compute_stream)
         def enqueue_scatters(start_idx, chunk_size):
         # Wait the last update_param to finish
         torch.cuda.current_stream().wait_stream(self.compute_stream)
+    @staticmethod
+    def _fused_adamw(
+        params: list[torch.Tensor],
+        grads: list[torch.Tensor],
+        exp_avgs: list[torch.Tensor],
+        exp_avg_sqs: list[torch.Tensor],
+        max_exp_avg_sqs: list[torch.Tensor],
+        state_steps: list[torch.Tensor],
+        amsgrad: bool,
+        beta1: float,
+        beta2: float,
+        lr: Union[float, torch.Tensor],
+        weight_decay: float,
+        eps: float,
+        maximize: bool,
+    ) -> None:
+        if not params:
+            return
+        # We only shuffle around the lr when it is a Tensor and on CUDA, otherwise, we prefer
+        # treating it as a scalar.
+        lr_dict: Optional[DeviceDict] = ({
+            lr.device: lr
+        } if isinstance(lr, torch.Tensor) and str(lr.device) != "cpu" else
+                                         None)
+        grouped_tensors = torch.optim.Optimizer._group_tensors_by_device_and_dtype(
+            [
+                params, grads, exp_avgs, exp_avg_sqs, max_exp_avg_sqs,
+                state_steps
+            ]  # type: ignore[list-item]
+        )
+        for (device, _), (
+            (
+                device_params_,
+                device_grads_,
+                device_exp_avgs_,
+                device_exp_avg_sqs_,
+                device_max_exp_avg_sqs,
+                device_state_steps_,
+            ),
+                _,
+        ) in grouped_tensors.items():
+            device_params = cast(list[torch.Tensor], device_params_)
+            device_grads = cast(list[torch.Tensor], device_grads_)
+            device_exp_avgs = cast(list[torch.Tensor], device_exp_avgs_)
+            device_exp_avg_sqs = cast(list[torch.Tensor], device_exp_avg_sqs_)
+            device_state_steps = cast(list[torch.Tensor], device_state_steps_)
+            if lr_dict is not None and device not in lr_dict:
+                lr_dict[device] = lr.to(
+                    device=device,
+                    non_blocking=True)  # type: ignore[union-attr]
+                lr = lr_dict[device]
+            torch._foreach_add_(device_state_steps, 1)
+            func = torch._fused_adamw_
+            func(
+                device_params,
+                device_grads,
+                device_exp_avgs,
+                device_exp_avg_sqs,
+                device_max_exp_avg_sqs,  # type: ignore[arg-type]
+                device_state_steps,
+                amsgrad=amsgrad,
+                lr=lr,  # type: ignore[arg-type]
+                beta1=beta1,
+                beta2=beta2,
+                weight_decay=weight_decay,
+                eps=eps,
+                maximize=maximize,
+            )
     def step(self, closure=None):
         """Perform a single optimization step.
                 #       AdamW backup       #
                 ############################
+                params_with_grads = []
+                grads = []
+                moment1 = []
+                moment2 = []
+                max_exp_avg_sqs = []
+                state_steps = []
                 lr = group["lr"]
                 beta1, beta2 = group["adamw_betas"]
                 eps = group["adamw_eps"]
                     if g is None:
                         continue
                     state = self.state[p]
+                    params_with_grads.append(p)
+                    grads.append(g)
                     if "step" not in state:
+                        state["step"] = (torch.zeros((),
+                                                     dtype=torch.float32,
+                                                     device=p.device))
                         state["moment1"] = torch.zeros_like(g)
                         state["moment2"] = torch.zeros_like(g)
+                    moment1.append(state["moment1"])
+                    moment2.append(state["moment2"])
+                    if not isinstance(state["step"], torch.Tensor):
+                        step_tensor = torch.tensor(state["step"],
+                                                   dtype=torch.float32,
+                                                   device=p.device)
+                    else:
+                        step_tensor = state["step"]
+                    state_steps.append(step_tensor)
+                self._fused_adamw(
+                    params_with_grads,
+                    grads,
+                    moment1,
+                    moment2,
+                    max_exp_avg_sqs,
+                    state_steps,
+                    amsgrad=False,
+                    beta1=beta1,
+                    beta2=beta2,
+                    lr=lr,
+                    weight_decay=weight_decay,
+                    eps=eps,
+                    maximize=False,
+                )
         return loss

build/torch28-cxx11-rocm64-x86_64-linux/optimizer/_ops.py CHANGED Viewed

@@ -1,9 +1,9 @@
 import torch
-from . import _optimizer_20250911094409
-ops = torch.ops._optimizer_20250911094409
 def add_op_namespace_prefix(op_name: str):
     """
     Prefix op by namespace.
     """
-    return f"_optimizer_20250911094409::{op_name}"

 import torch
+from . import _optimizer_ee6ed44_dirty
+ops = torch.ops._optimizer_ee6ed44_dirty
 def add_op_namespace_prefix(op_name: str):
     """
     Prefix op by namespace.
     """
+    return f"_optimizer_ee6ed44_dirty::{op_name}"

build/torch28-cxx11-rocm64-x86_64-linux/optimizer/_optimizer_20250911094409.abi3.so DELETED Viewed

@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:e62682b711f002505bb17c170b2bb233f8d389510ff8e2e0a753ee96d11d0746
-size 1750128

build/torch28-cxx11-rocm64-x86_64-linux/optimizer/_optimizer_ee6ed44_dirty.abi3.so ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:5eedf56e661a7d314727e40f192236dbd9696f62ba21f11e366643f2662c03a4
+size 1750088

build/torch28-cxx11-rocm64-x86_64-linux/optimizer/muon.py CHANGED Viewed

@@ -2,6 +2,7 @@ import logging
 import math
 import types
 from dataclasses import dataclass
 import torch
 import torch.distributed as dist
@@ -12,6 +13,8 @@ logger = logging.getLogger(__name__)
 # This code snippet is a modified version adapted from the following GitHub repositories:
 # https://github.com/KellerJordan/Muon/blob/master/muon.py
 @torch.no_grad()
 def _zeropower_via_newtonschulz5(G, steps):
     """
@@ -24,15 +27,21 @@ def _zeropower_via_newtonschulz5(G, steps):
     performance at all relative to UV^T, where USV^T = G is the SVD.
     """
     assert len(G.shape) == 2
-    a, b, c = (3.4445, -4.7750, 2.0315)
     X = G  # no manual typecast
     if G.size(0) > G.size(1):
         X = X.T
     # Ensure spectral norm is at most 1
     X = X / (X.norm() + 1e-7)
-    X = X.bfloat16()
     # Perform the NS iterations
-    for _ in range(steps):
         A = X @ X.T
         # B = (
         #    b * A + c * A @ A
@@ -43,7 +52,7 @@ def _zeropower_via_newtonschulz5(G, steps):
     if G.size(0) > G.size(1):
         X = X.T
-    return X.to(G.dtype)
 @dataclass
@@ -65,17 +74,19 @@ def _gather(p, state, rank, comm_stream, none_grad):
     Gather the gradients to worker_rank.
     If none_grad is True, free p.grad after the gather.
     """
-    g = p.grad
-    if rank == state.worker_rank:
-        num_ranks = dist.get_world_size(group=state.process_group)
-        gather_list = [
-            torch.empty_like(g.to_local()) for _ in range(num_ranks)
-        ]
-    else:
-        gather_list = None
     with torch.cuda.stream(comm_stream):
         torch.distributed.gather(
             g.to_local(),
             dst=state.worker_rank,
@@ -92,6 +103,7 @@ def _gather(p, state, rank, comm_stream, none_grad):
         else:
             state.gathered_grad = None
             state.gather_event = None
         if none_grad:
             # We can safely free p.grad without calling record_stream:
             #   p.grad.to_local().record_stream(comm_stream)
@@ -104,7 +116,7 @@ def _gather(p, state, rank, comm_stream, none_grad):
 @torch.no_grad()
-def _compute_u(state, steps, rank, compute_stream):
     """
     On worker_rank, compute the orthogonalized update using Newton-Schulz iteration.
     """
@@ -115,11 +127,11 @@ def _compute_u(state, steps, rank, compute_stream):
             compute_stream.wait_event(state.gather_event)
             u = _zeropower_via_newtonschulz5(state.gathered_grad, steps)
             state.computed_u = u
-            state.compute_event = torch.cuda.Event()
-            state.compute_event.record()
-        else:
-            state.computed_u = None
-            state.compute_event = None
 @torch.no_grad()
@@ -129,12 +141,12 @@ def _scatter(p, state, rank, comm_stream):
     """
     with torch.cuda.stream(comm_stream):
         if rank == state.worker_rank:
             num_ranks = dist.get_world_size(group=state.process_group)
-            if state.compute_event is None:
-                raise RuntimeError("Compute event must be set before scatter.")
-            comm_stream.wait_event(state.compute_event)
             # Clear the gathered gradient to free memory
             state.gathered_grad = None
@@ -144,22 +156,15 @@ def _scatter(p, state, rank, comm_stream):
         else:
             scatter_list = None
-        u_received = torch.empty_like(p.to_local())
         torch.distributed.scatter(
-            u_received,
             scatter_list=scatter_list,
             src=state.worker_rank,
             group=state.process_group,
         )
-        u_dtensor = DTensor.from_local(
-            u_received,
-            placements=p.placements,
-            device_mesh=p.device_mesh,
-        )
-        state.scattered_u = u_dtensor
         state.scatter_event = torch.cuda.Event()
         state.scatter_event.record()
 def _update_param(p, state, lr, adjusted_lr, weight_decay, rank,
@@ -172,11 +177,21 @@ def _update_param(p, state, lr, adjusted_lr, weight_decay, rank,
         if state.scatter_event is None:
             raise RuntimeError("Scatter event must be set before update")
         compute_stream.wait_event(state.scatter_event)
         if rank == state.worker_rank:
             # Free computed_u
             state.computed_u = None
         Muon._update_p(p, state.scattered_u, lr, adjusted_lr, weight_decay)
 def default_is_muon(name, x):
@@ -375,7 +390,8 @@ class Muon(torch.optim.Optimizer):
             else:
                 g = buf
-            u = _zeropower_via_newtonschulz5(g, steps=group["ns_steps"])
             adjusted_lr = self.adjust_lr_for_muon(lr, p.shape)
             Muon._update_p(p, u, lr, adjusted_lr, weight_decay)
@@ -433,7 +449,7 @@ class Muon(torch.optim.Optimizer):
         def enqueue_computes(start_idx, chunk_size):
             for p in ordered_params[start_idx:start_idx + chunk_size]:
                 state = param_to_state[id(p)]
-                _compute_u(state, group["ns_steps"], self.rank,
                            self.compute_stream)
         def enqueue_scatters(start_idx, chunk_size):
@@ -466,6 +482,77 @@ class Muon(torch.optim.Optimizer):
         # Wait the last update_param to finish
         torch.cuda.current_stream().wait_stream(self.compute_stream)
     def step(self, closure=None):
         """Perform a single optimization step.
@@ -542,6 +629,12 @@ class Muon(torch.optim.Optimizer):
                 #       AdamW backup       #
                 ############################
                 lr = group["lr"]
                 beta1, beta2 = group["adamw_betas"]
                 eps = group["adamw_eps"]
@@ -552,23 +645,38 @@ class Muon(torch.optim.Optimizer):
                     if g is None:
                         continue
                     state = self.state[p]
                     if "step" not in state:
-                        state["step"] = 0
                         state["moment1"] = torch.zeros_like(g)
                         state["moment2"] = torch.zeros_like(g)
-                    state["step"] += 1
-                    step = state["step"]
-                    buf1 = state["moment1"]
-                    buf2 = state["moment2"]
-                    buf1.lerp_(g, 1 - beta1)
-                    buf2.lerp_(g.square(), 1 - beta2)
-                    g = buf1 / (eps + buf2.sqrt())
-                    bias_correction1 = 1 - beta1**step
-                    bias_correction2 = 1 - beta2**step
-                    scale = bias_correction1 / bias_correction2**0.5
-                    p.data.mul_(1 - lr * weight_decay)
-                    p.data.add_(g, alpha=-lr / scale)
         return loss

 import math
 import types
 from dataclasses import dataclass
+from typing import Optional, Union, cast
 import torch
 import torch.distributed as dist
 # This code snippet is a modified version adapted from the following GitHub repositories:
 # https://github.com/KellerJordan/Muon/blob/master/muon.py
+# Muon's Newton–Schulz iteration causes high variance in singular values
+# Idea: give each iteration its own 3 coefficients and optimize them via gradient descent.
 @torch.no_grad()
 def _zeropower_via_newtonschulz5(G, steps):
     """
     performance at all relative to UV^T, where USV^T = G is the SVD.
     """
     assert len(G.shape) == 2
+    assert G.dtype == torch.bfloat16
     X = G  # no manual typecast
     if G.size(0) > G.size(1):
         X = X.T
     # Ensure spectral norm is at most 1
     X = X / (X.norm() + 1e-7)
     # Perform the NS iterations
+    for a, b, c in [
+        (4.0848, -6.8946, 2.9270),
+        (3.9505, -6.3029, 2.6377),
+        (3.7418, -5.5913, 2.3037),
+        (2.8769, -3.1427, 1.2046),
+        (2.8366, -3.0525, 1.2012),
+    ]:
         A = X @ X.T
         # B = (
         #    b * A + c * A @ A
     if G.size(0) > G.size(1):
         X = X.T
+    return X
 @dataclass
     Gather the gradients to worker_rank.
     If none_grad is True, free p.grad after the gather.
     """
     with torch.cuda.stream(comm_stream):
+        g = p.grad
+        if rank == state.worker_rank:
+            num_ranks = dist.get_world_size(group=state.process_group)
+            gather_list = [
+                torch.empty_like(g.to_local(), dtype=torch.bfloat16)
+                for _ in range(num_ranks)
+            ]
+        else:
+            gather_list = None
+        g = g.to(torch.bfloat16)
         torch.distributed.gather(
             g.to_local(),
             dst=state.worker_rank,
         else:
             state.gathered_grad = None
             state.gather_event = None
+        gather_list = None
         if none_grad:
             # We can safely free p.grad without calling record_stream:
             #   p.grad.to_local().record_stream(comm_stream)
 @torch.no_grad()
+def _compute_u(p, state, steps, rank, compute_stream):
     """
     On worker_rank, compute the orthogonalized update using Newton-Schulz iteration.
     """
             compute_stream.wait_event(state.gather_event)
             u = _zeropower_via_newtonschulz5(state.gathered_grad, steps)
             state.computed_u = u
+        state.scattered_u = torch.empty_like(p.to_local(),
+                                             dtype=torch.bfloat16)
+        state.compute_event = torch.cuda.Event()
+        state.compute_event.record()
+        u = None
 @torch.no_grad()
     """
     with torch.cuda.stream(comm_stream):
+        if state.compute_event is None:
+            raise RuntimeError("Compute event must be set before scatter.")
+        comm_stream.wait_event(state.compute_event)
         if rank == state.worker_rank:
             num_ranks = dist.get_world_size(group=state.process_group)
             # Clear the gathered gradient to free memory
             state.gathered_grad = None
         else:
             scatter_list = None
         torch.distributed.scatter(
+            state.scattered_u,
             scatter_list=scatter_list,
             src=state.worker_rank,
             group=state.process_group,
         )
         state.scatter_event = torch.cuda.Event()
         state.scatter_event.record()
+        scatter_list = None
 def _update_param(p, state, lr, adjusted_lr, weight_decay, rank,
         if state.scatter_event is None:
             raise RuntimeError("Scatter event must be set before update")
         compute_stream.wait_event(state.scatter_event)
+        u_dtensor = DTensor.from_local(
+            state.scattered_u,
+            placements=p.placements,
+            device_mesh=p.device_mesh,
+        )
+        state.scattered_u = u_dtensor
         if rank == state.worker_rank:
             # Free computed_u
             state.computed_u = None
         Muon._update_p(p, state.scattered_u, lr, adjusted_lr, weight_decay)
+        state.scattered_u = None
+        u_dtensor = None
 def default_is_muon(name, x):
             else:
                 g = buf
+            u = _zeropower_via_newtonschulz5(g.bfloat16(),
+                                             steps=group["ns_steps"])
             adjusted_lr = self.adjust_lr_for_muon(lr, p.shape)
             Muon._update_p(p, u, lr, adjusted_lr, weight_decay)
         def enqueue_computes(start_idx, chunk_size):
             for p in ordered_params[start_idx:start_idx + chunk_size]:
                 state = param_to_state[id(p)]
+                _compute_u(p, state, group["ns_steps"], self.rank,
                            self.compute_stream)
         def enqueue_scatters(start_idx, chunk_size):
         # Wait the last update_param to finish
         torch.cuda.current_stream().wait_stream(self.compute_stream)
+    @staticmethod
+    def _fused_adamw(
+        params: list[torch.Tensor],
+        grads: list[torch.Tensor],
+        exp_avgs: list[torch.Tensor],
+        exp_avg_sqs: list[torch.Tensor],
+        max_exp_avg_sqs: list[torch.Tensor],
+        state_steps: list[torch.Tensor],
+        amsgrad: bool,
+        beta1: float,
+        beta2: float,
+        lr: Union[float, torch.Tensor],
+        weight_decay: float,
+        eps: float,
+        maximize: bool,
+    ) -> None:
+        if not params:
+            return
+        # We only shuffle around the lr when it is a Tensor and on CUDA, otherwise, we prefer
+        # treating it as a scalar.
+        lr_dict: Optional[DeviceDict] = ({
+            lr.device: lr
+        } if isinstance(lr, torch.Tensor) and str(lr.device) != "cpu" else
+                                         None)
+        grouped_tensors = torch.optim.Optimizer._group_tensors_by_device_and_dtype(
+            [
+                params, grads, exp_avgs, exp_avg_sqs, max_exp_avg_sqs,
+                state_steps
+            ]  # type: ignore[list-item]
+        )
+        for (device, _), (
+            (
+                device_params_,
+                device_grads_,
+                device_exp_avgs_,
+                device_exp_avg_sqs_,
+                device_max_exp_avg_sqs,
+                device_state_steps_,
+            ),
+                _,
+        ) in grouped_tensors.items():
+            device_params = cast(list[torch.Tensor], device_params_)
+            device_grads = cast(list[torch.Tensor], device_grads_)
+            device_exp_avgs = cast(list[torch.Tensor], device_exp_avgs_)
+            device_exp_avg_sqs = cast(list[torch.Tensor], device_exp_avg_sqs_)
+            device_state_steps = cast(list[torch.Tensor], device_state_steps_)
+            if lr_dict is not None and device not in lr_dict:
+                lr_dict[device] = lr.to(
+                    device=device,
+                    non_blocking=True)  # type: ignore[union-attr]
+                lr = lr_dict[device]
+            torch._foreach_add_(device_state_steps, 1)
+            func = torch._fused_adamw_
+            func(
+                device_params,
+                device_grads,
+                device_exp_avgs,
+                device_exp_avg_sqs,
+                device_max_exp_avg_sqs,  # type: ignore[arg-type]
+                device_state_steps,
+                amsgrad=amsgrad,
+                lr=lr,  # type: ignore[arg-type]
+                beta1=beta1,
+                beta2=beta2,
+                weight_decay=weight_decay,
+                eps=eps,
+                maximize=maximize,
+            )
     def step(self, closure=None):
         """Perform a single optimization step.
                 #       AdamW backup       #
                 ############################
+                params_with_grads = []
+                grads = []
+                moment1 = []
+                moment2 = []
+                max_exp_avg_sqs = []
+                state_steps = []
                 lr = group["lr"]
                 beta1, beta2 = group["adamw_betas"]
                 eps = group["adamw_eps"]
                     if g is None:
                         continue
                     state = self.state[p]
+                    params_with_grads.append(p)
+                    grads.append(g)
                     if "step" not in state:
+                        state["step"] = (torch.zeros((),
+                                                     dtype=torch.float32,
+                                                     device=p.device))
                         state["moment1"] = torch.zeros_like(g)
                         state["moment2"] = torch.zeros_like(g)
+                    moment1.append(state["moment1"])
+                    moment2.append(state["moment2"])
+                    if not isinstance(state["step"], torch.Tensor):
+                        step_tensor = torch.tensor(state["step"],
+                                                   dtype=torch.float32,
+                                                   device=p.device)
+                    else:
+                        step_tensor = state["step"]
+                    state_steps.append(step_tensor)
+                self._fused_adamw(
+                    params_with_grads,
+                    grads,
+                    moment1,
+                    moment2,
+                    max_exp_avg_sqs,
+                    state_steps,
+                    amsgrad=False,
+                    beta1=beta1,
+                    beta2=beta2,
+                    lr=lr,
+                    weight_decay=weight_decay,
+                    eps=eps,
+                    maximize=False,
+                )
         return loss

torch-ext/optimizer/muon.py CHANGED Viewed

@@ -2,6 +2,7 @@ import logging
 import math
 import types
 from dataclasses import dataclass
 import torch
 import torch.distributed as dist
@@ -12,6 +13,8 @@ logger = logging.getLogger(__name__)
 # This code snippet is a modified version adapted from the following GitHub repositories:
 # https://github.com/KellerJordan/Muon/blob/master/muon.py
 @torch.no_grad()
 def _zeropower_via_newtonschulz5(G, steps):
     """
@@ -24,15 +27,21 @@ def _zeropower_via_newtonschulz5(G, steps):
     performance at all relative to UV^T, where USV^T = G is the SVD.
     """
     assert len(G.shape) == 2
-    a, b, c = (3.4445, -4.7750, 2.0315)
     X = G  # no manual typecast
     if G.size(0) > G.size(1):
         X = X.T
     # Ensure spectral norm is at most 1
     X = X / (X.norm() + 1e-7)
-    X = X.bfloat16()
     # Perform the NS iterations
-    for _ in range(steps):
         A = X @ X.T
         # B = (
         #    b * A + c * A @ A
@@ -43,7 +52,7 @@ def _zeropower_via_newtonschulz5(G, steps):
     if G.size(0) > G.size(1):
         X = X.T
-    return X.to(G.dtype)
 @dataclass
@@ -65,17 +74,19 @@ def _gather(p, state, rank, comm_stream, none_grad):
     Gather the gradients to worker_rank.
     If none_grad is True, free p.grad after the gather.
     """
-    g = p.grad
-    if rank == state.worker_rank:
-        num_ranks = dist.get_world_size(group=state.process_group)
-        gather_list = [
-            torch.empty_like(g.to_local()) for _ in range(num_ranks)
-        ]
-    else:
-        gather_list = None
     with torch.cuda.stream(comm_stream):
         torch.distributed.gather(
             g.to_local(),
             dst=state.worker_rank,
@@ -92,6 +103,7 @@ def _gather(p, state, rank, comm_stream, none_grad):
         else:
             state.gathered_grad = None
             state.gather_event = None
         if none_grad:
             # We can safely free p.grad without calling record_stream:
             #   p.grad.to_local().record_stream(comm_stream)
@@ -104,7 +116,7 @@ def _gather(p, state, rank, comm_stream, none_grad):
 @torch.no_grad()
-def _compute_u(state, steps, rank, compute_stream):
     """
     On worker_rank, compute the orthogonalized update using Newton-Schulz iteration.
     """
@@ -115,11 +127,11 @@ def _compute_u(state, steps, rank, compute_stream):
             compute_stream.wait_event(state.gather_event)
             u = _zeropower_via_newtonschulz5(state.gathered_grad, steps)
             state.computed_u = u
-            state.compute_event = torch.cuda.Event()
-            state.compute_event.record()
-        else:
-            state.computed_u = None
-            state.compute_event = None
 @torch.no_grad()
@@ -129,12 +141,12 @@ def _scatter(p, state, rank, comm_stream):
     """
     with torch.cuda.stream(comm_stream):
         if rank == state.worker_rank:
             num_ranks = dist.get_world_size(group=state.process_group)
-            if state.compute_event is None:
-                raise RuntimeError("Compute event must be set before scatter.")
-            comm_stream.wait_event(state.compute_event)
             # Clear the gathered gradient to free memory
             state.gathered_grad = None
@@ -144,22 +156,15 @@ def _scatter(p, state, rank, comm_stream):
         else:
             scatter_list = None
-        u_received = torch.empty_like(p.to_local())
         torch.distributed.scatter(
-            u_received,
             scatter_list=scatter_list,
             src=state.worker_rank,
             group=state.process_group,
         )
-        u_dtensor = DTensor.from_local(
-            u_received,
-            placements=p.placements,
-            device_mesh=p.device_mesh,
-        )
-        state.scattered_u = u_dtensor
         state.scatter_event = torch.cuda.Event()
         state.scatter_event.record()
 def _update_param(p, state, lr, adjusted_lr, weight_decay, rank,
@@ -172,11 +177,21 @@ def _update_param(p, state, lr, adjusted_lr, weight_decay, rank,
         if state.scatter_event is None:
             raise RuntimeError("Scatter event must be set before update")
         compute_stream.wait_event(state.scatter_event)
         if rank == state.worker_rank:
             # Free computed_u
             state.computed_u = None
         Muon._update_p(p, state.scattered_u, lr, adjusted_lr, weight_decay)
 def default_is_muon(name, x):
@@ -375,7 +390,8 @@ class Muon(torch.optim.Optimizer):
             else:
                 g = buf
-            u = _zeropower_via_newtonschulz5(g, steps=group["ns_steps"])
             adjusted_lr = self.adjust_lr_for_muon(lr, p.shape)
             Muon._update_p(p, u, lr, adjusted_lr, weight_decay)
@@ -433,7 +449,7 @@ class Muon(torch.optim.Optimizer):
         def enqueue_computes(start_idx, chunk_size):
             for p in ordered_params[start_idx:start_idx + chunk_size]:
                 state = param_to_state[id(p)]
-                _compute_u(state, group["ns_steps"], self.rank,
                            self.compute_stream)
         def enqueue_scatters(start_idx, chunk_size):
@@ -466,6 +482,77 @@ class Muon(torch.optim.Optimizer):
         # Wait the last update_param to finish
         torch.cuda.current_stream().wait_stream(self.compute_stream)
     def step(self, closure=None):
         """Perform a single optimization step.
@@ -542,6 +629,12 @@ class Muon(torch.optim.Optimizer):
                 #       AdamW backup       #
                 ############################
                 lr = group["lr"]
                 beta1, beta2 = group["adamw_betas"]
                 eps = group["adamw_eps"]
@@ -552,23 +645,38 @@ class Muon(torch.optim.Optimizer):
                     if g is None:
                         continue
                     state = self.state[p]
                     if "step" not in state:
-                        state["step"] = 0
                         state["moment1"] = torch.zeros_like(g)
                         state["moment2"] = torch.zeros_like(g)
-                    state["step"] += 1
-                    step = state["step"]
-                    buf1 = state["moment1"]
-                    buf2 = state["moment2"]
-                    buf1.lerp_(g, 1 - beta1)
-                    buf2.lerp_(g.square(), 1 - beta2)
-                    g = buf1 / (eps + buf2.sqrt())
-                    bias_correction1 = 1 - beta1**step
-                    bias_correction2 = 1 - beta2**step
-                    scale = bias_correction1 / bias_correction2**0.5
-                    p.data.mul_(1 - lr * weight_decay)
-                    p.data.add_(g, alpha=-lr / scale)
         return loss

 import math
 import types
 from dataclasses import dataclass
+from typing import Optional, Union, cast
 import torch
 import torch.distributed as dist
 # This code snippet is a modified version adapted from the following GitHub repositories:
 # https://github.com/KellerJordan/Muon/blob/master/muon.py
+# Muon's Newton–Schulz iteration causes high variance in singular values
+# Idea: give each iteration its own 3 coefficients and optimize them via gradient descent.
 @torch.no_grad()
 def _zeropower_via_newtonschulz5(G, steps):
     """
     performance at all relative to UV^T, where USV^T = G is the SVD.
     """
     assert len(G.shape) == 2
+    assert G.dtype == torch.bfloat16
     X = G  # no manual typecast
     if G.size(0) > G.size(1):
         X = X.T
     # Ensure spectral norm is at most 1
     X = X / (X.norm() + 1e-7)
     # Perform the NS iterations
+    for a, b, c in [
+        (4.0848, -6.8946, 2.9270),
+        (3.9505, -6.3029, 2.6377),
+        (3.7418, -5.5913, 2.3037),
+        (2.8769, -3.1427, 1.2046),
+        (2.8366, -3.0525, 1.2012),
+    ]:
         A = X @ X.T
         # B = (
         #    b * A + c * A @ A
     if G.size(0) > G.size(1):
         X = X.T
+    return X
 @dataclass
     Gather the gradients to worker_rank.
     If none_grad is True, free p.grad after the gather.
     """
     with torch.cuda.stream(comm_stream):
+        g = p.grad
+        if rank == state.worker_rank:
+            num_ranks = dist.get_world_size(group=state.process_group)
+            gather_list = [
+                torch.empty_like(g.to_local(), dtype=torch.bfloat16)
+                for _ in range(num_ranks)
+            ]
+        else:
+            gather_list = None
+        g = g.to(torch.bfloat16)
         torch.distributed.gather(
             g.to_local(),
             dst=state.worker_rank,
         else:
             state.gathered_grad = None
             state.gather_event = None
+        gather_list = None
         if none_grad:
             # We can safely free p.grad without calling record_stream:
             #   p.grad.to_local().record_stream(comm_stream)
 @torch.no_grad()
+def _compute_u(p, state, steps, rank, compute_stream):
     """
     On worker_rank, compute the orthogonalized update using Newton-Schulz iteration.
     """
             compute_stream.wait_event(state.gather_event)
             u = _zeropower_via_newtonschulz5(state.gathered_grad, steps)
             state.computed_u = u
+        state.scattered_u = torch.empty_like(p.to_local(),
+                                             dtype=torch.bfloat16)
+        state.compute_event = torch.cuda.Event()
+        state.compute_event.record()
+        u = None
 @torch.no_grad()
     """
     with torch.cuda.stream(comm_stream):
+        if state.compute_event is None:
+            raise RuntimeError("Compute event must be set before scatter.")
+        comm_stream.wait_event(state.compute_event)
         if rank == state.worker_rank:
             num_ranks = dist.get_world_size(group=state.process_group)
             # Clear the gathered gradient to free memory
             state.gathered_grad = None
         else:
             scatter_list = None
         torch.distributed.scatter(
+            state.scattered_u,
             scatter_list=scatter_list,
             src=state.worker_rank,
             group=state.process_group,
         )
         state.scatter_event = torch.cuda.Event()
         state.scatter_event.record()
+        scatter_list = None
 def _update_param(p, state, lr, adjusted_lr, weight_decay, rank,
         if state.scatter_event is None:
             raise RuntimeError("Scatter event must be set before update")
         compute_stream.wait_event(state.scatter_event)
+        u_dtensor = DTensor.from_local(
+            state.scattered_u,
+            placements=p.placements,
+            device_mesh=p.device_mesh,
+        )
+        state.scattered_u = u_dtensor
         if rank == state.worker_rank:
             # Free computed_u
             state.computed_u = None
         Muon._update_p(p, state.scattered_u, lr, adjusted_lr, weight_decay)
+        state.scattered_u = None
+        u_dtensor = None
 def default_is_muon(name, x):
             else:
                 g = buf
+            u = _zeropower_via_newtonschulz5(g.bfloat16(),
+                                             steps=group["ns_steps"])
             adjusted_lr = self.adjust_lr_for_muon(lr, p.shape)
             Muon._update_p(p, u, lr, adjusted_lr, weight_decay)
         def enqueue_computes(start_idx, chunk_size):
             for p in ordered_params[start_idx:start_idx + chunk_size]:
                 state = param_to_state[id(p)]
+                _compute_u(p, state, group["ns_steps"], self.rank,
                            self.compute_stream)
         def enqueue_scatters(start_idx, chunk_size):
         # Wait the last update_param to finish
         torch.cuda.current_stream().wait_stream(self.compute_stream)
+    @staticmethod
+    def _fused_adamw(
+        params: list[torch.Tensor],
+        grads: list[torch.Tensor],
+        exp_avgs: list[torch.Tensor],
+        exp_avg_sqs: list[torch.Tensor],
+        max_exp_avg_sqs: list[torch.Tensor],
+        state_steps: list[torch.Tensor],
+        amsgrad: bool,
+        beta1: float,
+        beta2: float,
+        lr: Union[float, torch.Tensor],
+        weight_decay: float,
+        eps: float,
+        maximize: bool,
+    ) -> None:
+        if not params:
+            return
+        # We only shuffle around the lr when it is a Tensor and on CUDA, otherwise, we prefer
+        # treating it as a scalar.
+        lr_dict: Optional[DeviceDict] = ({
+            lr.device: lr
+        } if isinstance(lr, torch.Tensor) and str(lr.device) != "cpu" else
+                                         None)
+        grouped_tensors = torch.optim.Optimizer._group_tensors_by_device_and_dtype(
+            [
+                params, grads, exp_avgs, exp_avg_sqs, max_exp_avg_sqs,
+                state_steps
+            ]  # type: ignore[list-item]
+        )
+        for (device, _), (
+            (
+                device_params_,
+                device_grads_,
+                device_exp_avgs_,
+                device_exp_avg_sqs_,
+                device_max_exp_avg_sqs,
+                device_state_steps_,
+            ),
+                _,
+        ) in grouped_tensors.items():
+            device_params = cast(list[torch.Tensor], device_params_)
+            device_grads = cast(list[torch.Tensor], device_grads_)
+            device_exp_avgs = cast(list[torch.Tensor], device_exp_avgs_)
+            device_exp_avg_sqs = cast(list[torch.Tensor], device_exp_avg_sqs_)
+            device_state_steps = cast(list[torch.Tensor], device_state_steps_)
+            if lr_dict is not None and device not in lr_dict:
+                lr_dict[device] = lr.to(
+                    device=device,
+                    non_blocking=True)  # type: ignore[union-attr]
+                lr = lr_dict[device]
+            torch._foreach_add_(device_state_steps, 1)
+            func = torch._fused_adamw_
+            func(
+                device_params,
+                device_grads,
+                device_exp_avgs,
+                device_exp_avg_sqs,
+                device_max_exp_avg_sqs,  # type: ignore[arg-type]
+                device_state_steps,
+                amsgrad=amsgrad,
+                lr=lr,  # type: ignore[arg-type]
+                beta1=beta1,
+                beta2=beta2,
+                weight_decay=weight_decay,
+                eps=eps,
+                maximize=maximize,
+            )
     def step(self, closure=None):
         """Perform a single optimization step.
                 #       AdamW backup       #
                 ############################
+                params_with_grads = []
+                grads = []
+                moment1 = []
+                moment2 = []
+                max_exp_avg_sqs = []
+                state_steps = []
                 lr = group["lr"]
                 beta1, beta2 = group["adamw_betas"]
                 eps = group["adamw_eps"]
                     if g is None:
                         continue
                     state = self.state[p]
+                    params_with_grads.append(p)
+                    grads.append(g)
                     if "step" not in state:
+                        state["step"] = (torch.zeros((),
+                                                     dtype=torch.float32,
+                                                     device=p.device))
                         state["moment1"] = torch.zeros_like(g)
                         state["moment2"] = torch.zeros_like(g)
+                    moment1.append(state["moment1"])
+                    moment2.append(state["moment2"])
+                    if not isinstance(state["step"], torch.Tensor):
+                        step_tensor = torch.tensor(state["step"],
+                                                   dtype=torch.float32,
+                                                   device=p.device)
+                    else:
+                        step_tensor = state["step"]
+                    state_steps.append(step_tensor)
+                self._fused_adamw(
+                    params_with_grads,
+                    grads,
+                    moment1,
+                    moment2,
+                    max_exp_avg_sqs,
+                    state_steps,
+                    amsgrad=False,
+                    beta1=beta1,
+                    beta2=beta2,
+                    lr=lr,
+                    weight_decay=weight_decay,
+                    eps=eps,
+                    maximize=False,
+                )
         return loss