junhyeok-motech

leejunhyeok

wyldecat commited on 3 days ago

Commit

b0f46c7

unverified ·

1 Parent(s): 99e7c0c

feat: update muon to receive paramgroups, not model (#4)

Browse files

* feat: update muon to receive paramgroups, not model

* feat: update message formats

* feat: remove boilerplate assertion

* chore: run pre-commits

* test: fix testscript

* fix: fix misc bugs

* feat: add get_default_muon_param_groups helper function

* fix: fix readme

* fix: raise error if parametergroup does not follow instructions

* chore: upload binary

---------

Co-authored-by: junhyeok.lee <[email protected]>
Co-authored-by: WyldeCat <[email protected]>

Files changed (35) hide show

README.md +7 -1
build/torch27-cxx11-cu118-x86_64-linux/optimizer/_ops.py +3 -3
build/torch27-cxx11-cu118-x86_64-linux/optimizer/{_optimizer_0c12ced_dirty.abi3.so → _optimizer_20250911094409.abi3.so} +2 -2
build/torch27-cxx11-cu118-x86_64-linux/optimizer/muon.py +125 -112
build/torch27-cxx11-cu126-x86_64-linux/optimizer/_ops.py +3 -3
build/torch27-cxx11-cu126-x86_64-linux/optimizer/{_optimizer_0c12ced_dirty.abi3.so → _optimizer_20250911094409.abi3.so} +2 -2
build/torch27-cxx11-cu126-x86_64-linux/optimizer/muon.py +125 -112
build/torch27-cxx11-cu128-x86_64-linux/optimizer/_ops.py +3 -3
build/torch27-cxx11-cu128-x86_64-linux/optimizer/{_optimizer_0c12ced_dirty.abi3.so → _optimizer_20250911094409.abi3.so} +2 -2
build/torch27-cxx11-cu128-x86_64-linux/optimizer/muon.py +125 -112
build/torch27-cxx11-rocm63-x86_64-linux/optimizer/_ops.py +3 -3
build/torch27-cxx11-rocm63-x86_64-linux/optimizer/{_optimizer_0c12ced_dirty.abi3.so → _optimizer_20250911094409.abi3.so} +2 -2
build/torch27-cxx11-rocm63-x86_64-linux/optimizer/muon.py +125 -112
build/torch28-cxx11-cu126-x86_64-linux/optimizer/_ops.py +3 -3
build/torch28-cxx11-cu126-x86_64-linux/optimizer/_optimizer_0c12ced_dirty.abi3.so +0 -3
build/torch28-cxx11-cu126-x86_64-linux/optimizer/_optimizer_20250911094409.abi3.so +3 -0
build/torch28-cxx11-cu126-x86_64-linux/optimizer/muon.py +125 -112
build/torch28-cxx11-cu128-x86_64-linux/optimizer/_ops.py +3 -3
build/torch28-cxx11-cu128-x86_64-linux/optimizer/_optimizer_0c12ced_dirty.abi3.so +0 -3
build/torch28-cxx11-cu128-x86_64-linux/optimizer/_optimizer_20250911094409.abi3.so +3 -0
build/torch28-cxx11-cu128-x86_64-linux/optimizer/muon.py +125 -112
build/torch28-cxx11-cu129-x86_64-linux/optimizer/_ops.py +3 -3
build/torch28-cxx11-cu129-x86_64-linux/optimizer/_optimizer_0c12ced_dirty.abi3.so +0 -3
build/torch28-cxx11-cu129-x86_64-linux/optimizer/_optimizer_20250911094409.abi3.so +3 -0
build/torch28-cxx11-cu129-x86_64-linux/optimizer/muon.py +125 -112
build/torch28-cxx11-rocm63-x86_64-linux/optimizer/_ops.py +3 -3
build/torch28-cxx11-rocm63-x86_64-linux/optimizer/_optimizer_0c12ced_dirty.abi3.so +0 -3
build/torch28-cxx11-rocm63-x86_64-linux/optimizer/_optimizer_20250911094409.abi3.so +3 -0
build/torch28-cxx11-rocm63-x86_64-linux/optimizer/muon.py +125 -112
build/torch28-cxx11-rocm64-x86_64-linux/optimizer/_ops.py +3 -3
build/torch28-cxx11-rocm64-x86_64-linux/optimizer/_optimizer_0c12ced_dirty.abi3.so +0 -3
build/torch28-cxx11-rocm64-x86_64-linux/optimizer/_optimizer_20250911094409.abi3.so +3 -0
build/torch28-cxx11-rocm64-x86_64-linux/optimizer/muon.py +125 -112
test/test_muon/test.py +3 -2
torch-ext/optimizer/muon.py +125 -112

README.md CHANGED Viewed

@@ -21,12 +21,18 @@ from torch.distributed.fsdp import FullyShardedDataParallel as FSDP
 from kernels import get_kernel
 optimizer = get_kernel("motif-technologies/optimizer")
 model = None # your model here
 fsdp_model = FSDP(model)
 optim = optimizer.Muon(
-    fsdp_model.parameters(),
     lr=0.01,
     momentum=0.9,
     weight_decay=1e-4,

 from kernels import get_kernel
 optimizer = get_kernel("motif-technologies/optimizer")
+get_default_muon_param_groups = optimizer.muon.get_default_muon_param_groups
 model = None # your model here
 fsdp_model = FSDP(model)
+# muon, in nature, cannot use 1-d tensor
+# we provide helper function to group such tensors
+# you can use your own function, if necessary
+params = get_default_muon_param_groups(model) # user can write own is_muon_func, if necessary
 optim = optimizer.Muon(
+    params,
     lr=0.01,
     momentum=0.9,
     weight_decay=1e-4,

build/torch27-cxx11-cu118-x86_64-linux/optimizer/_ops.py CHANGED Viewed

@@ -1,9 +1,9 @@
 import torch
-from . import _optimizer_0c12ced_dirty
-ops = torch.ops._optimizer_0c12ced_dirty
 def add_op_namespace_prefix(op_name: str):
     """
     Prefix op by namespace.
     """
-    return f"_optimizer_0c12ced_dirty::{op_name}"

 import torch
+from . import _optimizer_20250911094409
+ops = torch.ops._optimizer_20250911094409
 def add_op_namespace_prefix(op_name: str):
     """
     Prefix op by namespace.
     """
+    return f"_optimizer_20250911094409::{op_name}"

build/torch27-cxx11-cu118-x86_64-linux/optimizer/{_optimizer_0c12ced_dirty.abi3.so → _optimizer_20250911094409.abi3.so} RENAMED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:1c2963bea474b130d3b22e507692b42c1926d0b93c20495789602da2caff5ef3
-size 1787368

 version https://git-lfs.github.com/spec/v1
+oid sha256:48cd88108696ba8ed7487e637b785445bb5ff6075a3ae0c15355698958ad340a
+size 1787376

build/torch27-cxx11-cu118-x86_64-linux/optimizer/muon.py CHANGED Viewed

@@ -1,10 +1,14 @@
 import math
 from dataclasses import dataclass
 import torch
 import torch.distributed as dist
 from torch.distributed._tensor import DTensor, Replicate, Shard
 # This code snippet is a modified version adapted from the following GitHub repositories:
 # https://github.com/KellerJordan/Muon/blob/master/muon.py
@@ -175,10 +179,31 @@ def _update_param(p, state, lr, adjusted_lr, weight_decay, rank,
         Muon._update_p(p, state.scattered_u, lr, adjusted_lr, weight_decay)
-def default_is_muon(x, name):
     return x.ndim >= 2 and "embed_tokens" not in name and "lm_head" not in name
 class Muon(torch.optim.Optimizer):
     """
     Muon - MomentUm Orthogonalized by Newton-schulz
@@ -210,8 +235,7 @@ class Muon(torch.optim.Optimizer):
     def __init__(
         self,
-        model,
-        is_muon_func=default_is_muon,
         lr=1e-3,
         momentum=0.95,
         nesterov=True,
@@ -231,11 +255,19 @@ class Muon(torch.optim.Optimizer):
             adamw_betas=adamw_betas,
             adamw_eps=adamw_eps,
             none_grad=none_grad,
         )
-        super().__init__(model.parameters(), defaults)
-        self.is_muon_func = is_muon_func
-        self.model = model
         if dist.is_initialized():
             self.rank = dist.get_rank()
@@ -246,21 +278,6 @@ class Muon(torch.optim.Optimizer):
         self.compute_stream = torch.cuda.Stream()
         self.debug = debug
-    def __setstate__(self, state):
-        # Sort parameters into those for which we will use Muon, and those for which we will not
-        super().__setstate__(state)
-        self._init_state()
-    def _init_state(self):
-        for name, p in self.model.named_parameters():
-            if self.is_muon_func(p, name):
-                # Use Muon for every parameter in muon_params which is >= 2D and doesn't look like an embedding or head layer
-                assert p.ndim == 2, p.ndim
-                self.state[p]["use_muon"] = True
-            else:
-                # Do not use Muon for parameters in adamw_params
-                self.state[p]["use_muon"] = False
     def _calc_flops(self, G, steps):
         assert len(G.shape) == 2
         M, N = G.shape
@@ -462,100 +479,96 @@ class Muon(torch.optim.Optimizer):
                 loss = closure()
         for group in self.param_groups:
-            ############################
-            #           Muon           #
-            ############################
-            if "use_muon" not in self.state[group["params"][0]]:
-                self._init_state()
-            params = [p for p in group["params"] if self.state[p]["use_muon"]]
-            lr = group["lr"]
-            weight_decay = group["weight_decay"]
-            momentum = group["momentum"]
-            param_dtensors = []
-            param_tensors = []
-            for p in params:
-                if p is None or p.grad is None:
-                    continue
-                if isinstance(p.data, DTensor):
-                    if all(
-                            isinstance(placement, Replicate)
-                            for placement in p.placements):
                         param_tensors.append(p)
                     else:
-                        param_dtensors.append(p)
-                elif isinstance(p.data, torch.Tensor):
-                    param_tensors.append(p)
-                else:
-                    raise TypeError(
-                        f"Unsupported parameter type: {type(p.data)}")
-            if self.debug:
-                print(
-                    f"[Muon] {len(param_dtensors)} DTensors, {len(param_tensors)} Tensors",
-                    flush=True,
-                )
-            if len(param_dtensors) > 0:
-                if not dist.is_initialized():
-                    raise RuntimeError(
-                        "Parallel Muon requires torch.distributed to be initialized."
                     )
-                self.parallel(
-                    param_dtensors,
-                    group,
-                    lr=lr,
-                    weight_decay=weight_decay,
-                    momentum=momentum,
-                )
-            if len(param_tensors) > 0:
-                self.base(
-                    param_tensors,
-                    group,
-                    lr=lr,
-                    weight_decay=weight_decay,
-                    momentum=momentum,
-                )
-            ############################
-            #       AdamW backup       #
-            ############################
-            params = [
-                p for p in group["params"] if not self.state[p]["use_muon"]
-            ]
-            lr = group["lr"]
-            beta1, beta2 = group["adamw_betas"]
-            eps = group["adamw_eps"]
-            weight_decay = group["weight_decay"]
-            for p in params:
-                g = p.grad
-                if g is None:
-                    continue
-                state = self.state[p]
-                if "step" not in state:
-                    state["step"] = 0
-                    state["moment1"] = torch.zeros_like(g)
-                    state["moment2"] = torch.zeros_like(g)
-                state["step"] += 1
-                step = state["step"]
-                buf1 = state["moment1"]
-                buf2 = state["moment2"]
-                buf1.lerp_(g, 1 - beta1)
-                buf2.lerp_(g.square(), 1 - beta2)
-                g = buf1 / (eps + buf2.sqrt())
-                bias_correction1 = 1 - beta1**step
-                bias_correction2 = 1 - beta2**step
-                scale = bias_correction1 / bias_correction2**0.5
-                p.data.mul_(1 - lr * weight_decay)
-                p.data.add_(g, alpha=-lr / scale)
         return loss

+import logging
 import math
+import types
 from dataclasses import dataclass
 import torch
 import torch.distributed as dist
 from torch.distributed._tensor import DTensor, Replicate, Shard
+logger = logging.getLogger(__name__)
 # This code snippet is a modified version adapted from the following GitHub repositories:
 # https://github.com/KellerJordan/Muon/blob/master/muon.py
         Muon._update_p(p, state.scattered_u, lr, adjusted_lr, weight_decay)
+def default_is_muon(name, x):
     return x.ndim >= 2 and "embed_tokens" not in name and "lm_head" not in name
+def get_default_muon_param_groups(model, is_muon_func=default_is_muon):
+    return [
+        {
+            "params": [
+                p for n, p in model.named_parameters()
+                if (is_muon_func(n, p) and p.requires_grad)
+            ],
+            "use_muon":
+            True
+        },
+        {
+            "params": [
+                p for n, p in model.named_parameters()
+                if (not is_muon_func(n, p) and p.requires_grad)
+            ],
+            "use_muon":
+            False
+        },
+    ]
 class Muon(torch.optim.Optimizer):
     """
     Muon - MomentUm Orthogonalized by Newton-schulz
     def __init__(
         self,
+        params,
         lr=1e-3,
         momentum=0.95,
         nesterov=True,
             adamw_betas=adamw_betas,
             adamw_eps=adamw_eps,
             none_grad=none_grad,
+            use_muon=True,
         )
+        error_message = "The key 'use_muon' is not set in parameter group {idx}. Assuming all parameters in the group will use muon optimization, which may lead to unexpected behavior."
+        instruction_code = "\n\n please follow this code snippet \n```optimizer = get_kernel('motif-technologies/optimizer')\n\n\nparams = optimizer.muon.get_default_muon_param_groups(model)\n\noptim = optimizer.Muon(params, ...)```"
+        if isinstance(params, types.GeneratorType):
+            raise ValueError(error_message.format(idx=0) + instruction_code)
+        for _idx, param_group in enumerate(params):
+            if param_group.get("use_muon", None) is None:
+                raise ValueError(
+                    error_message.format(idx=_idx) + instruction_code)
+        super().__init__(params, defaults)
         if dist.is_initialized():
             self.rank = dist.get_rank()
         self.compute_stream = torch.cuda.Stream()
         self.debug = debug
     def _calc_flops(self, G, steps):
         assert len(G.shape) == 2
         M, N = G.shape
                 loss = closure()
         for group in self.param_groups:
+            params = group["params"]
+            if group["use_muon"]:
+                ############################
+                #           Muon           #
+                ############################
+                lr = group["lr"]
+                weight_decay = group["weight_decay"]
+                momentum = group["momentum"]
+                param_dtensors = []
+                param_tensors = []
+                for p in params:
+                    if p is None or p.grad is None:
+                        continue
+                    if isinstance(p.data, DTensor):
+                        if all(
+                                isinstance(placement, Replicate)
+                                for placement in p.placements):
+                            param_tensors.append(p)
+                        else:
+                            param_dtensors.append(p)
+                    elif isinstance(p.data, torch.Tensor):
                         param_tensors.append(p)
                     else:
+                        raise TypeError(
+                            f"Unsupported parameter type: {type(p.data)}")
+                if self.debug:
+                    print(
+                        f"[Muon] {len(param_dtensors)} DTensors, {len(param_tensors)} Tensors",
+                        flush=True,
+                    )
+                if len(param_dtensors) > 0:
+                    if not dist.is_initialized():
+                        raise RuntimeError(
+                            "Parallel Muon requires torch.distributed to be initialized."
+                        )
+                    self.parallel(
+                        param_dtensors,
+                        group,
+                        lr=lr,
+                        weight_decay=weight_decay,
+                        momentum=momentum,
                     )
+                if len(param_tensors) > 0:
+                    self.base(
+                        param_tensors,
+                        group,
+                        lr=lr,
+                        weight_decay=weight_decay,
+                        momentum=momentum,
+                    )
+            else:
+                ############################
+                #       AdamW backup       #
+                ############################
+                lr = group["lr"]
+                beta1, beta2 = group["adamw_betas"]
+                eps = group["adamw_eps"]
+                weight_decay = group["weight_decay"]
+                for p in params:
+                    g = p.grad
+                    if g is None:
+                        continue
+                    state = self.state[p]
+                    if "step" not in state:
+                        state["step"] = 0
+                        state["moment1"] = torch.zeros_like(g)
+                        state["moment2"] = torch.zeros_like(g)
+                    state["step"] += 1
+                    step = state["step"]
+                    buf1 = state["moment1"]
+                    buf2 = state["moment2"]
+                    buf1.lerp_(g, 1 - beta1)
+                    buf2.lerp_(g.square(), 1 - beta2)
+                    g = buf1 / (eps + buf2.sqrt())
+                    bias_correction1 = 1 - beta1**step
+                    bias_correction2 = 1 - beta2**step
+                    scale = bias_correction1 / bias_correction2**0.5
+                    p.data.mul_(1 - lr * weight_decay)
+                    p.data.add_(g, alpha=-lr / scale)
         return loss

build/torch27-cxx11-cu126-x86_64-linux/optimizer/_ops.py CHANGED Viewed

@@ -1,9 +1,9 @@
 import torch
-from . import _optimizer_0c12ced_dirty
-ops = torch.ops._optimizer_0c12ced_dirty
 def add_op_namespace_prefix(op_name: str):
     """
     Prefix op by namespace.
     """
-    return f"_optimizer_0c12ced_dirty::{op_name}"

 import torch
+from . import _optimizer_20250911094409
+ops = torch.ops._optimizer_20250911094409
 def add_op_namespace_prefix(op_name: str):
     """
     Prefix op by namespace.
     """
+    return f"_optimizer_20250911094409::{op_name}"

build/torch27-cxx11-cu126-x86_64-linux/optimizer/{_optimizer_0c12ced_dirty.abi3.so → _optimizer_20250911094409.abi3.so} RENAMED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:a55c3d0aba4548dc74a08d66987307bd381c2d93b149702fbdc60da19e03e5fc
-size 1824256

 version https://git-lfs.github.com/spec/v1
+oid sha256:a5908748e60a61c59e315fbba8b32e3867a4b673b587a2a9606ddde5b4f67da5
+size 1824264

build/torch27-cxx11-cu126-x86_64-linux/optimizer/muon.py CHANGED Viewed

@@ -1,10 +1,14 @@
 import math
 from dataclasses import dataclass
 import torch
 import torch.distributed as dist
 from torch.distributed._tensor import DTensor, Replicate, Shard
 # This code snippet is a modified version adapted from the following GitHub repositories:
 # https://github.com/KellerJordan/Muon/blob/master/muon.py
@@ -175,10 +179,31 @@ def _update_param(p, state, lr, adjusted_lr, weight_decay, rank,
         Muon._update_p(p, state.scattered_u, lr, adjusted_lr, weight_decay)
-def default_is_muon(x, name):
     return x.ndim >= 2 and "embed_tokens" not in name and "lm_head" not in name
 class Muon(torch.optim.Optimizer):
     """
     Muon - MomentUm Orthogonalized by Newton-schulz
@@ -210,8 +235,7 @@ class Muon(torch.optim.Optimizer):
     def __init__(
         self,
-        model,
-        is_muon_func=default_is_muon,
         lr=1e-3,
         momentum=0.95,
         nesterov=True,
@@ -231,11 +255,19 @@ class Muon(torch.optim.Optimizer):
             adamw_betas=adamw_betas,
             adamw_eps=adamw_eps,
             none_grad=none_grad,
         )
-        super().__init__(model.parameters(), defaults)
-        self.is_muon_func = is_muon_func
-        self.model = model
         if dist.is_initialized():
             self.rank = dist.get_rank()
@@ -246,21 +278,6 @@ class Muon(torch.optim.Optimizer):
         self.compute_stream = torch.cuda.Stream()
         self.debug = debug
-    def __setstate__(self, state):
-        # Sort parameters into those for which we will use Muon, and those for which we will not
-        super().__setstate__(state)
-        self._init_state()
-    def _init_state(self):
-        for name, p in self.model.named_parameters():
-            if self.is_muon_func(p, name):
-                # Use Muon for every parameter in muon_params which is >= 2D and doesn't look like an embedding or head layer
-                assert p.ndim == 2, p.ndim
-                self.state[p]["use_muon"] = True
-            else:
-                # Do not use Muon for parameters in adamw_params
-                self.state[p]["use_muon"] = False
     def _calc_flops(self, G, steps):
         assert len(G.shape) == 2
         M, N = G.shape
@@ -462,100 +479,96 @@ class Muon(torch.optim.Optimizer):
                 loss = closure()
         for group in self.param_groups:
-            ############################
-            #           Muon           #
-            ############################
-            if "use_muon" not in self.state[group["params"][0]]:
-                self._init_state()
-            params = [p for p in group["params"] if self.state[p]["use_muon"]]
-            lr = group["lr"]
-            weight_decay = group["weight_decay"]
-            momentum = group["momentum"]
-            param_dtensors = []
-            param_tensors = []
-            for p in params:
-                if p is None or p.grad is None:
-                    continue
-                if isinstance(p.data, DTensor):
-                    if all(
-                            isinstance(placement, Replicate)
-                            for placement in p.placements):
                         param_tensors.append(p)
                     else:
-                        param_dtensors.append(p)
-                elif isinstance(p.data, torch.Tensor):
-                    param_tensors.append(p)
-                else:
-                    raise TypeError(
-                        f"Unsupported parameter type: {type(p.data)}")
-            if self.debug:
-                print(
-                    f"[Muon] {len(param_dtensors)} DTensors, {len(param_tensors)} Tensors",
-                    flush=True,
-                )
-            if len(param_dtensors) > 0:
-                if not dist.is_initialized():
-                    raise RuntimeError(
-                        "Parallel Muon requires torch.distributed to be initialized."
                     )
-                self.parallel(
-                    param_dtensors,
-                    group,
-                    lr=lr,
-                    weight_decay=weight_decay,
-                    momentum=momentum,
-                )
-            if len(param_tensors) > 0:
-                self.base(
-                    param_tensors,
-                    group,
-                    lr=lr,
-                    weight_decay=weight_decay,
-                    momentum=momentum,
-                )
-            ############################
-            #       AdamW backup       #
-            ############################
-            params = [
-                p for p in group["params"] if not self.state[p]["use_muon"]
-            ]
-            lr = group["lr"]
-            beta1, beta2 = group["adamw_betas"]
-            eps = group["adamw_eps"]
-            weight_decay = group["weight_decay"]
-            for p in params:
-                g = p.grad
-                if g is None:
-                    continue
-                state = self.state[p]
-                if "step" not in state:
-                    state["step"] = 0
-                    state["moment1"] = torch.zeros_like(g)
-                    state["moment2"] = torch.zeros_like(g)
-                state["step"] += 1
-                step = state["step"]
-                buf1 = state["moment1"]
-                buf2 = state["moment2"]
-                buf1.lerp_(g, 1 - beta1)
-                buf2.lerp_(g.square(), 1 - beta2)
-                g = buf1 / (eps + buf2.sqrt())
-                bias_correction1 = 1 - beta1**step
-                bias_correction2 = 1 - beta2**step
-                scale = bias_correction1 / bias_correction2**0.5
-                p.data.mul_(1 - lr * weight_decay)
-                p.data.add_(g, alpha=-lr / scale)
         return loss

+import logging
 import math
+import types
 from dataclasses import dataclass
 import torch
 import torch.distributed as dist
 from torch.distributed._tensor import DTensor, Replicate, Shard
+logger = logging.getLogger(__name__)
 # This code snippet is a modified version adapted from the following GitHub repositories:
 # https://github.com/KellerJordan/Muon/blob/master/muon.py
         Muon._update_p(p, state.scattered_u, lr, adjusted_lr, weight_decay)
+def default_is_muon(name, x):
     return x.ndim >= 2 and "embed_tokens" not in name and "lm_head" not in name
+def get_default_muon_param_groups(model, is_muon_func=default_is_muon):
+    return [
+        {
+            "params": [
+                p for n, p in model.named_parameters()
+                if (is_muon_func(n, p) and p.requires_grad)
+            ],
+            "use_muon":
+            True
+        },
+        {
+            "params": [
+                p for n, p in model.named_parameters()
+                if (not is_muon_func(n, p) and p.requires_grad)
+            ],
+            "use_muon":
+            False
+        },
+    ]
 class Muon(torch.optim.Optimizer):
     """
     Muon - MomentUm Orthogonalized by Newton-schulz
     def __init__(
         self,
+        params,
         lr=1e-3,
         momentum=0.95,
         nesterov=True,
             adamw_betas=adamw_betas,
             adamw_eps=adamw_eps,
             none_grad=none_grad,
+            use_muon=True,
         )
+        error_message = "The key 'use_muon' is not set in parameter group {idx}. Assuming all parameters in the group will use muon optimization, which may lead to unexpected behavior."
+        instruction_code = "\n\n please follow this code snippet \n```optimizer = get_kernel('motif-technologies/optimizer')\n\n\nparams = optimizer.muon.get_default_muon_param_groups(model)\n\noptim = optimizer.Muon(params, ...)```"
+        if isinstance(params, types.GeneratorType):
+            raise ValueError(error_message.format(idx=0) + instruction_code)
+        for _idx, param_group in enumerate(params):
+            if param_group.get("use_muon", None) is None:
+                raise ValueError(
+                    error_message.format(idx=_idx) + instruction_code)
+        super().__init__(params, defaults)
         if dist.is_initialized():
             self.rank = dist.get_rank()
         self.compute_stream = torch.cuda.Stream()
         self.debug = debug
     def _calc_flops(self, G, steps):
         assert len(G.shape) == 2
         M, N = G.shape
                 loss = closure()
         for group in self.param_groups:
+            params = group["params"]
+            if group["use_muon"]:
+                ############################
+                #           Muon           #
+                ############################
+                lr = group["lr"]
+                weight_decay = group["weight_decay"]
+                momentum = group["momentum"]
+                param_dtensors = []
+                param_tensors = []
+                for p in params:
+                    if p is None or p.grad is None:
+                        continue
+                    if isinstance(p.data, DTensor):
+                        if all(
+                                isinstance(placement, Replicate)
+                                for placement in p.placements):
+                            param_tensors.append(p)
+                        else:
+                            param_dtensors.append(p)
+                    elif isinstance(p.data, torch.Tensor):
                         param_tensors.append(p)
                     else:
+                        raise TypeError(
+                            f"Unsupported parameter type: {type(p.data)}")
+                if self.debug:
+                    print(
+                        f"[Muon] {len(param_dtensors)} DTensors, {len(param_tensors)} Tensors",
+                        flush=True,
+                    )
+                if len(param_dtensors) > 0:
+                    if not dist.is_initialized():
+                        raise RuntimeError(
+                            "Parallel Muon requires torch.distributed to be initialized."
+                        )
+                    self.parallel(
+                        param_dtensors,
+                        group,
+                        lr=lr,
+                        weight_decay=weight_decay,
+                        momentum=momentum,
                     )
+                if len(param_tensors) > 0:
+                    self.base(
+                        param_tensors,
+                        group,
+                        lr=lr,
+                        weight_decay=weight_decay,
+                        momentum=momentum,
+                    )
+            else:
+                ############################
+                #       AdamW backup       #
+                ############################
+                lr = group["lr"]
+                beta1, beta2 = group["adamw_betas"]
+                eps = group["adamw_eps"]
+                weight_decay = group["weight_decay"]
+                for p in params:
+                    g = p.grad
+                    if g is None:
+                        continue
+                    state = self.state[p]
+                    if "step" not in state:
+                        state["step"] = 0
+                        state["moment1"] = torch.zeros_like(g)
+                        state["moment2"] = torch.zeros_like(g)
+                    state["step"] += 1
+                    step = state["step"]
+                    buf1 = state["moment1"]
+                    buf2 = state["moment2"]
+                    buf1.lerp_(g, 1 - beta1)
+                    buf2.lerp_(g.square(), 1 - beta2)
+                    g = buf1 / (eps + buf2.sqrt())
+                    bias_correction1 = 1 - beta1**step
+                    bias_correction2 = 1 - beta2**step
+                    scale = bias_correction1 / bias_correction2**0.5
+                    p.data.mul_(1 - lr * weight_decay)
+                    p.data.add_(g, alpha=-lr / scale)
         return loss

build/torch27-cxx11-cu128-x86_64-linux/optimizer/_ops.py CHANGED Viewed

@@ -1,9 +1,9 @@
 import torch
-from . import _optimizer_0c12ced_dirty
-ops = torch.ops._optimizer_0c12ced_dirty
 def add_op_namespace_prefix(op_name: str):
     """
     Prefix op by namespace.
     """
-    return f"_optimizer_0c12ced_dirty::{op_name}"

 import torch
+from . import _optimizer_20250911094409
+ops = torch.ops._optimizer_20250911094409
 def add_op_namespace_prefix(op_name: str):
     """
     Prefix op by namespace.
     """
+    return f"_optimizer_20250911094409::{op_name}"

build/torch27-cxx11-cu128-x86_64-linux/optimizer/{_optimizer_0c12ced_dirty.abi3.so → _optimizer_20250911094409.abi3.so} RENAMED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:c319d0fb497363746229fbabed6d14b82090a660de602125fb67135117c53f5a
-size 1883352

 version https://git-lfs.github.com/spec/v1
+oid sha256:b1729faaee0dd55134348a0d775c147cf3aaba106e0475e1389159d48dfc1ebe
+size 1883360

build/torch27-cxx11-cu128-x86_64-linux/optimizer/muon.py CHANGED Viewed

@@ -1,10 +1,14 @@
 import math
 from dataclasses import dataclass
 import torch
 import torch.distributed as dist
 from torch.distributed._tensor import DTensor, Replicate, Shard
 # This code snippet is a modified version adapted from the following GitHub repositories:
 # https://github.com/KellerJordan/Muon/blob/master/muon.py
@@ -175,10 +179,31 @@ def _update_param(p, state, lr, adjusted_lr, weight_decay, rank,
         Muon._update_p(p, state.scattered_u, lr, adjusted_lr, weight_decay)
-def default_is_muon(x, name):
     return x.ndim >= 2 and "embed_tokens" not in name and "lm_head" not in name
 class Muon(torch.optim.Optimizer):
     """
     Muon - MomentUm Orthogonalized by Newton-schulz
@@ -210,8 +235,7 @@ class Muon(torch.optim.Optimizer):
     def __init__(
         self,
-        model,
-        is_muon_func=default_is_muon,
         lr=1e-3,
         momentum=0.95,
         nesterov=True,
@@ -231,11 +255,19 @@ class Muon(torch.optim.Optimizer):
             adamw_betas=adamw_betas,
             adamw_eps=adamw_eps,
             none_grad=none_grad,
         )
-        super().__init__(model.parameters(), defaults)
-        self.is_muon_func = is_muon_func
-        self.model = model
         if dist.is_initialized():
             self.rank = dist.get_rank()
@@ -246,21 +278,6 @@ class Muon(torch.optim.Optimizer):
         self.compute_stream = torch.cuda.Stream()
         self.debug = debug
-    def __setstate__(self, state):
-        # Sort parameters into those for which we will use Muon, and those for which we will not
-        super().__setstate__(state)
-        self._init_state()
-    def _init_state(self):
-        for name, p in self.model.named_parameters():
-            if self.is_muon_func(p, name):
-                # Use Muon for every parameter in muon_params which is >= 2D and doesn't look like an embedding or head layer
-                assert p.ndim == 2, p.ndim
-                self.state[p]["use_muon"] = True
-            else:
-                # Do not use Muon for parameters in adamw_params
-                self.state[p]["use_muon"] = False
     def _calc_flops(self, G, steps):
         assert len(G.shape) == 2
         M, N = G.shape
@@ -462,100 +479,96 @@ class Muon(torch.optim.Optimizer):
                 loss = closure()
         for group in self.param_groups:
-            ############################
-            #           Muon           #
-            ############################
-            if "use_muon" not in self.state[group["params"][0]]:
-                self._init_state()
-            params = [p for p in group["params"] if self.state[p]["use_muon"]]
-            lr = group["lr"]
-            weight_decay = group["weight_decay"]
-            momentum = group["momentum"]
-            param_dtensors = []
-            param_tensors = []
-            for p in params:
-                if p is None or p.grad is None:
-                    continue
-                if isinstance(p.data, DTensor):
-                    if all(
-                            isinstance(placement, Replicate)
-                            for placement in p.placements):
                         param_tensors.append(p)
                     else:
-                        param_dtensors.append(p)
-                elif isinstance(p.data, torch.Tensor):
-                    param_tensors.append(p)
-                else:
-                    raise TypeError(
-                        f"Unsupported parameter type: {type(p.data)}")
-            if self.debug:
-                print(
-                    f"[Muon] {len(param_dtensors)} DTensors, {len(param_tensors)} Tensors",
-                    flush=True,
-                )
-            if len(param_dtensors) > 0:
-                if not dist.is_initialized():
-                    raise RuntimeError(
-                        "Parallel Muon requires torch.distributed to be initialized."
                     )
-                self.parallel(
-                    param_dtensors,
-                    group,
-                    lr=lr,
-                    weight_decay=weight_decay,
-                    momentum=momentum,
-                )
-            if len(param_tensors) > 0:
-                self.base(
-                    param_tensors,
-                    group,
-                    lr=lr,
-                    weight_decay=weight_decay,
-                    momentum=momentum,
-                )
-            ############################
-            #       AdamW backup       #
-            ############################
-            params = [
-                p for p in group["params"] if not self.state[p]["use_muon"]
-            ]
-            lr = group["lr"]
-            beta1, beta2 = group["adamw_betas"]
-            eps = group["adamw_eps"]
-            weight_decay = group["weight_decay"]
-            for p in params:
-                g = p.grad
-                if g is None:
-                    continue
-                state = self.state[p]
-                if "step" not in state:
-                    state["step"] = 0
-                    state["moment1"] = torch.zeros_like(g)
-                    state["moment2"] = torch.zeros_like(g)
-                state["step"] += 1
-                step = state["step"]
-                buf1 = state["moment1"]
-                buf2 = state["moment2"]
-                buf1.lerp_(g, 1 - beta1)
-                buf2.lerp_(g.square(), 1 - beta2)
-                g = buf1 / (eps + buf2.sqrt())
-                bias_correction1 = 1 - beta1**step
-                bias_correction2 = 1 - beta2**step
-                scale = bias_correction1 / bias_correction2**0.5
-                p.data.mul_(1 - lr * weight_decay)
-                p.data.add_(g, alpha=-lr / scale)
         return loss

+import logging
 import math
+import types
 from dataclasses import dataclass
 import torch
 import torch.distributed as dist
 from torch.distributed._tensor import DTensor, Replicate, Shard
+logger = logging.getLogger(__name__)
 # This code snippet is a modified version adapted from the following GitHub repositories:
 # https://github.com/KellerJordan/Muon/blob/master/muon.py
         Muon._update_p(p, state.scattered_u, lr, adjusted_lr, weight_decay)
+def default_is_muon(name, x):
     return x.ndim >= 2 and "embed_tokens" not in name and "lm_head" not in name
+def get_default_muon_param_groups(model, is_muon_func=default_is_muon):
+    return [
+        {
+            "params": [
+                p for n, p in model.named_parameters()
+                if (is_muon_func(n, p) and p.requires_grad)
+            ],
+            "use_muon":
+            True
+        },
+        {
+            "params": [
+                p for n, p in model.named_parameters()
+                if (not is_muon_func(n, p) and p.requires_grad)
+            ],
+            "use_muon":
+            False
+        },
+    ]
 class Muon(torch.optim.Optimizer):
     """
     Muon - MomentUm Orthogonalized by Newton-schulz
     def __init__(
         self,
+        params,
         lr=1e-3,
         momentum=0.95,
         nesterov=True,
             adamw_betas=adamw_betas,
             adamw_eps=adamw_eps,
             none_grad=none_grad,
+            use_muon=True,
         )
+        error_message = "The key 'use_muon' is not set in parameter group {idx}. Assuming all parameters in the group will use muon optimization, which may lead to unexpected behavior."
+        instruction_code = "\n\n please follow this code snippet \n```optimizer = get_kernel('motif-technologies/optimizer')\n\n\nparams = optimizer.muon.get_default_muon_param_groups(model)\n\noptim = optimizer.Muon(params, ...)```"
+        if isinstance(params, types.GeneratorType):
+            raise ValueError(error_message.format(idx=0) + instruction_code)
+        for _idx, param_group in enumerate(params):
+            if param_group.get("use_muon", None) is None:
+                raise ValueError(
+                    error_message.format(idx=_idx) + instruction_code)
+        super().__init__(params, defaults)
         if dist.is_initialized():
             self.rank = dist.get_rank()
         self.compute_stream = torch.cuda.Stream()
         self.debug = debug
     def _calc_flops(self, G, steps):
         assert len(G.shape) == 2
         M, N = G.shape
                 loss = closure()
         for group in self.param_groups:
+            params = group["params"]
+            if group["use_muon"]:
+                ############################
+                #           Muon           #
+                ############################
+                lr = group["lr"]
+                weight_decay = group["weight_decay"]
+                momentum = group["momentum"]
+                param_dtensors = []
+                param_tensors = []
+                for p in params:
+                    if p is None or p.grad is None:
+                        continue
+                    if isinstance(p.data, DTensor):
+                        if all(
+                                isinstance(placement, Replicate)
+                                for placement in p.placements):
+                            param_tensors.append(p)
+                        else:
+                            param_dtensors.append(p)
+                    elif isinstance(p.data, torch.Tensor):
                         param_tensors.append(p)
                     else:
+                        raise TypeError(
+                            f"Unsupported parameter type: {type(p.data)}")
+                if self.debug:
+                    print(
+                        f"[Muon] {len(param_dtensors)} DTensors, {len(param_tensors)} Tensors",
+                        flush=True,
+                    )
+                if len(param_dtensors) > 0:
+                    if not dist.is_initialized():
+                        raise RuntimeError(
+                            "Parallel Muon requires torch.distributed to be initialized."
+                        )
+                    self.parallel(
+                        param_dtensors,
+                        group,
+                        lr=lr,
+                        weight_decay=weight_decay,
+                        momentum=momentum,
                     )
+                if len(param_tensors) > 0:
+                    self.base(
+                        param_tensors,
+                        group,
+                        lr=lr,
+                        weight_decay=weight_decay,
+                        momentum=momentum,
+                    )
+            else:
+                ############################
+                #       AdamW backup       #
+                ############################
+                lr = group["lr"]
+                beta1, beta2 = group["adamw_betas"]
+                eps = group["adamw_eps"]
+                weight_decay = group["weight_decay"]
+                for p in params:
+                    g = p.grad
+                    if g is None:
+                        continue
+                    state = self.state[p]
+                    if "step" not in state:
+                        state["step"] = 0
+                        state["moment1"] = torch.zeros_like(g)
+                        state["moment2"] = torch.zeros_like(g)
+                    state["step"] += 1
+                    step = state["step"]
+                    buf1 = state["moment1"]
+                    buf2 = state["moment2"]
+                    buf1.lerp_(g, 1 - beta1)
+                    buf2.lerp_(g.square(), 1 - beta2)
+                    g = buf1 / (eps + buf2.sqrt())
+                    bias_correction1 = 1 - beta1**step
+                    bias_correction2 = 1 - beta2**step
+                    scale = bias_correction1 / bias_correction2**0.5
+                    p.data.mul_(1 - lr * weight_decay)
+                    p.data.add_(g, alpha=-lr / scale)
         return loss

build/torch27-cxx11-rocm63-x86_64-linux/optimizer/_ops.py CHANGED Viewed

@@ -1,9 +1,9 @@
 import torch
-from . import _optimizer_0c12ced_dirty
-ops = torch.ops._optimizer_0c12ced_dirty
 def add_op_namespace_prefix(op_name: str):
     """
     Prefix op by namespace.
     """
-    return f"_optimizer_0c12ced_dirty::{op_name}"

 import torch
+from . import _optimizer_20250911094409
+ops = torch.ops._optimizer_20250911094409
 def add_op_namespace_prefix(op_name: str):
     """
     Prefix op by namespace.
     """
+    return f"_optimizer_20250911094409::{op_name}"

build/torch27-cxx11-rocm63-x86_64-linux/optimizer/{_optimizer_0c12ced_dirty.abi3.so → _optimizer_20250911094409.abi3.so} RENAMED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:e8bda6399291a15b5bcba88214ffd3d0291b10d1cdfb0ab668436d176a9396ec
-size 1749840

 version https://git-lfs.github.com/spec/v1
+oid sha256:0857945a1ebfdbb6c7219d0b96c8ab47649aa3b47b65fa800c84b51ddbda9c19
+size 1749880

build/torch27-cxx11-rocm63-x86_64-linux/optimizer/muon.py CHANGED Viewed

@@ -1,10 +1,14 @@
 import math
 from dataclasses import dataclass
 import torch
 import torch.distributed as dist
 from torch.distributed._tensor import DTensor, Replicate, Shard
 # This code snippet is a modified version adapted from the following GitHub repositories:
 # https://github.com/KellerJordan/Muon/blob/master/muon.py
@@ -175,10 +179,31 @@ def _update_param(p, state, lr, adjusted_lr, weight_decay, rank,
         Muon._update_p(p, state.scattered_u, lr, adjusted_lr, weight_decay)
-def default_is_muon(x, name):
     return x.ndim >= 2 and "embed_tokens" not in name and "lm_head" not in name
 class Muon(torch.optim.Optimizer):
     """
     Muon - MomentUm Orthogonalized by Newton-schulz
@@ -210,8 +235,7 @@ class Muon(torch.optim.Optimizer):
     def __init__(
         self,
-        model,
-        is_muon_func=default_is_muon,
         lr=1e-3,
         momentum=0.95,
         nesterov=True,
@@ -231,11 +255,19 @@ class Muon(torch.optim.Optimizer):
             adamw_betas=adamw_betas,
             adamw_eps=adamw_eps,
             none_grad=none_grad,
         )
-        super().__init__(model.parameters(), defaults)
-        self.is_muon_func = is_muon_func
-        self.model = model
         if dist.is_initialized():
             self.rank = dist.get_rank()
@@ -246,21 +278,6 @@ class Muon(torch.optim.Optimizer):
         self.compute_stream = torch.cuda.Stream()
         self.debug = debug
-    def __setstate__(self, state):
-        # Sort parameters into those for which we will use Muon, and those for which we will not
-        super().__setstate__(state)
-        self._init_state()
-    def _init_state(self):
-        for name, p in self.model.named_parameters():
-            if self.is_muon_func(p, name):
-                # Use Muon for every parameter in muon_params which is >= 2D and doesn't look like an embedding or head layer
-                assert p.ndim == 2, p.ndim
-                self.state[p]["use_muon"] = True
-            else:
-                # Do not use Muon for parameters in adamw_params
-                self.state[p]["use_muon"] = False
     def _calc_flops(self, G, steps):
         assert len(G.shape) == 2
         M, N = G.shape
@@ -462,100 +479,96 @@ class Muon(torch.optim.Optimizer):
                 loss = closure()
         for group in self.param_groups:
-            ############################
-            #           Muon           #
-            ############################
-            if "use_muon" not in self.state[group["params"][0]]:
-                self._init_state()
-            params = [p for p in group["params"] if self.state[p]["use_muon"]]
-            lr = group["lr"]
-            weight_decay = group["weight_decay"]
-            momentum = group["momentum"]
-            param_dtensors = []
-            param_tensors = []
-            for p in params:
-                if p is None or p.grad is None:
-                    continue
-                if isinstance(p.data, DTensor):
-                    if all(
-                            isinstance(placement, Replicate)
-                            for placement in p.placements):
                         param_tensors.append(p)
                     else:
-                        param_dtensors.append(p)
-                elif isinstance(p.data, torch.Tensor):
-                    param_tensors.append(p)
-                else:
-                    raise TypeError(
-                        f"Unsupported parameter type: {type(p.data)}")
-            if self.debug:
-                print(
-                    f"[Muon] {len(param_dtensors)} DTensors, {len(param_tensors)} Tensors",
-                    flush=True,
-                )
-            if len(param_dtensors) > 0:
-                if not dist.is_initialized():
-                    raise RuntimeError(
-                        "Parallel Muon requires torch.distributed to be initialized."
                     )
-                self.parallel(
-                    param_dtensors,
-                    group,
-                    lr=lr,
-                    weight_decay=weight_decay,
-                    momentum=momentum,
-                )
-            if len(param_tensors) > 0:
-                self.base(
-                    param_tensors,
-                    group,
-                    lr=lr,
-                    weight_decay=weight_decay,
-                    momentum=momentum,
-                )
-            ############################
-            #       AdamW backup       #
-            ############################
-            params = [
-                p for p in group["params"] if not self.state[p]["use_muon"]
-            ]
-            lr = group["lr"]
-            beta1, beta2 = group["adamw_betas"]
-            eps = group["adamw_eps"]
-            weight_decay = group["weight_decay"]
-            for p in params:
-                g = p.grad
-                if g is None:
-                    continue
-                state = self.state[p]
-                if "step" not in state:
-                    state["step"] = 0
-                    state["moment1"] = torch.zeros_like(g)
-                    state["moment2"] = torch.zeros_like(g)
-                state["step"] += 1
-                step = state["step"]
-                buf1 = state["moment1"]
-                buf2 = state["moment2"]
-                buf1.lerp_(g, 1 - beta1)
-                buf2.lerp_(g.square(), 1 - beta2)
-                g = buf1 / (eps + buf2.sqrt())
-                bias_correction1 = 1 - beta1**step
-                bias_correction2 = 1 - beta2**step
-                scale = bias_correction1 / bias_correction2**0.5
-                p.data.mul_(1 - lr * weight_decay)
-                p.data.add_(g, alpha=-lr / scale)
         return loss

+import logging
 import math
+import types
 from dataclasses import dataclass
 import torch
 import torch.distributed as dist
 from torch.distributed._tensor import DTensor, Replicate, Shard
+logger = logging.getLogger(__name__)
 # This code snippet is a modified version adapted from the following GitHub repositories:
 # https://github.com/KellerJordan/Muon/blob/master/muon.py
         Muon._update_p(p, state.scattered_u, lr, adjusted_lr, weight_decay)
+def default_is_muon(name, x):
     return x.ndim >= 2 and "embed_tokens" not in name and "lm_head" not in name
+def get_default_muon_param_groups(model, is_muon_func=default_is_muon):
+    return [
+        {
+            "params": [
+                p for n, p in model.named_parameters()
+                if (is_muon_func(n, p) and p.requires_grad)
+            ],
+            "use_muon":
+            True
+        },
+        {
+            "params": [
+                p for n, p in model.named_parameters()
+                if (not is_muon_func(n, p) and p.requires_grad)
+            ],
+            "use_muon":
+            False
+        },
+    ]
 class Muon(torch.optim.Optimizer):
     """
     Muon - MomentUm Orthogonalized by Newton-schulz
     def __init__(
         self,
+        params,
         lr=1e-3,
         momentum=0.95,
         nesterov=True,
             adamw_betas=adamw_betas,
             adamw_eps=adamw_eps,
             none_grad=none_grad,
+            use_muon=True,
         )
+        error_message = "The key 'use_muon' is not set in parameter group {idx}. Assuming all parameters in the group will use muon optimization, which may lead to unexpected behavior."
+        instruction_code = "\n\n please follow this code snippet \n```optimizer = get_kernel('motif-technologies/optimizer')\n\n\nparams = optimizer.muon.get_default_muon_param_groups(model)\n\noptim = optimizer.Muon(params, ...)```"
+        if isinstance(params, types.GeneratorType):
+            raise ValueError(error_message.format(idx=0) + instruction_code)
+        for _idx, param_group in enumerate(params):
+            if param_group.get("use_muon", None) is None:
+                raise ValueError(
+                    error_message.format(idx=_idx) + instruction_code)
+        super().__init__(params, defaults)
         if dist.is_initialized():
             self.rank = dist.get_rank()
         self.compute_stream = torch.cuda.Stream()
         self.debug = debug
     def _calc_flops(self, G, steps):
         assert len(G.shape) == 2
         M, N = G.shape
                 loss = closure()
         for group in self.param_groups:
+            params = group["params"]
+            if group["use_muon"]:
+                ############################
+                #           Muon           #
+                ############################
+                lr = group["lr"]
+                weight_decay = group["weight_decay"]
+                momentum = group["momentum"]
+                param_dtensors = []
+                param_tensors = []
+                for p in params:
+                    if p is None or p.grad is None:
+                        continue
+                    if isinstance(p.data, DTensor):
+                        if all(
+                                isinstance(placement, Replicate)
+                                for placement in p.placements):
+                            param_tensors.append(p)
+                        else:
+                            param_dtensors.append(p)
+                    elif isinstance(p.data, torch.Tensor):
                         param_tensors.append(p)
                     else:
+                        raise TypeError(
+                            f"Unsupported parameter type: {type(p.data)}")
+                if self.debug:
+                    print(
+                        f"[Muon] {len(param_dtensors)} DTensors, {len(param_tensors)} Tensors",
+                        flush=True,
+                    )
+                if len(param_dtensors) > 0:
+                    if not dist.is_initialized():
+                        raise RuntimeError(
+                            "Parallel Muon requires torch.distributed to be initialized."
+                        )
+                    self.parallel(
+                        param_dtensors,
+                        group,
+                        lr=lr,
+                        weight_decay=weight_decay,
+                        momentum=momentum,
                     )
+                if len(param_tensors) > 0:
+                    self.base(
+                        param_tensors,
+                        group,
+                        lr=lr,
+                        weight_decay=weight_decay,
+                        momentum=momentum,
+                    )
+            else:
+                ############################
+                #       AdamW backup       #
+                ############################
+                lr = group["lr"]
+                beta1, beta2 = group["adamw_betas"]
+                eps = group["adamw_eps"]
+                weight_decay = group["weight_decay"]
+                for p in params:
+                    g = p.grad
+                    if g is None:
+                        continue
+                    state = self.state[p]
+                    if "step" not in state:
+                        state["step"] = 0
+                        state["moment1"] = torch.zeros_like(g)
+                        state["moment2"] = torch.zeros_like(g)
+                    state["step"] += 1
+                    step = state["step"]
+                    buf1 = state["moment1"]
+                    buf2 = state["moment2"]
+                    buf1.lerp_(g, 1 - beta1)
+                    buf2.lerp_(g.square(), 1 - beta2)
+                    g = buf1 / (eps + buf2.sqrt())
+                    bias_correction1 = 1 - beta1**step
+                    bias_correction2 = 1 - beta2**step
+                    scale = bias_correction1 / bias_correction2**0.5
+                    p.data.mul_(1 - lr * weight_decay)
+                    p.data.add_(g, alpha=-lr / scale)
         return loss

build/torch28-cxx11-cu126-x86_64-linux/optimizer/_ops.py CHANGED Viewed

@@ -1,9 +1,9 @@
 import torch
-from . import _optimizer_0c12ced_dirty
-ops = torch.ops._optimizer_0c12ced_dirty
 def add_op_namespace_prefix(op_name: str):
     """
     Prefix op by namespace.
     """
-    return f"_optimizer_0c12ced_dirty::{op_name}"

 import torch
+from . import _optimizer_20250911094409
+ops = torch.ops._optimizer_20250911094409
 def add_op_namespace_prefix(op_name: str):
     """
     Prefix op by namespace.
     """
+    return f"_optimizer_20250911094409::{op_name}"

build/torch28-cxx11-cu126-x86_64-linux/optimizer/_optimizer_0c12ced_dirty.abi3.so DELETED Viewed

@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:df5044ffb45124dfe7088ed991123724405b00285e4d8d1ba2961802f521aa0f
-size 1824256

build/torch28-cxx11-cu126-x86_64-linux/optimizer/_optimizer_20250911094409.abi3.so ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:cebddf4b9cb794ad3cd7b88affd011160f7fb9a16257fcb4d942604839b31b37
+size 1824264

build/torch28-cxx11-cu126-x86_64-linux/optimizer/muon.py CHANGED Viewed

@@ -1,10 +1,14 @@
 import math
 from dataclasses import dataclass
 import torch
 import torch.distributed as dist
 from torch.distributed._tensor import DTensor, Replicate, Shard
 # This code snippet is a modified version adapted from the following GitHub repositories:
 # https://github.com/KellerJordan/Muon/blob/master/muon.py
@@ -175,10 +179,31 @@ def _update_param(p, state, lr, adjusted_lr, weight_decay, rank,
         Muon._update_p(p, state.scattered_u, lr, adjusted_lr, weight_decay)
-def default_is_muon(x, name):
     return x.ndim >= 2 and "embed_tokens" not in name and "lm_head" not in name
 class Muon(torch.optim.Optimizer):
     """
     Muon - MomentUm Orthogonalized by Newton-schulz
@@ -210,8 +235,7 @@ class Muon(torch.optim.Optimizer):
     def __init__(
         self,
-        model,
-        is_muon_func=default_is_muon,
         lr=1e-3,
         momentum=0.95,
         nesterov=True,
@@ -231,11 +255,19 @@ class Muon(torch.optim.Optimizer):
             adamw_betas=adamw_betas,
             adamw_eps=adamw_eps,
             none_grad=none_grad,
         )
-        super().__init__(model.parameters(), defaults)
-        self.is_muon_func = is_muon_func
-        self.model = model
         if dist.is_initialized():
             self.rank = dist.get_rank()
@@ -246,21 +278,6 @@ class Muon(torch.optim.Optimizer):
         self.compute_stream = torch.cuda.Stream()
         self.debug = debug
-    def __setstate__(self, state):
-        # Sort parameters into those for which we will use Muon, and those for which we will not
-        super().__setstate__(state)
-        self._init_state()
-    def _init_state(self):
-        for name, p in self.model.named_parameters():
-            if self.is_muon_func(p, name):
-                # Use Muon for every parameter in muon_params which is >= 2D and doesn't look like an embedding or head layer
-                assert p.ndim == 2, p.ndim
-                self.state[p]["use_muon"] = True
-            else:
-                # Do not use Muon for parameters in adamw_params
-                self.state[p]["use_muon"] = False
     def _calc_flops(self, G, steps):
         assert len(G.shape) == 2
         M, N = G.shape
@@ -462,100 +479,96 @@ class Muon(torch.optim.Optimizer):
                 loss = closure()
         for group in self.param_groups:
-            ############################
-            #           Muon           #
-            ############################
-            if "use_muon" not in self.state[group["params"][0]]:
-                self._init_state()
-            params = [p for p in group["params"] if self.state[p]["use_muon"]]
-            lr = group["lr"]
-            weight_decay = group["weight_decay"]
-            momentum = group["momentum"]
-            param_dtensors = []
-            param_tensors = []
-            for p in params:
-                if p is None or p.grad is None:
-                    continue
-                if isinstance(p.data, DTensor):
-                    if all(
-                            isinstance(placement, Replicate)
-                            for placement in p.placements):
                         param_tensors.append(p)
                     else:
-                        param_dtensors.append(p)
-                elif isinstance(p.data, torch.Tensor):
-                    param_tensors.append(p)
-                else:
-                    raise TypeError(
-                        f"Unsupported parameter type: {type(p.data)}")
-            if self.debug:
-                print(
-                    f"[Muon] {len(param_dtensors)} DTensors, {len(param_tensors)} Tensors",
-                    flush=True,
-                )
-            if len(param_dtensors) > 0:
-                if not dist.is_initialized():
-                    raise RuntimeError(
-                        "Parallel Muon requires torch.distributed to be initialized."
                     )
-                self.parallel(
-                    param_dtensors,
-                    group,
-                    lr=lr,
-                    weight_decay=weight_decay,
-                    momentum=momentum,
-                )
-            if len(param_tensors) > 0:
-                self.base(
-                    param_tensors,
-                    group,
-                    lr=lr,
-                    weight_decay=weight_decay,
-                    momentum=momentum,
-                )
-            ############################
-            #       AdamW backup       #
-            ############################
-            params = [
-                p for p in group["params"] if not self.state[p]["use_muon"]
-            ]
-            lr = group["lr"]
-            beta1, beta2 = group["adamw_betas"]
-            eps = group["adamw_eps"]
-            weight_decay = group["weight_decay"]
-            for p in params:
-                g = p.grad
-                if g is None:
-                    continue
-                state = self.state[p]
-                if "step" not in state:
-                    state["step"] = 0
-                    state["moment1"] = torch.zeros_like(g)
-                    state["moment2"] = torch.zeros_like(g)
-                state["step"] += 1
-                step = state["step"]
-                buf1 = state["moment1"]
-                buf2 = state["moment2"]
-                buf1.lerp_(g, 1 - beta1)
-                buf2.lerp_(g.square(), 1 - beta2)
-                g = buf1 / (eps + buf2.sqrt())
-                bias_correction1 = 1 - beta1**step
-                bias_correction2 = 1 - beta2**step
-                scale = bias_correction1 / bias_correction2**0.5
-                p.data.mul_(1 - lr * weight_decay)
-                p.data.add_(g, alpha=-lr / scale)
         return loss

+import logging
 import math
+import types
 from dataclasses import dataclass
 import torch
 import torch.distributed as dist
 from torch.distributed._tensor import DTensor, Replicate, Shard
+logger = logging.getLogger(__name__)
 # This code snippet is a modified version adapted from the following GitHub repositories:
 # https://github.com/KellerJordan/Muon/blob/master/muon.py
         Muon._update_p(p, state.scattered_u, lr, adjusted_lr, weight_decay)
+def default_is_muon(name, x):
     return x.ndim >= 2 and "embed_tokens" not in name and "lm_head" not in name
+def get_default_muon_param_groups(model, is_muon_func=default_is_muon):
+    return [
+        {
+            "params": [
+                p for n, p in model.named_parameters()
+                if (is_muon_func(n, p) and p.requires_grad)
+            ],
+            "use_muon":
+            True
+        },
+        {
+            "params": [
+                p for n, p in model.named_parameters()
+                if (not is_muon_func(n, p) and p.requires_grad)
+            ],
+            "use_muon":
+            False
+        },
+    ]
 class Muon(torch.optim.Optimizer):
     """
     Muon - MomentUm Orthogonalized by Newton-schulz
     def __init__(
         self,
+        params,
         lr=1e-3,
         momentum=0.95,
         nesterov=True,
             adamw_betas=adamw_betas,
             adamw_eps=adamw_eps,
             none_grad=none_grad,
+            use_muon=True,
         )
+        error_message = "The key 'use_muon' is not set in parameter group {idx}. Assuming all parameters in the group will use muon optimization, which may lead to unexpected behavior."
+        instruction_code = "\n\n please follow this code snippet \n```optimizer = get_kernel('motif-technologies/optimizer')\n\n\nparams = optimizer.muon.get_default_muon_param_groups(model)\n\noptim = optimizer.Muon(params, ...)```"
+        if isinstance(params, types.GeneratorType):
+            raise ValueError(error_message.format(idx=0) + instruction_code)
+        for _idx, param_group in enumerate(params):
+            if param_group.get("use_muon", None) is None:
+                raise ValueError(
+                    error_message.format(idx=_idx) + instruction_code)
+        super().__init__(params, defaults)
         if dist.is_initialized():
             self.rank = dist.get_rank()
         self.compute_stream = torch.cuda.Stream()
         self.debug = debug
     def _calc_flops(self, G, steps):
         assert len(G.shape) == 2
         M, N = G.shape
                 loss = closure()
         for group in self.param_groups:
+            params = group["params"]
+            if group["use_muon"]:
+                ############################
+                #           Muon           #
+                ############################
+                lr = group["lr"]
+                weight_decay = group["weight_decay"]
+                momentum = group["momentum"]
+                param_dtensors = []
+                param_tensors = []
+                for p in params:
+                    if p is None or p.grad is None:
+                        continue
+                    if isinstance(p.data, DTensor):
+                        if all(
+                                isinstance(placement, Replicate)
+                                for placement in p.placements):
+                            param_tensors.append(p)
+                        else:
+                            param_dtensors.append(p)
+                    elif isinstance(p.data, torch.Tensor):
                         param_tensors.append(p)
                     else:
+                        raise TypeError(
+                            f"Unsupported parameter type: {type(p.data)}")
+                if self.debug:
+                    print(
+                        f"[Muon] {len(param_dtensors)} DTensors, {len(param_tensors)} Tensors",
+                        flush=True,
+                    )
+                if len(param_dtensors) > 0:
+                    if not dist.is_initialized():
+                        raise RuntimeError(
+                            "Parallel Muon requires torch.distributed to be initialized."
+                        )
+                    self.parallel(
+                        param_dtensors,
+                        group,
+                        lr=lr,
+                        weight_decay=weight_decay,
+                        momentum=momentum,
                     )
+                if len(param_tensors) > 0:
+                    self.base(
+                        param_tensors,
+                        group,
+                        lr=lr,
+                        weight_decay=weight_decay,
+                        momentum=momentum,
+                    )
+            else:
+                ############################
+                #       AdamW backup       #
+                ############################
+                lr = group["lr"]
+                beta1, beta2 = group["adamw_betas"]
+                eps = group["adamw_eps"]
+                weight_decay = group["weight_decay"]
+                for p in params:
+                    g = p.grad
+                    if g is None:
+                        continue
+                    state = self.state[p]
+                    if "step" not in state:
+                        state["step"] = 0
+                        state["moment1"] = torch.zeros_like(g)
+                        state["moment2"] = torch.zeros_like(g)
+                    state["step"] += 1
+                    step = state["step"]
+                    buf1 = state["moment1"]
+                    buf2 = state["moment2"]
+                    buf1.lerp_(g, 1 - beta1)
+                    buf2.lerp_(g.square(), 1 - beta2)
+                    g = buf1 / (eps + buf2.sqrt())
+                    bias_correction1 = 1 - beta1**step
+                    bias_correction2 = 1 - beta2**step
+                    scale = bias_correction1 / bias_correction2**0.5
+                    p.data.mul_(1 - lr * weight_decay)
+                    p.data.add_(g, alpha=-lr / scale)
         return loss

build/torch28-cxx11-cu128-x86_64-linux/optimizer/_ops.py CHANGED Viewed

@@ -1,9 +1,9 @@
 import torch
-from . import _optimizer_0c12ced_dirty
-ops = torch.ops._optimizer_0c12ced_dirty
 def add_op_namespace_prefix(op_name: str):
     """
     Prefix op by namespace.
     """
-    return f"_optimizer_0c12ced_dirty::{op_name}"

 import torch
+from . import _optimizer_20250911094409
+ops = torch.ops._optimizer_20250911094409
 def add_op_namespace_prefix(op_name: str):
     """
     Prefix op by namespace.
     """
+    return f"_optimizer_20250911094409::{op_name}"

build/torch28-cxx11-cu128-x86_64-linux/optimizer/_optimizer_0c12ced_dirty.abi3.so DELETED Viewed

@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:80cb3ac21d3afafe368f31318c31a4c6356b53bbc2186ae81b79e1eb3ff441f5
-size 1883352

build/torch28-cxx11-cu128-x86_64-linux/optimizer/_optimizer_20250911094409.abi3.so ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:22dc3ab77ab74837126281f79f417c5d55b2cc9885388fd9d3a1c7c824ece2bd
+size 1883360

build/torch28-cxx11-cu128-x86_64-linux/optimizer/muon.py CHANGED Viewed

@@ -1,10 +1,14 @@
 import math
 from dataclasses import dataclass
 import torch
 import torch.distributed as dist
 from torch.distributed._tensor import DTensor, Replicate, Shard
 # This code snippet is a modified version adapted from the following GitHub repositories:
 # https://github.com/KellerJordan/Muon/blob/master/muon.py
@@ -175,10 +179,31 @@ def _update_param(p, state, lr, adjusted_lr, weight_decay, rank,
         Muon._update_p(p, state.scattered_u, lr, adjusted_lr, weight_decay)
-def default_is_muon(x, name):
     return x.ndim >= 2 and "embed_tokens" not in name and "lm_head" not in name
 class Muon(torch.optim.Optimizer):
     """
     Muon - MomentUm Orthogonalized by Newton-schulz
@@ -210,8 +235,7 @@ class Muon(torch.optim.Optimizer):
     def __init__(
         self,
-        model,
-        is_muon_func=default_is_muon,
         lr=1e-3,
         momentum=0.95,
         nesterov=True,
@@ -231,11 +255,19 @@ class Muon(torch.optim.Optimizer):
             adamw_betas=adamw_betas,
             adamw_eps=adamw_eps,
             none_grad=none_grad,
         )
-        super().__init__(model.parameters(), defaults)
-        self.is_muon_func = is_muon_func
-        self.model = model
         if dist.is_initialized():
             self.rank = dist.get_rank()
@@ -246,21 +278,6 @@ class Muon(torch.optim.Optimizer):
         self.compute_stream = torch.cuda.Stream()
         self.debug = debug
-    def __setstate__(self, state):
-        # Sort parameters into those for which we will use Muon, and those for which we will not
-        super().__setstate__(state)
-        self._init_state()
-    def _init_state(self):
-        for name, p in self.model.named_parameters():
-            if self.is_muon_func(p, name):
-                # Use Muon for every parameter in muon_params which is >= 2D and doesn't look like an embedding or head layer
-                assert p.ndim == 2, p.ndim
-                self.state[p]["use_muon"] = True
-            else:
-                # Do not use Muon for parameters in adamw_params
-                self.state[p]["use_muon"] = False
     def _calc_flops(self, G, steps):
         assert len(G.shape) == 2
         M, N = G.shape
@@ -462,100 +479,96 @@ class Muon(torch.optim.Optimizer):
                 loss = closure()
         for group in self.param_groups:
-            ############################
-            #           Muon           #
-            ############################
-            if "use_muon" not in self.state[group["params"][0]]:
-                self._init_state()
-            params = [p for p in group["params"] if self.state[p]["use_muon"]]
-            lr = group["lr"]
-            weight_decay = group["weight_decay"]
-            momentum = group["momentum"]
-            param_dtensors = []
-            param_tensors = []
-            for p in params:
-                if p is None or p.grad is None:
-                    continue
-                if isinstance(p.data, DTensor):
-                    if all(
-                            isinstance(placement, Replicate)
-                            for placement in p.placements):
                         param_tensors.append(p)
                     else:
-                        param_dtensors.append(p)
-                elif isinstance(p.data, torch.Tensor):
-                    param_tensors.append(p)
-                else:
-                    raise TypeError(
-                        f"Unsupported parameter type: {type(p.data)}")
-            if self.debug:
-                print(
-                    f"[Muon] {len(param_dtensors)} DTensors, {len(param_tensors)} Tensors",
-                    flush=True,
-                )
-            if len(param_dtensors) > 0:
-                if not dist.is_initialized():
-                    raise RuntimeError(
-                        "Parallel Muon requires torch.distributed to be initialized."
                     )
-                self.parallel(
-                    param_dtensors,
-                    group,
-                    lr=lr,
-                    weight_decay=weight_decay,
-                    momentum=momentum,
-                )
-            if len(param_tensors) > 0:
-                self.base(
-                    param_tensors,
-                    group,
-                    lr=lr,
-                    weight_decay=weight_decay,
-                    momentum=momentum,
-                )
-            ############################
-            #       AdamW backup       #
-            ############################
-            params = [
-                p for p in group["params"] if not self.state[p]["use_muon"]
-            ]
-            lr = group["lr"]
-            beta1, beta2 = group["adamw_betas"]
-            eps = group["adamw_eps"]
-            weight_decay = group["weight_decay"]
-            for p in params:
-                g = p.grad
-                if g is None:
-                    continue
-                state = self.state[p]
-                if "step" not in state:
-                    state["step"] = 0
-                    state["moment1"] = torch.zeros_like(g)
-                    state["moment2"] = torch.zeros_like(g)
-                state["step"] += 1
-                step = state["step"]
-                buf1 = state["moment1"]
-                buf2 = state["moment2"]
-                buf1.lerp_(g, 1 - beta1)
-                buf2.lerp_(g.square(), 1 - beta2)
-                g = buf1 / (eps + buf2.sqrt())
-                bias_correction1 = 1 - beta1**step
-                bias_correction2 = 1 - beta2**step
-                scale = bias_correction1 / bias_correction2**0.5
-                p.data.mul_(1 - lr * weight_decay)
-                p.data.add_(g, alpha=-lr / scale)
         return loss

+import logging
 import math
+import types
 from dataclasses import dataclass
 import torch
 import torch.distributed as dist
 from torch.distributed._tensor import DTensor, Replicate, Shard
+logger = logging.getLogger(__name__)
 # This code snippet is a modified version adapted from the following GitHub repositories:
 # https://github.com/KellerJordan/Muon/blob/master/muon.py
         Muon._update_p(p, state.scattered_u, lr, adjusted_lr, weight_decay)
+def default_is_muon(name, x):
     return x.ndim >= 2 and "embed_tokens" not in name and "lm_head" not in name
+def get_default_muon_param_groups(model, is_muon_func=default_is_muon):
+    return [
+        {
+            "params": [
+                p for n, p in model.named_parameters()
+                if (is_muon_func(n, p) and p.requires_grad)
+            ],
+            "use_muon":
+            True
+        },
+        {
+            "params": [
+                p for n, p in model.named_parameters()
+                if (not is_muon_func(n, p) and p.requires_grad)
+            ],
+            "use_muon":
+            False
+        },
+    ]
 class Muon(torch.optim.Optimizer):
     """
     Muon - MomentUm Orthogonalized by Newton-schulz
     def __init__(
         self,
+        params,
         lr=1e-3,
         momentum=0.95,
         nesterov=True,
             adamw_betas=adamw_betas,
             adamw_eps=adamw_eps,
             none_grad=none_grad,
+            use_muon=True,
         )
+        error_message = "The key 'use_muon' is not set in parameter group {idx}. Assuming all parameters in the group will use muon optimization, which may lead to unexpected behavior."
+        instruction_code = "\n\n please follow this code snippet \n```optimizer = get_kernel('motif-technologies/optimizer')\n\n\nparams = optimizer.muon.get_default_muon_param_groups(model)\n\noptim = optimizer.Muon(params, ...)```"
+        if isinstance(params, types.GeneratorType):
+            raise ValueError(error_message.format(idx=0) + instruction_code)
+        for _idx, param_group in enumerate(params):
+            if param_group.get("use_muon", None) is None:
+                raise ValueError(
+                    error_message.format(idx=_idx) + instruction_code)
+        super().__init__(params, defaults)
         if dist.is_initialized():
             self.rank = dist.get_rank()
         self.compute_stream = torch.cuda.Stream()
         self.debug = debug
     def _calc_flops(self, G, steps):
         assert len(G.shape) == 2
         M, N = G.shape
                 loss = closure()
         for group in self.param_groups:
+            params = group["params"]
+            if group["use_muon"]:
+                ############################
+                #           Muon           #
+                ############################
+                lr = group["lr"]
+                weight_decay = group["weight_decay"]
+                momentum = group["momentum"]
+                param_dtensors = []
+                param_tensors = []
+                for p in params:
+                    if p is None or p.grad is None:
+                        continue
+                    if isinstance(p.data, DTensor):
+                        if all(
+                                isinstance(placement, Replicate)
+                                for placement in p.placements):
+                            param_tensors.append(p)
+                        else:
+                            param_dtensors.append(p)
+                    elif isinstance(p.data, torch.Tensor):
                         param_tensors.append(p)
                     else:
+                        raise TypeError(
+                            f"Unsupported parameter type: {type(p.data)}")
+                if self.debug:
+                    print(
+                        f"[Muon] {len(param_dtensors)} DTensors, {len(param_tensors)} Tensors",
+                        flush=True,
+                    )
+                if len(param_dtensors) > 0:
+                    if not dist.is_initialized():
+                        raise RuntimeError(
+                            "Parallel Muon requires torch.distributed to be initialized."
+                        )
+                    self.parallel(
+                        param_dtensors,
+                        group,
+                        lr=lr,
+                        weight_decay=weight_decay,
+                        momentum=momentum,
                     )
+                if len(param_tensors) > 0:
+                    self.base(
+                        param_tensors,
+                        group,
+                        lr=lr,
+                        weight_decay=weight_decay,
+                        momentum=momentum,
+                    )
+            else:
+                ############################
+                #       AdamW backup       #
+                ############################
+                lr = group["lr"]
+                beta1, beta2 = group["adamw_betas"]
+                eps = group["adamw_eps"]
+                weight_decay = group["weight_decay"]
+                for p in params:
+                    g = p.grad
+                    if g is None:
+                        continue
+                    state = self.state[p]
+                    if "step" not in state:
+                        state["step"] = 0
+                        state["moment1"] = torch.zeros_like(g)
+                        state["moment2"] = torch.zeros_like(g)
+                    state["step"] += 1
+                    step = state["step"]
+                    buf1 = state["moment1"]
+                    buf2 = state["moment2"]
+                    buf1.lerp_(g, 1 - beta1)
+                    buf2.lerp_(g.square(), 1 - beta2)
+                    g = buf1 / (eps + buf2.sqrt())
+                    bias_correction1 = 1 - beta1**step
+                    bias_correction2 = 1 - beta2**step
+                    scale = bias_correction1 / bias_correction2**0.5
+                    p.data.mul_(1 - lr * weight_decay)
+                    p.data.add_(g, alpha=-lr / scale)
         return loss

build/torch28-cxx11-cu129-x86_64-linux/optimizer/_ops.py CHANGED Viewed

@@ -1,9 +1,9 @@
 import torch
-from . import _optimizer_0c12ced_dirty
-ops = torch.ops._optimizer_0c12ced_dirty
 def add_op_namespace_prefix(op_name: str):
     """
     Prefix op by namespace.
     """
-    return f"_optimizer_0c12ced_dirty::{op_name}"

 import torch
+from . import _optimizer_20250911094409
+ops = torch.ops._optimizer_20250911094409
 def add_op_namespace_prefix(op_name: str):
     """
     Prefix op by namespace.
     """
+    return f"_optimizer_20250911094409::{op_name}"

build/torch28-cxx11-cu129-x86_64-linux/optimizer/_optimizer_0c12ced_dirty.abi3.so DELETED Viewed

@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:32af855517484e2695b6d83c29a03d85fcbaaea559d95cbb62fd9fa67cc3ccac
-size 1883352

build/torch28-cxx11-cu129-x86_64-linux/optimizer/_optimizer_20250911094409.abi3.so ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:62ecfc7e6a1ab0c4ada19ed7aea40fc0a431c4ceb1729666efa98ac0e407f9c8
+size 1883360

build/torch28-cxx11-cu129-x86_64-linux/optimizer/muon.py CHANGED Viewed

@@ -1,10 +1,14 @@
 import math
 from dataclasses import dataclass
 import torch
 import torch.distributed as dist
 from torch.distributed._tensor import DTensor, Replicate, Shard
 # This code snippet is a modified version adapted from the following GitHub repositories:
 # https://github.com/KellerJordan/Muon/blob/master/muon.py
@@ -175,10 +179,31 @@ def _update_param(p, state, lr, adjusted_lr, weight_decay, rank,
         Muon._update_p(p, state.scattered_u, lr, adjusted_lr, weight_decay)
-def default_is_muon(x, name):
     return x.ndim >= 2 and "embed_tokens" not in name and "lm_head" not in name
 class Muon(torch.optim.Optimizer):
     """
     Muon - MomentUm Orthogonalized by Newton-schulz
@@ -210,8 +235,7 @@ class Muon(torch.optim.Optimizer):
     def __init__(
         self,
-        model,
-        is_muon_func=default_is_muon,
         lr=1e-3,
         momentum=0.95,
         nesterov=True,
@@ -231,11 +255,19 @@ class Muon(torch.optim.Optimizer):
             adamw_betas=adamw_betas,
             adamw_eps=adamw_eps,
             none_grad=none_grad,
         )
-        super().__init__(model.parameters(), defaults)
-        self.is_muon_func = is_muon_func
-        self.model = model
         if dist.is_initialized():
             self.rank = dist.get_rank()
@@ -246,21 +278,6 @@ class Muon(torch.optim.Optimizer):
         self.compute_stream = torch.cuda.Stream()
         self.debug = debug
-    def __setstate__(self, state):
-        # Sort parameters into those for which we will use Muon, and those for which we will not
-        super().__setstate__(state)
-        self._init_state()
-    def _init_state(self):
-        for name, p in self.model.named_parameters():
-            if self.is_muon_func(p, name):
-                # Use Muon for every parameter in muon_params which is >= 2D and doesn't look like an embedding or head layer
-                assert p.ndim == 2, p.ndim
-                self.state[p]["use_muon"] = True
-            else:
-                # Do not use Muon for parameters in adamw_params
-                self.state[p]["use_muon"] = False
     def _calc_flops(self, G, steps):
         assert len(G.shape) == 2
         M, N = G.shape
@@ -462,100 +479,96 @@ class Muon(torch.optim.Optimizer):
                 loss = closure()
         for group in self.param_groups:
-            ############################
-            #           Muon           #
-            ############################
-            if "use_muon" not in self.state[group["params"][0]]:
-                self._init_state()
-            params = [p for p in group["params"] if self.state[p]["use_muon"]]
-            lr = group["lr"]
-            weight_decay = group["weight_decay"]
-            momentum = group["momentum"]
-            param_dtensors = []
-            param_tensors = []
-            for p in params:
-                if p is None or p.grad is None:
-                    continue
-                if isinstance(p.data, DTensor):
-                    if all(
-                            isinstance(placement, Replicate)
-                            for placement in p.placements):
                         param_tensors.append(p)
                     else:
-                        param_dtensors.append(p)
-                elif isinstance(p.data, torch.Tensor):
-                    param_tensors.append(p)
-                else:
-                    raise TypeError(
-                        f"Unsupported parameter type: {type(p.data)}")
-            if self.debug:
-                print(
-                    f"[Muon] {len(param_dtensors)} DTensors, {len(param_tensors)} Tensors",
-                    flush=True,
-                )
-            if len(param_dtensors) > 0:
-                if not dist.is_initialized():
-                    raise RuntimeError(
-                        "Parallel Muon requires torch.distributed to be initialized."
                     )
-                self.parallel(
-                    param_dtensors,
-                    group,
-                    lr=lr,
-                    weight_decay=weight_decay,
-                    momentum=momentum,
-                )
-            if len(param_tensors) > 0:
-                self.base(
-                    param_tensors,
-                    group,
-                    lr=lr,
-                    weight_decay=weight_decay,
-                    momentum=momentum,
-                )
-            ############################
-            #       AdamW backup       #
-            ############################
-            params = [
-                p for p in group["params"] if not self.state[p]["use_muon"]
-            ]
-            lr = group["lr"]
-            beta1, beta2 = group["adamw_betas"]
-            eps = group["adamw_eps"]
-            weight_decay = group["weight_decay"]
-            for p in params:
-                g = p.grad
-                if g is None:
-                    continue
-                state = self.state[p]
-                if "step" not in state:
-                    state["step"] = 0
-                    state["moment1"] = torch.zeros_like(g)
-                    state["moment2"] = torch.zeros_like(g)
-                state["step"] += 1
-                step = state["step"]
-                buf1 = state["moment1"]
-                buf2 = state["moment2"]
-                buf1.lerp_(g, 1 - beta1)
-                buf2.lerp_(g.square(), 1 - beta2)
-                g = buf1 / (eps + buf2.sqrt())
-                bias_correction1 = 1 - beta1**step
-                bias_correction2 = 1 - beta2**step
-                scale = bias_correction1 / bias_correction2**0.5
-                p.data.mul_(1 - lr * weight_decay)
-                p.data.add_(g, alpha=-lr / scale)
         return loss

+import logging
 import math
+import types
 from dataclasses import dataclass
 import torch
 import torch.distributed as dist
 from torch.distributed._tensor import DTensor, Replicate, Shard
+logger = logging.getLogger(__name__)
 # This code snippet is a modified version adapted from the following GitHub repositories:
 # https://github.com/KellerJordan/Muon/blob/master/muon.py
         Muon._update_p(p, state.scattered_u, lr, adjusted_lr, weight_decay)
+def default_is_muon(name, x):
     return x.ndim >= 2 and "embed_tokens" not in name and "lm_head" not in name
+def get_default_muon_param_groups(model, is_muon_func=default_is_muon):
+    return [
+        {
+            "params": [
+                p for n, p in model.named_parameters()
+                if (is_muon_func(n, p) and p.requires_grad)
+            ],
+            "use_muon":
+            True
+        },
+        {
+            "params": [
+                p for n, p in model.named_parameters()
+                if (not is_muon_func(n, p) and p.requires_grad)
+            ],
+            "use_muon":
+            False
+        },
+    ]
 class Muon(torch.optim.Optimizer):
     """
     Muon - MomentUm Orthogonalized by Newton-schulz
     def __init__(
         self,
+        params,
         lr=1e-3,
         momentum=0.95,
         nesterov=True,
             adamw_betas=adamw_betas,
             adamw_eps=adamw_eps,
             none_grad=none_grad,
+            use_muon=True,
         )
+        error_message = "The key 'use_muon' is not set in parameter group {idx}. Assuming all parameters in the group will use muon optimization, which may lead to unexpected behavior."
+        instruction_code = "\n\n please follow this code snippet \n```optimizer = get_kernel('motif-technologies/optimizer')\n\n\nparams = optimizer.muon.get_default_muon_param_groups(model)\n\noptim = optimizer.Muon(params, ...)```"
+        if isinstance(params, types.GeneratorType):
+            raise ValueError(error_message.format(idx=0) + instruction_code)
+        for _idx, param_group in enumerate(params):
+            if param_group.get("use_muon", None) is None:
+                raise ValueError(
+                    error_message.format(idx=_idx) + instruction_code)
+        super().__init__(params, defaults)
         if dist.is_initialized():
             self.rank = dist.get_rank()
         self.compute_stream = torch.cuda.Stream()
         self.debug = debug
     def _calc_flops(self, G, steps):
         assert len(G.shape) == 2
         M, N = G.shape
                 loss = closure()
         for group in self.param_groups:
+            params = group["params"]
+            if group["use_muon"]:
+                ############################
+                #           Muon           #
+                ############################
+                lr = group["lr"]
+                weight_decay = group["weight_decay"]
+                momentum = group["momentum"]
+                param_dtensors = []
+                param_tensors = []
+                for p in params:
+                    if p is None or p.grad is None:
+                        continue
+                    if isinstance(p.data, DTensor):
+                        if all(
+                                isinstance(placement, Replicate)
+                                for placement in p.placements):
+                            param_tensors.append(p)
+                        else:
+                            param_dtensors.append(p)
+                    elif isinstance(p.data, torch.Tensor):
                         param_tensors.append(p)
                     else:
+                        raise TypeError(
+                            f"Unsupported parameter type: {type(p.data)}")
+                if self.debug:
+                    print(
+                        f"[Muon] {len(param_dtensors)} DTensors, {len(param_tensors)} Tensors",
+                        flush=True,
+                    )
+                if len(param_dtensors) > 0:
+                    if not dist.is_initialized():
+                        raise RuntimeError(
+                            "Parallel Muon requires torch.distributed to be initialized."
+                        )
+                    self.parallel(
+                        param_dtensors,
+                        group,
+                        lr=lr,
+                        weight_decay=weight_decay,
+                        momentum=momentum,
                     )
+                if len(param_tensors) > 0:
+                    self.base(
+                        param_tensors,
+                        group,
+                        lr=lr,
+                        weight_decay=weight_decay,
+                        momentum=momentum,
+                    )
+            else:
+                ############################
+                #       AdamW backup       #
+                ############################
+                lr = group["lr"]
+                beta1, beta2 = group["adamw_betas"]
+                eps = group["adamw_eps"]
+                weight_decay = group["weight_decay"]
+                for p in params:
+                    g = p.grad
+                    if g is None:
+                        continue
+                    state = self.state[p]
+                    if "step" not in state:
+                        state["step"] = 0
+                        state["moment1"] = torch.zeros_like(g)
+                        state["moment2"] = torch.zeros_like(g)
+                    state["step"] += 1
+                    step = state["step"]
+                    buf1 = state["moment1"]
+                    buf2 = state["moment2"]
+                    buf1.lerp_(g, 1 - beta1)
+                    buf2.lerp_(g.square(), 1 - beta2)
+                    g = buf1 / (eps + buf2.sqrt())
+                    bias_correction1 = 1 - beta1**step
+                    bias_correction2 = 1 - beta2**step
+                    scale = bias_correction1 / bias_correction2**0.5
+                    p.data.mul_(1 - lr * weight_decay)
+                    p.data.add_(g, alpha=-lr / scale)
         return loss

build/torch28-cxx11-rocm63-x86_64-linux/optimizer/_ops.py CHANGED Viewed

@@ -1,9 +1,9 @@
 import torch
-from . import _optimizer_0c12ced_dirty
-ops = torch.ops._optimizer_0c12ced_dirty
 def add_op_namespace_prefix(op_name: str):
     """
     Prefix op by namespace.
     """
-    return f"_optimizer_0c12ced_dirty::{op_name}"

 import torch
+from . import _optimizer_20250911094409
+ops = torch.ops._optimizer_20250911094409
 def add_op_namespace_prefix(op_name: str):
     """
     Prefix op by namespace.
     """
+    return f"_optimizer_20250911094409::{op_name}"

build/torch28-cxx11-rocm63-x86_64-linux/optimizer/_optimizer_0c12ced_dirty.abi3.so DELETED Viewed

@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:2dd72f3b9f513dc8bd0724fede9b668761b1d701dfdf3a294979706d803b0800
-size 1750000

build/torch28-cxx11-rocm63-x86_64-linux/optimizer/_optimizer_20250911094409.abi3.so ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:37e389c650fc1fcbc9fbd68f1e7c1a768b08e90509fd8a5d87879655726f2db2
+size 1750040

build/torch28-cxx11-rocm63-x86_64-linux/optimizer/muon.py CHANGED Viewed

@@ -1,10 +1,14 @@
 import math
 from dataclasses import dataclass
 import torch
 import torch.distributed as dist
 from torch.distributed._tensor import DTensor, Replicate, Shard
 # This code snippet is a modified version adapted from the following GitHub repositories:
 # https://github.com/KellerJordan/Muon/blob/master/muon.py
@@ -175,10 +179,31 @@ def _update_param(p, state, lr, adjusted_lr, weight_decay, rank,
         Muon._update_p(p, state.scattered_u, lr, adjusted_lr, weight_decay)
-def default_is_muon(x, name):
     return x.ndim >= 2 and "embed_tokens" not in name and "lm_head" not in name
 class Muon(torch.optim.Optimizer):
     """
     Muon - MomentUm Orthogonalized by Newton-schulz
@@ -210,8 +235,7 @@ class Muon(torch.optim.Optimizer):
     def __init__(
         self,
-        model,
-        is_muon_func=default_is_muon,
         lr=1e-3,
         momentum=0.95,
         nesterov=True,
@@ -231,11 +255,19 @@ class Muon(torch.optim.Optimizer):
             adamw_betas=adamw_betas,
             adamw_eps=adamw_eps,
             none_grad=none_grad,
         )
-        super().__init__(model.parameters(), defaults)
-        self.is_muon_func = is_muon_func
-        self.model = model
         if dist.is_initialized():
             self.rank = dist.get_rank()
@@ -246,21 +278,6 @@ class Muon(torch.optim.Optimizer):
         self.compute_stream = torch.cuda.Stream()
         self.debug = debug
-    def __setstate__(self, state):
-        # Sort parameters into those for which we will use Muon, and those for which we will not
-        super().__setstate__(state)
-        self._init_state()
-    def _init_state(self):
-        for name, p in self.model.named_parameters():
-            if self.is_muon_func(p, name):
-                # Use Muon for every parameter in muon_params which is >= 2D and doesn't look like an embedding or head layer
-                assert p.ndim == 2, p.ndim
-                self.state[p]["use_muon"] = True
-            else:
-                # Do not use Muon for parameters in adamw_params
-                self.state[p]["use_muon"] = False
     def _calc_flops(self, G, steps):
         assert len(G.shape) == 2
         M, N = G.shape
@@ -462,100 +479,96 @@ class Muon(torch.optim.Optimizer):
                 loss = closure()
         for group in self.param_groups:
-            ############################
-            #           Muon           #
-            ############################
-            if "use_muon" not in self.state[group["params"][0]]:
-                self._init_state()
-            params = [p for p in group["params"] if self.state[p]["use_muon"]]
-            lr = group["lr"]
-            weight_decay = group["weight_decay"]
-            momentum = group["momentum"]
-            param_dtensors = []
-            param_tensors = []
-            for p in params:
-                if p is None or p.grad is None:
-                    continue
-                if isinstance(p.data, DTensor):
-                    if all(
-                            isinstance(placement, Replicate)
-                            for placement in p.placements):
                         param_tensors.append(p)
                     else:
-                        param_dtensors.append(p)
-                elif isinstance(p.data, torch.Tensor):
-                    param_tensors.append(p)
-                else:
-                    raise TypeError(
-                        f"Unsupported parameter type: {type(p.data)}")
-            if self.debug:
-                print(
-                    f"[Muon] {len(param_dtensors)} DTensors, {len(param_tensors)} Tensors",
-                    flush=True,
-                )
-            if len(param_dtensors) > 0:
-                if not dist.is_initialized():
-                    raise RuntimeError(
-                        "Parallel Muon requires torch.distributed to be initialized."
                     )
-                self.parallel(
-                    param_dtensors,
-                    group,
-                    lr=lr,
-                    weight_decay=weight_decay,
-                    momentum=momentum,
-                )
-            if len(param_tensors) > 0:
-                self.base(
-                    param_tensors,
-                    group,
-                    lr=lr,
-                    weight_decay=weight_decay,
-                    momentum=momentum,
-                )
-            ############################
-            #       AdamW backup       #
-            ############################
-            params = [
-                p for p in group["params"] if not self.state[p]["use_muon"]
-            ]
-            lr = group["lr"]
-            beta1, beta2 = group["adamw_betas"]
-            eps = group["adamw_eps"]
-            weight_decay = group["weight_decay"]
-            for p in params:
-                g = p.grad
-                if g is None:
-                    continue
-                state = self.state[p]
-                if "step" not in state:
-                    state["step"] = 0
-                    state["moment1"] = torch.zeros_like(g)
-                    state["moment2"] = torch.zeros_like(g)
-                state["step"] += 1
-                step = state["step"]
-                buf1 = state["moment1"]
-                buf2 = state["moment2"]
-                buf1.lerp_(g, 1 - beta1)
-                buf2.lerp_(g.square(), 1 - beta2)
-                g = buf1 / (eps + buf2.sqrt())
-                bias_correction1 = 1 - beta1**step
-                bias_correction2 = 1 - beta2**step
-                scale = bias_correction1 / bias_correction2**0.5
-                p.data.mul_(1 - lr * weight_decay)
-                p.data.add_(g, alpha=-lr / scale)
         return loss

+import logging
 import math
+import types
 from dataclasses import dataclass
 import torch
 import torch.distributed as dist
 from torch.distributed._tensor import DTensor, Replicate, Shard
+logger = logging.getLogger(__name__)
 # This code snippet is a modified version adapted from the following GitHub repositories:
 # https://github.com/KellerJordan/Muon/blob/master/muon.py
         Muon._update_p(p, state.scattered_u, lr, adjusted_lr, weight_decay)
+def default_is_muon(name, x):
     return x.ndim >= 2 and "embed_tokens" not in name and "lm_head" not in name
+def get_default_muon_param_groups(model, is_muon_func=default_is_muon):
+    return [
+        {
+            "params": [
+                p for n, p in model.named_parameters()
+                if (is_muon_func(n, p) and p.requires_grad)
+            ],
+            "use_muon":
+            True
+        },
+        {
+            "params": [
+                p for n, p in model.named_parameters()
+                if (not is_muon_func(n, p) and p.requires_grad)
+            ],
+            "use_muon":
+            False
+        },
+    ]
 class Muon(torch.optim.Optimizer):
     """
     Muon - MomentUm Orthogonalized by Newton-schulz
     def __init__(
         self,
+        params,
         lr=1e-3,
         momentum=0.95,
         nesterov=True,
             adamw_betas=adamw_betas,
             adamw_eps=adamw_eps,
             none_grad=none_grad,
+            use_muon=True,
         )
+        error_message = "The key 'use_muon' is not set in parameter group {idx}. Assuming all parameters in the group will use muon optimization, which may lead to unexpected behavior."
+        instruction_code = "\n\n please follow this code snippet \n```optimizer = get_kernel('motif-technologies/optimizer')\n\n\nparams = optimizer.muon.get_default_muon_param_groups(model)\n\noptim = optimizer.Muon(params, ...)```"
+        if isinstance(params, types.GeneratorType):
+            raise ValueError(error_message.format(idx=0) + instruction_code)
+        for _idx, param_group in enumerate(params):
+            if param_group.get("use_muon", None) is None:
+                raise ValueError(
+                    error_message.format(idx=_idx) + instruction_code)
+        super().__init__(params, defaults)
         if dist.is_initialized():
             self.rank = dist.get_rank()
         self.compute_stream = torch.cuda.Stream()
         self.debug = debug
     def _calc_flops(self, G, steps):
         assert len(G.shape) == 2
         M, N = G.shape
                 loss = closure()
         for group in self.param_groups:
+            params = group["params"]
+            if group["use_muon"]:
+                ############################
+                #           Muon           #
+                ############################
+                lr = group["lr"]
+                weight_decay = group["weight_decay"]
+                momentum = group["momentum"]
+                param_dtensors = []
+                param_tensors = []
+                for p in params:
+                    if p is None or p.grad is None:
+                        continue
+                    if isinstance(p.data, DTensor):
+                        if all(
+                                isinstance(placement, Replicate)
+                                for placement in p.placements):
+                            param_tensors.append(p)
+                        else:
+                            param_dtensors.append(p)
+                    elif isinstance(p.data, torch.Tensor):
                         param_tensors.append(p)
                     else:
+                        raise TypeError(
+                            f"Unsupported parameter type: {type(p.data)}")
+                if self.debug:
+                    print(
+                        f"[Muon] {len(param_dtensors)} DTensors, {len(param_tensors)} Tensors",
+                        flush=True,
+                    )
+                if len(param_dtensors) > 0:
+                    if not dist.is_initialized():
+                        raise RuntimeError(
+                            "Parallel Muon requires torch.distributed to be initialized."
+                        )
+                    self.parallel(
+                        param_dtensors,
+                        group,
+                        lr=lr,
+                        weight_decay=weight_decay,
+                        momentum=momentum,
                     )
+                if len(param_tensors) > 0:
+                    self.base(
+                        param_tensors,
+                        group,
+                        lr=lr,
+                        weight_decay=weight_decay,
+                        momentum=momentum,
+                    )
+            else:
+                ############################
+                #       AdamW backup       #
+                ############################
+                lr = group["lr"]
+                beta1, beta2 = group["adamw_betas"]
+                eps = group["adamw_eps"]
+                weight_decay = group["weight_decay"]
+                for p in params:
+                    g = p.grad
+                    if g is None:
+                        continue
+                    state = self.state[p]
+                    if "step" not in state:
+                        state["step"] = 0
+                        state["moment1"] = torch.zeros_like(g)
+                        state["moment2"] = torch.zeros_like(g)
+                    state["step"] += 1
+                    step = state["step"]
+                    buf1 = state["moment1"]
+                    buf2 = state["moment2"]
+                    buf1.lerp_(g, 1 - beta1)
+                    buf2.lerp_(g.square(), 1 - beta2)
+                    g = buf1 / (eps + buf2.sqrt())
+                    bias_correction1 = 1 - beta1**step
+                    bias_correction2 = 1 - beta2**step
+                    scale = bias_correction1 / bias_correction2**0.5
+                    p.data.mul_(1 - lr * weight_decay)
+                    p.data.add_(g, alpha=-lr / scale)
         return loss

build/torch28-cxx11-rocm64-x86_64-linux/optimizer/_ops.py CHANGED Viewed

@@ -1,9 +1,9 @@
 import torch
-from . import _optimizer_0c12ced_dirty
-ops = torch.ops._optimizer_0c12ced_dirty
 def add_op_namespace_prefix(op_name: str):
     """
     Prefix op by namespace.
     """
-    return f"_optimizer_0c12ced_dirty::{op_name}"

 import torch
+from . import _optimizer_20250911094409
+ops = torch.ops._optimizer_20250911094409
 def add_op_namespace_prefix(op_name: str):
     """
     Prefix op by namespace.
     """
+    return f"_optimizer_20250911094409::{op_name}"

build/torch28-cxx11-rocm64-x86_64-linux/optimizer/_optimizer_0c12ced_dirty.abi3.so DELETED Viewed

@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:2a49b0225ecf27b33bbbe55936811ecf443ce97be97ccb7237b3b66eb46c0ad8
-size 1750088

build/torch28-cxx11-rocm64-x86_64-linux/optimizer/_optimizer_20250911094409.abi3.so ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:e62682b711f002505bb17c170b2bb233f8d389510ff8e2e0a753ee96d11d0746
+size 1750128

build/torch28-cxx11-rocm64-x86_64-linux/optimizer/muon.py CHANGED Viewed

@@ -1,10 +1,14 @@
 import math
 from dataclasses import dataclass
 import torch
 import torch.distributed as dist
 from torch.distributed._tensor import DTensor, Replicate, Shard
 # This code snippet is a modified version adapted from the following GitHub repositories:
 # https://github.com/KellerJordan/Muon/blob/master/muon.py
@@ -175,10 +179,31 @@ def _update_param(p, state, lr, adjusted_lr, weight_decay, rank,
         Muon._update_p(p, state.scattered_u, lr, adjusted_lr, weight_decay)
-def default_is_muon(x, name):
     return x.ndim >= 2 and "embed_tokens" not in name and "lm_head" not in name
 class Muon(torch.optim.Optimizer):
     """
     Muon - MomentUm Orthogonalized by Newton-schulz
@@ -210,8 +235,7 @@ class Muon(torch.optim.Optimizer):
     def __init__(
         self,
-        model,
-        is_muon_func=default_is_muon,
         lr=1e-3,
         momentum=0.95,
         nesterov=True,
@@ -231,11 +255,19 @@ class Muon(torch.optim.Optimizer):
             adamw_betas=adamw_betas,
             adamw_eps=adamw_eps,
             none_grad=none_grad,
         )
-        super().__init__(model.parameters(), defaults)
-        self.is_muon_func = is_muon_func
-        self.model = model
         if dist.is_initialized():
             self.rank = dist.get_rank()
@@ -246,21 +278,6 @@ class Muon(torch.optim.Optimizer):
         self.compute_stream = torch.cuda.Stream()
         self.debug = debug
-    def __setstate__(self, state):
-        # Sort parameters into those for which we will use Muon, and those for which we will not
-        super().__setstate__(state)
-        self._init_state()
-    def _init_state(self):
-        for name, p in self.model.named_parameters():
-            if self.is_muon_func(p, name):
-                # Use Muon for every parameter in muon_params which is >= 2D and doesn't look like an embedding or head layer
-                assert p.ndim == 2, p.ndim
-                self.state[p]["use_muon"] = True
-            else:
-                # Do not use Muon for parameters in adamw_params
-                self.state[p]["use_muon"] = False
     def _calc_flops(self, G, steps):
         assert len(G.shape) == 2
         M, N = G.shape
@@ -462,100 +479,96 @@ class Muon(torch.optim.Optimizer):
                 loss = closure()
         for group in self.param_groups:
-            ############################
-            #           Muon           #
-            ############################
-            if "use_muon" not in self.state[group["params"][0]]:
-                self._init_state()
-            params = [p for p in group["params"] if self.state[p]["use_muon"]]
-            lr = group["lr"]
-            weight_decay = group["weight_decay"]
-            momentum = group["momentum"]
-            param_dtensors = []
-            param_tensors = []
-            for p in params:
-                if p is None or p.grad is None:
-                    continue
-                if isinstance(p.data, DTensor):
-                    if all(
-                            isinstance(placement, Replicate)
-                            for placement in p.placements):
                         param_tensors.append(p)
                     else:
-                        param_dtensors.append(p)
-                elif isinstance(p.data, torch.Tensor):
-                    param_tensors.append(p)
-                else:
-                    raise TypeError(
-                        f"Unsupported parameter type: {type(p.data)}")
-            if self.debug:
-                print(
-                    f"[Muon] {len(param_dtensors)} DTensors, {len(param_tensors)} Tensors",
-                    flush=True,
-                )
-            if len(param_dtensors) > 0:
-                if not dist.is_initialized():
-                    raise RuntimeError(
-                        "Parallel Muon requires torch.distributed to be initialized."
                     )
-                self.parallel(
-                    param_dtensors,
-                    group,
-                    lr=lr,
-                    weight_decay=weight_decay,
-                    momentum=momentum,
-                )
-            if len(param_tensors) > 0:
-                self.base(
-                    param_tensors,
-                    group,
-                    lr=lr,
-                    weight_decay=weight_decay,
-                    momentum=momentum,
-                )
-            ############################
-            #       AdamW backup       #
-            ############################
-            params = [
-                p for p in group["params"] if not self.state[p]["use_muon"]
-            ]
-            lr = group["lr"]
-            beta1, beta2 = group["adamw_betas"]
-            eps = group["adamw_eps"]
-            weight_decay = group["weight_decay"]
-            for p in params:
-                g = p.grad
-                if g is None:
-                    continue
-                state = self.state[p]
-                if "step" not in state:
-                    state["step"] = 0
-                    state["moment1"] = torch.zeros_like(g)
-                    state["moment2"] = torch.zeros_like(g)
-                state["step"] += 1
-                step = state["step"]
-                buf1 = state["moment1"]
-                buf2 = state["moment2"]
-                buf1.lerp_(g, 1 - beta1)
-                buf2.lerp_(g.square(), 1 - beta2)
-                g = buf1 / (eps + buf2.sqrt())
-                bias_correction1 = 1 - beta1**step
-                bias_correction2 = 1 - beta2**step
-                scale = bias_correction1 / bias_correction2**0.5
-                p.data.mul_(1 - lr * weight_decay)
-                p.data.add_(g, alpha=-lr / scale)
         return loss

+import logging
 import math
+import types
 from dataclasses import dataclass
 import torch
 import torch.distributed as dist
 from torch.distributed._tensor import DTensor, Replicate, Shard
+logger = logging.getLogger(__name__)
 # This code snippet is a modified version adapted from the following GitHub repositories:
 # https://github.com/KellerJordan/Muon/blob/master/muon.py
         Muon._update_p(p, state.scattered_u, lr, adjusted_lr, weight_decay)
+def default_is_muon(name, x):
     return x.ndim >= 2 and "embed_tokens" not in name and "lm_head" not in name
+def get_default_muon_param_groups(model, is_muon_func=default_is_muon):
+    return [
+        {
+            "params": [
+                p for n, p in model.named_parameters()
+                if (is_muon_func(n, p) and p.requires_grad)
+            ],
+            "use_muon":
+            True
+        },
+        {
+            "params": [
+                p for n, p in model.named_parameters()
+                if (not is_muon_func(n, p) and p.requires_grad)
+            ],
+            "use_muon":
+            False
+        },
+    ]
 class Muon(torch.optim.Optimizer):
     """
     Muon - MomentUm Orthogonalized by Newton-schulz
     def __init__(
         self,
+        params,
         lr=1e-3,
         momentum=0.95,
         nesterov=True,
             adamw_betas=adamw_betas,
             adamw_eps=adamw_eps,
             none_grad=none_grad,
+            use_muon=True,
         )
+        error_message = "The key 'use_muon' is not set in parameter group {idx}. Assuming all parameters in the group will use muon optimization, which may lead to unexpected behavior."
+        instruction_code = "\n\n please follow this code snippet \n```optimizer = get_kernel('motif-technologies/optimizer')\n\n\nparams = optimizer.muon.get_default_muon_param_groups(model)\n\noptim = optimizer.Muon(params, ...)```"
+        if isinstance(params, types.GeneratorType):
+            raise ValueError(error_message.format(idx=0) + instruction_code)
+        for _idx, param_group in enumerate(params):
+            if param_group.get("use_muon", None) is None:
+                raise ValueError(
+                    error_message.format(idx=_idx) + instruction_code)
+        super().__init__(params, defaults)
         if dist.is_initialized():
             self.rank = dist.get_rank()
         self.compute_stream = torch.cuda.Stream()
         self.debug = debug
     def _calc_flops(self, G, steps):
         assert len(G.shape) == 2
         M, N = G.shape
                 loss = closure()
         for group in self.param_groups:
+            params = group["params"]
+            if group["use_muon"]:
+                ############################
+                #           Muon           #
+                ############################
+                lr = group["lr"]
+                weight_decay = group["weight_decay"]
+                momentum = group["momentum"]
+                param_dtensors = []
+                param_tensors = []
+                for p in params:
+                    if p is None or p.grad is None:
+                        continue
+                    if isinstance(p.data, DTensor):
+                        if all(
+                                isinstance(placement, Replicate)
+                                for placement in p.placements):
+                            param_tensors.append(p)
+                        else:
+                            param_dtensors.append(p)
+                    elif isinstance(p.data, torch.Tensor):
                         param_tensors.append(p)
                     else:
+                        raise TypeError(
+                            f"Unsupported parameter type: {type(p.data)}")
+                if self.debug:
+                    print(
+                        f"[Muon] {len(param_dtensors)} DTensors, {len(param_tensors)} Tensors",
+                        flush=True,
+                    )
+                if len(param_dtensors) > 0:
+                    if not dist.is_initialized():
+                        raise RuntimeError(
+                            "Parallel Muon requires torch.distributed to be initialized."
+                        )
+                    self.parallel(
+                        param_dtensors,
+                        group,
+                        lr=lr,
+                        weight_decay=weight_decay,
+                        momentum=momentum,
                     )
+                if len(param_tensors) > 0:
+                    self.base(
+                        param_tensors,
+                        group,
+                        lr=lr,
+                        weight_decay=weight_decay,
+                        momentum=momentum,
+                    )
+            else:
+                ############################
+                #       AdamW backup       #
+                ############################
+                lr = group["lr"]
+                beta1, beta2 = group["adamw_betas"]
+                eps = group["adamw_eps"]
+                weight_decay = group["weight_decay"]
+                for p in params:
+                    g = p.grad
+                    if g is None:
+                        continue
+                    state = self.state[p]
+                    if "step" not in state:
+                        state["step"] = 0
+                        state["moment1"] = torch.zeros_like(g)
+                        state["moment2"] = torch.zeros_like(g)
+                    state["step"] += 1
+                    step = state["step"]
+                    buf1 = state["moment1"]
+                    buf2 = state["moment2"]
+                    buf1.lerp_(g, 1 - beta1)
+                    buf2.lerp_(g.square(), 1 - beta2)
+                    g = buf1 / (eps + buf2.sqrt())
+                    bias_correction1 = 1 - beta1**step
+                    bias_correction2 = 1 - beta2**step
+                    scale = bias_correction1 / bias_correction2**0.5
+                    p.data.mul_(1 - lr * weight_decay)
+                    p.data.add_(g, alpha=-lr / scale)
         return loss

test/test_muon/test.py CHANGED Viewed

@@ -2,7 +2,7 @@ import logging
 import torch
 import torch.distributed as dist
-from muon import Muon
 from torch.distributed.fsdp import FSDPModule, fully_shard
 from torch.distributed.tensor import DTensor
 from torch.distributed.tensor.placement_types import Replicate
@@ -54,7 +54,8 @@ def load_model(fsdp: bool) -> torch.nn.Module:
 def run_muon(fsdp: bool) -> torch.nn.Module:
     model = load_model(fsdp=fsdp)
-    optim = Muon(model)
     optim.step()
     return model

 import torch
 import torch.distributed as dist
+from muon import Muon, get_default_muon_param_groups
 from torch.distributed.fsdp import FSDPModule, fully_shard
 from torch.distributed.tensor import DTensor
 from torch.distributed.tensor.placement_types import Replicate
 def run_muon(fsdp: bool) -> torch.nn.Module:
     model = load_model(fsdp=fsdp)
+    params = get_default_muon_param_groups(model)
+    optim = Muon(params=params)
     optim.step()
     return model

torch-ext/optimizer/muon.py CHANGED Viewed

@@ -1,10 +1,14 @@
 import math
 from dataclasses import dataclass
 import torch
 import torch.distributed as dist
 from torch.distributed._tensor import DTensor, Replicate, Shard
 # This code snippet is a modified version adapted from the following GitHub repositories:
 # https://github.com/KellerJordan/Muon/blob/master/muon.py
@@ -175,10 +179,31 @@ def _update_param(p, state, lr, adjusted_lr, weight_decay, rank,
         Muon._update_p(p, state.scattered_u, lr, adjusted_lr, weight_decay)
-def default_is_muon(x, name):
     return x.ndim >= 2 and "embed_tokens" not in name and "lm_head" not in name
 class Muon(torch.optim.Optimizer):
     """
     Muon - MomentUm Orthogonalized by Newton-schulz
@@ -210,8 +235,7 @@ class Muon(torch.optim.Optimizer):
     def __init__(
         self,
-        model,
-        is_muon_func=default_is_muon,
         lr=1e-3,
         momentum=0.95,
         nesterov=True,
@@ -231,11 +255,19 @@ class Muon(torch.optim.Optimizer):
             adamw_betas=adamw_betas,
             adamw_eps=adamw_eps,
             none_grad=none_grad,
         )
-        super().__init__(model.parameters(), defaults)
-        self.is_muon_func = is_muon_func
-        self.model = model
         if dist.is_initialized():
             self.rank = dist.get_rank()
@@ -246,21 +278,6 @@ class Muon(torch.optim.Optimizer):
         self.compute_stream = torch.cuda.Stream()
         self.debug = debug
-    def __setstate__(self, state):
-        # Sort parameters into those for which we will use Muon, and those for which we will not
-        super().__setstate__(state)
-        self._init_state()
-    def _init_state(self):
-        for name, p in self.model.named_parameters():
-            if self.is_muon_func(p, name):
-                # Use Muon for every parameter in muon_params which is >= 2D and doesn't look like an embedding or head layer
-                assert p.ndim == 2, p.ndim
-                self.state[p]["use_muon"] = True
-            else:
-                # Do not use Muon for parameters in adamw_params
-                self.state[p]["use_muon"] = False
     def _calc_flops(self, G, steps):
         assert len(G.shape) == 2
         M, N = G.shape
@@ -462,100 +479,96 @@ class Muon(torch.optim.Optimizer):
                 loss = closure()
         for group in self.param_groups:
-            ############################
-            #           Muon           #
-            ############################
-            if "use_muon" not in self.state[group["params"][0]]:
-                self._init_state()
-            params = [p for p in group["params"] if self.state[p]["use_muon"]]
-            lr = group["lr"]
-            weight_decay = group["weight_decay"]
-            momentum = group["momentum"]
-            param_dtensors = []
-            param_tensors = []
-            for p in params:
-                if p is None or p.grad is None:
-                    continue
-                if isinstance(p.data, DTensor):
-                    if all(
-                            isinstance(placement, Replicate)
-                            for placement in p.placements):
                         param_tensors.append(p)
                     else:
-                        param_dtensors.append(p)
-                elif isinstance(p.data, torch.Tensor):
-                    param_tensors.append(p)
-                else:
-                    raise TypeError(
-                        f"Unsupported parameter type: {type(p.data)}")
-            if self.debug:
-                print(
-                    f"[Muon] {len(param_dtensors)} DTensors, {len(param_tensors)} Tensors",
-                    flush=True,
-                )
-            if len(param_dtensors) > 0:
-                if not dist.is_initialized():
-                    raise RuntimeError(
-                        "Parallel Muon requires torch.distributed to be initialized."
                     )
-                self.parallel(
-                    param_dtensors,
-                    group,
-                    lr=lr,
-                    weight_decay=weight_decay,
-                    momentum=momentum,
-                )
-            if len(param_tensors) > 0:
-                self.base(
-                    param_tensors,
-                    group,
-                    lr=lr,
-                    weight_decay=weight_decay,
-                    momentum=momentum,
-                )
-            ############################
-            #       AdamW backup       #
-            ############################
-            params = [
-                p for p in group["params"] if not self.state[p]["use_muon"]
-            ]
-            lr = group["lr"]
-            beta1, beta2 = group["adamw_betas"]
-            eps = group["adamw_eps"]
-            weight_decay = group["weight_decay"]
-            for p in params:
-                g = p.grad
-                if g is None:
-                    continue
-                state = self.state[p]
-                if "step" not in state:
-                    state["step"] = 0
-                    state["moment1"] = torch.zeros_like(g)
-                    state["moment2"] = torch.zeros_like(g)
-                state["step"] += 1
-                step = state["step"]
-                buf1 = state["moment1"]
-                buf2 = state["moment2"]
-                buf1.lerp_(g, 1 - beta1)
-                buf2.lerp_(g.square(), 1 - beta2)
-                g = buf1 / (eps + buf2.sqrt())
-                bias_correction1 = 1 - beta1**step
-                bias_correction2 = 1 - beta2**step
-                scale = bias_correction1 / bias_correction2**0.5
-                p.data.mul_(1 - lr * weight_decay)
-                p.data.add_(g, alpha=-lr / scale)
         return loss

+import logging
 import math
+import types
 from dataclasses import dataclass
 import torch
 import torch.distributed as dist
 from torch.distributed._tensor import DTensor, Replicate, Shard
+logger = logging.getLogger(__name__)
 # This code snippet is a modified version adapted from the following GitHub repositories:
 # https://github.com/KellerJordan/Muon/blob/master/muon.py
         Muon._update_p(p, state.scattered_u, lr, adjusted_lr, weight_decay)
+def default_is_muon(name, x):
     return x.ndim >= 2 and "embed_tokens" not in name and "lm_head" not in name
+def get_default_muon_param_groups(model, is_muon_func=default_is_muon):
+    return [
+        {
+            "params": [
+                p for n, p in model.named_parameters()
+                if (is_muon_func(n, p) and p.requires_grad)
+            ],
+            "use_muon":
+            True
+        },
+        {
+            "params": [
+                p for n, p in model.named_parameters()
+                if (not is_muon_func(n, p) and p.requires_grad)
+            ],
+            "use_muon":
+            False
+        },
+    ]
 class Muon(torch.optim.Optimizer):
     """
     Muon - MomentUm Orthogonalized by Newton-schulz
     def __init__(
         self,
+        params,
         lr=1e-3,
         momentum=0.95,
         nesterov=True,
             adamw_betas=adamw_betas,
             adamw_eps=adamw_eps,
             none_grad=none_grad,
+            use_muon=True,
         )
+        error_message = "The key 'use_muon' is not set in parameter group {idx}. Assuming all parameters in the group will use muon optimization, which may lead to unexpected behavior."
+        instruction_code = "\n\n please follow this code snippet \n```optimizer = get_kernel('motif-technologies/optimizer')\n\n\nparams = optimizer.muon.get_default_muon_param_groups(model)\n\noptim = optimizer.Muon(params, ...)```"
+        if isinstance(params, types.GeneratorType):
+            raise ValueError(error_message.format(idx=0) + instruction_code)
+        for _idx, param_group in enumerate(params):
+            if param_group.get("use_muon", None) is None:
+                raise ValueError(
+                    error_message.format(idx=_idx) + instruction_code)
+        super().__init__(params, defaults)
         if dist.is_initialized():
             self.rank = dist.get_rank()
         self.compute_stream = torch.cuda.Stream()
         self.debug = debug
     def _calc_flops(self, G, steps):
         assert len(G.shape) == 2
         M, N = G.shape
                 loss = closure()
         for group in self.param_groups:
+            params = group["params"]
+            if group["use_muon"]:
+                ############################
+                #           Muon           #
+                ############################
+                lr = group["lr"]
+                weight_decay = group["weight_decay"]
+                momentum = group["momentum"]
+                param_dtensors = []
+                param_tensors = []
+                for p in params:
+                    if p is None or p.grad is None:
+                        continue
+                    if isinstance(p.data, DTensor):
+                        if all(
+                                isinstance(placement, Replicate)
+                                for placement in p.placements):
+                            param_tensors.append(p)
+                        else:
+                            param_dtensors.append(p)
+                    elif isinstance(p.data, torch.Tensor):
                         param_tensors.append(p)
                     else:
+                        raise TypeError(
+                            f"Unsupported parameter type: {type(p.data)}")
+                if self.debug:
+                    print(
+                        f"[Muon] {len(param_dtensors)} DTensors, {len(param_tensors)} Tensors",
+                        flush=True,
+                    )
+                if len(param_dtensors) > 0:
+                    if not dist.is_initialized():
+                        raise RuntimeError(
+                            "Parallel Muon requires torch.distributed to be initialized."
+                        )
+                    self.parallel(
+                        param_dtensors,
+                        group,
+                        lr=lr,
+                        weight_decay=weight_decay,
+                        momentum=momentum,
                     )
+                if len(param_tensors) > 0:
+                    self.base(
+                        param_tensors,
+                        group,
+                        lr=lr,
+                        weight_decay=weight_decay,
+                        momentum=momentum,
+                    )
+            else:
+                ############################
+                #       AdamW backup       #
+                ############################
+                lr = group["lr"]
+                beta1, beta2 = group["adamw_betas"]
+                eps = group["adamw_eps"]
+                weight_decay = group["weight_decay"]
+                for p in params:
+                    g = p.grad
+                    if g is None:
+                        continue
+                    state = self.state[p]
+                    if "step" not in state:
+                        state["step"] = 0
+                        state["moment1"] = torch.zeros_like(g)
+                        state["moment2"] = torch.zeros_like(g)
+                    state["step"] += 1
+                    step = state["step"]
+                    buf1 = state["moment1"]
+                    buf2 = state["moment2"]
+                    buf1.lerp_(g, 1 - beta1)
+                    buf2.lerp_(g.square(), 1 - beta2)
+                    g = buf1 / (eps + buf2.sqrt())
+                    bias_correction1 = 1 - beta1**step
+                    bias_correction2 = 1 - beta2**step
+                    scale = bias_correction1 / bias_correction2**0.5
+                    p.data.mul_(1 - lr * weight_decay)
+                    p.data.add_(g, alpha=-lr / scale)
         return loss