iamwyldecat commited on Jun 16

Commit

febdf5b

1 Parent(s): bdd2678

chore(muon): clean build and update doc

Browse files

This view is limited to 50 files because it contains too many changes. See raw diff

Files changed (50) hide show

build/torch26-cxx11-cu118-x86_64-linux/optimizer/_ops.py +3 -3
build/torch26-cxx11-cu118-x86_64-linux/optimizer/_optimizer_8535e80_dirty.abi3.so +0 -3
build/torch26-cxx11-cu118-x86_64-linux/optimizer/_optimizer_b4b3752_dirty.abi3.so +0 -3
build/torch26-cxx11-cu118-x86_64-linux/optimizer/{_optimizer_20250614125054.abi3.so → _optimizer_bdd2678_dirty.abi3.so} +1 -1
build/torch26-cxx11-cu118-x86_64-linux/optimizer/muon.py +6 -1
build/torch26-cxx11-cu124-x86_64-linux/optimizer/_ops.py +3 -3
build/torch26-cxx11-cu124-x86_64-linux/optimizer/_optimizer_20250614125054.abi3.so +0 -3
build/torch26-cxx11-cu124-x86_64-linux/optimizer/{_optimizer_8535e80_dirty.abi3.so → _optimizer_bdd2678_dirty.abi3.so} +1 -1
build/torch26-cxx11-cu124-x86_64-linux/optimizer/muon.py +6 -1
build/torch26-cxx11-cu126-x86_64-linux/optimizer/_ops.py +3 -3
build/torch26-cxx11-cu126-x86_64-linux/optimizer/_optimizer_20250614125054.abi3.so +0 -3
build/torch26-cxx11-cu126-x86_64-linux/optimizer/_optimizer_8535e80_dirty.abi3.so +0 -3
build/torch26-cxx11-cu126-x86_64-linux/optimizer/_optimizer_b4b3752_dirty.abi3.so +0 -3
build/{torch26-cxx11-cu124-x86_64-linux/optimizer/_optimizer_b4b3752_dirty.abi3.so → torch26-cxx11-cu126-x86_64-linux/optimizer/_optimizer_bdd2678_dirty.abi3.so} +1 -1
build/torch26-cxx11-cu126-x86_64-linux/optimizer/muon.py +6 -1
build/torch26-cxx11-rocm62-x86_64-linux/optimizer/_ops.py +3 -3
build/torch26-cxx11-rocm62-x86_64-linux/optimizer/_optimizer_20250614121529.abi3.so +0 -3
build/torch26-cxx11-rocm62-x86_64-linux/optimizer/_optimizer_20250614123843.abi3.so +0 -3
build/torch26-cxx11-rocm62-x86_64-linux/optimizer/_optimizer_20250614125054.abi3.so +0 -3
build/torch26-cxx11-rocm62-x86_64-linux/optimizer/_optimizer_b4b3752_dirty.abi3.so +0 -3
build/torch26-cxx11-rocm62-x86_64-linux/optimizer/{_optimizer_8535e80_dirty.abi3.so → _optimizer_bdd2678_dirty.abi3.so} +1 -1
build/torch26-cxx11-rocm62-x86_64-linux/optimizer/muon.py +6 -1
build/torch26-cxx98-cu118-x86_64-linux/optimizer/_ops.py +3 -3
build/torch26-cxx98-cu118-x86_64-linux/optimizer/_optimizer_20250614125054.abi3.so +0 -3
build/torch26-cxx98-cu118-x86_64-linux/optimizer/_optimizer_b4b3752_dirty.abi3.so +0 -3
build/torch26-cxx98-cu118-x86_64-linux/optimizer/{_optimizer_8535e80_dirty.abi3.so → _optimizer_bdd2678_dirty.abi3.so} +1 -1
build/torch26-cxx98-cu118-x86_64-linux/optimizer/muon.py +6 -1
build/torch26-cxx98-cu124-x86_64-linux/optimizer/_ops.py +3 -3
build/torch26-cxx98-cu124-x86_64-linux/optimizer/_optimizer_20250614125054.abi3.so +0 -3
build/torch26-cxx98-cu124-x86_64-linux/optimizer/{_optimizer_8535e80_dirty.abi3.so → _optimizer_bdd2678_dirty.abi3.so} +1 -1
build/torch26-cxx98-cu124-x86_64-linux/optimizer/muon.py +6 -1
build/torch26-cxx98-cu126-x86_64-linux/optimizer/_ops.py +3 -3
build/torch26-cxx98-cu126-x86_64-linux/optimizer/_optimizer_20250614125054.abi3.so +0 -3
build/torch26-cxx98-cu126-x86_64-linux/optimizer/_optimizer_8535e80_dirty.abi3.so +0 -3
build/torch26-cxx98-cu126-x86_64-linux/optimizer/_optimizer_b4b3752_dirty.abi3.so +0 -3
build/{torch26-cxx98-cu124-x86_64-linux/optimizer/_optimizer_b4b3752_dirty.abi3.so → torch26-cxx98-cu126-x86_64-linux/optimizer/_optimizer_bdd2678_dirty.abi3.so} +1 -1
build/torch26-cxx98-cu126-x86_64-linux/optimizer/muon.py +6 -1
build/torch27-cxx11-cu118-x86_64-linux/optimizer/_ops.py +3 -3
build/torch27-cxx11-cu118-x86_64-linux/optimizer/_optimizer_20250614125054.abi3.so +0 -3
build/torch27-cxx11-cu118-x86_64-linux/optimizer/_optimizer_b4b3752_dirty.abi3.so +0 -3
build/torch27-cxx11-cu118-x86_64-linux/optimizer/{_optimizer_8535e80_dirty.abi3.so → _optimizer_bdd2678_dirty.abi3.so} +1 -1
build/torch27-cxx11-cu118-x86_64-linux/optimizer/muon.py +6 -1
build/torch27-cxx11-cu126-x86_64-linux/optimizer/_ops.py +3 -3
build/torch27-cxx11-cu126-x86_64-linux/optimizer/_optimizer_8535e80_dirty.abi3.so +0 -3
build/torch27-cxx11-cu126-x86_64-linux/optimizer/_optimizer_b4b3752_dirty.abi3.so +0 -3
build/torch27-cxx11-cu126-x86_64-linux/optimizer/{_optimizer_20250614125054.abi3.so → _optimizer_bdd2678_dirty.abi3.so} +1 -1
build/torch27-cxx11-cu126-x86_64-linux/optimizer/muon.py +6 -1
build/torch27-cxx11-cu128-x86_64-linux/optimizer/_ops.py +3 -3
build/torch27-cxx11-cu128-x86_64-linux/optimizer/_optimizer_8535e80_dirty.abi3.so +0 -3
build/torch27-cxx11-cu128-x86_64-linux/optimizer/_optimizer_b4b3752_dirty.abi3.so +0 -3

build/torch26-cxx11-cu118-x86_64-linux/optimizer/_ops.py CHANGED Viewed

@@ -1,9 +1,9 @@
 import torch
-from . import _optimizer_8535e80_dirty
-ops = torch.ops._optimizer_8535e80_dirty
 def add_op_namespace_prefix(op_name: str):
     """
     Prefix op by namespace.
     """
-    return f"_optimizer_8535e80_dirty::{op_name}"

 import torch
+from . import _optimizer_bdd2678_dirty
+ops = torch.ops._optimizer_bdd2678_dirty
 def add_op_namespace_prefix(op_name: str):
     """
     Prefix op by namespace.
     """
+    return f"_optimizer_bdd2678_dirty::{op_name}"

build/torch26-cxx11-cu118-x86_64-linux/optimizer/_optimizer_8535e80_dirty.abi3.so DELETED Viewed

@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:a46d9e65efcfa82522950d9ebf2b2b4594d9ed5abc28704352a1f7de2dae707a
-size 1787272

build/torch26-cxx11-cu118-x86_64-linux/optimizer/_optimizer_b4b3752_dirty.abi3.so DELETED Viewed

@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:f8325d12959ef4f31b6c6340eca29176f5077abeaa10f3a6651db55ccf3c634f
-size 1787272

build/torch26-cxx11-cu118-x86_64-linux/optimizer/{_optimizer_20250614125054.abi3.so → _optimizer_bdd2678_dirty.abi3.so} RENAMED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:66ca698639fff584999fe65f8f10cc4436c197829e936be2741bf53db685caa0
 size 1787272

 version https://git-lfs.github.com/spec/v1
+oid sha256:9119d3a6d99c07a17d110d2ccf6042f199d00c839f5efa74008c1642d21e48b0
 size 1787272

build/torch26-cxx11-cu118-x86_64-linux/optimizer/muon.py CHANGED Viewed

@@ -5,10 +5,12 @@ import torch
 import torch.distributed as dist
 from torch.distributed._tensor import DTensor
 # TODO leave original url and consider LICENSE
 # This code snippet is a modified version adapted from the following GitHub repository:
 # https://github.com/KellerJordan/Muon/blob/master/muon.py
 def _zeropower_via_newtonschulz5(G, steps):
     """
     Newton-Schulz iteration to compute the zeroth power / orthogonalization of G. We opt to use a
@@ -52,6 +54,7 @@ class _muon_state:
     compute_event: torch.cuda.Event | None = None
 def _gather(p, state, rank, comm_stream):
     g = p.grad
     mesh = g.device_mesh
@@ -82,6 +85,7 @@ def _gather(p, state, rank, comm_stream):
             state.gather_event = None
 def _compute_u(state, steps, rank, compute_stream):
     with torch.cuda.stream(compute_stream):
         if rank == state.worker_rank:
@@ -99,6 +103,7 @@ def _compute_u(state, steps, rank, compute_stream):
             state.compute_event = None
 def _scatter(p, state, lr, wd, rank, comm_stream):
     u = state.computed_u
     mesh = p.device_mesh

 import torch.distributed as dist
 from torch.distributed._tensor import DTensor
 # TODO leave original url and consider LICENSE
 # This code snippet is a modified version adapted from the following GitHub repository:
 # https://github.com/KellerJordan/Muon/blob/master/muon.py
+@torch.no_grad()
 def _zeropower_via_newtonschulz5(G, steps):
     """
     Newton-Schulz iteration to compute the zeroth power / orthogonalization of G. We opt to use a
     compute_event: torch.cuda.Event | None = None
+@torch.no_grad()
 def _gather(p, state, rank, comm_stream):
     g = p.grad
     mesh = g.device_mesh
             state.gather_event = None
+@torch.no_grad()
 def _compute_u(state, steps, rank, compute_stream):
     with torch.cuda.stream(compute_stream):
         if rank == state.worker_rank:
             state.compute_event = None
+@torch.no_grad()
 def _scatter(p, state, lr, wd, rank, comm_stream):
     u = state.computed_u
     mesh = p.device_mesh

build/torch26-cxx11-cu124-x86_64-linux/optimizer/_ops.py CHANGED Viewed

@@ -1,9 +1,9 @@
 import torch
-from . import _optimizer_8535e80_dirty
-ops = torch.ops._optimizer_8535e80_dirty
 def add_op_namespace_prefix(op_name: str):
     """
     Prefix op by namespace.
     """
-    return f"_optimizer_8535e80_dirty::{op_name}"

 import torch
+from . import _optimizer_bdd2678_dirty
+ops = torch.ops._optimizer_bdd2678_dirty
 def add_op_namespace_prefix(op_name: str):
     """
     Prefix op by namespace.
     """
+    return f"_optimizer_bdd2678_dirty::{op_name}"

build/torch26-cxx11-cu124-x86_64-linux/optimizer/_optimizer_20250614125054.abi3.so DELETED Viewed

@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:e89cd7d514bfe92598684ae3cfc2d35ac2d021340846e09c0b6c880c3d55bfa0
-size 1820136

build/torch26-cxx11-cu124-x86_64-linux/optimizer/{_optimizer_8535e80_dirty.abi3.so → _optimizer_bdd2678_dirty.abi3.so} RENAMED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:d351a600884b7378f546a345afe65c176e1399bb42fb7dfe4333b0e90975803b
 size 1824224

 version https://git-lfs.github.com/spec/v1
+oid sha256:91b76cd5be429f99840e26e8ba55b61f9fdcae19301bd7c082b2e9746a276501
 size 1824224

build/torch26-cxx11-cu124-x86_64-linux/optimizer/muon.py CHANGED Viewed

@@ -5,10 +5,12 @@ import torch
 import torch.distributed as dist
 from torch.distributed._tensor import DTensor
 # TODO leave original url and consider LICENSE
 # This code snippet is a modified version adapted from the following GitHub repository:
 # https://github.com/KellerJordan/Muon/blob/master/muon.py
 def _zeropower_via_newtonschulz5(G, steps):
     """
     Newton-Schulz iteration to compute the zeroth power / orthogonalization of G. We opt to use a
@@ -52,6 +54,7 @@ class _muon_state:
     compute_event: torch.cuda.Event | None = None
 def _gather(p, state, rank, comm_stream):
     g = p.grad
     mesh = g.device_mesh
@@ -82,6 +85,7 @@ def _gather(p, state, rank, comm_stream):
             state.gather_event = None
 def _compute_u(state, steps, rank, compute_stream):
     with torch.cuda.stream(compute_stream):
         if rank == state.worker_rank:
@@ -99,6 +103,7 @@ def _compute_u(state, steps, rank, compute_stream):
             state.compute_event = None
 def _scatter(p, state, lr, wd, rank, comm_stream):
     u = state.computed_u
     mesh = p.device_mesh

 import torch.distributed as dist
 from torch.distributed._tensor import DTensor
 # TODO leave original url and consider LICENSE
 # This code snippet is a modified version adapted from the following GitHub repository:
 # https://github.com/KellerJordan/Muon/blob/master/muon.py
+@torch.no_grad()
 def _zeropower_via_newtonschulz5(G, steps):
     """
     Newton-Schulz iteration to compute the zeroth power / orthogonalization of G. We opt to use a
     compute_event: torch.cuda.Event | None = None
+@torch.no_grad()
 def _gather(p, state, rank, comm_stream):
     g = p.grad
     mesh = g.device_mesh
             state.gather_event = None
+@torch.no_grad()
 def _compute_u(state, steps, rank, compute_stream):
     with torch.cuda.stream(compute_stream):
         if rank == state.worker_rank:
             state.compute_event = None
+@torch.no_grad()
 def _scatter(p, state, lr, wd, rank, comm_stream):
     u = state.computed_u
     mesh = p.device_mesh

build/torch26-cxx11-cu126-x86_64-linux/optimizer/_ops.py CHANGED Viewed

@@ -1,9 +1,9 @@
 import torch
-from . import _optimizer_8535e80_dirty
-ops = torch.ops._optimizer_8535e80_dirty
 def add_op_namespace_prefix(op_name: str):
     """
     Prefix op by namespace.
     """
-    return f"_optimizer_8535e80_dirty::{op_name}"

 import torch
+from . import _optimizer_bdd2678_dirty
+ops = torch.ops._optimizer_bdd2678_dirty
 def add_op_namespace_prefix(op_name: str):
     """
     Prefix op by namespace.
     """
+    return f"_optimizer_bdd2678_dirty::{op_name}"

build/torch26-cxx11-cu126-x86_64-linux/optimizer/_optimizer_20250614125054.abi3.so DELETED Viewed

@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:7f5dce62d3038e879e688fffa9bbc70f3e82db20b2e7ae3ba09040e0319acb71
-size 1820136

build/torch26-cxx11-cu126-x86_64-linux/optimizer/_optimizer_8535e80_dirty.abi3.so DELETED Viewed

@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:2c0843f38cee494b7a5939eb62d27039d76dc3f69401d411efbacaa25cb0d67a
-size 1824224

build/torch26-cxx11-cu126-x86_64-linux/optimizer/_optimizer_b4b3752_dirty.abi3.so DELETED Viewed

@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:58162f994df84868dbf62ae70e39d3c14e3390fc827f152eece83dfae7f51503
-size 1824224

build/{torch26-cxx11-cu124-x86_64-linux/optimizer/_optimizer_b4b3752_dirty.abi3.so → torch26-cxx11-cu126-x86_64-linux/optimizer/_optimizer_bdd2678_dirty.abi3.so} RENAMED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:9cbffc2cf8039069831a57afb8e2f64fa684f1a44bec79bb4b72dbb57d9ac607
 size 1824224

 version https://git-lfs.github.com/spec/v1
+oid sha256:807d59aca5b0403206395a1f4c770b8d644294c17f6af866207c36ac617f0a7d
 size 1824224

build/torch26-cxx11-cu126-x86_64-linux/optimizer/muon.py CHANGED Viewed

@@ -5,10 +5,12 @@ import torch
 import torch.distributed as dist
 from torch.distributed._tensor import DTensor
 # TODO leave original url and consider LICENSE
 # This code snippet is a modified version adapted from the following GitHub repository:
 # https://github.com/KellerJordan/Muon/blob/master/muon.py
 def _zeropower_via_newtonschulz5(G, steps):
     """
     Newton-Schulz iteration to compute the zeroth power / orthogonalization of G. We opt to use a
@@ -52,6 +54,7 @@ class _muon_state:
     compute_event: torch.cuda.Event | None = None
 def _gather(p, state, rank, comm_stream):
     g = p.grad
     mesh = g.device_mesh
@@ -82,6 +85,7 @@ def _gather(p, state, rank, comm_stream):
             state.gather_event = None
 def _compute_u(state, steps, rank, compute_stream):
     with torch.cuda.stream(compute_stream):
         if rank == state.worker_rank:
@@ -99,6 +103,7 @@ def _compute_u(state, steps, rank, compute_stream):
             state.compute_event = None
 def _scatter(p, state, lr, wd, rank, comm_stream):
     u = state.computed_u
     mesh = p.device_mesh

 import torch.distributed as dist
 from torch.distributed._tensor import DTensor
 # TODO leave original url and consider LICENSE
 # This code snippet is a modified version adapted from the following GitHub repository:
 # https://github.com/KellerJordan/Muon/blob/master/muon.py
+@torch.no_grad()
 def _zeropower_via_newtonschulz5(G, steps):
     """
     Newton-Schulz iteration to compute the zeroth power / orthogonalization of G. We opt to use a
     compute_event: torch.cuda.Event | None = None
+@torch.no_grad()
 def _gather(p, state, rank, comm_stream):
     g = p.grad
     mesh = g.device_mesh
             state.gather_event = None
+@torch.no_grad()
 def _compute_u(state, steps, rank, compute_stream):
     with torch.cuda.stream(compute_stream):
         if rank == state.worker_rank:
             state.compute_event = None
+@torch.no_grad()
 def _scatter(p, state, lr, wd, rank, comm_stream):
     u = state.computed_u
     mesh = p.device_mesh

build/torch26-cxx11-rocm62-x86_64-linux/optimizer/_ops.py CHANGED Viewed

@@ -1,9 +1,9 @@
 import torch
-from . import _optimizer_8535e80_dirty
-ops = torch.ops._optimizer_8535e80_dirty
 def add_op_namespace_prefix(op_name: str):
     """
     Prefix op by namespace.
     """
-    return f"_optimizer_8535e80_dirty::{op_name}"

 import torch
+from . import _optimizer_bdd2678_dirty
+ops = torch.ops._optimizer_bdd2678_dirty
 def add_op_namespace_prefix(op_name: str):
     """
     Prefix op by namespace.
     """
+    return f"_optimizer_bdd2678_dirty::{op_name}"

build/torch26-cxx11-rocm62-x86_64-linux/optimizer/_optimizer_20250614121529.abi3.so DELETED Viewed

@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:d2f60369ba2bd0a0f84e053d857d37496137ff476dc21561f211b1fa39651990
-size 1749784

build/torch26-cxx11-rocm62-x86_64-linux/optimizer/_optimizer_20250614123843.abi3.so DELETED Viewed

@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:f4d790535f99b7b362a966e802a547654f31749f5f28a0207493870927f1d8d2
-size 1749784

build/torch26-cxx11-rocm62-x86_64-linux/optimizer/_optimizer_20250614125054.abi3.so DELETED Viewed

@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:b440dd9a60711a498010068e91d0ad013cd0b8ac732c16b5d1d17e5d4ec0f9b4
-size 1749784

build/torch26-cxx11-rocm62-x86_64-linux/optimizer/_optimizer_b4b3752_dirty.abi3.so DELETED Viewed

@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:f50ea9cab62a5bd06d886516d3917e4490e65aa9addd1cbb84fc81c6f9a9d5b1
-size 1749744

build/torch26-cxx11-rocm62-x86_64-linux/optimizer/{_optimizer_8535e80_dirty.abi3.so → _optimizer_bdd2678_dirty.abi3.so} RENAMED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:acdba99ce95532a9ca6a8987a7ab61a257657872f2cc672c91e8e5fe809aa24e
 size 1749744

 version https://git-lfs.github.com/spec/v1
+oid sha256:3d2bdd755079fa06a27401b8a26ac425d35514d196f9df4ce1be5c52ebcc9a64
 size 1749744

build/torch26-cxx11-rocm62-x86_64-linux/optimizer/muon.py CHANGED Viewed

@@ -5,10 +5,12 @@ import torch
 import torch.distributed as dist
 from torch.distributed._tensor import DTensor
 # TODO leave original url and consider LICENSE
 # This code snippet is a modified version adapted from the following GitHub repository:
 # https://github.com/KellerJordan/Muon/blob/master/muon.py
 def _zeropower_via_newtonschulz5(G, steps):
     """
     Newton-Schulz iteration to compute the zeroth power / orthogonalization of G. We opt to use a
@@ -52,6 +54,7 @@ class _muon_state:
     compute_event: torch.cuda.Event | None = None
 def _gather(p, state, rank, comm_stream):
     g = p.grad
     mesh = g.device_mesh
@@ -82,6 +85,7 @@ def _gather(p, state, rank, comm_stream):
             state.gather_event = None
 def _compute_u(state, steps, rank, compute_stream):
     with torch.cuda.stream(compute_stream):
         if rank == state.worker_rank:
@@ -99,6 +103,7 @@ def _compute_u(state, steps, rank, compute_stream):
             state.compute_event = None
 def _scatter(p, state, lr, wd, rank, comm_stream):
     u = state.computed_u
     mesh = p.device_mesh

 import torch.distributed as dist
 from torch.distributed._tensor import DTensor
 # TODO leave original url and consider LICENSE
 # This code snippet is a modified version adapted from the following GitHub repository:
 # https://github.com/KellerJordan/Muon/blob/master/muon.py
+@torch.no_grad()
 def _zeropower_via_newtonschulz5(G, steps):
     """
     Newton-Schulz iteration to compute the zeroth power / orthogonalization of G. We opt to use a
     compute_event: torch.cuda.Event | None = None
+@torch.no_grad()
 def _gather(p, state, rank, comm_stream):
     g = p.grad
     mesh = g.device_mesh
             state.gather_event = None
+@torch.no_grad()
 def _compute_u(state, steps, rank, compute_stream):
     with torch.cuda.stream(compute_stream):
         if rank == state.worker_rank:
             state.compute_event = None
+@torch.no_grad()
 def _scatter(p, state, lr, wd, rank, comm_stream):
     u = state.computed_u
     mesh = p.device_mesh

build/torch26-cxx98-cu118-x86_64-linux/optimizer/_ops.py CHANGED Viewed

@@ -1,9 +1,9 @@
 import torch
-from . import _optimizer_8535e80_dirty
-ops = torch.ops._optimizer_8535e80_dirty
 def add_op_namespace_prefix(op_name: str):
     """
     Prefix op by namespace.
     """
-    return f"_optimizer_8535e80_dirty::{op_name}"

 import torch
+from . import _optimizer_bdd2678_dirty
+ops = torch.ops._optimizer_bdd2678_dirty
 def add_op_namespace_prefix(op_name: str):
     """
     Prefix op by namespace.
     """
+    return f"_optimizer_bdd2678_dirty::{op_name}"

build/torch26-cxx98-cu118-x86_64-linux/optimizer/_optimizer_20250614125054.abi3.so DELETED Viewed

@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:a8f8e7d78ed9a095b882cf764fd9c80a0b0810fb961ba9e8545656fc4cb0b0d7
-size 1787200

build/torch26-cxx98-cu118-x86_64-linux/optimizer/_optimizer_b4b3752_dirty.abi3.so DELETED Viewed

@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:002dab6441bcad54ab4e7c064b5806acfd45170eb33cfa059745ba6e0c349607
-size 1787192

build/torch26-cxx98-cu118-x86_64-linux/optimizer/{_optimizer_8535e80_dirty.abi3.so → _optimizer_bdd2678_dirty.abi3.so} RENAMED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:f7d5e76c002507f66f2a227d02c2b11aa3fdc3f07a2a0b82faaa34133adb77ef
 size 1787192

 version https://git-lfs.github.com/spec/v1
+oid sha256:e4ca177074d4c04630ffaa2e49e41e1451bf198c44c4cc544a664be88475a3b9
 size 1787192

build/torch26-cxx98-cu118-x86_64-linux/optimizer/muon.py CHANGED Viewed

@@ -5,10 +5,12 @@ import torch
 import torch.distributed as dist
 from torch.distributed._tensor import DTensor
 # TODO leave original url and consider LICENSE
 # This code snippet is a modified version adapted from the following GitHub repository:
 # https://github.com/KellerJordan/Muon/blob/master/muon.py
 def _zeropower_via_newtonschulz5(G, steps):
     """
     Newton-Schulz iteration to compute the zeroth power / orthogonalization of G. We opt to use a
@@ -52,6 +54,7 @@ class _muon_state:
     compute_event: torch.cuda.Event | None = None
 def _gather(p, state, rank, comm_stream):
     g = p.grad
     mesh = g.device_mesh
@@ -82,6 +85,7 @@ def _gather(p, state, rank, comm_stream):
             state.gather_event = None
 def _compute_u(state, steps, rank, compute_stream):
     with torch.cuda.stream(compute_stream):
         if rank == state.worker_rank:
@@ -99,6 +103,7 @@ def _compute_u(state, steps, rank, compute_stream):
             state.compute_event = None
 def _scatter(p, state, lr, wd, rank, comm_stream):
     u = state.computed_u
     mesh = p.device_mesh

 import torch.distributed as dist
 from torch.distributed._tensor import DTensor
 # TODO leave original url and consider LICENSE
 # This code snippet is a modified version adapted from the following GitHub repository:
 # https://github.com/KellerJordan/Muon/blob/master/muon.py
+@torch.no_grad()
 def _zeropower_via_newtonschulz5(G, steps):
     """
     Newton-Schulz iteration to compute the zeroth power / orthogonalization of G. We opt to use a
     compute_event: torch.cuda.Event | None = None
+@torch.no_grad()
 def _gather(p, state, rank, comm_stream):
     g = p.grad
     mesh = g.device_mesh
             state.gather_event = None
+@torch.no_grad()
 def _compute_u(state, steps, rank, compute_stream):
     with torch.cuda.stream(compute_stream):
         if rank == state.worker_rank:
             state.compute_event = None
+@torch.no_grad()
 def _scatter(p, state, lr, wd, rank, comm_stream):
     u = state.computed_u
     mesh = p.device_mesh

build/torch26-cxx98-cu124-x86_64-linux/optimizer/_ops.py CHANGED Viewed

@@ -1,9 +1,9 @@
 import torch
-from . import _optimizer_8535e80_dirty
-ops = torch.ops._optimizer_8535e80_dirty
 def add_op_namespace_prefix(op_name: str):
     """
     Prefix op by namespace.
     """
-    return f"_optimizer_8535e80_dirty::{op_name}"

 import torch
+from . import _optimizer_bdd2678_dirty
+ops = torch.ops._optimizer_bdd2678_dirty
 def add_op_namespace_prefix(op_name: str):
     """
     Prefix op by namespace.
     """
+    return f"_optimizer_bdd2678_dirty::{op_name}"

build/torch26-cxx98-cu124-x86_64-linux/optimizer/_optimizer_20250614125054.abi3.so DELETED Viewed

@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:ab2379d932e40d10bee55f032bd16d2e4d9c1920bc5500628006f8a0eb8abd39
-size 1824192

build/torch26-cxx98-cu124-x86_64-linux/optimizer/{_optimizer_8535e80_dirty.abi3.so → _optimizer_bdd2678_dirty.abi3.so} RENAMED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:becccd250f38a84803350cfb5fac3a6682b1e594968a714642724cbc71246b4a
 size 1824184

 version https://git-lfs.github.com/spec/v1
+oid sha256:3511c3a46297462166d7b773dc2bd8b16f43b7004eee1e4b31d468113051fb55
 size 1824184

build/torch26-cxx98-cu124-x86_64-linux/optimizer/muon.py CHANGED Viewed

@@ -5,10 +5,12 @@ import torch
 import torch.distributed as dist
 from torch.distributed._tensor import DTensor
 # TODO leave original url and consider LICENSE
 # This code snippet is a modified version adapted from the following GitHub repository:
 # https://github.com/KellerJordan/Muon/blob/master/muon.py
 def _zeropower_via_newtonschulz5(G, steps):
     """
     Newton-Schulz iteration to compute the zeroth power / orthogonalization of G. We opt to use a
@@ -52,6 +54,7 @@ class _muon_state:
     compute_event: torch.cuda.Event | None = None
 def _gather(p, state, rank, comm_stream):
     g = p.grad
     mesh = g.device_mesh
@@ -82,6 +85,7 @@ def _gather(p, state, rank, comm_stream):
             state.gather_event = None
 def _compute_u(state, steps, rank, compute_stream):
     with torch.cuda.stream(compute_stream):
         if rank == state.worker_rank:
@@ -99,6 +103,7 @@ def _compute_u(state, steps, rank, compute_stream):
             state.compute_event = None
 def _scatter(p, state, lr, wd, rank, comm_stream):
     u = state.computed_u
     mesh = p.device_mesh

 import torch.distributed as dist
 from torch.distributed._tensor import DTensor
 # TODO leave original url and consider LICENSE
 # This code snippet is a modified version adapted from the following GitHub repository:
 # https://github.com/KellerJordan/Muon/blob/master/muon.py
+@torch.no_grad()
 def _zeropower_via_newtonschulz5(G, steps):
     """
     Newton-Schulz iteration to compute the zeroth power / orthogonalization of G. We opt to use a
     compute_event: torch.cuda.Event | None = None
+@torch.no_grad()
 def _gather(p, state, rank, comm_stream):
     g = p.grad
     mesh = g.device_mesh
             state.gather_event = None
+@torch.no_grad()
 def _compute_u(state, steps, rank, compute_stream):
     with torch.cuda.stream(compute_stream):
         if rank == state.worker_rank:
             state.compute_event = None
+@torch.no_grad()
 def _scatter(p, state, lr, wd, rank, comm_stream):
     u = state.computed_u
     mesh = p.device_mesh

build/torch26-cxx98-cu126-x86_64-linux/optimizer/_ops.py CHANGED Viewed

@@ -1,9 +1,9 @@
 import torch
-from . import _optimizer_8535e80_dirty
-ops = torch.ops._optimizer_8535e80_dirty
 def add_op_namespace_prefix(op_name: str):
     """
     Prefix op by namespace.
     """
-    return f"_optimizer_8535e80_dirty::{op_name}"

 import torch
+from . import _optimizer_bdd2678_dirty
+ops = torch.ops._optimizer_bdd2678_dirty
 def add_op_namespace_prefix(op_name: str):
     """
     Prefix op by namespace.
     """
+    return f"_optimizer_bdd2678_dirty::{op_name}"

build/torch26-cxx98-cu126-x86_64-linux/optimizer/_optimizer_20250614125054.abi3.so DELETED Viewed

@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:3c3282a321487a6faa532afe43bc1298731983c50e2a1acdff5480ff6e4df34e
-size 1824192

build/torch26-cxx98-cu126-x86_64-linux/optimizer/_optimizer_8535e80_dirty.abi3.so DELETED Viewed

@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:34215ecc274ef516967962c8457dad214e9bbf618bf5eee8f467371f4f620284
-size 1824184

build/torch26-cxx98-cu126-x86_64-linux/optimizer/_optimizer_b4b3752_dirty.abi3.so DELETED Viewed

@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:a5b49ed642e1c320da3932377033ad90031124f4ec24b2d1c95fd976ff28346c
-size 1824184

build/{torch26-cxx98-cu124-x86_64-linux/optimizer/_optimizer_b4b3752_dirty.abi3.so → torch26-cxx98-cu126-x86_64-linux/optimizer/_optimizer_bdd2678_dirty.abi3.so} RENAMED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:7f499350bb19eca6c3da1bb72e46023834b8411ce00730854273b588b2cd9206
 size 1824184

 version https://git-lfs.github.com/spec/v1
+oid sha256:2fa1dad3b3c1d94e7613a35e42afb8c7974d7bf6ce25cd2766590ba65b129f07
 size 1824184

build/torch26-cxx98-cu126-x86_64-linux/optimizer/muon.py CHANGED Viewed

@@ -5,10 +5,12 @@ import torch
 import torch.distributed as dist
 from torch.distributed._tensor import DTensor
 # TODO leave original url and consider LICENSE
 # This code snippet is a modified version adapted from the following GitHub repository:
 # https://github.com/KellerJordan/Muon/blob/master/muon.py
 def _zeropower_via_newtonschulz5(G, steps):
     """
     Newton-Schulz iteration to compute the zeroth power / orthogonalization of G. We opt to use a
@@ -52,6 +54,7 @@ class _muon_state:
     compute_event: torch.cuda.Event | None = None
 def _gather(p, state, rank, comm_stream):
     g = p.grad
     mesh = g.device_mesh
@@ -82,6 +85,7 @@ def _gather(p, state, rank, comm_stream):
             state.gather_event = None
 def _compute_u(state, steps, rank, compute_stream):
     with torch.cuda.stream(compute_stream):
         if rank == state.worker_rank:
@@ -99,6 +103,7 @@ def _compute_u(state, steps, rank, compute_stream):
             state.compute_event = None
 def _scatter(p, state, lr, wd, rank, comm_stream):
     u = state.computed_u
     mesh = p.device_mesh

 import torch.distributed as dist
 from torch.distributed._tensor import DTensor
 # TODO leave original url and consider LICENSE
 # This code snippet is a modified version adapted from the following GitHub repository:
 # https://github.com/KellerJordan/Muon/blob/master/muon.py
+@torch.no_grad()
 def _zeropower_via_newtonschulz5(G, steps):
     """
     Newton-Schulz iteration to compute the zeroth power / orthogonalization of G. We opt to use a
     compute_event: torch.cuda.Event | None = None
+@torch.no_grad()
 def _gather(p, state, rank, comm_stream):
     g = p.grad
     mesh = g.device_mesh
             state.gather_event = None
+@torch.no_grad()
 def _compute_u(state, steps, rank, compute_stream):
     with torch.cuda.stream(compute_stream):
         if rank == state.worker_rank:
             state.compute_event = None
+@torch.no_grad()
 def _scatter(p, state, lr, wd, rank, comm_stream):
     u = state.computed_u
     mesh = p.device_mesh

build/torch27-cxx11-cu118-x86_64-linux/optimizer/_ops.py CHANGED Viewed

@@ -1,9 +1,9 @@
 import torch
-from . import _optimizer_8535e80_dirty
-ops = torch.ops._optimizer_8535e80_dirty
 def add_op_namespace_prefix(op_name: str):
     """
     Prefix op by namespace.
     """
-    return f"_optimizer_8535e80_dirty::{op_name}"

 import torch
+from . import _optimizer_bdd2678_dirty
+ops = torch.ops._optimizer_bdd2678_dirty
 def add_op_namespace_prefix(op_name: str):
     """
     Prefix op by namespace.
     """
+    return f"_optimizer_bdd2678_dirty::{op_name}"

build/torch27-cxx11-cu118-x86_64-linux/optimizer/_optimizer_20250614125054.abi3.so DELETED Viewed

@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:de82486a39ded94bfe7eeaa862459944a93e284fd0d919329979bb67db3c367f
-size 1787376

build/torch27-cxx11-cu118-x86_64-linux/optimizer/_optimizer_b4b3752_dirty.abi3.so DELETED Viewed

@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:8ac9027c4a93801e9f19f1e9e94a9ed33b27e92c72797053c3de55e2a6fbb41d
-size 1787368

build/torch27-cxx11-cu118-x86_64-linux/optimizer/{_optimizer_8535e80_dirty.abi3.so → _optimizer_bdd2678_dirty.abi3.so} RENAMED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:c23a3adbe4dc1a64b4851a9f8e4aed0e3e1eeeded27322c54f5b942282a2a332
 size 1787368

 version https://git-lfs.github.com/spec/v1
+oid sha256:fe5761d07ed965bf94d00d8a8e6753a7fb571271e73773de9021511e0e6ae2c7
 size 1787368

build/torch27-cxx11-cu118-x86_64-linux/optimizer/muon.py CHANGED Viewed

@@ -5,10 +5,12 @@ import torch
 import torch.distributed as dist
 from torch.distributed._tensor import DTensor
 # TODO leave original url and consider LICENSE
 # This code snippet is a modified version adapted from the following GitHub repository:
 # https://github.com/KellerJordan/Muon/blob/master/muon.py
 def _zeropower_via_newtonschulz5(G, steps):
     """
     Newton-Schulz iteration to compute the zeroth power / orthogonalization of G. We opt to use a
@@ -52,6 +54,7 @@ class _muon_state:
     compute_event: torch.cuda.Event | None = None
 def _gather(p, state, rank, comm_stream):
     g = p.grad
     mesh = g.device_mesh
@@ -82,6 +85,7 @@ def _gather(p, state, rank, comm_stream):
             state.gather_event = None
 def _compute_u(state, steps, rank, compute_stream):
     with torch.cuda.stream(compute_stream):
         if rank == state.worker_rank:
@@ -99,6 +103,7 @@ def _compute_u(state, steps, rank, compute_stream):
             state.compute_event = None
 def _scatter(p, state, lr, wd, rank, comm_stream):
     u = state.computed_u
     mesh = p.device_mesh

 import torch.distributed as dist
 from torch.distributed._tensor import DTensor
 # TODO leave original url and consider LICENSE
 # This code snippet is a modified version adapted from the following GitHub repository:
 # https://github.com/KellerJordan/Muon/blob/master/muon.py
+@torch.no_grad()
 def _zeropower_via_newtonschulz5(G, steps):
     """
     Newton-Schulz iteration to compute the zeroth power / orthogonalization of G. We opt to use a
     compute_event: torch.cuda.Event | None = None
+@torch.no_grad()
 def _gather(p, state, rank, comm_stream):
     g = p.grad
     mesh = g.device_mesh
             state.gather_event = None
+@torch.no_grad()
 def _compute_u(state, steps, rank, compute_stream):
     with torch.cuda.stream(compute_stream):
         if rank == state.worker_rank:
             state.compute_event = None
+@torch.no_grad()
 def _scatter(p, state, lr, wd, rank, comm_stream):
     u = state.computed_u
     mesh = p.device_mesh

build/torch27-cxx11-cu126-x86_64-linux/optimizer/_ops.py CHANGED Viewed

@@ -1,9 +1,9 @@
 import torch
-from . import _optimizer_8535e80_dirty
-ops = torch.ops._optimizer_8535e80_dirty
 def add_op_namespace_prefix(op_name: str):
     """
     Prefix op by namespace.
     """
-    return f"_optimizer_8535e80_dirty::{op_name}"

 import torch
+from . import _optimizer_bdd2678_dirty
+ops = torch.ops._optimizer_bdd2678_dirty
 def add_op_namespace_prefix(op_name: str):
     """
     Prefix op by namespace.
     """
+    return f"_optimizer_bdd2678_dirty::{op_name}"

build/torch27-cxx11-cu126-x86_64-linux/optimizer/_optimizer_8535e80_dirty.abi3.so DELETED Viewed

@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:d4aa09c22745d5efe1ef0669c4ca05615f67595dc90cabeee6e878301fa9bd22
-size 1824256

build/torch27-cxx11-cu126-x86_64-linux/optimizer/_optimizer_b4b3752_dirty.abi3.so DELETED Viewed

@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:b425a7fd854402508da5af17fa88f305753a09474686d6ec7afe540b3c5c082e
-size 1824256

build/torch27-cxx11-cu126-x86_64-linux/optimizer/{_optimizer_20250614125054.abi3.so → _optimizer_bdd2678_dirty.abi3.so} RENAMED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:cb02d3818a89c819a5a12d066ce56da0ebc4f3da491cb045ae380c5b9319e592
 size 1824256

 version https://git-lfs.github.com/spec/v1
+oid sha256:a35c1c4d46f677f0fe35fec9023a866b9bd0f4245624b4e71a9812a1864c01e6
 size 1824256

build/torch27-cxx11-cu126-x86_64-linux/optimizer/muon.py CHANGED Viewed

@@ -5,10 +5,12 @@ import torch
 import torch.distributed as dist
 from torch.distributed._tensor import DTensor
 # TODO leave original url and consider LICENSE
 # This code snippet is a modified version adapted from the following GitHub repository:
 # https://github.com/KellerJordan/Muon/blob/master/muon.py
 def _zeropower_via_newtonschulz5(G, steps):
     """
     Newton-Schulz iteration to compute the zeroth power / orthogonalization of G. We opt to use a
@@ -52,6 +54,7 @@ class _muon_state:
     compute_event: torch.cuda.Event | None = None
 def _gather(p, state, rank, comm_stream):
     g = p.grad
     mesh = g.device_mesh
@@ -82,6 +85,7 @@ def _gather(p, state, rank, comm_stream):
             state.gather_event = None
 def _compute_u(state, steps, rank, compute_stream):
     with torch.cuda.stream(compute_stream):
         if rank == state.worker_rank:
@@ -99,6 +103,7 @@ def _compute_u(state, steps, rank, compute_stream):
             state.compute_event = None
 def _scatter(p, state, lr, wd, rank, comm_stream):
     u = state.computed_u
     mesh = p.device_mesh

 import torch.distributed as dist
 from torch.distributed._tensor import DTensor
 # TODO leave original url and consider LICENSE
 # This code snippet is a modified version adapted from the following GitHub repository:
 # https://github.com/KellerJordan/Muon/blob/master/muon.py
+@torch.no_grad()
 def _zeropower_via_newtonschulz5(G, steps):
     """
     Newton-Schulz iteration to compute the zeroth power / orthogonalization of G. We opt to use a
     compute_event: torch.cuda.Event | None = None
+@torch.no_grad()
 def _gather(p, state, rank, comm_stream):
     g = p.grad
     mesh = g.device_mesh
             state.gather_event = None
+@torch.no_grad()
 def _compute_u(state, steps, rank, compute_stream):
     with torch.cuda.stream(compute_stream):
         if rank == state.worker_rank:
             state.compute_event = None
+@torch.no_grad()
 def _scatter(p, state, lr, wd, rank, comm_stream):
     u = state.computed_u
     mesh = p.device_mesh

build/torch27-cxx11-cu128-x86_64-linux/optimizer/_ops.py CHANGED Viewed

@@ -1,9 +1,9 @@
 import torch
-from . import _optimizer_8535e80_dirty
-ops = torch.ops._optimizer_8535e80_dirty
 def add_op_namespace_prefix(op_name: str):
     """
     Prefix op by namespace.
     """
-    return f"_optimizer_8535e80_dirty::{op_name}"

 import torch
+from . import _optimizer_bdd2678_dirty
+ops = torch.ops._optimizer_bdd2678_dirty
 def add_op_namespace_prefix(op_name: str):
     """
     Prefix op by namespace.
     """
+    return f"_optimizer_bdd2678_dirty::{op_name}"

build/torch27-cxx11-cu128-x86_64-linux/optimizer/_optimizer_8535e80_dirty.abi3.so DELETED Viewed

@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:b4baf569b70749c4657062fb0f56943fc486adb0c482e50c7aa8e31ddf5cc870
-size 1883352

build/torch27-cxx11-cu128-x86_64-linux/optimizer/_optimizer_b4b3752_dirty.abi3.so DELETED Viewed

@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:1150b2d74d708ef2f7d8a24049c49b9ba6e2d8f5d5ce5ae88a611e4d555fe659
-size 1883352