diff --git a/build/torch26-cxx11-cu118-x86_64-linux/optimizer/_ops.py b/build/torch26-cxx11-cu118-x86_64-linux/optimizer/_ops.py index 9564d5c71da17e364e08af02794a35eef592c5d6..555368ab7e9874f09a1e0672390360863f5d7211 100755 --- a/build/torch26-cxx11-cu118-x86_64-linux/optimizer/_ops.py +++ b/build/torch26-cxx11-cu118-x86_64-linux/optimizer/_ops.py @@ -1,9 +1,9 @@ import torch -from . import _optimizer_8535e80_dirty -ops = torch.ops._optimizer_8535e80_dirty +from . import _optimizer_bdd2678_dirty +ops = torch.ops._optimizer_bdd2678_dirty def add_op_namespace_prefix(op_name: str): """ Prefix op by namespace. """ - return f"_optimizer_8535e80_dirty::{op_name}" \ No newline at end of file + return f"_optimizer_bdd2678_dirty::{op_name}" \ No newline at end of file diff --git a/build/torch26-cxx11-cu118-x86_64-linux/optimizer/_optimizer_20250614125054.abi3.so b/build/torch26-cxx11-cu118-x86_64-linux/optimizer/_optimizer_20250614125054.abi3.so deleted file mode 100755 index 6f1afc3bc9c26549c621ee8396cfd9b6d632228e..0000000000000000000000000000000000000000 --- a/build/torch26-cxx11-cu118-x86_64-linux/optimizer/_optimizer_20250614125054.abi3.so +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:66ca698639fff584999fe65f8f10cc4436c197829e936be2741bf53db685caa0 -size 1787272 diff --git a/build/torch26-cxx11-cu118-x86_64-linux/optimizer/_optimizer_8535e80_dirty.abi3.so b/build/torch26-cxx11-cu118-x86_64-linux/optimizer/_optimizer_8535e80_dirty.abi3.so deleted file mode 100755 index b1a994d6f8e5e499b618abfcf8787e3f67208d19..0000000000000000000000000000000000000000 --- a/build/torch26-cxx11-cu118-x86_64-linux/optimizer/_optimizer_8535e80_dirty.abi3.so +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:a46d9e65efcfa82522950d9ebf2b2b4594d9ed5abc28704352a1f7de2dae707a -size 1787272 diff --git a/build/torch26-cxx11-cu118-x86_64-linux/optimizer/_optimizer_b4b3752_dirty.abi3.so b/build/torch26-cxx11-cu118-x86_64-linux/optimizer/_optimizer_b4b3752_dirty.abi3.so deleted file mode 100755 index 3ad8df1e7879102f18c8f3ecefdcd4a710867734..0000000000000000000000000000000000000000 --- a/build/torch26-cxx11-cu118-x86_64-linux/optimizer/_optimizer_b4b3752_dirty.abi3.so +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:f8325d12959ef4f31b6c6340eca29176f5077abeaa10f3a6651db55ccf3c634f -size 1787272 diff --git a/build/torch26-cxx11-cu118-x86_64-linux/optimizer/_optimizer_bdd2678_dirty.abi3.so b/build/torch26-cxx11-cu118-x86_64-linux/optimizer/_optimizer_bdd2678_dirty.abi3.so new file mode 100755 index 0000000000000000000000000000000000000000..b2bc9c7662ff079257d34d723369b0093d28c11e --- /dev/null +++ b/build/torch26-cxx11-cu118-x86_64-linux/optimizer/_optimizer_bdd2678_dirty.abi3.so @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9119d3a6d99c07a17d110d2ccf6042f199d00c839f5efa74008c1642d21e48b0 +size 1787272 diff --git a/build/torch26-cxx11-cu118-x86_64-linux/optimizer/muon.py b/build/torch26-cxx11-cu118-x86_64-linux/optimizer/muon.py index af6dc3feaccabb9f86fa3bbb31e4274651caa53b..1bd67e98878b841610dd6eaf377d8b29d457a42c 100755 --- a/build/torch26-cxx11-cu118-x86_64-linux/optimizer/muon.py +++ b/build/torch26-cxx11-cu118-x86_64-linux/optimizer/muon.py @@ -5,10 +5,12 @@ import torch import torch.distributed as dist from torch.distributed._tensor import DTensor - # TODO leave original url and consider LICENSE # This code snippet is a modified version adapted from the following GitHub repository: # https://github.com/KellerJordan/Muon/blob/master/muon.py + + +@torch.no_grad() def _zeropower_via_newtonschulz5(G, steps): """ Newton-Schulz iteration to compute the zeroth power / orthogonalization of G. We opt to use a @@ -52,6 +54,7 @@ class _muon_state: compute_event: torch.cuda.Event | None = None +@torch.no_grad() def _gather(p, state, rank, comm_stream): g = p.grad mesh = g.device_mesh @@ -82,6 +85,7 @@ def _gather(p, state, rank, comm_stream): state.gather_event = None +@torch.no_grad() def _compute_u(state, steps, rank, compute_stream): with torch.cuda.stream(compute_stream): if rank == state.worker_rank: @@ -99,6 +103,7 @@ def _compute_u(state, steps, rank, compute_stream): state.compute_event = None +@torch.no_grad() def _scatter(p, state, lr, wd, rank, comm_stream): u = state.computed_u mesh = p.device_mesh diff --git a/build/torch26-cxx11-cu124-x86_64-linux/optimizer/_ops.py b/build/torch26-cxx11-cu124-x86_64-linux/optimizer/_ops.py index 9564d5c71da17e364e08af02794a35eef592c5d6..555368ab7e9874f09a1e0672390360863f5d7211 100755 --- a/build/torch26-cxx11-cu124-x86_64-linux/optimizer/_ops.py +++ b/build/torch26-cxx11-cu124-x86_64-linux/optimizer/_ops.py @@ -1,9 +1,9 @@ import torch -from . import _optimizer_8535e80_dirty -ops = torch.ops._optimizer_8535e80_dirty +from . import _optimizer_bdd2678_dirty +ops = torch.ops._optimizer_bdd2678_dirty def add_op_namespace_prefix(op_name: str): """ Prefix op by namespace. """ - return f"_optimizer_8535e80_dirty::{op_name}" \ No newline at end of file + return f"_optimizer_bdd2678_dirty::{op_name}" \ No newline at end of file diff --git a/build/torch26-cxx11-cu124-x86_64-linux/optimizer/_optimizer_20250614125054.abi3.so b/build/torch26-cxx11-cu124-x86_64-linux/optimizer/_optimizer_20250614125054.abi3.so deleted file mode 100755 index 3be3c1fe4294649f4aad6e9c2baed7dd62d26788..0000000000000000000000000000000000000000 --- a/build/torch26-cxx11-cu124-x86_64-linux/optimizer/_optimizer_20250614125054.abi3.so +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:e89cd7d514bfe92598684ae3cfc2d35ac2d021340846e09c0b6c880c3d55bfa0 -size 1820136 diff --git a/build/torch26-cxx11-cu124-x86_64-linux/optimizer/_optimizer_8535e80_dirty.abi3.so b/build/torch26-cxx11-cu124-x86_64-linux/optimizer/_optimizer_8535e80_dirty.abi3.so deleted file mode 100755 index be5bc569540c9ad7ffd6359bfe139133ab2a4b3e..0000000000000000000000000000000000000000 --- a/build/torch26-cxx11-cu124-x86_64-linux/optimizer/_optimizer_8535e80_dirty.abi3.so +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:d351a600884b7378f546a345afe65c176e1399bb42fb7dfe4333b0e90975803b -size 1824224 diff --git a/build/torch26-cxx11-cu124-x86_64-linux/optimizer/_optimizer_b4b3752_dirty.abi3.so b/build/torch26-cxx11-cu124-x86_64-linux/optimizer/_optimizer_b4b3752_dirty.abi3.so deleted file mode 100755 index b011a8836fa8a0b3ad74ec14e29a6284a0742be2..0000000000000000000000000000000000000000 --- a/build/torch26-cxx11-cu124-x86_64-linux/optimizer/_optimizer_b4b3752_dirty.abi3.so +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:9cbffc2cf8039069831a57afb8e2f64fa684f1a44bec79bb4b72dbb57d9ac607 -size 1824224 diff --git a/build/torch26-cxx11-cu124-x86_64-linux/optimizer/_optimizer_bdd2678_dirty.abi3.so b/build/torch26-cxx11-cu124-x86_64-linux/optimizer/_optimizer_bdd2678_dirty.abi3.so new file mode 100755 index 0000000000000000000000000000000000000000..4fce5a92f8987c95b929a587c1314ed652c585fb --- /dev/null +++ b/build/torch26-cxx11-cu124-x86_64-linux/optimizer/_optimizer_bdd2678_dirty.abi3.so @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:91b76cd5be429f99840e26e8ba55b61f9fdcae19301bd7c082b2e9746a276501 +size 1824224 diff --git a/build/torch26-cxx11-cu124-x86_64-linux/optimizer/muon.py b/build/torch26-cxx11-cu124-x86_64-linux/optimizer/muon.py index af6dc3feaccabb9f86fa3bbb31e4274651caa53b..1bd67e98878b841610dd6eaf377d8b29d457a42c 100755 --- a/build/torch26-cxx11-cu124-x86_64-linux/optimizer/muon.py +++ b/build/torch26-cxx11-cu124-x86_64-linux/optimizer/muon.py @@ -5,10 +5,12 @@ import torch import torch.distributed as dist from torch.distributed._tensor import DTensor - # TODO leave original url and consider LICENSE # This code snippet is a modified version adapted from the following GitHub repository: # https://github.com/KellerJordan/Muon/blob/master/muon.py + + +@torch.no_grad() def _zeropower_via_newtonschulz5(G, steps): """ Newton-Schulz iteration to compute the zeroth power / orthogonalization of G. We opt to use a @@ -52,6 +54,7 @@ class _muon_state: compute_event: torch.cuda.Event | None = None +@torch.no_grad() def _gather(p, state, rank, comm_stream): g = p.grad mesh = g.device_mesh @@ -82,6 +85,7 @@ def _gather(p, state, rank, comm_stream): state.gather_event = None +@torch.no_grad() def _compute_u(state, steps, rank, compute_stream): with torch.cuda.stream(compute_stream): if rank == state.worker_rank: @@ -99,6 +103,7 @@ def _compute_u(state, steps, rank, compute_stream): state.compute_event = None +@torch.no_grad() def _scatter(p, state, lr, wd, rank, comm_stream): u = state.computed_u mesh = p.device_mesh diff --git a/build/torch26-cxx11-cu126-x86_64-linux/optimizer/_ops.py b/build/torch26-cxx11-cu126-x86_64-linux/optimizer/_ops.py index 9564d5c71da17e364e08af02794a35eef592c5d6..555368ab7e9874f09a1e0672390360863f5d7211 100755 --- a/build/torch26-cxx11-cu126-x86_64-linux/optimizer/_ops.py +++ b/build/torch26-cxx11-cu126-x86_64-linux/optimizer/_ops.py @@ -1,9 +1,9 @@ import torch -from . import _optimizer_8535e80_dirty -ops = torch.ops._optimizer_8535e80_dirty +from . import _optimizer_bdd2678_dirty +ops = torch.ops._optimizer_bdd2678_dirty def add_op_namespace_prefix(op_name: str): """ Prefix op by namespace. """ - return f"_optimizer_8535e80_dirty::{op_name}" \ No newline at end of file + return f"_optimizer_bdd2678_dirty::{op_name}" \ No newline at end of file diff --git a/build/torch26-cxx11-cu126-x86_64-linux/optimizer/_optimizer_20250614125054.abi3.so b/build/torch26-cxx11-cu126-x86_64-linux/optimizer/_optimizer_20250614125054.abi3.so deleted file mode 100755 index ec7e75e7b673420b7ff82464fd0f66d086797be8..0000000000000000000000000000000000000000 --- a/build/torch26-cxx11-cu126-x86_64-linux/optimizer/_optimizer_20250614125054.abi3.so +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:7f5dce62d3038e879e688fffa9bbc70f3e82db20b2e7ae3ba09040e0319acb71 -size 1820136 diff --git a/build/torch26-cxx11-cu126-x86_64-linux/optimizer/_optimizer_8535e80_dirty.abi3.so b/build/torch26-cxx11-cu126-x86_64-linux/optimizer/_optimizer_8535e80_dirty.abi3.so deleted file mode 100755 index 91b009386c824abf98ab44df988fe56cd017cf8d..0000000000000000000000000000000000000000 --- a/build/torch26-cxx11-cu126-x86_64-linux/optimizer/_optimizer_8535e80_dirty.abi3.so +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:2c0843f38cee494b7a5939eb62d27039d76dc3f69401d411efbacaa25cb0d67a -size 1824224 diff --git a/build/torch26-cxx11-cu126-x86_64-linux/optimizer/_optimizer_b4b3752_dirty.abi3.so b/build/torch26-cxx11-cu126-x86_64-linux/optimizer/_optimizer_b4b3752_dirty.abi3.so deleted file mode 100755 index 27870794663e4e380d26a8c438668dc9b1501547..0000000000000000000000000000000000000000 --- a/build/torch26-cxx11-cu126-x86_64-linux/optimizer/_optimizer_b4b3752_dirty.abi3.so +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:58162f994df84868dbf62ae70e39d3c14e3390fc827f152eece83dfae7f51503 -size 1824224 diff --git a/build/torch26-cxx11-cu126-x86_64-linux/optimizer/_optimizer_bdd2678_dirty.abi3.so b/build/torch26-cxx11-cu126-x86_64-linux/optimizer/_optimizer_bdd2678_dirty.abi3.so new file mode 100755 index 0000000000000000000000000000000000000000..3ffbd9727d44aba913777a13fde172f240452eef --- /dev/null +++ b/build/torch26-cxx11-cu126-x86_64-linux/optimizer/_optimizer_bdd2678_dirty.abi3.so @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:807d59aca5b0403206395a1f4c770b8d644294c17f6af866207c36ac617f0a7d +size 1824224 diff --git a/build/torch26-cxx11-cu126-x86_64-linux/optimizer/muon.py b/build/torch26-cxx11-cu126-x86_64-linux/optimizer/muon.py index af6dc3feaccabb9f86fa3bbb31e4274651caa53b..1bd67e98878b841610dd6eaf377d8b29d457a42c 100755 --- a/build/torch26-cxx11-cu126-x86_64-linux/optimizer/muon.py +++ b/build/torch26-cxx11-cu126-x86_64-linux/optimizer/muon.py @@ -5,10 +5,12 @@ import torch import torch.distributed as dist from torch.distributed._tensor import DTensor - # TODO leave original url and consider LICENSE # This code snippet is a modified version adapted from the following GitHub repository: # https://github.com/KellerJordan/Muon/blob/master/muon.py + + +@torch.no_grad() def _zeropower_via_newtonschulz5(G, steps): """ Newton-Schulz iteration to compute the zeroth power / orthogonalization of G. We opt to use a @@ -52,6 +54,7 @@ class _muon_state: compute_event: torch.cuda.Event | None = None +@torch.no_grad() def _gather(p, state, rank, comm_stream): g = p.grad mesh = g.device_mesh @@ -82,6 +85,7 @@ def _gather(p, state, rank, comm_stream): state.gather_event = None +@torch.no_grad() def _compute_u(state, steps, rank, compute_stream): with torch.cuda.stream(compute_stream): if rank == state.worker_rank: @@ -99,6 +103,7 @@ def _compute_u(state, steps, rank, compute_stream): state.compute_event = None +@torch.no_grad() def _scatter(p, state, lr, wd, rank, comm_stream): u = state.computed_u mesh = p.device_mesh diff --git a/build/torch26-cxx11-rocm62-x86_64-linux/optimizer/_ops.py b/build/torch26-cxx11-rocm62-x86_64-linux/optimizer/_ops.py index 9564d5c71da17e364e08af02794a35eef592c5d6..555368ab7e9874f09a1e0672390360863f5d7211 100755 --- a/build/torch26-cxx11-rocm62-x86_64-linux/optimizer/_ops.py +++ b/build/torch26-cxx11-rocm62-x86_64-linux/optimizer/_ops.py @@ -1,9 +1,9 @@ import torch -from . import _optimizer_8535e80_dirty -ops = torch.ops._optimizer_8535e80_dirty +from . import _optimizer_bdd2678_dirty +ops = torch.ops._optimizer_bdd2678_dirty def add_op_namespace_prefix(op_name: str): """ Prefix op by namespace. """ - return f"_optimizer_8535e80_dirty::{op_name}" \ No newline at end of file + return f"_optimizer_bdd2678_dirty::{op_name}" \ No newline at end of file diff --git a/build/torch26-cxx11-rocm62-x86_64-linux/optimizer/_optimizer_20250614121529.abi3.so b/build/torch26-cxx11-rocm62-x86_64-linux/optimizer/_optimizer_20250614121529.abi3.so deleted file mode 100755 index 52812e8a8729a06d9a416a541cdb9dacdbd18bde..0000000000000000000000000000000000000000 --- a/build/torch26-cxx11-rocm62-x86_64-linux/optimizer/_optimizer_20250614121529.abi3.so +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:d2f60369ba2bd0a0f84e053d857d37496137ff476dc21561f211b1fa39651990 -size 1749784 diff --git a/build/torch26-cxx11-rocm62-x86_64-linux/optimizer/_optimizer_20250614123843.abi3.so b/build/torch26-cxx11-rocm62-x86_64-linux/optimizer/_optimizer_20250614123843.abi3.so deleted file mode 100755 index f920a59fe9b333a3a502408b21ab45b5946283ba..0000000000000000000000000000000000000000 --- a/build/torch26-cxx11-rocm62-x86_64-linux/optimizer/_optimizer_20250614123843.abi3.so +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:f4d790535f99b7b362a966e802a547654f31749f5f28a0207493870927f1d8d2 -size 1749784 diff --git a/build/torch26-cxx11-rocm62-x86_64-linux/optimizer/_optimizer_20250614125054.abi3.so b/build/torch26-cxx11-rocm62-x86_64-linux/optimizer/_optimizer_20250614125054.abi3.so deleted file mode 100755 index 70bf31624ca0641277491c69bb148281e987b9ce..0000000000000000000000000000000000000000 --- a/build/torch26-cxx11-rocm62-x86_64-linux/optimizer/_optimizer_20250614125054.abi3.so +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:b440dd9a60711a498010068e91d0ad013cd0b8ac732c16b5d1d17e5d4ec0f9b4 -size 1749784 diff --git a/build/torch26-cxx11-rocm62-x86_64-linux/optimizer/_optimizer_8535e80_dirty.abi3.so b/build/torch26-cxx11-rocm62-x86_64-linux/optimizer/_optimizer_8535e80_dirty.abi3.so deleted file mode 100755 index 13e56361f9f105e09b3b9cc5ea0f91493a4326ea..0000000000000000000000000000000000000000 --- a/build/torch26-cxx11-rocm62-x86_64-linux/optimizer/_optimizer_8535e80_dirty.abi3.so +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:acdba99ce95532a9ca6a8987a7ab61a257657872f2cc672c91e8e5fe809aa24e -size 1749744 diff --git a/build/torch26-cxx11-rocm62-x86_64-linux/optimizer/_optimizer_b4b3752_dirty.abi3.so b/build/torch26-cxx11-rocm62-x86_64-linux/optimizer/_optimizer_b4b3752_dirty.abi3.so deleted file mode 100755 index 68ebbf08ff0cd761f4f6817ede6d806330daa380..0000000000000000000000000000000000000000 --- a/build/torch26-cxx11-rocm62-x86_64-linux/optimizer/_optimizer_b4b3752_dirty.abi3.so +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:f50ea9cab62a5bd06d886516d3917e4490e65aa9addd1cbb84fc81c6f9a9d5b1 -size 1749744 diff --git a/build/torch26-cxx11-rocm62-x86_64-linux/optimizer/_optimizer_bdd2678_dirty.abi3.so b/build/torch26-cxx11-rocm62-x86_64-linux/optimizer/_optimizer_bdd2678_dirty.abi3.so new file mode 100755 index 0000000000000000000000000000000000000000..753c5e652541449551520609e2b72f6b4b481a94 --- /dev/null +++ b/build/torch26-cxx11-rocm62-x86_64-linux/optimizer/_optimizer_bdd2678_dirty.abi3.so @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3d2bdd755079fa06a27401b8a26ac425d35514d196f9df4ce1be5c52ebcc9a64 +size 1749744 diff --git a/build/torch26-cxx11-rocm62-x86_64-linux/optimizer/muon.py b/build/torch26-cxx11-rocm62-x86_64-linux/optimizer/muon.py index af6dc3feaccabb9f86fa3bbb31e4274651caa53b..1bd67e98878b841610dd6eaf377d8b29d457a42c 100755 --- a/build/torch26-cxx11-rocm62-x86_64-linux/optimizer/muon.py +++ b/build/torch26-cxx11-rocm62-x86_64-linux/optimizer/muon.py @@ -5,10 +5,12 @@ import torch import torch.distributed as dist from torch.distributed._tensor import DTensor - # TODO leave original url and consider LICENSE # This code snippet is a modified version adapted from the following GitHub repository: # https://github.com/KellerJordan/Muon/blob/master/muon.py + + +@torch.no_grad() def _zeropower_via_newtonschulz5(G, steps): """ Newton-Schulz iteration to compute the zeroth power / orthogonalization of G. We opt to use a @@ -52,6 +54,7 @@ class _muon_state: compute_event: torch.cuda.Event | None = None +@torch.no_grad() def _gather(p, state, rank, comm_stream): g = p.grad mesh = g.device_mesh @@ -82,6 +85,7 @@ def _gather(p, state, rank, comm_stream): state.gather_event = None +@torch.no_grad() def _compute_u(state, steps, rank, compute_stream): with torch.cuda.stream(compute_stream): if rank == state.worker_rank: @@ -99,6 +103,7 @@ def _compute_u(state, steps, rank, compute_stream): state.compute_event = None +@torch.no_grad() def _scatter(p, state, lr, wd, rank, comm_stream): u = state.computed_u mesh = p.device_mesh diff --git a/build/torch26-cxx98-cu118-x86_64-linux/optimizer/_ops.py b/build/torch26-cxx98-cu118-x86_64-linux/optimizer/_ops.py index 9564d5c71da17e364e08af02794a35eef592c5d6..555368ab7e9874f09a1e0672390360863f5d7211 100755 --- a/build/torch26-cxx98-cu118-x86_64-linux/optimizer/_ops.py +++ b/build/torch26-cxx98-cu118-x86_64-linux/optimizer/_ops.py @@ -1,9 +1,9 @@ import torch -from . import _optimizer_8535e80_dirty -ops = torch.ops._optimizer_8535e80_dirty +from . import _optimizer_bdd2678_dirty +ops = torch.ops._optimizer_bdd2678_dirty def add_op_namespace_prefix(op_name: str): """ Prefix op by namespace. """ - return f"_optimizer_8535e80_dirty::{op_name}" \ No newline at end of file + return f"_optimizer_bdd2678_dirty::{op_name}" \ No newline at end of file diff --git a/build/torch26-cxx98-cu118-x86_64-linux/optimizer/_optimizer_20250614125054.abi3.so b/build/torch26-cxx98-cu118-x86_64-linux/optimizer/_optimizer_20250614125054.abi3.so deleted file mode 100755 index 97c729e3fbaacccbb4de3648ecec3bdd3e3df48c..0000000000000000000000000000000000000000 --- a/build/torch26-cxx98-cu118-x86_64-linux/optimizer/_optimizer_20250614125054.abi3.so +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:a8f8e7d78ed9a095b882cf764fd9c80a0b0810fb961ba9e8545656fc4cb0b0d7 -size 1787200 diff --git a/build/torch26-cxx98-cu118-x86_64-linux/optimizer/_optimizer_8535e80_dirty.abi3.so b/build/torch26-cxx98-cu118-x86_64-linux/optimizer/_optimizer_8535e80_dirty.abi3.so deleted file mode 100755 index 68149a0c7db7cc42958294fff670bf4d14641a5e..0000000000000000000000000000000000000000 --- a/build/torch26-cxx98-cu118-x86_64-linux/optimizer/_optimizer_8535e80_dirty.abi3.so +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:f7d5e76c002507f66f2a227d02c2b11aa3fdc3f07a2a0b82faaa34133adb77ef -size 1787192 diff --git a/build/torch26-cxx98-cu118-x86_64-linux/optimizer/_optimizer_b4b3752_dirty.abi3.so b/build/torch26-cxx98-cu118-x86_64-linux/optimizer/_optimizer_b4b3752_dirty.abi3.so deleted file mode 100755 index 171b656d3cca9f3e371b6235f8c8b53738289a14..0000000000000000000000000000000000000000 --- a/build/torch26-cxx98-cu118-x86_64-linux/optimizer/_optimizer_b4b3752_dirty.abi3.so +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:002dab6441bcad54ab4e7c064b5806acfd45170eb33cfa059745ba6e0c349607 -size 1787192 diff --git a/build/torch26-cxx98-cu118-x86_64-linux/optimizer/_optimizer_bdd2678_dirty.abi3.so b/build/torch26-cxx98-cu118-x86_64-linux/optimizer/_optimizer_bdd2678_dirty.abi3.so new file mode 100755 index 0000000000000000000000000000000000000000..67f55b46c59a9b2df60b991ef58226d6cbbd03af --- /dev/null +++ b/build/torch26-cxx98-cu118-x86_64-linux/optimizer/_optimizer_bdd2678_dirty.abi3.so @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e4ca177074d4c04630ffaa2e49e41e1451bf198c44c4cc544a664be88475a3b9 +size 1787192 diff --git a/build/torch26-cxx98-cu118-x86_64-linux/optimizer/muon.py b/build/torch26-cxx98-cu118-x86_64-linux/optimizer/muon.py index af6dc3feaccabb9f86fa3bbb31e4274651caa53b..1bd67e98878b841610dd6eaf377d8b29d457a42c 100755 --- a/build/torch26-cxx98-cu118-x86_64-linux/optimizer/muon.py +++ b/build/torch26-cxx98-cu118-x86_64-linux/optimizer/muon.py @@ -5,10 +5,12 @@ import torch import torch.distributed as dist from torch.distributed._tensor import DTensor - # TODO leave original url and consider LICENSE # This code snippet is a modified version adapted from the following GitHub repository: # https://github.com/KellerJordan/Muon/blob/master/muon.py + + +@torch.no_grad() def _zeropower_via_newtonschulz5(G, steps): """ Newton-Schulz iteration to compute the zeroth power / orthogonalization of G. We opt to use a @@ -52,6 +54,7 @@ class _muon_state: compute_event: torch.cuda.Event | None = None +@torch.no_grad() def _gather(p, state, rank, comm_stream): g = p.grad mesh = g.device_mesh @@ -82,6 +85,7 @@ def _gather(p, state, rank, comm_stream): state.gather_event = None +@torch.no_grad() def _compute_u(state, steps, rank, compute_stream): with torch.cuda.stream(compute_stream): if rank == state.worker_rank: @@ -99,6 +103,7 @@ def _compute_u(state, steps, rank, compute_stream): state.compute_event = None +@torch.no_grad() def _scatter(p, state, lr, wd, rank, comm_stream): u = state.computed_u mesh = p.device_mesh diff --git a/build/torch26-cxx98-cu124-x86_64-linux/optimizer/_ops.py b/build/torch26-cxx98-cu124-x86_64-linux/optimizer/_ops.py index 9564d5c71da17e364e08af02794a35eef592c5d6..555368ab7e9874f09a1e0672390360863f5d7211 100755 --- a/build/torch26-cxx98-cu124-x86_64-linux/optimizer/_ops.py +++ b/build/torch26-cxx98-cu124-x86_64-linux/optimizer/_ops.py @@ -1,9 +1,9 @@ import torch -from . import _optimizer_8535e80_dirty -ops = torch.ops._optimizer_8535e80_dirty +from . import _optimizer_bdd2678_dirty +ops = torch.ops._optimizer_bdd2678_dirty def add_op_namespace_prefix(op_name: str): """ Prefix op by namespace. """ - return f"_optimizer_8535e80_dirty::{op_name}" \ No newline at end of file + return f"_optimizer_bdd2678_dirty::{op_name}" \ No newline at end of file diff --git a/build/torch26-cxx98-cu124-x86_64-linux/optimizer/_optimizer_20250614125054.abi3.so b/build/torch26-cxx98-cu124-x86_64-linux/optimizer/_optimizer_20250614125054.abi3.so deleted file mode 100755 index e28a927785764651e3ce2e76f737478ce74b93ed..0000000000000000000000000000000000000000 --- a/build/torch26-cxx98-cu124-x86_64-linux/optimizer/_optimizer_20250614125054.abi3.so +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:ab2379d932e40d10bee55f032bd16d2e4d9c1920bc5500628006f8a0eb8abd39 -size 1824192 diff --git a/build/torch26-cxx98-cu124-x86_64-linux/optimizer/_optimizer_8535e80_dirty.abi3.so b/build/torch26-cxx98-cu124-x86_64-linux/optimizer/_optimizer_8535e80_dirty.abi3.so deleted file mode 100755 index 9b68dd45d308cb506c1e50b08bc292415aa50d2e..0000000000000000000000000000000000000000 --- a/build/torch26-cxx98-cu124-x86_64-linux/optimizer/_optimizer_8535e80_dirty.abi3.so +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:becccd250f38a84803350cfb5fac3a6682b1e594968a714642724cbc71246b4a -size 1824184 diff --git a/build/torch26-cxx98-cu124-x86_64-linux/optimizer/_optimizer_b4b3752_dirty.abi3.so b/build/torch26-cxx98-cu124-x86_64-linux/optimizer/_optimizer_b4b3752_dirty.abi3.so deleted file mode 100755 index baae25dbc1cd71cee383d366d50ef444612b9029..0000000000000000000000000000000000000000 --- a/build/torch26-cxx98-cu124-x86_64-linux/optimizer/_optimizer_b4b3752_dirty.abi3.so +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:7f499350bb19eca6c3da1bb72e46023834b8411ce00730854273b588b2cd9206 -size 1824184 diff --git a/build/torch26-cxx98-cu124-x86_64-linux/optimizer/_optimizer_bdd2678_dirty.abi3.so b/build/torch26-cxx98-cu124-x86_64-linux/optimizer/_optimizer_bdd2678_dirty.abi3.so new file mode 100755 index 0000000000000000000000000000000000000000..954a7c2106a5bf126c9efa8b77814d48f4cdf328 --- /dev/null +++ b/build/torch26-cxx98-cu124-x86_64-linux/optimizer/_optimizer_bdd2678_dirty.abi3.so @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3511c3a46297462166d7b773dc2bd8b16f43b7004eee1e4b31d468113051fb55 +size 1824184 diff --git a/build/torch26-cxx98-cu124-x86_64-linux/optimizer/muon.py b/build/torch26-cxx98-cu124-x86_64-linux/optimizer/muon.py index af6dc3feaccabb9f86fa3bbb31e4274651caa53b..1bd67e98878b841610dd6eaf377d8b29d457a42c 100755 --- a/build/torch26-cxx98-cu124-x86_64-linux/optimizer/muon.py +++ b/build/torch26-cxx98-cu124-x86_64-linux/optimizer/muon.py @@ -5,10 +5,12 @@ import torch import torch.distributed as dist from torch.distributed._tensor import DTensor - # TODO leave original url and consider LICENSE # This code snippet is a modified version adapted from the following GitHub repository: # https://github.com/KellerJordan/Muon/blob/master/muon.py + + +@torch.no_grad() def _zeropower_via_newtonschulz5(G, steps): """ Newton-Schulz iteration to compute the zeroth power / orthogonalization of G. We opt to use a @@ -52,6 +54,7 @@ class _muon_state: compute_event: torch.cuda.Event | None = None +@torch.no_grad() def _gather(p, state, rank, comm_stream): g = p.grad mesh = g.device_mesh @@ -82,6 +85,7 @@ def _gather(p, state, rank, comm_stream): state.gather_event = None +@torch.no_grad() def _compute_u(state, steps, rank, compute_stream): with torch.cuda.stream(compute_stream): if rank == state.worker_rank: @@ -99,6 +103,7 @@ def _compute_u(state, steps, rank, compute_stream): state.compute_event = None +@torch.no_grad() def _scatter(p, state, lr, wd, rank, comm_stream): u = state.computed_u mesh = p.device_mesh diff --git a/build/torch26-cxx98-cu126-x86_64-linux/optimizer/_ops.py b/build/torch26-cxx98-cu126-x86_64-linux/optimizer/_ops.py index 9564d5c71da17e364e08af02794a35eef592c5d6..555368ab7e9874f09a1e0672390360863f5d7211 100755 --- a/build/torch26-cxx98-cu126-x86_64-linux/optimizer/_ops.py +++ b/build/torch26-cxx98-cu126-x86_64-linux/optimizer/_ops.py @@ -1,9 +1,9 @@ import torch -from . import _optimizer_8535e80_dirty -ops = torch.ops._optimizer_8535e80_dirty +from . import _optimizer_bdd2678_dirty +ops = torch.ops._optimizer_bdd2678_dirty def add_op_namespace_prefix(op_name: str): """ Prefix op by namespace. """ - return f"_optimizer_8535e80_dirty::{op_name}" \ No newline at end of file + return f"_optimizer_bdd2678_dirty::{op_name}" \ No newline at end of file diff --git a/build/torch26-cxx98-cu126-x86_64-linux/optimizer/_optimizer_20250614125054.abi3.so b/build/torch26-cxx98-cu126-x86_64-linux/optimizer/_optimizer_20250614125054.abi3.so deleted file mode 100755 index 12a767e52b5d38684926563a9c8969fc50229dd8..0000000000000000000000000000000000000000 --- a/build/torch26-cxx98-cu126-x86_64-linux/optimizer/_optimizer_20250614125054.abi3.so +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:3c3282a321487a6faa532afe43bc1298731983c50e2a1acdff5480ff6e4df34e -size 1824192 diff --git a/build/torch26-cxx98-cu126-x86_64-linux/optimizer/_optimizer_8535e80_dirty.abi3.so b/build/torch26-cxx98-cu126-x86_64-linux/optimizer/_optimizer_8535e80_dirty.abi3.so deleted file mode 100755 index 0c91c18132e58aac01184a7d413409e5a18935ac..0000000000000000000000000000000000000000 --- a/build/torch26-cxx98-cu126-x86_64-linux/optimizer/_optimizer_8535e80_dirty.abi3.so +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:34215ecc274ef516967962c8457dad214e9bbf618bf5eee8f467371f4f620284 -size 1824184 diff --git a/build/torch26-cxx98-cu126-x86_64-linux/optimizer/_optimizer_b4b3752_dirty.abi3.so b/build/torch26-cxx98-cu126-x86_64-linux/optimizer/_optimizer_b4b3752_dirty.abi3.so deleted file mode 100755 index 8ad904ccfaa8794b609dd947fe9dabcca1040ca9..0000000000000000000000000000000000000000 --- a/build/torch26-cxx98-cu126-x86_64-linux/optimizer/_optimizer_b4b3752_dirty.abi3.so +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:a5b49ed642e1c320da3932377033ad90031124f4ec24b2d1c95fd976ff28346c -size 1824184 diff --git a/build/torch26-cxx98-cu126-x86_64-linux/optimizer/_optimizer_bdd2678_dirty.abi3.so b/build/torch26-cxx98-cu126-x86_64-linux/optimizer/_optimizer_bdd2678_dirty.abi3.so new file mode 100755 index 0000000000000000000000000000000000000000..d0e7ceab071bf727f5e39161e742fb6f0d599e32 --- /dev/null +++ b/build/torch26-cxx98-cu126-x86_64-linux/optimizer/_optimizer_bdd2678_dirty.abi3.so @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2fa1dad3b3c1d94e7613a35e42afb8c7974d7bf6ce25cd2766590ba65b129f07 +size 1824184 diff --git a/build/torch26-cxx98-cu126-x86_64-linux/optimizer/muon.py b/build/torch26-cxx98-cu126-x86_64-linux/optimizer/muon.py index af6dc3feaccabb9f86fa3bbb31e4274651caa53b..1bd67e98878b841610dd6eaf377d8b29d457a42c 100755 --- a/build/torch26-cxx98-cu126-x86_64-linux/optimizer/muon.py +++ b/build/torch26-cxx98-cu126-x86_64-linux/optimizer/muon.py @@ -5,10 +5,12 @@ import torch import torch.distributed as dist from torch.distributed._tensor import DTensor - # TODO leave original url and consider LICENSE # This code snippet is a modified version adapted from the following GitHub repository: # https://github.com/KellerJordan/Muon/blob/master/muon.py + + +@torch.no_grad() def _zeropower_via_newtonschulz5(G, steps): """ Newton-Schulz iteration to compute the zeroth power / orthogonalization of G. We opt to use a @@ -52,6 +54,7 @@ class _muon_state: compute_event: torch.cuda.Event | None = None +@torch.no_grad() def _gather(p, state, rank, comm_stream): g = p.grad mesh = g.device_mesh @@ -82,6 +85,7 @@ def _gather(p, state, rank, comm_stream): state.gather_event = None +@torch.no_grad() def _compute_u(state, steps, rank, compute_stream): with torch.cuda.stream(compute_stream): if rank == state.worker_rank: @@ -99,6 +103,7 @@ def _compute_u(state, steps, rank, compute_stream): state.compute_event = None +@torch.no_grad() def _scatter(p, state, lr, wd, rank, comm_stream): u = state.computed_u mesh = p.device_mesh diff --git a/build/torch27-cxx11-cu118-x86_64-linux/optimizer/_ops.py b/build/torch27-cxx11-cu118-x86_64-linux/optimizer/_ops.py index 9564d5c71da17e364e08af02794a35eef592c5d6..555368ab7e9874f09a1e0672390360863f5d7211 100755 --- a/build/torch27-cxx11-cu118-x86_64-linux/optimizer/_ops.py +++ b/build/torch27-cxx11-cu118-x86_64-linux/optimizer/_ops.py @@ -1,9 +1,9 @@ import torch -from . import _optimizer_8535e80_dirty -ops = torch.ops._optimizer_8535e80_dirty +from . import _optimizer_bdd2678_dirty +ops = torch.ops._optimizer_bdd2678_dirty def add_op_namespace_prefix(op_name: str): """ Prefix op by namespace. """ - return f"_optimizer_8535e80_dirty::{op_name}" \ No newline at end of file + return f"_optimizer_bdd2678_dirty::{op_name}" \ No newline at end of file diff --git a/build/torch27-cxx11-cu118-x86_64-linux/optimizer/_optimizer_20250614125054.abi3.so b/build/torch27-cxx11-cu118-x86_64-linux/optimizer/_optimizer_20250614125054.abi3.so deleted file mode 100755 index cf52a2d146c6a6a3f90e6171553aa70fa5a04359..0000000000000000000000000000000000000000 --- a/build/torch27-cxx11-cu118-x86_64-linux/optimizer/_optimizer_20250614125054.abi3.so +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:de82486a39ded94bfe7eeaa862459944a93e284fd0d919329979bb67db3c367f -size 1787376 diff --git a/build/torch27-cxx11-cu118-x86_64-linux/optimizer/_optimizer_8535e80_dirty.abi3.so b/build/torch27-cxx11-cu118-x86_64-linux/optimizer/_optimizer_8535e80_dirty.abi3.so deleted file mode 100755 index 0bb0b0de8c534a082a0403c2c03187f97327fe7b..0000000000000000000000000000000000000000 --- a/build/torch27-cxx11-cu118-x86_64-linux/optimizer/_optimizer_8535e80_dirty.abi3.so +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:c23a3adbe4dc1a64b4851a9f8e4aed0e3e1eeeded27322c54f5b942282a2a332 -size 1787368 diff --git a/build/torch27-cxx11-cu118-x86_64-linux/optimizer/_optimizer_b4b3752_dirty.abi3.so b/build/torch27-cxx11-cu118-x86_64-linux/optimizer/_optimizer_b4b3752_dirty.abi3.so deleted file mode 100755 index c2749235508977b018762b027b746c1fec58c251..0000000000000000000000000000000000000000 --- a/build/torch27-cxx11-cu118-x86_64-linux/optimizer/_optimizer_b4b3752_dirty.abi3.so +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:8ac9027c4a93801e9f19f1e9e94a9ed33b27e92c72797053c3de55e2a6fbb41d -size 1787368 diff --git a/build/torch27-cxx11-cu118-x86_64-linux/optimizer/_optimizer_bdd2678_dirty.abi3.so b/build/torch27-cxx11-cu118-x86_64-linux/optimizer/_optimizer_bdd2678_dirty.abi3.so new file mode 100755 index 0000000000000000000000000000000000000000..88d3308654b4dc76aa797c7d0beffc1125e35e1a --- /dev/null +++ b/build/torch27-cxx11-cu118-x86_64-linux/optimizer/_optimizer_bdd2678_dirty.abi3.so @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:fe5761d07ed965bf94d00d8a8e6753a7fb571271e73773de9021511e0e6ae2c7 +size 1787368 diff --git a/build/torch27-cxx11-cu118-x86_64-linux/optimizer/muon.py b/build/torch27-cxx11-cu118-x86_64-linux/optimizer/muon.py index af6dc3feaccabb9f86fa3bbb31e4274651caa53b..1bd67e98878b841610dd6eaf377d8b29d457a42c 100755 --- a/build/torch27-cxx11-cu118-x86_64-linux/optimizer/muon.py +++ b/build/torch27-cxx11-cu118-x86_64-linux/optimizer/muon.py @@ -5,10 +5,12 @@ import torch import torch.distributed as dist from torch.distributed._tensor import DTensor - # TODO leave original url and consider LICENSE # This code snippet is a modified version adapted from the following GitHub repository: # https://github.com/KellerJordan/Muon/blob/master/muon.py + + +@torch.no_grad() def _zeropower_via_newtonschulz5(G, steps): """ Newton-Schulz iteration to compute the zeroth power / orthogonalization of G. We opt to use a @@ -52,6 +54,7 @@ class _muon_state: compute_event: torch.cuda.Event | None = None +@torch.no_grad() def _gather(p, state, rank, comm_stream): g = p.grad mesh = g.device_mesh @@ -82,6 +85,7 @@ def _gather(p, state, rank, comm_stream): state.gather_event = None +@torch.no_grad() def _compute_u(state, steps, rank, compute_stream): with torch.cuda.stream(compute_stream): if rank == state.worker_rank: @@ -99,6 +103,7 @@ def _compute_u(state, steps, rank, compute_stream): state.compute_event = None +@torch.no_grad() def _scatter(p, state, lr, wd, rank, comm_stream): u = state.computed_u mesh = p.device_mesh diff --git a/build/torch27-cxx11-cu126-x86_64-linux/optimizer/_ops.py b/build/torch27-cxx11-cu126-x86_64-linux/optimizer/_ops.py index 9564d5c71da17e364e08af02794a35eef592c5d6..555368ab7e9874f09a1e0672390360863f5d7211 100755 --- a/build/torch27-cxx11-cu126-x86_64-linux/optimizer/_ops.py +++ b/build/torch27-cxx11-cu126-x86_64-linux/optimizer/_ops.py @@ -1,9 +1,9 @@ import torch -from . import _optimizer_8535e80_dirty -ops = torch.ops._optimizer_8535e80_dirty +from . import _optimizer_bdd2678_dirty +ops = torch.ops._optimizer_bdd2678_dirty def add_op_namespace_prefix(op_name: str): """ Prefix op by namespace. """ - return f"_optimizer_8535e80_dirty::{op_name}" \ No newline at end of file + return f"_optimizer_bdd2678_dirty::{op_name}" \ No newline at end of file diff --git a/build/torch27-cxx11-cu126-x86_64-linux/optimizer/_optimizer_20250614125054.abi3.so b/build/torch27-cxx11-cu126-x86_64-linux/optimizer/_optimizer_20250614125054.abi3.so deleted file mode 100755 index fe455a191950613d486ea79c18f82b0fbbad6f3a..0000000000000000000000000000000000000000 --- a/build/torch27-cxx11-cu126-x86_64-linux/optimizer/_optimizer_20250614125054.abi3.so +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:cb02d3818a89c819a5a12d066ce56da0ebc4f3da491cb045ae380c5b9319e592 -size 1824256 diff --git a/build/torch27-cxx11-cu126-x86_64-linux/optimizer/_optimizer_8535e80_dirty.abi3.so b/build/torch27-cxx11-cu126-x86_64-linux/optimizer/_optimizer_8535e80_dirty.abi3.so deleted file mode 100755 index 4262df9177c924c5e5443a5af699a21f8b072776..0000000000000000000000000000000000000000 --- a/build/torch27-cxx11-cu126-x86_64-linux/optimizer/_optimizer_8535e80_dirty.abi3.so +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:d4aa09c22745d5efe1ef0669c4ca05615f67595dc90cabeee6e878301fa9bd22 -size 1824256 diff --git a/build/torch27-cxx11-cu126-x86_64-linux/optimizer/_optimizer_b4b3752_dirty.abi3.so b/build/torch27-cxx11-cu126-x86_64-linux/optimizer/_optimizer_b4b3752_dirty.abi3.so deleted file mode 100755 index da27633d09f7236707f64a4ddc048aea95bc0ee8..0000000000000000000000000000000000000000 --- a/build/torch27-cxx11-cu126-x86_64-linux/optimizer/_optimizer_b4b3752_dirty.abi3.so +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:b425a7fd854402508da5af17fa88f305753a09474686d6ec7afe540b3c5c082e -size 1824256 diff --git a/build/torch27-cxx11-cu126-x86_64-linux/optimizer/_optimizer_bdd2678_dirty.abi3.so b/build/torch27-cxx11-cu126-x86_64-linux/optimizer/_optimizer_bdd2678_dirty.abi3.so new file mode 100755 index 0000000000000000000000000000000000000000..ee858184e51c541e929f3918056ed87e6cd3ed4d --- /dev/null +++ b/build/torch27-cxx11-cu126-x86_64-linux/optimizer/_optimizer_bdd2678_dirty.abi3.so @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a35c1c4d46f677f0fe35fec9023a866b9bd0f4245624b4e71a9812a1864c01e6 +size 1824256 diff --git a/build/torch27-cxx11-cu126-x86_64-linux/optimizer/muon.py b/build/torch27-cxx11-cu126-x86_64-linux/optimizer/muon.py index af6dc3feaccabb9f86fa3bbb31e4274651caa53b..1bd67e98878b841610dd6eaf377d8b29d457a42c 100755 --- a/build/torch27-cxx11-cu126-x86_64-linux/optimizer/muon.py +++ b/build/torch27-cxx11-cu126-x86_64-linux/optimizer/muon.py @@ -5,10 +5,12 @@ import torch import torch.distributed as dist from torch.distributed._tensor import DTensor - # TODO leave original url and consider LICENSE # This code snippet is a modified version adapted from the following GitHub repository: # https://github.com/KellerJordan/Muon/blob/master/muon.py + + +@torch.no_grad() def _zeropower_via_newtonschulz5(G, steps): """ Newton-Schulz iteration to compute the zeroth power / orthogonalization of G. We opt to use a @@ -52,6 +54,7 @@ class _muon_state: compute_event: torch.cuda.Event | None = None +@torch.no_grad() def _gather(p, state, rank, comm_stream): g = p.grad mesh = g.device_mesh @@ -82,6 +85,7 @@ def _gather(p, state, rank, comm_stream): state.gather_event = None +@torch.no_grad() def _compute_u(state, steps, rank, compute_stream): with torch.cuda.stream(compute_stream): if rank == state.worker_rank: @@ -99,6 +103,7 @@ def _compute_u(state, steps, rank, compute_stream): state.compute_event = None +@torch.no_grad() def _scatter(p, state, lr, wd, rank, comm_stream): u = state.computed_u mesh = p.device_mesh diff --git a/build/torch27-cxx11-cu128-x86_64-linux/optimizer/_ops.py b/build/torch27-cxx11-cu128-x86_64-linux/optimizer/_ops.py index 9564d5c71da17e364e08af02794a35eef592c5d6..555368ab7e9874f09a1e0672390360863f5d7211 100755 --- a/build/torch27-cxx11-cu128-x86_64-linux/optimizer/_ops.py +++ b/build/torch27-cxx11-cu128-x86_64-linux/optimizer/_ops.py @@ -1,9 +1,9 @@ import torch -from . import _optimizer_8535e80_dirty -ops = torch.ops._optimizer_8535e80_dirty +from . import _optimizer_bdd2678_dirty +ops = torch.ops._optimizer_bdd2678_dirty def add_op_namespace_prefix(op_name: str): """ Prefix op by namespace. """ - return f"_optimizer_8535e80_dirty::{op_name}" \ No newline at end of file + return f"_optimizer_bdd2678_dirty::{op_name}" \ No newline at end of file diff --git a/build/torch27-cxx11-cu128-x86_64-linux/optimizer/_optimizer_20250614125054.abi3.so b/build/torch27-cxx11-cu128-x86_64-linux/optimizer/_optimizer_20250614125054.abi3.so deleted file mode 100755 index 79c295dcc99970e4a8d8e08ad782238ac619ebd6..0000000000000000000000000000000000000000 --- a/build/torch27-cxx11-cu128-x86_64-linux/optimizer/_optimizer_20250614125054.abi3.so +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:0f4bb02fd9fc62e272efc673aa7cb96f363e6c1d617515c93ae6708db3feaa8e -size 1883352 diff --git a/build/torch27-cxx11-cu128-x86_64-linux/optimizer/_optimizer_8535e80_dirty.abi3.so b/build/torch27-cxx11-cu128-x86_64-linux/optimizer/_optimizer_8535e80_dirty.abi3.so deleted file mode 100755 index 4554b8fa175dfa280343c05903edac728c863a58..0000000000000000000000000000000000000000 --- a/build/torch27-cxx11-cu128-x86_64-linux/optimizer/_optimizer_8535e80_dirty.abi3.so +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:b4baf569b70749c4657062fb0f56943fc486adb0c482e50c7aa8e31ddf5cc870 -size 1883352 diff --git a/build/torch27-cxx11-cu128-x86_64-linux/optimizer/_optimizer_b4b3752_dirty.abi3.so b/build/torch27-cxx11-cu128-x86_64-linux/optimizer/_optimizer_b4b3752_dirty.abi3.so deleted file mode 100755 index c0dda129458a8c19caef554baa11de9106a6d370..0000000000000000000000000000000000000000 --- a/build/torch27-cxx11-cu128-x86_64-linux/optimizer/_optimizer_b4b3752_dirty.abi3.so +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:1150b2d74d708ef2f7d8a24049c49b9ba6e2d8f5d5ce5ae88a611e4d555fe659 -size 1883352 diff --git a/build/torch27-cxx11-cu128-x86_64-linux/optimizer/_optimizer_bdd2678_dirty.abi3.so b/build/torch27-cxx11-cu128-x86_64-linux/optimizer/_optimizer_bdd2678_dirty.abi3.so new file mode 100755 index 0000000000000000000000000000000000000000..edd42a5f48e5af377e2ff4216d3ef5867b40fbdb --- /dev/null +++ b/build/torch27-cxx11-cu128-x86_64-linux/optimizer/_optimizer_bdd2678_dirty.abi3.so @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1b27a334f5b3c1dd922468fc93662f90cc95b4213f3f96a212e34ea8e4f3bf03 +size 1883352 diff --git a/build/torch27-cxx11-cu128-x86_64-linux/optimizer/muon.py b/build/torch27-cxx11-cu128-x86_64-linux/optimizer/muon.py index af6dc3feaccabb9f86fa3bbb31e4274651caa53b..1bd67e98878b841610dd6eaf377d8b29d457a42c 100755 --- a/build/torch27-cxx11-cu128-x86_64-linux/optimizer/muon.py +++ b/build/torch27-cxx11-cu128-x86_64-linux/optimizer/muon.py @@ -5,10 +5,12 @@ import torch import torch.distributed as dist from torch.distributed._tensor import DTensor - # TODO leave original url and consider LICENSE # This code snippet is a modified version adapted from the following GitHub repository: # https://github.com/KellerJordan/Muon/blob/master/muon.py + + +@torch.no_grad() def _zeropower_via_newtonschulz5(G, steps): """ Newton-Schulz iteration to compute the zeroth power / orthogonalization of G. We opt to use a @@ -52,6 +54,7 @@ class _muon_state: compute_event: torch.cuda.Event | None = None +@torch.no_grad() def _gather(p, state, rank, comm_stream): g = p.grad mesh = g.device_mesh @@ -82,6 +85,7 @@ def _gather(p, state, rank, comm_stream): state.gather_event = None +@torch.no_grad() def _compute_u(state, steps, rank, compute_stream): with torch.cuda.stream(compute_stream): if rank == state.worker_rank: @@ -99,6 +103,7 @@ def _compute_u(state, steps, rank, compute_stream): state.compute_event = None +@torch.no_grad() def _scatter(p, state, lr, wd, rank, comm_stream): u = state.computed_u mesh = p.device_mesh diff --git a/build/torch27-cxx11-rocm63-x86_64-linux/optimizer/__pycache__/__init__.cpython-310.pyc b/build/torch27-cxx11-rocm63-x86_64-linux/optimizer/__pycache__/__init__.cpython-310.pyc deleted file mode 100644 index ceb67b0a077b991ccc42361dfb7c2b3dabf00170..0000000000000000000000000000000000000000 Binary files a/build/torch27-cxx11-rocm63-x86_64-linux/optimizer/__pycache__/__init__.cpython-310.pyc and /dev/null differ diff --git a/build/torch27-cxx11-rocm63-x86_64-linux/optimizer/__pycache__/muon.cpython-310.pyc b/build/torch27-cxx11-rocm63-x86_64-linux/optimizer/__pycache__/muon.cpython-310.pyc deleted file mode 100644 index daa8983f731e73b9903af8821b810ed3eadcf716..0000000000000000000000000000000000000000 Binary files a/build/torch27-cxx11-rocm63-x86_64-linux/optimizer/__pycache__/muon.cpython-310.pyc and /dev/null differ diff --git a/build/torch27-cxx11-rocm63-x86_64-linux/optimizer/_ops.py b/build/torch27-cxx11-rocm63-x86_64-linux/optimizer/_ops.py index 9564d5c71da17e364e08af02794a35eef592c5d6..555368ab7e9874f09a1e0672390360863f5d7211 100755 --- a/build/torch27-cxx11-rocm63-x86_64-linux/optimizer/_ops.py +++ b/build/torch27-cxx11-rocm63-x86_64-linux/optimizer/_ops.py @@ -1,9 +1,9 @@ import torch -from . import _optimizer_8535e80_dirty -ops = torch.ops._optimizer_8535e80_dirty +from . import _optimizer_bdd2678_dirty +ops = torch.ops._optimizer_bdd2678_dirty def add_op_namespace_prefix(op_name: str): """ Prefix op by namespace. """ - return f"_optimizer_8535e80_dirty::{op_name}" \ No newline at end of file + return f"_optimizer_bdd2678_dirty::{op_name}" \ No newline at end of file diff --git a/build/torch27-cxx11-rocm63-x86_64-linux/optimizer/_optimizer_20250614121529.abi3.so b/build/torch27-cxx11-rocm63-x86_64-linux/optimizer/_optimizer_20250614121529.abi3.so deleted file mode 100755 index 52f817670f67373218bd36e98856a9192e1c59f9..0000000000000000000000000000000000000000 --- a/build/torch27-cxx11-rocm63-x86_64-linux/optimizer/_optimizer_20250614121529.abi3.so +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:0796f06f2de4e26247141c21c1e8dafc3d268073a3eb2c8f2ef810cf588c2746 -size 1749688 diff --git a/build/torch27-cxx11-rocm63-x86_64-linux/optimizer/_optimizer_20250614123843.abi3.so b/build/torch27-cxx11-rocm63-x86_64-linux/optimizer/_optimizer_20250614123843.abi3.so deleted file mode 100755 index c3262a8b5b00ca39b8a37f1a1ca51a955d90f1f5..0000000000000000000000000000000000000000 --- a/build/torch27-cxx11-rocm63-x86_64-linux/optimizer/_optimizer_20250614123843.abi3.so +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:57144d037f5db2441940be53c25ff198c5b3ec11bc5edac809bb208434e8d53d -size 1749688 diff --git a/build/torch27-cxx11-rocm63-x86_64-linux/optimizer/_optimizer_20250614125054.abi3.so b/build/torch27-cxx11-rocm63-x86_64-linux/optimizer/_optimizer_20250614125054.abi3.so deleted file mode 100755 index 24c19f92b157c4cb10fd6723e5e13488cee571ab..0000000000000000000000000000000000000000 --- a/build/torch27-cxx11-rocm63-x86_64-linux/optimizer/_optimizer_20250614125054.abi3.so +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:e362d290566b6187aedf433241e853cf60f311b69e49b35d9b8f70892fbb57f6 -size 1749688 diff --git a/build/torch27-cxx11-rocm63-x86_64-linux/optimizer/_optimizer_8535e80_dirty.abi3.so b/build/torch27-cxx11-rocm63-x86_64-linux/optimizer/_optimizer_8535e80_dirty.abi3.so deleted file mode 100755 index 41c7b238e8d7628e65ccd89813577f84c9b72bb8..0000000000000000000000000000000000000000 --- a/build/torch27-cxx11-rocm63-x86_64-linux/optimizer/_optimizer_8535e80_dirty.abi3.so +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:8566c9bc05e13c9394572f9f9c6bac24c31932548be485f49eb49fb249880832 -size 1749648 diff --git a/build/torch27-cxx11-rocm63-x86_64-linux/optimizer/_optimizer_b4b3752_dirty.abi3.so b/build/torch27-cxx11-rocm63-x86_64-linux/optimizer/_optimizer_b4b3752_dirty.abi3.so deleted file mode 100755 index 01410e2d256e65f553c8b5522c011367253547a5..0000000000000000000000000000000000000000 --- a/build/torch27-cxx11-rocm63-x86_64-linux/optimizer/_optimizer_b4b3752_dirty.abi3.so +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:c3489677edaccff3afade9c51427a89ac1edf283d092cdd3bc39e06d75c231f1 -size 1749648 diff --git a/build/torch27-cxx11-rocm63-x86_64-linux/optimizer/_optimizer_bdd2678_dirty.abi3.so b/build/torch27-cxx11-rocm63-x86_64-linux/optimizer/_optimizer_bdd2678_dirty.abi3.so new file mode 100755 index 0000000000000000000000000000000000000000..8b1491a66725323f155352b7de1cb6d3424cd2b5 --- /dev/null +++ b/build/torch27-cxx11-rocm63-x86_64-linux/optimizer/_optimizer_bdd2678_dirty.abi3.so @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6ae60aac17486a756b1926d38e1c20933f57444688e15ba849da3153adcf434e +size 1749648 diff --git a/build/torch27-cxx11-rocm63-x86_64-linux/optimizer/muon.py b/build/torch27-cxx11-rocm63-x86_64-linux/optimizer/muon.py index af6dc3feaccabb9f86fa3bbb31e4274651caa53b..1bd67e98878b841610dd6eaf377d8b29d457a42c 100755 --- a/build/torch27-cxx11-rocm63-x86_64-linux/optimizer/muon.py +++ b/build/torch27-cxx11-rocm63-x86_64-linux/optimizer/muon.py @@ -5,10 +5,12 @@ import torch import torch.distributed as dist from torch.distributed._tensor import DTensor - # TODO leave original url and consider LICENSE # This code snippet is a modified version adapted from the following GitHub repository: # https://github.com/KellerJordan/Muon/blob/master/muon.py + + +@torch.no_grad() def _zeropower_via_newtonschulz5(G, steps): """ Newton-Schulz iteration to compute the zeroth power / orthogonalization of G. We opt to use a @@ -52,6 +54,7 @@ class _muon_state: compute_event: torch.cuda.Event | None = None +@torch.no_grad() def _gather(p, state, rank, comm_stream): g = p.grad mesh = g.device_mesh @@ -82,6 +85,7 @@ def _gather(p, state, rank, comm_stream): state.gather_event = None +@torch.no_grad() def _compute_u(state, steps, rank, compute_stream): with torch.cuda.stream(compute_stream): if rank == state.worker_rank: @@ -99,6 +103,7 @@ def _compute_u(state, steps, rank, compute_stream): state.compute_event = None +@torch.no_grad() def _scatter(p, state, lr, wd, rank, comm_stream): u = state.computed_u mesh = p.device_mesh diff --git a/docs/muon/main.tex b/docs/muon/main.tex index bd9a86c98f424fd742a0a7068e799ed37eaedc3b..41ad35f7b47a0bcfd9cc5bcc879b5fa1bf56c6f4 100644 --- a/docs/muon/main.tex +++ b/docs/muon/main.tex @@ -46,7 +46,7 @@ While a distributed version of Muon is available, it has the drawback of redunda \begin{algorithm} \caption{Parallel Muon} -\textbf{Require:} DP partitioned gradients $\mathbf{gG}$, DP partitioned Momentum $\mathbf{m}$, DP partitioned parameters $\mathbf{p}$, momentum $\mu$, local rank $\mathbf{r}$ +\textbf{Require:} DP partitioned gradient $\mathbf{g}$, DP partitioned Momentum $\mathbf{m}$, DP partitioned parameter $\mathbf{p}$, momentum $\mu$, local rank $\mathbf{r}$ \begin{algorithmic}[1] \State \texttt{// Apply momentum to $\mathbf{g}$ using local partitioned momentum $\mathbf{m}$} \State $\mathbf{g'} \gets \text{update\_with\_momentum}(\mathbf{g}, \mathbf{m}, \mu)$ @@ -125,15 +125,15 @@ To enable concurrent computation and communication, we use separate compute and Thanks to the simplicity of \texttt{torch.DTensor} and \texttt{torch.distributed}, the implementation remains straightforward and low in complexity. \section*{Evaluation} -We evaluated the performance using \href{https://huggingface.co/Motif-Technologies/Motif-2.6B}{Motif 2.6B}, achieving 151 TFLOPS per GPU during the optimizer step. +We evaluated the performance using 10B model currently in development, achieving 151 TFLOPS per GPU during the optimizer step. \begin{table}[H] \centering \begin{tabular}{@{}lllll@{}} \toprule - Model & TFLOPs for Muon & GPUs & Elapsed time & TFLOPS/GPU \\ + Model Size & TFLOPs for Muon & GPUs & Elapsed time & TFLOPS/GPU \\ \midrule - Motif 2.6B & 847.45 & 4xMI250 (8 devices) & 1.4 s & 151 \\ + 10B & 847.45 & 4xMI250 (8 devices) & 1.4 s & 151 \\ \bottomrule \end{tabular} \end{table} diff --git a/docs/muon/parallel_muon.pdf b/docs/muon/parallel_muon.pdf index 31397f765a5356ac426213ca512e5ddacbf3f524..8321c572edfae32e963a013d69187d58971fc27e 100644 --- a/docs/muon/parallel_muon.pdf +++ b/docs/muon/parallel_muon.pdf @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:d0a27499300118f9aa736ab8bde61796640a779afcfa0e885b55b29b833a4272 -size 654653 +oid sha256:c1a88537a50ecc3db52d6e148d3513b31e2c9810c09df0da8f6aff03fa652fe5 +size 654538 diff --git a/docs/muon/parallel_muon.png b/docs/muon/parallel_muon.png deleted file mode 100644 index 002032fee3e7115c210061d6feabd98dbd7eeff8..0000000000000000000000000000000000000000 --- a/docs/muon/parallel_muon.png +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:89572b94f05e3cd0915e01e56f0c8ac8070a6ac53f3ac3791447daa48325f9b0 -size 126934 diff --git a/torch-ext/optimizer/muon.py b/torch-ext/optimizer/muon.py index af6dc3feaccabb9f86fa3bbb31e4274651caa53b..1bd67e98878b841610dd6eaf377d8b29d457a42c 100644 --- a/torch-ext/optimizer/muon.py +++ b/torch-ext/optimizer/muon.py @@ -5,10 +5,12 @@ import torch import torch.distributed as dist from torch.distributed._tensor import DTensor - # TODO leave original url and consider LICENSE # This code snippet is a modified version adapted from the following GitHub repository: # https://github.com/KellerJordan/Muon/blob/master/muon.py + + +@torch.no_grad() def _zeropower_via_newtonschulz5(G, steps): """ Newton-Schulz iteration to compute the zeroth power / orthogonalization of G. We opt to use a @@ -52,6 +54,7 @@ class _muon_state: compute_event: torch.cuda.Event | None = None +@torch.no_grad() def _gather(p, state, rank, comm_stream): g = p.grad mesh = g.device_mesh @@ -82,6 +85,7 @@ def _gather(p, state, rank, comm_stream): state.gather_event = None +@torch.no_grad() def _compute_u(state, steps, rank, compute_stream): with torch.cuda.stream(compute_stream): if rank == state.worker_rank: @@ -99,6 +103,7 @@ def _compute_u(state, steps, rank, compute_stream): state.compute_event = None +@torch.no_grad() def _scatter(p, state, lr, wd, rank, comm_stream): u = state.computed_u mesh = p.device_mesh