iamwyldecat commited on
Commit
febdf5b
·
1 Parent(s): bdd2678

chore(muon): clean build and update doc

Browse files
This view is limited to 50 files because it contains too many changes.   See raw diff
Files changed (50) hide show
  1. build/torch26-cxx11-cu118-x86_64-linux/optimizer/_ops.py +3 -3
  2. build/torch26-cxx11-cu118-x86_64-linux/optimizer/_optimizer_8535e80_dirty.abi3.so +0 -3
  3. build/torch26-cxx11-cu118-x86_64-linux/optimizer/_optimizer_b4b3752_dirty.abi3.so +0 -3
  4. build/torch26-cxx11-cu118-x86_64-linux/optimizer/{_optimizer_20250614125054.abi3.so → _optimizer_bdd2678_dirty.abi3.so} +1 -1
  5. build/torch26-cxx11-cu118-x86_64-linux/optimizer/muon.py +6 -1
  6. build/torch26-cxx11-cu124-x86_64-linux/optimizer/_ops.py +3 -3
  7. build/torch26-cxx11-cu124-x86_64-linux/optimizer/_optimizer_20250614125054.abi3.so +0 -3
  8. build/torch26-cxx11-cu124-x86_64-linux/optimizer/{_optimizer_8535e80_dirty.abi3.so → _optimizer_bdd2678_dirty.abi3.so} +1 -1
  9. build/torch26-cxx11-cu124-x86_64-linux/optimizer/muon.py +6 -1
  10. build/torch26-cxx11-cu126-x86_64-linux/optimizer/_ops.py +3 -3
  11. build/torch26-cxx11-cu126-x86_64-linux/optimizer/_optimizer_20250614125054.abi3.so +0 -3
  12. build/torch26-cxx11-cu126-x86_64-linux/optimizer/_optimizer_8535e80_dirty.abi3.so +0 -3
  13. build/torch26-cxx11-cu126-x86_64-linux/optimizer/_optimizer_b4b3752_dirty.abi3.so +0 -3
  14. build/{torch26-cxx11-cu124-x86_64-linux/optimizer/_optimizer_b4b3752_dirty.abi3.so → torch26-cxx11-cu126-x86_64-linux/optimizer/_optimizer_bdd2678_dirty.abi3.so} +1 -1
  15. build/torch26-cxx11-cu126-x86_64-linux/optimizer/muon.py +6 -1
  16. build/torch26-cxx11-rocm62-x86_64-linux/optimizer/_ops.py +3 -3
  17. build/torch26-cxx11-rocm62-x86_64-linux/optimizer/_optimizer_20250614121529.abi3.so +0 -3
  18. build/torch26-cxx11-rocm62-x86_64-linux/optimizer/_optimizer_20250614123843.abi3.so +0 -3
  19. build/torch26-cxx11-rocm62-x86_64-linux/optimizer/_optimizer_20250614125054.abi3.so +0 -3
  20. build/torch26-cxx11-rocm62-x86_64-linux/optimizer/_optimizer_b4b3752_dirty.abi3.so +0 -3
  21. build/torch26-cxx11-rocm62-x86_64-linux/optimizer/{_optimizer_8535e80_dirty.abi3.so → _optimizer_bdd2678_dirty.abi3.so} +1 -1
  22. build/torch26-cxx11-rocm62-x86_64-linux/optimizer/muon.py +6 -1
  23. build/torch26-cxx98-cu118-x86_64-linux/optimizer/_ops.py +3 -3
  24. build/torch26-cxx98-cu118-x86_64-linux/optimizer/_optimizer_20250614125054.abi3.so +0 -3
  25. build/torch26-cxx98-cu118-x86_64-linux/optimizer/_optimizer_b4b3752_dirty.abi3.so +0 -3
  26. build/torch26-cxx98-cu118-x86_64-linux/optimizer/{_optimizer_8535e80_dirty.abi3.so → _optimizer_bdd2678_dirty.abi3.so} +1 -1
  27. build/torch26-cxx98-cu118-x86_64-linux/optimizer/muon.py +6 -1
  28. build/torch26-cxx98-cu124-x86_64-linux/optimizer/_ops.py +3 -3
  29. build/torch26-cxx98-cu124-x86_64-linux/optimizer/_optimizer_20250614125054.abi3.so +0 -3
  30. build/torch26-cxx98-cu124-x86_64-linux/optimizer/{_optimizer_8535e80_dirty.abi3.so → _optimizer_bdd2678_dirty.abi3.so} +1 -1
  31. build/torch26-cxx98-cu124-x86_64-linux/optimizer/muon.py +6 -1
  32. build/torch26-cxx98-cu126-x86_64-linux/optimizer/_ops.py +3 -3
  33. build/torch26-cxx98-cu126-x86_64-linux/optimizer/_optimizer_20250614125054.abi3.so +0 -3
  34. build/torch26-cxx98-cu126-x86_64-linux/optimizer/_optimizer_8535e80_dirty.abi3.so +0 -3
  35. build/torch26-cxx98-cu126-x86_64-linux/optimizer/_optimizer_b4b3752_dirty.abi3.so +0 -3
  36. build/{torch26-cxx98-cu124-x86_64-linux/optimizer/_optimizer_b4b3752_dirty.abi3.so → torch26-cxx98-cu126-x86_64-linux/optimizer/_optimizer_bdd2678_dirty.abi3.so} +1 -1
  37. build/torch26-cxx98-cu126-x86_64-linux/optimizer/muon.py +6 -1
  38. build/torch27-cxx11-cu118-x86_64-linux/optimizer/_ops.py +3 -3
  39. build/torch27-cxx11-cu118-x86_64-linux/optimizer/_optimizer_20250614125054.abi3.so +0 -3
  40. build/torch27-cxx11-cu118-x86_64-linux/optimizer/_optimizer_b4b3752_dirty.abi3.so +0 -3
  41. build/torch27-cxx11-cu118-x86_64-linux/optimizer/{_optimizer_8535e80_dirty.abi3.so → _optimizer_bdd2678_dirty.abi3.so} +1 -1
  42. build/torch27-cxx11-cu118-x86_64-linux/optimizer/muon.py +6 -1
  43. build/torch27-cxx11-cu126-x86_64-linux/optimizer/_ops.py +3 -3
  44. build/torch27-cxx11-cu126-x86_64-linux/optimizer/_optimizer_8535e80_dirty.abi3.so +0 -3
  45. build/torch27-cxx11-cu126-x86_64-linux/optimizer/_optimizer_b4b3752_dirty.abi3.so +0 -3
  46. build/torch27-cxx11-cu126-x86_64-linux/optimizer/{_optimizer_20250614125054.abi3.so → _optimizer_bdd2678_dirty.abi3.so} +1 -1
  47. build/torch27-cxx11-cu126-x86_64-linux/optimizer/muon.py +6 -1
  48. build/torch27-cxx11-cu128-x86_64-linux/optimizer/_ops.py +3 -3
  49. build/torch27-cxx11-cu128-x86_64-linux/optimizer/_optimizer_8535e80_dirty.abi3.so +0 -3
  50. build/torch27-cxx11-cu128-x86_64-linux/optimizer/_optimizer_b4b3752_dirty.abi3.so +0 -3
build/torch26-cxx11-cu118-x86_64-linux/optimizer/_ops.py CHANGED
@@ -1,9 +1,9 @@
1
  import torch
2
- from . import _optimizer_8535e80_dirty
3
- ops = torch.ops._optimizer_8535e80_dirty
4
 
5
  def add_op_namespace_prefix(op_name: str):
6
  """
7
  Prefix op by namespace.
8
  """
9
- return f"_optimizer_8535e80_dirty::{op_name}"
 
1
  import torch
2
+ from . import _optimizer_bdd2678_dirty
3
+ ops = torch.ops._optimizer_bdd2678_dirty
4
 
5
  def add_op_namespace_prefix(op_name: str):
6
  """
7
  Prefix op by namespace.
8
  """
9
+ return f"_optimizer_bdd2678_dirty::{op_name}"
build/torch26-cxx11-cu118-x86_64-linux/optimizer/_optimizer_8535e80_dirty.abi3.so DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:a46d9e65efcfa82522950d9ebf2b2b4594d9ed5abc28704352a1f7de2dae707a
3
- size 1787272
 
 
 
 
build/torch26-cxx11-cu118-x86_64-linux/optimizer/_optimizer_b4b3752_dirty.abi3.so DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:f8325d12959ef4f31b6c6340eca29176f5077abeaa10f3a6651db55ccf3c634f
3
- size 1787272
 
 
 
 
build/torch26-cxx11-cu118-x86_64-linux/optimizer/{_optimizer_20250614125054.abi3.so → _optimizer_bdd2678_dirty.abi3.so} RENAMED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:66ca698639fff584999fe65f8f10cc4436c197829e936be2741bf53db685caa0
3
  size 1787272
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:9119d3a6d99c07a17d110d2ccf6042f199d00c839f5efa74008c1642d21e48b0
3
  size 1787272
build/torch26-cxx11-cu118-x86_64-linux/optimizer/muon.py CHANGED
@@ -5,10 +5,12 @@ import torch
5
  import torch.distributed as dist
6
  from torch.distributed._tensor import DTensor
7
 
8
-
9
  # TODO leave original url and consider LICENSE
10
  # This code snippet is a modified version adapted from the following GitHub repository:
11
  # https://github.com/KellerJordan/Muon/blob/master/muon.py
 
 
 
12
  def _zeropower_via_newtonschulz5(G, steps):
13
  """
14
  Newton-Schulz iteration to compute the zeroth power / orthogonalization of G. We opt to use a
@@ -52,6 +54,7 @@ class _muon_state:
52
  compute_event: torch.cuda.Event | None = None
53
 
54
 
 
55
  def _gather(p, state, rank, comm_stream):
56
  g = p.grad
57
  mesh = g.device_mesh
@@ -82,6 +85,7 @@ def _gather(p, state, rank, comm_stream):
82
  state.gather_event = None
83
 
84
 
 
85
  def _compute_u(state, steps, rank, compute_stream):
86
  with torch.cuda.stream(compute_stream):
87
  if rank == state.worker_rank:
@@ -99,6 +103,7 @@ def _compute_u(state, steps, rank, compute_stream):
99
  state.compute_event = None
100
 
101
 
 
102
  def _scatter(p, state, lr, wd, rank, comm_stream):
103
  u = state.computed_u
104
  mesh = p.device_mesh
 
5
  import torch.distributed as dist
6
  from torch.distributed._tensor import DTensor
7
 
 
8
  # TODO leave original url and consider LICENSE
9
  # This code snippet is a modified version adapted from the following GitHub repository:
10
  # https://github.com/KellerJordan/Muon/blob/master/muon.py
11
+
12
+
13
+ @torch.no_grad()
14
  def _zeropower_via_newtonschulz5(G, steps):
15
  """
16
  Newton-Schulz iteration to compute the zeroth power / orthogonalization of G. We opt to use a
 
54
  compute_event: torch.cuda.Event | None = None
55
 
56
 
57
+ @torch.no_grad()
58
  def _gather(p, state, rank, comm_stream):
59
  g = p.grad
60
  mesh = g.device_mesh
 
85
  state.gather_event = None
86
 
87
 
88
+ @torch.no_grad()
89
  def _compute_u(state, steps, rank, compute_stream):
90
  with torch.cuda.stream(compute_stream):
91
  if rank == state.worker_rank:
 
103
  state.compute_event = None
104
 
105
 
106
+ @torch.no_grad()
107
  def _scatter(p, state, lr, wd, rank, comm_stream):
108
  u = state.computed_u
109
  mesh = p.device_mesh
build/torch26-cxx11-cu124-x86_64-linux/optimizer/_ops.py CHANGED
@@ -1,9 +1,9 @@
1
  import torch
2
- from . import _optimizer_8535e80_dirty
3
- ops = torch.ops._optimizer_8535e80_dirty
4
 
5
  def add_op_namespace_prefix(op_name: str):
6
  """
7
  Prefix op by namespace.
8
  """
9
- return f"_optimizer_8535e80_dirty::{op_name}"
 
1
  import torch
2
+ from . import _optimizer_bdd2678_dirty
3
+ ops = torch.ops._optimizer_bdd2678_dirty
4
 
5
  def add_op_namespace_prefix(op_name: str):
6
  """
7
  Prefix op by namespace.
8
  """
9
+ return f"_optimizer_bdd2678_dirty::{op_name}"
build/torch26-cxx11-cu124-x86_64-linux/optimizer/_optimizer_20250614125054.abi3.so DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:e89cd7d514bfe92598684ae3cfc2d35ac2d021340846e09c0b6c880c3d55bfa0
3
- size 1820136
 
 
 
 
build/torch26-cxx11-cu124-x86_64-linux/optimizer/{_optimizer_8535e80_dirty.abi3.so → _optimizer_bdd2678_dirty.abi3.so} RENAMED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:d351a600884b7378f546a345afe65c176e1399bb42fb7dfe4333b0e90975803b
3
  size 1824224
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:91b76cd5be429f99840e26e8ba55b61f9fdcae19301bd7c082b2e9746a276501
3
  size 1824224
build/torch26-cxx11-cu124-x86_64-linux/optimizer/muon.py CHANGED
@@ -5,10 +5,12 @@ import torch
5
  import torch.distributed as dist
6
  from torch.distributed._tensor import DTensor
7
 
8
-
9
  # TODO leave original url and consider LICENSE
10
  # This code snippet is a modified version adapted from the following GitHub repository:
11
  # https://github.com/KellerJordan/Muon/blob/master/muon.py
 
 
 
12
  def _zeropower_via_newtonschulz5(G, steps):
13
  """
14
  Newton-Schulz iteration to compute the zeroth power / orthogonalization of G. We opt to use a
@@ -52,6 +54,7 @@ class _muon_state:
52
  compute_event: torch.cuda.Event | None = None
53
 
54
 
 
55
  def _gather(p, state, rank, comm_stream):
56
  g = p.grad
57
  mesh = g.device_mesh
@@ -82,6 +85,7 @@ def _gather(p, state, rank, comm_stream):
82
  state.gather_event = None
83
 
84
 
 
85
  def _compute_u(state, steps, rank, compute_stream):
86
  with torch.cuda.stream(compute_stream):
87
  if rank == state.worker_rank:
@@ -99,6 +103,7 @@ def _compute_u(state, steps, rank, compute_stream):
99
  state.compute_event = None
100
 
101
 
 
102
  def _scatter(p, state, lr, wd, rank, comm_stream):
103
  u = state.computed_u
104
  mesh = p.device_mesh
 
5
  import torch.distributed as dist
6
  from torch.distributed._tensor import DTensor
7
 
 
8
  # TODO leave original url and consider LICENSE
9
  # This code snippet is a modified version adapted from the following GitHub repository:
10
  # https://github.com/KellerJordan/Muon/blob/master/muon.py
11
+
12
+
13
+ @torch.no_grad()
14
  def _zeropower_via_newtonschulz5(G, steps):
15
  """
16
  Newton-Schulz iteration to compute the zeroth power / orthogonalization of G. We opt to use a
 
54
  compute_event: torch.cuda.Event | None = None
55
 
56
 
57
+ @torch.no_grad()
58
  def _gather(p, state, rank, comm_stream):
59
  g = p.grad
60
  mesh = g.device_mesh
 
85
  state.gather_event = None
86
 
87
 
88
+ @torch.no_grad()
89
  def _compute_u(state, steps, rank, compute_stream):
90
  with torch.cuda.stream(compute_stream):
91
  if rank == state.worker_rank:
 
103
  state.compute_event = None
104
 
105
 
106
+ @torch.no_grad()
107
  def _scatter(p, state, lr, wd, rank, comm_stream):
108
  u = state.computed_u
109
  mesh = p.device_mesh
build/torch26-cxx11-cu126-x86_64-linux/optimizer/_ops.py CHANGED
@@ -1,9 +1,9 @@
1
  import torch
2
- from . import _optimizer_8535e80_dirty
3
- ops = torch.ops._optimizer_8535e80_dirty
4
 
5
  def add_op_namespace_prefix(op_name: str):
6
  """
7
  Prefix op by namespace.
8
  """
9
- return f"_optimizer_8535e80_dirty::{op_name}"
 
1
  import torch
2
+ from . import _optimizer_bdd2678_dirty
3
+ ops = torch.ops._optimizer_bdd2678_dirty
4
 
5
  def add_op_namespace_prefix(op_name: str):
6
  """
7
  Prefix op by namespace.
8
  """
9
+ return f"_optimizer_bdd2678_dirty::{op_name}"
build/torch26-cxx11-cu126-x86_64-linux/optimizer/_optimizer_20250614125054.abi3.so DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:7f5dce62d3038e879e688fffa9bbc70f3e82db20b2e7ae3ba09040e0319acb71
3
- size 1820136
 
 
 
 
build/torch26-cxx11-cu126-x86_64-linux/optimizer/_optimizer_8535e80_dirty.abi3.so DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:2c0843f38cee494b7a5939eb62d27039d76dc3f69401d411efbacaa25cb0d67a
3
- size 1824224
 
 
 
 
build/torch26-cxx11-cu126-x86_64-linux/optimizer/_optimizer_b4b3752_dirty.abi3.so DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:58162f994df84868dbf62ae70e39d3c14e3390fc827f152eece83dfae7f51503
3
- size 1824224
 
 
 
 
build/{torch26-cxx11-cu124-x86_64-linux/optimizer/_optimizer_b4b3752_dirty.abi3.so → torch26-cxx11-cu126-x86_64-linux/optimizer/_optimizer_bdd2678_dirty.abi3.so} RENAMED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:9cbffc2cf8039069831a57afb8e2f64fa684f1a44bec79bb4b72dbb57d9ac607
3
  size 1824224
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:807d59aca5b0403206395a1f4c770b8d644294c17f6af866207c36ac617f0a7d
3
  size 1824224
build/torch26-cxx11-cu126-x86_64-linux/optimizer/muon.py CHANGED
@@ -5,10 +5,12 @@ import torch
5
  import torch.distributed as dist
6
  from torch.distributed._tensor import DTensor
7
 
8
-
9
  # TODO leave original url and consider LICENSE
10
  # This code snippet is a modified version adapted from the following GitHub repository:
11
  # https://github.com/KellerJordan/Muon/blob/master/muon.py
 
 
 
12
  def _zeropower_via_newtonschulz5(G, steps):
13
  """
14
  Newton-Schulz iteration to compute the zeroth power / orthogonalization of G. We opt to use a
@@ -52,6 +54,7 @@ class _muon_state:
52
  compute_event: torch.cuda.Event | None = None
53
 
54
 
 
55
  def _gather(p, state, rank, comm_stream):
56
  g = p.grad
57
  mesh = g.device_mesh
@@ -82,6 +85,7 @@ def _gather(p, state, rank, comm_stream):
82
  state.gather_event = None
83
 
84
 
 
85
  def _compute_u(state, steps, rank, compute_stream):
86
  with torch.cuda.stream(compute_stream):
87
  if rank == state.worker_rank:
@@ -99,6 +103,7 @@ def _compute_u(state, steps, rank, compute_stream):
99
  state.compute_event = None
100
 
101
 
 
102
  def _scatter(p, state, lr, wd, rank, comm_stream):
103
  u = state.computed_u
104
  mesh = p.device_mesh
 
5
  import torch.distributed as dist
6
  from torch.distributed._tensor import DTensor
7
 
 
8
  # TODO leave original url and consider LICENSE
9
  # This code snippet is a modified version adapted from the following GitHub repository:
10
  # https://github.com/KellerJordan/Muon/blob/master/muon.py
11
+
12
+
13
+ @torch.no_grad()
14
  def _zeropower_via_newtonschulz5(G, steps):
15
  """
16
  Newton-Schulz iteration to compute the zeroth power / orthogonalization of G. We opt to use a
 
54
  compute_event: torch.cuda.Event | None = None
55
 
56
 
57
+ @torch.no_grad()
58
  def _gather(p, state, rank, comm_stream):
59
  g = p.grad
60
  mesh = g.device_mesh
 
85
  state.gather_event = None
86
 
87
 
88
+ @torch.no_grad()
89
  def _compute_u(state, steps, rank, compute_stream):
90
  with torch.cuda.stream(compute_stream):
91
  if rank == state.worker_rank:
 
103
  state.compute_event = None
104
 
105
 
106
+ @torch.no_grad()
107
  def _scatter(p, state, lr, wd, rank, comm_stream):
108
  u = state.computed_u
109
  mesh = p.device_mesh
build/torch26-cxx11-rocm62-x86_64-linux/optimizer/_ops.py CHANGED
@@ -1,9 +1,9 @@
1
  import torch
2
- from . import _optimizer_8535e80_dirty
3
- ops = torch.ops._optimizer_8535e80_dirty
4
 
5
  def add_op_namespace_prefix(op_name: str):
6
  """
7
  Prefix op by namespace.
8
  """
9
- return f"_optimizer_8535e80_dirty::{op_name}"
 
1
  import torch
2
+ from . import _optimizer_bdd2678_dirty
3
+ ops = torch.ops._optimizer_bdd2678_dirty
4
 
5
  def add_op_namespace_prefix(op_name: str):
6
  """
7
  Prefix op by namespace.
8
  """
9
+ return f"_optimizer_bdd2678_dirty::{op_name}"
build/torch26-cxx11-rocm62-x86_64-linux/optimizer/_optimizer_20250614121529.abi3.so DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:d2f60369ba2bd0a0f84e053d857d37496137ff476dc21561f211b1fa39651990
3
- size 1749784
 
 
 
 
build/torch26-cxx11-rocm62-x86_64-linux/optimizer/_optimizer_20250614123843.abi3.so DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:f4d790535f99b7b362a966e802a547654f31749f5f28a0207493870927f1d8d2
3
- size 1749784
 
 
 
 
build/torch26-cxx11-rocm62-x86_64-linux/optimizer/_optimizer_20250614125054.abi3.so DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:b440dd9a60711a498010068e91d0ad013cd0b8ac732c16b5d1d17e5d4ec0f9b4
3
- size 1749784
 
 
 
 
build/torch26-cxx11-rocm62-x86_64-linux/optimizer/_optimizer_b4b3752_dirty.abi3.so DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:f50ea9cab62a5bd06d886516d3917e4490e65aa9addd1cbb84fc81c6f9a9d5b1
3
- size 1749744
 
 
 
 
build/torch26-cxx11-rocm62-x86_64-linux/optimizer/{_optimizer_8535e80_dirty.abi3.so → _optimizer_bdd2678_dirty.abi3.so} RENAMED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:acdba99ce95532a9ca6a8987a7ab61a257657872f2cc672c91e8e5fe809aa24e
3
  size 1749744
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:3d2bdd755079fa06a27401b8a26ac425d35514d196f9df4ce1be5c52ebcc9a64
3
  size 1749744
build/torch26-cxx11-rocm62-x86_64-linux/optimizer/muon.py CHANGED
@@ -5,10 +5,12 @@ import torch
5
  import torch.distributed as dist
6
  from torch.distributed._tensor import DTensor
7
 
8
-
9
  # TODO leave original url and consider LICENSE
10
  # This code snippet is a modified version adapted from the following GitHub repository:
11
  # https://github.com/KellerJordan/Muon/blob/master/muon.py
 
 
 
12
  def _zeropower_via_newtonschulz5(G, steps):
13
  """
14
  Newton-Schulz iteration to compute the zeroth power / orthogonalization of G. We opt to use a
@@ -52,6 +54,7 @@ class _muon_state:
52
  compute_event: torch.cuda.Event | None = None
53
 
54
 
 
55
  def _gather(p, state, rank, comm_stream):
56
  g = p.grad
57
  mesh = g.device_mesh
@@ -82,6 +85,7 @@ def _gather(p, state, rank, comm_stream):
82
  state.gather_event = None
83
 
84
 
 
85
  def _compute_u(state, steps, rank, compute_stream):
86
  with torch.cuda.stream(compute_stream):
87
  if rank == state.worker_rank:
@@ -99,6 +103,7 @@ def _compute_u(state, steps, rank, compute_stream):
99
  state.compute_event = None
100
 
101
 
 
102
  def _scatter(p, state, lr, wd, rank, comm_stream):
103
  u = state.computed_u
104
  mesh = p.device_mesh
 
5
  import torch.distributed as dist
6
  from torch.distributed._tensor import DTensor
7
 
 
8
  # TODO leave original url and consider LICENSE
9
  # This code snippet is a modified version adapted from the following GitHub repository:
10
  # https://github.com/KellerJordan/Muon/blob/master/muon.py
11
+
12
+
13
+ @torch.no_grad()
14
  def _zeropower_via_newtonschulz5(G, steps):
15
  """
16
  Newton-Schulz iteration to compute the zeroth power / orthogonalization of G. We opt to use a
 
54
  compute_event: torch.cuda.Event | None = None
55
 
56
 
57
+ @torch.no_grad()
58
  def _gather(p, state, rank, comm_stream):
59
  g = p.grad
60
  mesh = g.device_mesh
 
85
  state.gather_event = None
86
 
87
 
88
+ @torch.no_grad()
89
  def _compute_u(state, steps, rank, compute_stream):
90
  with torch.cuda.stream(compute_stream):
91
  if rank == state.worker_rank:
 
103
  state.compute_event = None
104
 
105
 
106
+ @torch.no_grad()
107
  def _scatter(p, state, lr, wd, rank, comm_stream):
108
  u = state.computed_u
109
  mesh = p.device_mesh
build/torch26-cxx98-cu118-x86_64-linux/optimizer/_ops.py CHANGED
@@ -1,9 +1,9 @@
1
  import torch
2
- from . import _optimizer_8535e80_dirty
3
- ops = torch.ops._optimizer_8535e80_dirty
4
 
5
  def add_op_namespace_prefix(op_name: str):
6
  """
7
  Prefix op by namespace.
8
  """
9
- return f"_optimizer_8535e80_dirty::{op_name}"
 
1
  import torch
2
+ from . import _optimizer_bdd2678_dirty
3
+ ops = torch.ops._optimizer_bdd2678_dirty
4
 
5
  def add_op_namespace_prefix(op_name: str):
6
  """
7
  Prefix op by namespace.
8
  """
9
+ return f"_optimizer_bdd2678_dirty::{op_name}"
build/torch26-cxx98-cu118-x86_64-linux/optimizer/_optimizer_20250614125054.abi3.so DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:a8f8e7d78ed9a095b882cf764fd9c80a0b0810fb961ba9e8545656fc4cb0b0d7
3
- size 1787200
 
 
 
 
build/torch26-cxx98-cu118-x86_64-linux/optimizer/_optimizer_b4b3752_dirty.abi3.so DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:002dab6441bcad54ab4e7c064b5806acfd45170eb33cfa059745ba6e0c349607
3
- size 1787192
 
 
 
 
build/torch26-cxx98-cu118-x86_64-linux/optimizer/{_optimizer_8535e80_dirty.abi3.so → _optimizer_bdd2678_dirty.abi3.so} RENAMED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:f7d5e76c002507f66f2a227d02c2b11aa3fdc3f07a2a0b82faaa34133adb77ef
3
  size 1787192
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:e4ca177074d4c04630ffaa2e49e41e1451bf198c44c4cc544a664be88475a3b9
3
  size 1787192
build/torch26-cxx98-cu118-x86_64-linux/optimizer/muon.py CHANGED
@@ -5,10 +5,12 @@ import torch
5
  import torch.distributed as dist
6
  from torch.distributed._tensor import DTensor
7
 
8
-
9
  # TODO leave original url and consider LICENSE
10
  # This code snippet is a modified version adapted from the following GitHub repository:
11
  # https://github.com/KellerJordan/Muon/blob/master/muon.py
 
 
 
12
  def _zeropower_via_newtonschulz5(G, steps):
13
  """
14
  Newton-Schulz iteration to compute the zeroth power / orthogonalization of G. We opt to use a
@@ -52,6 +54,7 @@ class _muon_state:
52
  compute_event: torch.cuda.Event | None = None
53
 
54
 
 
55
  def _gather(p, state, rank, comm_stream):
56
  g = p.grad
57
  mesh = g.device_mesh
@@ -82,6 +85,7 @@ def _gather(p, state, rank, comm_stream):
82
  state.gather_event = None
83
 
84
 
 
85
  def _compute_u(state, steps, rank, compute_stream):
86
  with torch.cuda.stream(compute_stream):
87
  if rank == state.worker_rank:
@@ -99,6 +103,7 @@ def _compute_u(state, steps, rank, compute_stream):
99
  state.compute_event = None
100
 
101
 
 
102
  def _scatter(p, state, lr, wd, rank, comm_stream):
103
  u = state.computed_u
104
  mesh = p.device_mesh
 
5
  import torch.distributed as dist
6
  from torch.distributed._tensor import DTensor
7
 
 
8
  # TODO leave original url and consider LICENSE
9
  # This code snippet is a modified version adapted from the following GitHub repository:
10
  # https://github.com/KellerJordan/Muon/blob/master/muon.py
11
+
12
+
13
+ @torch.no_grad()
14
  def _zeropower_via_newtonschulz5(G, steps):
15
  """
16
  Newton-Schulz iteration to compute the zeroth power / orthogonalization of G. We opt to use a
 
54
  compute_event: torch.cuda.Event | None = None
55
 
56
 
57
+ @torch.no_grad()
58
  def _gather(p, state, rank, comm_stream):
59
  g = p.grad
60
  mesh = g.device_mesh
 
85
  state.gather_event = None
86
 
87
 
88
+ @torch.no_grad()
89
  def _compute_u(state, steps, rank, compute_stream):
90
  with torch.cuda.stream(compute_stream):
91
  if rank == state.worker_rank:
 
103
  state.compute_event = None
104
 
105
 
106
+ @torch.no_grad()
107
  def _scatter(p, state, lr, wd, rank, comm_stream):
108
  u = state.computed_u
109
  mesh = p.device_mesh
build/torch26-cxx98-cu124-x86_64-linux/optimizer/_ops.py CHANGED
@@ -1,9 +1,9 @@
1
  import torch
2
- from . import _optimizer_8535e80_dirty
3
- ops = torch.ops._optimizer_8535e80_dirty
4
 
5
  def add_op_namespace_prefix(op_name: str):
6
  """
7
  Prefix op by namespace.
8
  """
9
- return f"_optimizer_8535e80_dirty::{op_name}"
 
1
  import torch
2
+ from . import _optimizer_bdd2678_dirty
3
+ ops = torch.ops._optimizer_bdd2678_dirty
4
 
5
  def add_op_namespace_prefix(op_name: str):
6
  """
7
  Prefix op by namespace.
8
  """
9
+ return f"_optimizer_bdd2678_dirty::{op_name}"
build/torch26-cxx98-cu124-x86_64-linux/optimizer/_optimizer_20250614125054.abi3.so DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:ab2379d932e40d10bee55f032bd16d2e4d9c1920bc5500628006f8a0eb8abd39
3
- size 1824192
 
 
 
 
build/torch26-cxx98-cu124-x86_64-linux/optimizer/{_optimizer_8535e80_dirty.abi3.so → _optimizer_bdd2678_dirty.abi3.so} RENAMED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:becccd250f38a84803350cfb5fac3a6682b1e594968a714642724cbc71246b4a
3
  size 1824184
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:3511c3a46297462166d7b773dc2bd8b16f43b7004eee1e4b31d468113051fb55
3
  size 1824184
build/torch26-cxx98-cu124-x86_64-linux/optimizer/muon.py CHANGED
@@ -5,10 +5,12 @@ import torch
5
  import torch.distributed as dist
6
  from torch.distributed._tensor import DTensor
7
 
8
-
9
  # TODO leave original url and consider LICENSE
10
  # This code snippet is a modified version adapted from the following GitHub repository:
11
  # https://github.com/KellerJordan/Muon/blob/master/muon.py
 
 
 
12
  def _zeropower_via_newtonschulz5(G, steps):
13
  """
14
  Newton-Schulz iteration to compute the zeroth power / orthogonalization of G. We opt to use a
@@ -52,6 +54,7 @@ class _muon_state:
52
  compute_event: torch.cuda.Event | None = None
53
 
54
 
 
55
  def _gather(p, state, rank, comm_stream):
56
  g = p.grad
57
  mesh = g.device_mesh
@@ -82,6 +85,7 @@ def _gather(p, state, rank, comm_stream):
82
  state.gather_event = None
83
 
84
 
 
85
  def _compute_u(state, steps, rank, compute_stream):
86
  with torch.cuda.stream(compute_stream):
87
  if rank == state.worker_rank:
@@ -99,6 +103,7 @@ def _compute_u(state, steps, rank, compute_stream):
99
  state.compute_event = None
100
 
101
 
 
102
  def _scatter(p, state, lr, wd, rank, comm_stream):
103
  u = state.computed_u
104
  mesh = p.device_mesh
 
5
  import torch.distributed as dist
6
  from torch.distributed._tensor import DTensor
7
 
 
8
  # TODO leave original url and consider LICENSE
9
  # This code snippet is a modified version adapted from the following GitHub repository:
10
  # https://github.com/KellerJordan/Muon/blob/master/muon.py
11
+
12
+
13
+ @torch.no_grad()
14
  def _zeropower_via_newtonschulz5(G, steps):
15
  """
16
  Newton-Schulz iteration to compute the zeroth power / orthogonalization of G. We opt to use a
 
54
  compute_event: torch.cuda.Event | None = None
55
 
56
 
57
+ @torch.no_grad()
58
  def _gather(p, state, rank, comm_stream):
59
  g = p.grad
60
  mesh = g.device_mesh
 
85
  state.gather_event = None
86
 
87
 
88
+ @torch.no_grad()
89
  def _compute_u(state, steps, rank, compute_stream):
90
  with torch.cuda.stream(compute_stream):
91
  if rank == state.worker_rank:
 
103
  state.compute_event = None
104
 
105
 
106
+ @torch.no_grad()
107
  def _scatter(p, state, lr, wd, rank, comm_stream):
108
  u = state.computed_u
109
  mesh = p.device_mesh
build/torch26-cxx98-cu126-x86_64-linux/optimizer/_ops.py CHANGED
@@ -1,9 +1,9 @@
1
  import torch
2
- from . import _optimizer_8535e80_dirty
3
- ops = torch.ops._optimizer_8535e80_dirty
4
 
5
  def add_op_namespace_prefix(op_name: str):
6
  """
7
  Prefix op by namespace.
8
  """
9
- return f"_optimizer_8535e80_dirty::{op_name}"
 
1
  import torch
2
+ from . import _optimizer_bdd2678_dirty
3
+ ops = torch.ops._optimizer_bdd2678_dirty
4
 
5
  def add_op_namespace_prefix(op_name: str):
6
  """
7
  Prefix op by namespace.
8
  """
9
+ return f"_optimizer_bdd2678_dirty::{op_name}"
build/torch26-cxx98-cu126-x86_64-linux/optimizer/_optimizer_20250614125054.abi3.so DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:3c3282a321487a6faa532afe43bc1298731983c50e2a1acdff5480ff6e4df34e
3
- size 1824192
 
 
 
 
build/torch26-cxx98-cu126-x86_64-linux/optimizer/_optimizer_8535e80_dirty.abi3.so DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:34215ecc274ef516967962c8457dad214e9bbf618bf5eee8f467371f4f620284
3
- size 1824184
 
 
 
 
build/torch26-cxx98-cu126-x86_64-linux/optimizer/_optimizer_b4b3752_dirty.abi3.so DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:a5b49ed642e1c320da3932377033ad90031124f4ec24b2d1c95fd976ff28346c
3
- size 1824184
 
 
 
 
build/{torch26-cxx98-cu124-x86_64-linux/optimizer/_optimizer_b4b3752_dirty.abi3.so → torch26-cxx98-cu126-x86_64-linux/optimizer/_optimizer_bdd2678_dirty.abi3.so} RENAMED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:7f499350bb19eca6c3da1bb72e46023834b8411ce00730854273b588b2cd9206
3
  size 1824184
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:2fa1dad3b3c1d94e7613a35e42afb8c7974d7bf6ce25cd2766590ba65b129f07
3
  size 1824184
build/torch26-cxx98-cu126-x86_64-linux/optimizer/muon.py CHANGED
@@ -5,10 +5,12 @@ import torch
5
  import torch.distributed as dist
6
  from torch.distributed._tensor import DTensor
7
 
8
-
9
  # TODO leave original url and consider LICENSE
10
  # This code snippet is a modified version adapted from the following GitHub repository:
11
  # https://github.com/KellerJordan/Muon/blob/master/muon.py
 
 
 
12
  def _zeropower_via_newtonschulz5(G, steps):
13
  """
14
  Newton-Schulz iteration to compute the zeroth power / orthogonalization of G. We opt to use a
@@ -52,6 +54,7 @@ class _muon_state:
52
  compute_event: torch.cuda.Event | None = None
53
 
54
 
 
55
  def _gather(p, state, rank, comm_stream):
56
  g = p.grad
57
  mesh = g.device_mesh
@@ -82,6 +85,7 @@ def _gather(p, state, rank, comm_stream):
82
  state.gather_event = None
83
 
84
 
 
85
  def _compute_u(state, steps, rank, compute_stream):
86
  with torch.cuda.stream(compute_stream):
87
  if rank == state.worker_rank:
@@ -99,6 +103,7 @@ def _compute_u(state, steps, rank, compute_stream):
99
  state.compute_event = None
100
 
101
 
 
102
  def _scatter(p, state, lr, wd, rank, comm_stream):
103
  u = state.computed_u
104
  mesh = p.device_mesh
 
5
  import torch.distributed as dist
6
  from torch.distributed._tensor import DTensor
7
 
 
8
  # TODO leave original url and consider LICENSE
9
  # This code snippet is a modified version adapted from the following GitHub repository:
10
  # https://github.com/KellerJordan/Muon/blob/master/muon.py
11
+
12
+
13
+ @torch.no_grad()
14
  def _zeropower_via_newtonschulz5(G, steps):
15
  """
16
  Newton-Schulz iteration to compute the zeroth power / orthogonalization of G. We opt to use a
 
54
  compute_event: torch.cuda.Event | None = None
55
 
56
 
57
+ @torch.no_grad()
58
  def _gather(p, state, rank, comm_stream):
59
  g = p.grad
60
  mesh = g.device_mesh
 
85
  state.gather_event = None
86
 
87
 
88
+ @torch.no_grad()
89
  def _compute_u(state, steps, rank, compute_stream):
90
  with torch.cuda.stream(compute_stream):
91
  if rank == state.worker_rank:
 
103
  state.compute_event = None
104
 
105
 
106
+ @torch.no_grad()
107
  def _scatter(p, state, lr, wd, rank, comm_stream):
108
  u = state.computed_u
109
  mesh = p.device_mesh
build/torch27-cxx11-cu118-x86_64-linux/optimizer/_ops.py CHANGED
@@ -1,9 +1,9 @@
1
  import torch
2
- from . import _optimizer_8535e80_dirty
3
- ops = torch.ops._optimizer_8535e80_dirty
4
 
5
  def add_op_namespace_prefix(op_name: str):
6
  """
7
  Prefix op by namespace.
8
  """
9
- return f"_optimizer_8535e80_dirty::{op_name}"
 
1
  import torch
2
+ from . import _optimizer_bdd2678_dirty
3
+ ops = torch.ops._optimizer_bdd2678_dirty
4
 
5
  def add_op_namespace_prefix(op_name: str):
6
  """
7
  Prefix op by namespace.
8
  """
9
+ return f"_optimizer_bdd2678_dirty::{op_name}"
build/torch27-cxx11-cu118-x86_64-linux/optimizer/_optimizer_20250614125054.abi3.so DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:de82486a39ded94bfe7eeaa862459944a93e284fd0d919329979bb67db3c367f
3
- size 1787376
 
 
 
 
build/torch27-cxx11-cu118-x86_64-linux/optimizer/_optimizer_b4b3752_dirty.abi3.so DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:8ac9027c4a93801e9f19f1e9e94a9ed33b27e92c72797053c3de55e2a6fbb41d
3
- size 1787368
 
 
 
 
build/torch27-cxx11-cu118-x86_64-linux/optimizer/{_optimizer_8535e80_dirty.abi3.so → _optimizer_bdd2678_dirty.abi3.so} RENAMED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:c23a3adbe4dc1a64b4851a9f8e4aed0e3e1eeeded27322c54f5b942282a2a332
3
  size 1787368
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:fe5761d07ed965bf94d00d8a8e6753a7fb571271e73773de9021511e0e6ae2c7
3
  size 1787368
build/torch27-cxx11-cu118-x86_64-linux/optimizer/muon.py CHANGED
@@ -5,10 +5,12 @@ import torch
5
  import torch.distributed as dist
6
  from torch.distributed._tensor import DTensor
7
 
8
-
9
  # TODO leave original url and consider LICENSE
10
  # This code snippet is a modified version adapted from the following GitHub repository:
11
  # https://github.com/KellerJordan/Muon/blob/master/muon.py
 
 
 
12
  def _zeropower_via_newtonschulz5(G, steps):
13
  """
14
  Newton-Schulz iteration to compute the zeroth power / orthogonalization of G. We opt to use a
@@ -52,6 +54,7 @@ class _muon_state:
52
  compute_event: torch.cuda.Event | None = None
53
 
54
 
 
55
  def _gather(p, state, rank, comm_stream):
56
  g = p.grad
57
  mesh = g.device_mesh
@@ -82,6 +85,7 @@ def _gather(p, state, rank, comm_stream):
82
  state.gather_event = None
83
 
84
 
 
85
  def _compute_u(state, steps, rank, compute_stream):
86
  with torch.cuda.stream(compute_stream):
87
  if rank == state.worker_rank:
@@ -99,6 +103,7 @@ def _compute_u(state, steps, rank, compute_stream):
99
  state.compute_event = None
100
 
101
 
 
102
  def _scatter(p, state, lr, wd, rank, comm_stream):
103
  u = state.computed_u
104
  mesh = p.device_mesh
 
5
  import torch.distributed as dist
6
  from torch.distributed._tensor import DTensor
7
 
 
8
  # TODO leave original url and consider LICENSE
9
  # This code snippet is a modified version adapted from the following GitHub repository:
10
  # https://github.com/KellerJordan/Muon/blob/master/muon.py
11
+
12
+
13
+ @torch.no_grad()
14
  def _zeropower_via_newtonschulz5(G, steps):
15
  """
16
  Newton-Schulz iteration to compute the zeroth power / orthogonalization of G. We opt to use a
 
54
  compute_event: torch.cuda.Event | None = None
55
 
56
 
57
+ @torch.no_grad()
58
  def _gather(p, state, rank, comm_stream):
59
  g = p.grad
60
  mesh = g.device_mesh
 
85
  state.gather_event = None
86
 
87
 
88
+ @torch.no_grad()
89
  def _compute_u(state, steps, rank, compute_stream):
90
  with torch.cuda.stream(compute_stream):
91
  if rank == state.worker_rank:
 
103
  state.compute_event = None
104
 
105
 
106
+ @torch.no_grad()
107
  def _scatter(p, state, lr, wd, rank, comm_stream):
108
  u = state.computed_u
109
  mesh = p.device_mesh
build/torch27-cxx11-cu126-x86_64-linux/optimizer/_ops.py CHANGED
@@ -1,9 +1,9 @@
1
  import torch
2
- from . import _optimizer_8535e80_dirty
3
- ops = torch.ops._optimizer_8535e80_dirty
4
 
5
  def add_op_namespace_prefix(op_name: str):
6
  """
7
  Prefix op by namespace.
8
  """
9
- return f"_optimizer_8535e80_dirty::{op_name}"
 
1
  import torch
2
+ from . import _optimizer_bdd2678_dirty
3
+ ops = torch.ops._optimizer_bdd2678_dirty
4
 
5
  def add_op_namespace_prefix(op_name: str):
6
  """
7
  Prefix op by namespace.
8
  """
9
+ return f"_optimizer_bdd2678_dirty::{op_name}"
build/torch27-cxx11-cu126-x86_64-linux/optimizer/_optimizer_8535e80_dirty.abi3.so DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:d4aa09c22745d5efe1ef0669c4ca05615f67595dc90cabeee6e878301fa9bd22
3
- size 1824256
 
 
 
 
build/torch27-cxx11-cu126-x86_64-linux/optimizer/_optimizer_b4b3752_dirty.abi3.so DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:b425a7fd854402508da5af17fa88f305753a09474686d6ec7afe540b3c5c082e
3
- size 1824256
 
 
 
 
build/torch27-cxx11-cu126-x86_64-linux/optimizer/{_optimizer_20250614125054.abi3.so → _optimizer_bdd2678_dirty.abi3.so} RENAMED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:cb02d3818a89c819a5a12d066ce56da0ebc4f3da491cb045ae380c5b9319e592
3
  size 1824256
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a35c1c4d46f677f0fe35fec9023a866b9bd0f4245624b4e71a9812a1864c01e6
3
  size 1824256
build/torch27-cxx11-cu126-x86_64-linux/optimizer/muon.py CHANGED
@@ -5,10 +5,12 @@ import torch
5
  import torch.distributed as dist
6
  from torch.distributed._tensor import DTensor
7
 
8
-
9
  # TODO leave original url and consider LICENSE
10
  # This code snippet is a modified version adapted from the following GitHub repository:
11
  # https://github.com/KellerJordan/Muon/blob/master/muon.py
 
 
 
12
  def _zeropower_via_newtonschulz5(G, steps):
13
  """
14
  Newton-Schulz iteration to compute the zeroth power / orthogonalization of G. We opt to use a
@@ -52,6 +54,7 @@ class _muon_state:
52
  compute_event: torch.cuda.Event | None = None
53
 
54
 
 
55
  def _gather(p, state, rank, comm_stream):
56
  g = p.grad
57
  mesh = g.device_mesh
@@ -82,6 +85,7 @@ def _gather(p, state, rank, comm_stream):
82
  state.gather_event = None
83
 
84
 
 
85
  def _compute_u(state, steps, rank, compute_stream):
86
  with torch.cuda.stream(compute_stream):
87
  if rank == state.worker_rank:
@@ -99,6 +103,7 @@ def _compute_u(state, steps, rank, compute_stream):
99
  state.compute_event = None
100
 
101
 
 
102
  def _scatter(p, state, lr, wd, rank, comm_stream):
103
  u = state.computed_u
104
  mesh = p.device_mesh
 
5
  import torch.distributed as dist
6
  from torch.distributed._tensor import DTensor
7
 
 
8
  # TODO leave original url and consider LICENSE
9
  # This code snippet is a modified version adapted from the following GitHub repository:
10
  # https://github.com/KellerJordan/Muon/blob/master/muon.py
11
+
12
+
13
+ @torch.no_grad()
14
  def _zeropower_via_newtonschulz5(G, steps):
15
  """
16
  Newton-Schulz iteration to compute the zeroth power / orthogonalization of G. We opt to use a
 
54
  compute_event: torch.cuda.Event | None = None
55
 
56
 
57
+ @torch.no_grad()
58
  def _gather(p, state, rank, comm_stream):
59
  g = p.grad
60
  mesh = g.device_mesh
 
85
  state.gather_event = None
86
 
87
 
88
+ @torch.no_grad()
89
  def _compute_u(state, steps, rank, compute_stream):
90
  with torch.cuda.stream(compute_stream):
91
  if rank == state.worker_rank:
 
103
  state.compute_event = None
104
 
105
 
106
+ @torch.no_grad()
107
  def _scatter(p, state, lr, wd, rank, comm_stream):
108
  u = state.computed_u
109
  mesh = p.device_mesh
build/torch27-cxx11-cu128-x86_64-linux/optimizer/_ops.py CHANGED
@@ -1,9 +1,9 @@
1
  import torch
2
- from . import _optimizer_8535e80_dirty
3
- ops = torch.ops._optimizer_8535e80_dirty
4
 
5
  def add_op_namespace_prefix(op_name: str):
6
  """
7
  Prefix op by namespace.
8
  """
9
- return f"_optimizer_8535e80_dirty::{op_name}"
 
1
  import torch
2
+ from . import _optimizer_bdd2678_dirty
3
+ ops = torch.ops._optimizer_bdd2678_dirty
4
 
5
  def add_op_namespace_prefix(op_name: str):
6
  """
7
  Prefix op by namespace.
8
  """
9
+ return f"_optimizer_bdd2678_dirty::{op_name}"
build/torch27-cxx11-cu128-x86_64-linux/optimizer/_optimizer_8535e80_dirty.abi3.so DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:b4baf569b70749c4657062fb0f56943fc486adb0c482e50c7aa8e31ddf5cc870
3
- size 1883352
 
 
 
 
build/torch27-cxx11-cu128-x86_64-linux/optimizer/_optimizer_b4b3752_dirty.abi3.so DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:1150b2d74d708ef2f7d8a24049c49b9ba6e2d8f5d5ce5ae88a611e4d555fe659
3
- size 1883352