Commit
·
febdf5b
1
Parent(s):
bdd2678
chore(muon): clean build and update doc
Browse filesThis view is limited to 50 files because it contains too many changes.
See raw diff
- build/torch26-cxx11-cu118-x86_64-linux/optimizer/_ops.py +3 -3
- build/torch26-cxx11-cu118-x86_64-linux/optimizer/_optimizer_8535e80_dirty.abi3.so +0 -3
- build/torch26-cxx11-cu118-x86_64-linux/optimizer/_optimizer_b4b3752_dirty.abi3.so +0 -3
- build/torch26-cxx11-cu118-x86_64-linux/optimizer/{_optimizer_20250614125054.abi3.so → _optimizer_bdd2678_dirty.abi3.so} +1 -1
- build/torch26-cxx11-cu118-x86_64-linux/optimizer/muon.py +6 -1
- build/torch26-cxx11-cu124-x86_64-linux/optimizer/_ops.py +3 -3
- build/torch26-cxx11-cu124-x86_64-linux/optimizer/_optimizer_20250614125054.abi3.so +0 -3
- build/torch26-cxx11-cu124-x86_64-linux/optimizer/{_optimizer_8535e80_dirty.abi3.so → _optimizer_bdd2678_dirty.abi3.so} +1 -1
- build/torch26-cxx11-cu124-x86_64-linux/optimizer/muon.py +6 -1
- build/torch26-cxx11-cu126-x86_64-linux/optimizer/_ops.py +3 -3
- build/torch26-cxx11-cu126-x86_64-linux/optimizer/_optimizer_20250614125054.abi3.so +0 -3
- build/torch26-cxx11-cu126-x86_64-linux/optimizer/_optimizer_8535e80_dirty.abi3.so +0 -3
- build/torch26-cxx11-cu126-x86_64-linux/optimizer/_optimizer_b4b3752_dirty.abi3.so +0 -3
- build/{torch26-cxx11-cu124-x86_64-linux/optimizer/_optimizer_b4b3752_dirty.abi3.so → torch26-cxx11-cu126-x86_64-linux/optimizer/_optimizer_bdd2678_dirty.abi3.so} +1 -1
- build/torch26-cxx11-cu126-x86_64-linux/optimizer/muon.py +6 -1
- build/torch26-cxx11-rocm62-x86_64-linux/optimizer/_ops.py +3 -3
- build/torch26-cxx11-rocm62-x86_64-linux/optimizer/_optimizer_20250614121529.abi3.so +0 -3
- build/torch26-cxx11-rocm62-x86_64-linux/optimizer/_optimizer_20250614123843.abi3.so +0 -3
- build/torch26-cxx11-rocm62-x86_64-linux/optimizer/_optimizer_20250614125054.abi3.so +0 -3
- build/torch26-cxx11-rocm62-x86_64-linux/optimizer/_optimizer_b4b3752_dirty.abi3.so +0 -3
- build/torch26-cxx11-rocm62-x86_64-linux/optimizer/{_optimizer_8535e80_dirty.abi3.so → _optimizer_bdd2678_dirty.abi3.so} +1 -1
- build/torch26-cxx11-rocm62-x86_64-linux/optimizer/muon.py +6 -1
- build/torch26-cxx98-cu118-x86_64-linux/optimizer/_ops.py +3 -3
- build/torch26-cxx98-cu118-x86_64-linux/optimizer/_optimizer_20250614125054.abi3.so +0 -3
- build/torch26-cxx98-cu118-x86_64-linux/optimizer/_optimizer_b4b3752_dirty.abi3.so +0 -3
- build/torch26-cxx98-cu118-x86_64-linux/optimizer/{_optimizer_8535e80_dirty.abi3.so → _optimizer_bdd2678_dirty.abi3.so} +1 -1
- build/torch26-cxx98-cu118-x86_64-linux/optimizer/muon.py +6 -1
- build/torch26-cxx98-cu124-x86_64-linux/optimizer/_ops.py +3 -3
- build/torch26-cxx98-cu124-x86_64-linux/optimizer/_optimizer_20250614125054.abi3.so +0 -3
- build/torch26-cxx98-cu124-x86_64-linux/optimizer/{_optimizer_8535e80_dirty.abi3.so → _optimizer_bdd2678_dirty.abi3.so} +1 -1
- build/torch26-cxx98-cu124-x86_64-linux/optimizer/muon.py +6 -1
- build/torch26-cxx98-cu126-x86_64-linux/optimizer/_ops.py +3 -3
- build/torch26-cxx98-cu126-x86_64-linux/optimizer/_optimizer_20250614125054.abi3.so +0 -3
- build/torch26-cxx98-cu126-x86_64-linux/optimizer/_optimizer_8535e80_dirty.abi3.so +0 -3
- build/torch26-cxx98-cu126-x86_64-linux/optimizer/_optimizer_b4b3752_dirty.abi3.so +0 -3
- build/{torch26-cxx98-cu124-x86_64-linux/optimizer/_optimizer_b4b3752_dirty.abi3.so → torch26-cxx98-cu126-x86_64-linux/optimizer/_optimizer_bdd2678_dirty.abi3.so} +1 -1
- build/torch26-cxx98-cu126-x86_64-linux/optimizer/muon.py +6 -1
- build/torch27-cxx11-cu118-x86_64-linux/optimizer/_ops.py +3 -3
- build/torch27-cxx11-cu118-x86_64-linux/optimizer/_optimizer_20250614125054.abi3.so +0 -3
- build/torch27-cxx11-cu118-x86_64-linux/optimizer/_optimizer_b4b3752_dirty.abi3.so +0 -3
- build/torch27-cxx11-cu118-x86_64-linux/optimizer/{_optimizer_8535e80_dirty.abi3.so → _optimizer_bdd2678_dirty.abi3.so} +1 -1
- build/torch27-cxx11-cu118-x86_64-linux/optimizer/muon.py +6 -1
- build/torch27-cxx11-cu126-x86_64-linux/optimizer/_ops.py +3 -3
- build/torch27-cxx11-cu126-x86_64-linux/optimizer/_optimizer_8535e80_dirty.abi3.so +0 -3
- build/torch27-cxx11-cu126-x86_64-linux/optimizer/_optimizer_b4b3752_dirty.abi3.so +0 -3
- build/torch27-cxx11-cu126-x86_64-linux/optimizer/{_optimizer_20250614125054.abi3.so → _optimizer_bdd2678_dirty.abi3.so} +1 -1
- build/torch27-cxx11-cu126-x86_64-linux/optimizer/muon.py +6 -1
- build/torch27-cxx11-cu128-x86_64-linux/optimizer/_ops.py +3 -3
- build/torch27-cxx11-cu128-x86_64-linux/optimizer/_optimizer_8535e80_dirty.abi3.so +0 -3
- build/torch27-cxx11-cu128-x86_64-linux/optimizer/_optimizer_b4b3752_dirty.abi3.so +0 -3
build/torch26-cxx11-cu118-x86_64-linux/optimizer/_ops.py
CHANGED
@@ -1,9 +1,9 @@
|
|
1 |
import torch
|
2 |
-
from . import
|
3 |
-
ops = torch.ops.
|
4 |
|
5 |
def add_op_namespace_prefix(op_name: str):
|
6 |
"""
|
7 |
Prefix op by namespace.
|
8 |
"""
|
9 |
-
return f"
|
|
|
1 |
import torch
|
2 |
+
from . import _optimizer_bdd2678_dirty
|
3 |
+
ops = torch.ops._optimizer_bdd2678_dirty
|
4 |
|
5 |
def add_op_namespace_prefix(op_name: str):
|
6 |
"""
|
7 |
Prefix op by namespace.
|
8 |
"""
|
9 |
+
return f"_optimizer_bdd2678_dirty::{op_name}"
|
build/torch26-cxx11-cu118-x86_64-linux/optimizer/_optimizer_8535e80_dirty.abi3.so
DELETED
@@ -1,3 +0,0 @@
|
|
1 |
-
version https://git-lfs.github.com/spec/v1
|
2 |
-
oid sha256:a46d9e65efcfa82522950d9ebf2b2b4594d9ed5abc28704352a1f7de2dae707a
|
3 |
-
size 1787272
|
|
|
|
|
|
|
|
build/torch26-cxx11-cu118-x86_64-linux/optimizer/_optimizer_b4b3752_dirty.abi3.so
DELETED
@@ -1,3 +0,0 @@
|
|
1 |
-
version https://git-lfs.github.com/spec/v1
|
2 |
-
oid sha256:f8325d12959ef4f31b6c6340eca29176f5077abeaa10f3a6651db55ccf3c634f
|
3 |
-
size 1787272
|
|
|
|
|
|
|
|
build/torch26-cxx11-cu118-x86_64-linux/optimizer/{_optimizer_20250614125054.abi3.so → _optimizer_bdd2678_dirty.abi3.so}
RENAMED
@@ -1,3 +1,3 @@
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
-
oid sha256:
|
3 |
size 1787272
|
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:9119d3a6d99c07a17d110d2ccf6042f199d00c839f5efa74008c1642d21e48b0
|
3 |
size 1787272
|
build/torch26-cxx11-cu118-x86_64-linux/optimizer/muon.py
CHANGED
@@ -5,10 +5,12 @@ import torch
|
|
5 |
import torch.distributed as dist
|
6 |
from torch.distributed._tensor import DTensor
|
7 |
|
8 |
-
|
9 |
# TODO leave original url and consider LICENSE
|
10 |
# This code snippet is a modified version adapted from the following GitHub repository:
|
11 |
# https://github.com/KellerJordan/Muon/blob/master/muon.py
|
|
|
|
|
|
|
12 |
def _zeropower_via_newtonschulz5(G, steps):
|
13 |
"""
|
14 |
Newton-Schulz iteration to compute the zeroth power / orthogonalization of G. We opt to use a
|
@@ -52,6 +54,7 @@ class _muon_state:
|
|
52 |
compute_event: torch.cuda.Event | None = None
|
53 |
|
54 |
|
|
|
55 |
def _gather(p, state, rank, comm_stream):
|
56 |
g = p.grad
|
57 |
mesh = g.device_mesh
|
@@ -82,6 +85,7 @@ def _gather(p, state, rank, comm_stream):
|
|
82 |
state.gather_event = None
|
83 |
|
84 |
|
|
|
85 |
def _compute_u(state, steps, rank, compute_stream):
|
86 |
with torch.cuda.stream(compute_stream):
|
87 |
if rank == state.worker_rank:
|
@@ -99,6 +103,7 @@ def _compute_u(state, steps, rank, compute_stream):
|
|
99 |
state.compute_event = None
|
100 |
|
101 |
|
|
|
102 |
def _scatter(p, state, lr, wd, rank, comm_stream):
|
103 |
u = state.computed_u
|
104 |
mesh = p.device_mesh
|
|
|
5 |
import torch.distributed as dist
|
6 |
from torch.distributed._tensor import DTensor
|
7 |
|
|
|
8 |
# TODO leave original url and consider LICENSE
|
9 |
# This code snippet is a modified version adapted from the following GitHub repository:
|
10 |
# https://github.com/KellerJordan/Muon/blob/master/muon.py
|
11 |
+
|
12 |
+
|
13 |
+
@torch.no_grad()
|
14 |
def _zeropower_via_newtonschulz5(G, steps):
|
15 |
"""
|
16 |
Newton-Schulz iteration to compute the zeroth power / orthogonalization of G. We opt to use a
|
|
|
54 |
compute_event: torch.cuda.Event | None = None
|
55 |
|
56 |
|
57 |
+
@torch.no_grad()
|
58 |
def _gather(p, state, rank, comm_stream):
|
59 |
g = p.grad
|
60 |
mesh = g.device_mesh
|
|
|
85 |
state.gather_event = None
|
86 |
|
87 |
|
88 |
+
@torch.no_grad()
|
89 |
def _compute_u(state, steps, rank, compute_stream):
|
90 |
with torch.cuda.stream(compute_stream):
|
91 |
if rank == state.worker_rank:
|
|
|
103 |
state.compute_event = None
|
104 |
|
105 |
|
106 |
+
@torch.no_grad()
|
107 |
def _scatter(p, state, lr, wd, rank, comm_stream):
|
108 |
u = state.computed_u
|
109 |
mesh = p.device_mesh
|
build/torch26-cxx11-cu124-x86_64-linux/optimizer/_ops.py
CHANGED
@@ -1,9 +1,9 @@
|
|
1 |
import torch
|
2 |
-
from . import
|
3 |
-
ops = torch.ops.
|
4 |
|
5 |
def add_op_namespace_prefix(op_name: str):
|
6 |
"""
|
7 |
Prefix op by namespace.
|
8 |
"""
|
9 |
-
return f"
|
|
|
1 |
import torch
|
2 |
+
from . import _optimizer_bdd2678_dirty
|
3 |
+
ops = torch.ops._optimizer_bdd2678_dirty
|
4 |
|
5 |
def add_op_namespace_prefix(op_name: str):
|
6 |
"""
|
7 |
Prefix op by namespace.
|
8 |
"""
|
9 |
+
return f"_optimizer_bdd2678_dirty::{op_name}"
|
build/torch26-cxx11-cu124-x86_64-linux/optimizer/_optimizer_20250614125054.abi3.so
DELETED
@@ -1,3 +0,0 @@
|
|
1 |
-
version https://git-lfs.github.com/spec/v1
|
2 |
-
oid sha256:e89cd7d514bfe92598684ae3cfc2d35ac2d021340846e09c0b6c880c3d55bfa0
|
3 |
-
size 1820136
|
|
|
|
|
|
|
|
build/torch26-cxx11-cu124-x86_64-linux/optimizer/{_optimizer_8535e80_dirty.abi3.so → _optimizer_bdd2678_dirty.abi3.so}
RENAMED
@@ -1,3 +1,3 @@
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
-
oid sha256:
|
3 |
size 1824224
|
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:91b76cd5be429f99840e26e8ba55b61f9fdcae19301bd7c082b2e9746a276501
|
3 |
size 1824224
|
build/torch26-cxx11-cu124-x86_64-linux/optimizer/muon.py
CHANGED
@@ -5,10 +5,12 @@ import torch
|
|
5 |
import torch.distributed as dist
|
6 |
from torch.distributed._tensor import DTensor
|
7 |
|
8 |
-
|
9 |
# TODO leave original url and consider LICENSE
|
10 |
# This code snippet is a modified version adapted from the following GitHub repository:
|
11 |
# https://github.com/KellerJordan/Muon/blob/master/muon.py
|
|
|
|
|
|
|
12 |
def _zeropower_via_newtonschulz5(G, steps):
|
13 |
"""
|
14 |
Newton-Schulz iteration to compute the zeroth power / orthogonalization of G. We opt to use a
|
@@ -52,6 +54,7 @@ class _muon_state:
|
|
52 |
compute_event: torch.cuda.Event | None = None
|
53 |
|
54 |
|
|
|
55 |
def _gather(p, state, rank, comm_stream):
|
56 |
g = p.grad
|
57 |
mesh = g.device_mesh
|
@@ -82,6 +85,7 @@ def _gather(p, state, rank, comm_stream):
|
|
82 |
state.gather_event = None
|
83 |
|
84 |
|
|
|
85 |
def _compute_u(state, steps, rank, compute_stream):
|
86 |
with torch.cuda.stream(compute_stream):
|
87 |
if rank == state.worker_rank:
|
@@ -99,6 +103,7 @@ def _compute_u(state, steps, rank, compute_stream):
|
|
99 |
state.compute_event = None
|
100 |
|
101 |
|
|
|
102 |
def _scatter(p, state, lr, wd, rank, comm_stream):
|
103 |
u = state.computed_u
|
104 |
mesh = p.device_mesh
|
|
|
5 |
import torch.distributed as dist
|
6 |
from torch.distributed._tensor import DTensor
|
7 |
|
|
|
8 |
# TODO leave original url and consider LICENSE
|
9 |
# This code snippet is a modified version adapted from the following GitHub repository:
|
10 |
# https://github.com/KellerJordan/Muon/blob/master/muon.py
|
11 |
+
|
12 |
+
|
13 |
+
@torch.no_grad()
|
14 |
def _zeropower_via_newtonschulz5(G, steps):
|
15 |
"""
|
16 |
Newton-Schulz iteration to compute the zeroth power / orthogonalization of G. We opt to use a
|
|
|
54 |
compute_event: torch.cuda.Event | None = None
|
55 |
|
56 |
|
57 |
+
@torch.no_grad()
|
58 |
def _gather(p, state, rank, comm_stream):
|
59 |
g = p.grad
|
60 |
mesh = g.device_mesh
|
|
|
85 |
state.gather_event = None
|
86 |
|
87 |
|
88 |
+
@torch.no_grad()
|
89 |
def _compute_u(state, steps, rank, compute_stream):
|
90 |
with torch.cuda.stream(compute_stream):
|
91 |
if rank == state.worker_rank:
|
|
|
103 |
state.compute_event = None
|
104 |
|
105 |
|
106 |
+
@torch.no_grad()
|
107 |
def _scatter(p, state, lr, wd, rank, comm_stream):
|
108 |
u = state.computed_u
|
109 |
mesh = p.device_mesh
|
build/torch26-cxx11-cu126-x86_64-linux/optimizer/_ops.py
CHANGED
@@ -1,9 +1,9 @@
|
|
1 |
import torch
|
2 |
-
from . import
|
3 |
-
ops = torch.ops.
|
4 |
|
5 |
def add_op_namespace_prefix(op_name: str):
|
6 |
"""
|
7 |
Prefix op by namespace.
|
8 |
"""
|
9 |
-
return f"
|
|
|
1 |
import torch
|
2 |
+
from . import _optimizer_bdd2678_dirty
|
3 |
+
ops = torch.ops._optimizer_bdd2678_dirty
|
4 |
|
5 |
def add_op_namespace_prefix(op_name: str):
|
6 |
"""
|
7 |
Prefix op by namespace.
|
8 |
"""
|
9 |
+
return f"_optimizer_bdd2678_dirty::{op_name}"
|
build/torch26-cxx11-cu126-x86_64-linux/optimizer/_optimizer_20250614125054.abi3.so
DELETED
@@ -1,3 +0,0 @@
|
|
1 |
-
version https://git-lfs.github.com/spec/v1
|
2 |
-
oid sha256:7f5dce62d3038e879e688fffa9bbc70f3e82db20b2e7ae3ba09040e0319acb71
|
3 |
-
size 1820136
|
|
|
|
|
|
|
|
build/torch26-cxx11-cu126-x86_64-linux/optimizer/_optimizer_8535e80_dirty.abi3.so
DELETED
@@ -1,3 +0,0 @@
|
|
1 |
-
version https://git-lfs.github.com/spec/v1
|
2 |
-
oid sha256:2c0843f38cee494b7a5939eb62d27039d76dc3f69401d411efbacaa25cb0d67a
|
3 |
-
size 1824224
|
|
|
|
|
|
|
|
build/torch26-cxx11-cu126-x86_64-linux/optimizer/_optimizer_b4b3752_dirty.abi3.so
DELETED
@@ -1,3 +0,0 @@
|
|
1 |
-
version https://git-lfs.github.com/spec/v1
|
2 |
-
oid sha256:58162f994df84868dbf62ae70e39d3c14e3390fc827f152eece83dfae7f51503
|
3 |
-
size 1824224
|
|
|
|
|
|
|
|
build/{torch26-cxx11-cu124-x86_64-linux/optimizer/_optimizer_b4b3752_dirty.abi3.so → torch26-cxx11-cu126-x86_64-linux/optimizer/_optimizer_bdd2678_dirty.abi3.so}
RENAMED
@@ -1,3 +1,3 @@
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
-
oid sha256:
|
3 |
size 1824224
|
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:807d59aca5b0403206395a1f4c770b8d644294c17f6af866207c36ac617f0a7d
|
3 |
size 1824224
|
build/torch26-cxx11-cu126-x86_64-linux/optimizer/muon.py
CHANGED
@@ -5,10 +5,12 @@ import torch
|
|
5 |
import torch.distributed as dist
|
6 |
from torch.distributed._tensor import DTensor
|
7 |
|
8 |
-
|
9 |
# TODO leave original url and consider LICENSE
|
10 |
# This code snippet is a modified version adapted from the following GitHub repository:
|
11 |
# https://github.com/KellerJordan/Muon/blob/master/muon.py
|
|
|
|
|
|
|
12 |
def _zeropower_via_newtonschulz5(G, steps):
|
13 |
"""
|
14 |
Newton-Schulz iteration to compute the zeroth power / orthogonalization of G. We opt to use a
|
@@ -52,6 +54,7 @@ class _muon_state:
|
|
52 |
compute_event: torch.cuda.Event | None = None
|
53 |
|
54 |
|
|
|
55 |
def _gather(p, state, rank, comm_stream):
|
56 |
g = p.grad
|
57 |
mesh = g.device_mesh
|
@@ -82,6 +85,7 @@ def _gather(p, state, rank, comm_stream):
|
|
82 |
state.gather_event = None
|
83 |
|
84 |
|
|
|
85 |
def _compute_u(state, steps, rank, compute_stream):
|
86 |
with torch.cuda.stream(compute_stream):
|
87 |
if rank == state.worker_rank:
|
@@ -99,6 +103,7 @@ def _compute_u(state, steps, rank, compute_stream):
|
|
99 |
state.compute_event = None
|
100 |
|
101 |
|
|
|
102 |
def _scatter(p, state, lr, wd, rank, comm_stream):
|
103 |
u = state.computed_u
|
104 |
mesh = p.device_mesh
|
|
|
5 |
import torch.distributed as dist
|
6 |
from torch.distributed._tensor import DTensor
|
7 |
|
|
|
8 |
# TODO leave original url and consider LICENSE
|
9 |
# This code snippet is a modified version adapted from the following GitHub repository:
|
10 |
# https://github.com/KellerJordan/Muon/blob/master/muon.py
|
11 |
+
|
12 |
+
|
13 |
+
@torch.no_grad()
|
14 |
def _zeropower_via_newtonschulz5(G, steps):
|
15 |
"""
|
16 |
Newton-Schulz iteration to compute the zeroth power / orthogonalization of G. We opt to use a
|
|
|
54 |
compute_event: torch.cuda.Event | None = None
|
55 |
|
56 |
|
57 |
+
@torch.no_grad()
|
58 |
def _gather(p, state, rank, comm_stream):
|
59 |
g = p.grad
|
60 |
mesh = g.device_mesh
|
|
|
85 |
state.gather_event = None
|
86 |
|
87 |
|
88 |
+
@torch.no_grad()
|
89 |
def _compute_u(state, steps, rank, compute_stream):
|
90 |
with torch.cuda.stream(compute_stream):
|
91 |
if rank == state.worker_rank:
|
|
|
103 |
state.compute_event = None
|
104 |
|
105 |
|
106 |
+
@torch.no_grad()
|
107 |
def _scatter(p, state, lr, wd, rank, comm_stream):
|
108 |
u = state.computed_u
|
109 |
mesh = p.device_mesh
|
build/torch26-cxx11-rocm62-x86_64-linux/optimizer/_ops.py
CHANGED
@@ -1,9 +1,9 @@
|
|
1 |
import torch
|
2 |
-
from . import
|
3 |
-
ops = torch.ops.
|
4 |
|
5 |
def add_op_namespace_prefix(op_name: str):
|
6 |
"""
|
7 |
Prefix op by namespace.
|
8 |
"""
|
9 |
-
return f"
|
|
|
1 |
import torch
|
2 |
+
from . import _optimizer_bdd2678_dirty
|
3 |
+
ops = torch.ops._optimizer_bdd2678_dirty
|
4 |
|
5 |
def add_op_namespace_prefix(op_name: str):
|
6 |
"""
|
7 |
Prefix op by namespace.
|
8 |
"""
|
9 |
+
return f"_optimizer_bdd2678_dirty::{op_name}"
|
build/torch26-cxx11-rocm62-x86_64-linux/optimizer/_optimizer_20250614121529.abi3.so
DELETED
@@ -1,3 +0,0 @@
|
|
1 |
-
version https://git-lfs.github.com/spec/v1
|
2 |
-
oid sha256:d2f60369ba2bd0a0f84e053d857d37496137ff476dc21561f211b1fa39651990
|
3 |
-
size 1749784
|
|
|
|
|
|
|
|
build/torch26-cxx11-rocm62-x86_64-linux/optimizer/_optimizer_20250614123843.abi3.so
DELETED
@@ -1,3 +0,0 @@
|
|
1 |
-
version https://git-lfs.github.com/spec/v1
|
2 |
-
oid sha256:f4d790535f99b7b362a966e802a547654f31749f5f28a0207493870927f1d8d2
|
3 |
-
size 1749784
|
|
|
|
|
|
|
|
build/torch26-cxx11-rocm62-x86_64-linux/optimizer/_optimizer_20250614125054.abi3.so
DELETED
@@ -1,3 +0,0 @@
|
|
1 |
-
version https://git-lfs.github.com/spec/v1
|
2 |
-
oid sha256:b440dd9a60711a498010068e91d0ad013cd0b8ac732c16b5d1d17e5d4ec0f9b4
|
3 |
-
size 1749784
|
|
|
|
|
|
|
|
build/torch26-cxx11-rocm62-x86_64-linux/optimizer/_optimizer_b4b3752_dirty.abi3.so
DELETED
@@ -1,3 +0,0 @@
|
|
1 |
-
version https://git-lfs.github.com/spec/v1
|
2 |
-
oid sha256:f50ea9cab62a5bd06d886516d3917e4490e65aa9addd1cbb84fc81c6f9a9d5b1
|
3 |
-
size 1749744
|
|
|
|
|
|
|
|
build/torch26-cxx11-rocm62-x86_64-linux/optimizer/{_optimizer_8535e80_dirty.abi3.so → _optimizer_bdd2678_dirty.abi3.so}
RENAMED
@@ -1,3 +1,3 @@
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
-
oid sha256:
|
3 |
size 1749744
|
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:3d2bdd755079fa06a27401b8a26ac425d35514d196f9df4ce1be5c52ebcc9a64
|
3 |
size 1749744
|
build/torch26-cxx11-rocm62-x86_64-linux/optimizer/muon.py
CHANGED
@@ -5,10 +5,12 @@ import torch
|
|
5 |
import torch.distributed as dist
|
6 |
from torch.distributed._tensor import DTensor
|
7 |
|
8 |
-
|
9 |
# TODO leave original url and consider LICENSE
|
10 |
# This code snippet is a modified version adapted from the following GitHub repository:
|
11 |
# https://github.com/KellerJordan/Muon/blob/master/muon.py
|
|
|
|
|
|
|
12 |
def _zeropower_via_newtonschulz5(G, steps):
|
13 |
"""
|
14 |
Newton-Schulz iteration to compute the zeroth power / orthogonalization of G. We opt to use a
|
@@ -52,6 +54,7 @@ class _muon_state:
|
|
52 |
compute_event: torch.cuda.Event | None = None
|
53 |
|
54 |
|
|
|
55 |
def _gather(p, state, rank, comm_stream):
|
56 |
g = p.grad
|
57 |
mesh = g.device_mesh
|
@@ -82,6 +85,7 @@ def _gather(p, state, rank, comm_stream):
|
|
82 |
state.gather_event = None
|
83 |
|
84 |
|
|
|
85 |
def _compute_u(state, steps, rank, compute_stream):
|
86 |
with torch.cuda.stream(compute_stream):
|
87 |
if rank == state.worker_rank:
|
@@ -99,6 +103,7 @@ def _compute_u(state, steps, rank, compute_stream):
|
|
99 |
state.compute_event = None
|
100 |
|
101 |
|
|
|
102 |
def _scatter(p, state, lr, wd, rank, comm_stream):
|
103 |
u = state.computed_u
|
104 |
mesh = p.device_mesh
|
|
|
5 |
import torch.distributed as dist
|
6 |
from torch.distributed._tensor import DTensor
|
7 |
|
|
|
8 |
# TODO leave original url and consider LICENSE
|
9 |
# This code snippet is a modified version adapted from the following GitHub repository:
|
10 |
# https://github.com/KellerJordan/Muon/blob/master/muon.py
|
11 |
+
|
12 |
+
|
13 |
+
@torch.no_grad()
|
14 |
def _zeropower_via_newtonschulz5(G, steps):
|
15 |
"""
|
16 |
Newton-Schulz iteration to compute the zeroth power / orthogonalization of G. We opt to use a
|
|
|
54 |
compute_event: torch.cuda.Event | None = None
|
55 |
|
56 |
|
57 |
+
@torch.no_grad()
|
58 |
def _gather(p, state, rank, comm_stream):
|
59 |
g = p.grad
|
60 |
mesh = g.device_mesh
|
|
|
85 |
state.gather_event = None
|
86 |
|
87 |
|
88 |
+
@torch.no_grad()
|
89 |
def _compute_u(state, steps, rank, compute_stream):
|
90 |
with torch.cuda.stream(compute_stream):
|
91 |
if rank == state.worker_rank:
|
|
|
103 |
state.compute_event = None
|
104 |
|
105 |
|
106 |
+
@torch.no_grad()
|
107 |
def _scatter(p, state, lr, wd, rank, comm_stream):
|
108 |
u = state.computed_u
|
109 |
mesh = p.device_mesh
|
build/torch26-cxx98-cu118-x86_64-linux/optimizer/_ops.py
CHANGED
@@ -1,9 +1,9 @@
|
|
1 |
import torch
|
2 |
-
from . import
|
3 |
-
ops = torch.ops.
|
4 |
|
5 |
def add_op_namespace_prefix(op_name: str):
|
6 |
"""
|
7 |
Prefix op by namespace.
|
8 |
"""
|
9 |
-
return f"
|
|
|
1 |
import torch
|
2 |
+
from . import _optimizer_bdd2678_dirty
|
3 |
+
ops = torch.ops._optimizer_bdd2678_dirty
|
4 |
|
5 |
def add_op_namespace_prefix(op_name: str):
|
6 |
"""
|
7 |
Prefix op by namespace.
|
8 |
"""
|
9 |
+
return f"_optimizer_bdd2678_dirty::{op_name}"
|
build/torch26-cxx98-cu118-x86_64-linux/optimizer/_optimizer_20250614125054.abi3.so
DELETED
@@ -1,3 +0,0 @@
|
|
1 |
-
version https://git-lfs.github.com/spec/v1
|
2 |
-
oid sha256:a8f8e7d78ed9a095b882cf764fd9c80a0b0810fb961ba9e8545656fc4cb0b0d7
|
3 |
-
size 1787200
|
|
|
|
|
|
|
|
build/torch26-cxx98-cu118-x86_64-linux/optimizer/_optimizer_b4b3752_dirty.abi3.so
DELETED
@@ -1,3 +0,0 @@
|
|
1 |
-
version https://git-lfs.github.com/spec/v1
|
2 |
-
oid sha256:002dab6441bcad54ab4e7c064b5806acfd45170eb33cfa059745ba6e0c349607
|
3 |
-
size 1787192
|
|
|
|
|
|
|
|
build/torch26-cxx98-cu118-x86_64-linux/optimizer/{_optimizer_8535e80_dirty.abi3.so → _optimizer_bdd2678_dirty.abi3.so}
RENAMED
@@ -1,3 +1,3 @@
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
-
oid sha256:
|
3 |
size 1787192
|
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:e4ca177074d4c04630ffaa2e49e41e1451bf198c44c4cc544a664be88475a3b9
|
3 |
size 1787192
|
build/torch26-cxx98-cu118-x86_64-linux/optimizer/muon.py
CHANGED
@@ -5,10 +5,12 @@ import torch
|
|
5 |
import torch.distributed as dist
|
6 |
from torch.distributed._tensor import DTensor
|
7 |
|
8 |
-
|
9 |
# TODO leave original url and consider LICENSE
|
10 |
# This code snippet is a modified version adapted from the following GitHub repository:
|
11 |
# https://github.com/KellerJordan/Muon/blob/master/muon.py
|
|
|
|
|
|
|
12 |
def _zeropower_via_newtonschulz5(G, steps):
|
13 |
"""
|
14 |
Newton-Schulz iteration to compute the zeroth power / orthogonalization of G. We opt to use a
|
@@ -52,6 +54,7 @@ class _muon_state:
|
|
52 |
compute_event: torch.cuda.Event | None = None
|
53 |
|
54 |
|
|
|
55 |
def _gather(p, state, rank, comm_stream):
|
56 |
g = p.grad
|
57 |
mesh = g.device_mesh
|
@@ -82,6 +85,7 @@ def _gather(p, state, rank, comm_stream):
|
|
82 |
state.gather_event = None
|
83 |
|
84 |
|
|
|
85 |
def _compute_u(state, steps, rank, compute_stream):
|
86 |
with torch.cuda.stream(compute_stream):
|
87 |
if rank == state.worker_rank:
|
@@ -99,6 +103,7 @@ def _compute_u(state, steps, rank, compute_stream):
|
|
99 |
state.compute_event = None
|
100 |
|
101 |
|
|
|
102 |
def _scatter(p, state, lr, wd, rank, comm_stream):
|
103 |
u = state.computed_u
|
104 |
mesh = p.device_mesh
|
|
|
5 |
import torch.distributed as dist
|
6 |
from torch.distributed._tensor import DTensor
|
7 |
|
|
|
8 |
# TODO leave original url and consider LICENSE
|
9 |
# This code snippet is a modified version adapted from the following GitHub repository:
|
10 |
# https://github.com/KellerJordan/Muon/blob/master/muon.py
|
11 |
+
|
12 |
+
|
13 |
+
@torch.no_grad()
|
14 |
def _zeropower_via_newtonschulz5(G, steps):
|
15 |
"""
|
16 |
Newton-Schulz iteration to compute the zeroth power / orthogonalization of G. We opt to use a
|
|
|
54 |
compute_event: torch.cuda.Event | None = None
|
55 |
|
56 |
|
57 |
+
@torch.no_grad()
|
58 |
def _gather(p, state, rank, comm_stream):
|
59 |
g = p.grad
|
60 |
mesh = g.device_mesh
|
|
|
85 |
state.gather_event = None
|
86 |
|
87 |
|
88 |
+
@torch.no_grad()
|
89 |
def _compute_u(state, steps, rank, compute_stream):
|
90 |
with torch.cuda.stream(compute_stream):
|
91 |
if rank == state.worker_rank:
|
|
|
103 |
state.compute_event = None
|
104 |
|
105 |
|
106 |
+
@torch.no_grad()
|
107 |
def _scatter(p, state, lr, wd, rank, comm_stream):
|
108 |
u = state.computed_u
|
109 |
mesh = p.device_mesh
|
build/torch26-cxx98-cu124-x86_64-linux/optimizer/_ops.py
CHANGED
@@ -1,9 +1,9 @@
|
|
1 |
import torch
|
2 |
-
from . import
|
3 |
-
ops = torch.ops.
|
4 |
|
5 |
def add_op_namespace_prefix(op_name: str):
|
6 |
"""
|
7 |
Prefix op by namespace.
|
8 |
"""
|
9 |
-
return f"
|
|
|
1 |
import torch
|
2 |
+
from . import _optimizer_bdd2678_dirty
|
3 |
+
ops = torch.ops._optimizer_bdd2678_dirty
|
4 |
|
5 |
def add_op_namespace_prefix(op_name: str):
|
6 |
"""
|
7 |
Prefix op by namespace.
|
8 |
"""
|
9 |
+
return f"_optimizer_bdd2678_dirty::{op_name}"
|
build/torch26-cxx98-cu124-x86_64-linux/optimizer/_optimizer_20250614125054.abi3.so
DELETED
@@ -1,3 +0,0 @@
|
|
1 |
-
version https://git-lfs.github.com/spec/v1
|
2 |
-
oid sha256:ab2379d932e40d10bee55f032bd16d2e4d9c1920bc5500628006f8a0eb8abd39
|
3 |
-
size 1824192
|
|
|
|
|
|
|
|
build/torch26-cxx98-cu124-x86_64-linux/optimizer/{_optimizer_8535e80_dirty.abi3.so → _optimizer_bdd2678_dirty.abi3.so}
RENAMED
@@ -1,3 +1,3 @@
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
-
oid sha256:
|
3 |
size 1824184
|
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:3511c3a46297462166d7b773dc2bd8b16f43b7004eee1e4b31d468113051fb55
|
3 |
size 1824184
|
build/torch26-cxx98-cu124-x86_64-linux/optimizer/muon.py
CHANGED
@@ -5,10 +5,12 @@ import torch
|
|
5 |
import torch.distributed as dist
|
6 |
from torch.distributed._tensor import DTensor
|
7 |
|
8 |
-
|
9 |
# TODO leave original url and consider LICENSE
|
10 |
# This code snippet is a modified version adapted from the following GitHub repository:
|
11 |
# https://github.com/KellerJordan/Muon/blob/master/muon.py
|
|
|
|
|
|
|
12 |
def _zeropower_via_newtonschulz5(G, steps):
|
13 |
"""
|
14 |
Newton-Schulz iteration to compute the zeroth power / orthogonalization of G. We opt to use a
|
@@ -52,6 +54,7 @@ class _muon_state:
|
|
52 |
compute_event: torch.cuda.Event | None = None
|
53 |
|
54 |
|
|
|
55 |
def _gather(p, state, rank, comm_stream):
|
56 |
g = p.grad
|
57 |
mesh = g.device_mesh
|
@@ -82,6 +85,7 @@ def _gather(p, state, rank, comm_stream):
|
|
82 |
state.gather_event = None
|
83 |
|
84 |
|
|
|
85 |
def _compute_u(state, steps, rank, compute_stream):
|
86 |
with torch.cuda.stream(compute_stream):
|
87 |
if rank == state.worker_rank:
|
@@ -99,6 +103,7 @@ def _compute_u(state, steps, rank, compute_stream):
|
|
99 |
state.compute_event = None
|
100 |
|
101 |
|
|
|
102 |
def _scatter(p, state, lr, wd, rank, comm_stream):
|
103 |
u = state.computed_u
|
104 |
mesh = p.device_mesh
|
|
|
5 |
import torch.distributed as dist
|
6 |
from torch.distributed._tensor import DTensor
|
7 |
|
|
|
8 |
# TODO leave original url and consider LICENSE
|
9 |
# This code snippet is a modified version adapted from the following GitHub repository:
|
10 |
# https://github.com/KellerJordan/Muon/blob/master/muon.py
|
11 |
+
|
12 |
+
|
13 |
+
@torch.no_grad()
|
14 |
def _zeropower_via_newtonschulz5(G, steps):
|
15 |
"""
|
16 |
Newton-Schulz iteration to compute the zeroth power / orthogonalization of G. We opt to use a
|
|
|
54 |
compute_event: torch.cuda.Event | None = None
|
55 |
|
56 |
|
57 |
+
@torch.no_grad()
|
58 |
def _gather(p, state, rank, comm_stream):
|
59 |
g = p.grad
|
60 |
mesh = g.device_mesh
|
|
|
85 |
state.gather_event = None
|
86 |
|
87 |
|
88 |
+
@torch.no_grad()
|
89 |
def _compute_u(state, steps, rank, compute_stream):
|
90 |
with torch.cuda.stream(compute_stream):
|
91 |
if rank == state.worker_rank:
|
|
|
103 |
state.compute_event = None
|
104 |
|
105 |
|
106 |
+
@torch.no_grad()
|
107 |
def _scatter(p, state, lr, wd, rank, comm_stream):
|
108 |
u = state.computed_u
|
109 |
mesh = p.device_mesh
|
build/torch26-cxx98-cu126-x86_64-linux/optimizer/_ops.py
CHANGED
@@ -1,9 +1,9 @@
|
|
1 |
import torch
|
2 |
-
from . import
|
3 |
-
ops = torch.ops.
|
4 |
|
5 |
def add_op_namespace_prefix(op_name: str):
|
6 |
"""
|
7 |
Prefix op by namespace.
|
8 |
"""
|
9 |
-
return f"
|
|
|
1 |
import torch
|
2 |
+
from . import _optimizer_bdd2678_dirty
|
3 |
+
ops = torch.ops._optimizer_bdd2678_dirty
|
4 |
|
5 |
def add_op_namespace_prefix(op_name: str):
|
6 |
"""
|
7 |
Prefix op by namespace.
|
8 |
"""
|
9 |
+
return f"_optimizer_bdd2678_dirty::{op_name}"
|
build/torch26-cxx98-cu126-x86_64-linux/optimizer/_optimizer_20250614125054.abi3.so
DELETED
@@ -1,3 +0,0 @@
|
|
1 |
-
version https://git-lfs.github.com/spec/v1
|
2 |
-
oid sha256:3c3282a321487a6faa532afe43bc1298731983c50e2a1acdff5480ff6e4df34e
|
3 |
-
size 1824192
|
|
|
|
|
|
|
|
build/torch26-cxx98-cu126-x86_64-linux/optimizer/_optimizer_8535e80_dirty.abi3.so
DELETED
@@ -1,3 +0,0 @@
|
|
1 |
-
version https://git-lfs.github.com/spec/v1
|
2 |
-
oid sha256:34215ecc274ef516967962c8457dad214e9bbf618bf5eee8f467371f4f620284
|
3 |
-
size 1824184
|
|
|
|
|
|
|
|
build/torch26-cxx98-cu126-x86_64-linux/optimizer/_optimizer_b4b3752_dirty.abi3.so
DELETED
@@ -1,3 +0,0 @@
|
|
1 |
-
version https://git-lfs.github.com/spec/v1
|
2 |
-
oid sha256:a5b49ed642e1c320da3932377033ad90031124f4ec24b2d1c95fd976ff28346c
|
3 |
-
size 1824184
|
|
|
|
|
|
|
|
build/{torch26-cxx98-cu124-x86_64-linux/optimizer/_optimizer_b4b3752_dirty.abi3.so → torch26-cxx98-cu126-x86_64-linux/optimizer/_optimizer_bdd2678_dirty.abi3.so}
RENAMED
@@ -1,3 +1,3 @@
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
-
oid sha256:
|
3 |
size 1824184
|
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:2fa1dad3b3c1d94e7613a35e42afb8c7974d7bf6ce25cd2766590ba65b129f07
|
3 |
size 1824184
|
build/torch26-cxx98-cu126-x86_64-linux/optimizer/muon.py
CHANGED
@@ -5,10 +5,12 @@ import torch
|
|
5 |
import torch.distributed as dist
|
6 |
from torch.distributed._tensor import DTensor
|
7 |
|
8 |
-
|
9 |
# TODO leave original url and consider LICENSE
|
10 |
# This code snippet is a modified version adapted from the following GitHub repository:
|
11 |
# https://github.com/KellerJordan/Muon/blob/master/muon.py
|
|
|
|
|
|
|
12 |
def _zeropower_via_newtonschulz5(G, steps):
|
13 |
"""
|
14 |
Newton-Schulz iteration to compute the zeroth power / orthogonalization of G. We opt to use a
|
@@ -52,6 +54,7 @@ class _muon_state:
|
|
52 |
compute_event: torch.cuda.Event | None = None
|
53 |
|
54 |
|
|
|
55 |
def _gather(p, state, rank, comm_stream):
|
56 |
g = p.grad
|
57 |
mesh = g.device_mesh
|
@@ -82,6 +85,7 @@ def _gather(p, state, rank, comm_stream):
|
|
82 |
state.gather_event = None
|
83 |
|
84 |
|
|
|
85 |
def _compute_u(state, steps, rank, compute_stream):
|
86 |
with torch.cuda.stream(compute_stream):
|
87 |
if rank == state.worker_rank:
|
@@ -99,6 +103,7 @@ def _compute_u(state, steps, rank, compute_stream):
|
|
99 |
state.compute_event = None
|
100 |
|
101 |
|
|
|
102 |
def _scatter(p, state, lr, wd, rank, comm_stream):
|
103 |
u = state.computed_u
|
104 |
mesh = p.device_mesh
|
|
|
5 |
import torch.distributed as dist
|
6 |
from torch.distributed._tensor import DTensor
|
7 |
|
|
|
8 |
# TODO leave original url and consider LICENSE
|
9 |
# This code snippet is a modified version adapted from the following GitHub repository:
|
10 |
# https://github.com/KellerJordan/Muon/blob/master/muon.py
|
11 |
+
|
12 |
+
|
13 |
+
@torch.no_grad()
|
14 |
def _zeropower_via_newtonschulz5(G, steps):
|
15 |
"""
|
16 |
Newton-Schulz iteration to compute the zeroth power / orthogonalization of G. We opt to use a
|
|
|
54 |
compute_event: torch.cuda.Event | None = None
|
55 |
|
56 |
|
57 |
+
@torch.no_grad()
|
58 |
def _gather(p, state, rank, comm_stream):
|
59 |
g = p.grad
|
60 |
mesh = g.device_mesh
|
|
|
85 |
state.gather_event = None
|
86 |
|
87 |
|
88 |
+
@torch.no_grad()
|
89 |
def _compute_u(state, steps, rank, compute_stream):
|
90 |
with torch.cuda.stream(compute_stream):
|
91 |
if rank == state.worker_rank:
|
|
|
103 |
state.compute_event = None
|
104 |
|
105 |
|
106 |
+
@torch.no_grad()
|
107 |
def _scatter(p, state, lr, wd, rank, comm_stream):
|
108 |
u = state.computed_u
|
109 |
mesh = p.device_mesh
|
build/torch27-cxx11-cu118-x86_64-linux/optimizer/_ops.py
CHANGED
@@ -1,9 +1,9 @@
|
|
1 |
import torch
|
2 |
-
from . import
|
3 |
-
ops = torch.ops.
|
4 |
|
5 |
def add_op_namespace_prefix(op_name: str):
|
6 |
"""
|
7 |
Prefix op by namespace.
|
8 |
"""
|
9 |
-
return f"
|
|
|
1 |
import torch
|
2 |
+
from . import _optimizer_bdd2678_dirty
|
3 |
+
ops = torch.ops._optimizer_bdd2678_dirty
|
4 |
|
5 |
def add_op_namespace_prefix(op_name: str):
|
6 |
"""
|
7 |
Prefix op by namespace.
|
8 |
"""
|
9 |
+
return f"_optimizer_bdd2678_dirty::{op_name}"
|
build/torch27-cxx11-cu118-x86_64-linux/optimizer/_optimizer_20250614125054.abi3.so
DELETED
@@ -1,3 +0,0 @@
|
|
1 |
-
version https://git-lfs.github.com/spec/v1
|
2 |
-
oid sha256:de82486a39ded94bfe7eeaa862459944a93e284fd0d919329979bb67db3c367f
|
3 |
-
size 1787376
|
|
|
|
|
|
|
|
build/torch27-cxx11-cu118-x86_64-linux/optimizer/_optimizer_b4b3752_dirty.abi3.so
DELETED
@@ -1,3 +0,0 @@
|
|
1 |
-
version https://git-lfs.github.com/spec/v1
|
2 |
-
oid sha256:8ac9027c4a93801e9f19f1e9e94a9ed33b27e92c72797053c3de55e2a6fbb41d
|
3 |
-
size 1787368
|
|
|
|
|
|
|
|
build/torch27-cxx11-cu118-x86_64-linux/optimizer/{_optimizer_8535e80_dirty.abi3.so → _optimizer_bdd2678_dirty.abi3.so}
RENAMED
@@ -1,3 +1,3 @@
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
-
oid sha256:
|
3 |
size 1787368
|
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:fe5761d07ed965bf94d00d8a8e6753a7fb571271e73773de9021511e0e6ae2c7
|
3 |
size 1787368
|
build/torch27-cxx11-cu118-x86_64-linux/optimizer/muon.py
CHANGED
@@ -5,10 +5,12 @@ import torch
|
|
5 |
import torch.distributed as dist
|
6 |
from torch.distributed._tensor import DTensor
|
7 |
|
8 |
-
|
9 |
# TODO leave original url and consider LICENSE
|
10 |
# This code snippet is a modified version adapted from the following GitHub repository:
|
11 |
# https://github.com/KellerJordan/Muon/blob/master/muon.py
|
|
|
|
|
|
|
12 |
def _zeropower_via_newtonschulz5(G, steps):
|
13 |
"""
|
14 |
Newton-Schulz iteration to compute the zeroth power / orthogonalization of G. We opt to use a
|
@@ -52,6 +54,7 @@ class _muon_state:
|
|
52 |
compute_event: torch.cuda.Event | None = None
|
53 |
|
54 |
|
|
|
55 |
def _gather(p, state, rank, comm_stream):
|
56 |
g = p.grad
|
57 |
mesh = g.device_mesh
|
@@ -82,6 +85,7 @@ def _gather(p, state, rank, comm_stream):
|
|
82 |
state.gather_event = None
|
83 |
|
84 |
|
|
|
85 |
def _compute_u(state, steps, rank, compute_stream):
|
86 |
with torch.cuda.stream(compute_stream):
|
87 |
if rank == state.worker_rank:
|
@@ -99,6 +103,7 @@ def _compute_u(state, steps, rank, compute_stream):
|
|
99 |
state.compute_event = None
|
100 |
|
101 |
|
|
|
102 |
def _scatter(p, state, lr, wd, rank, comm_stream):
|
103 |
u = state.computed_u
|
104 |
mesh = p.device_mesh
|
|
|
5 |
import torch.distributed as dist
|
6 |
from torch.distributed._tensor import DTensor
|
7 |
|
|
|
8 |
# TODO leave original url and consider LICENSE
|
9 |
# This code snippet is a modified version adapted from the following GitHub repository:
|
10 |
# https://github.com/KellerJordan/Muon/blob/master/muon.py
|
11 |
+
|
12 |
+
|
13 |
+
@torch.no_grad()
|
14 |
def _zeropower_via_newtonschulz5(G, steps):
|
15 |
"""
|
16 |
Newton-Schulz iteration to compute the zeroth power / orthogonalization of G. We opt to use a
|
|
|
54 |
compute_event: torch.cuda.Event | None = None
|
55 |
|
56 |
|
57 |
+
@torch.no_grad()
|
58 |
def _gather(p, state, rank, comm_stream):
|
59 |
g = p.grad
|
60 |
mesh = g.device_mesh
|
|
|
85 |
state.gather_event = None
|
86 |
|
87 |
|
88 |
+
@torch.no_grad()
|
89 |
def _compute_u(state, steps, rank, compute_stream):
|
90 |
with torch.cuda.stream(compute_stream):
|
91 |
if rank == state.worker_rank:
|
|
|
103 |
state.compute_event = None
|
104 |
|
105 |
|
106 |
+
@torch.no_grad()
|
107 |
def _scatter(p, state, lr, wd, rank, comm_stream):
|
108 |
u = state.computed_u
|
109 |
mesh = p.device_mesh
|
build/torch27-cxx11-cu126-x86_64-linux/optimizer/_ops.py
CHANGED
@@ -1,9 +1,9 @@
|
|
1 |
import torch
|
2 |
-
from . import
|
3 |
-
ops = torch.ops.
|
4 |
|
5 |
def add_op_namespace_prefix(op_name: str):
|
6 |
"""
|
7 |
Prefix op by namespace.
|
8 |
"""
|
9 |
-
return f"
|
|
|
1 |
import torch
|
2 |
+
from . import _optimizer_bdd2678_dirty
|
3 |
+
ops = torch.ops._optimizer_bdd2678_dirty
|
4 |
|
5 |
def add_op_namespace_prefix(op_name: str):
|
6 |
"""
|
7 |
Prefix op by namespace.
|
8 |
"""
|
9 |
+
return f"_optimizer_bdd2678_dirty::{op_name}"
|
build/torch27-cxx11-cu126-x86_64-linux/optimizer/_optimizer_8535e80_dirty.abi3.so
DELETED
@@ -1,3 +0,0 @@
|
|
1 |
-
version https://git-lfs.github.com/spec/v1
|
2 |
-
oid sha256:d4aa09c22745d5efe1ef0669c4ca05615f67595dc90cabeee6e878301fa9bd22
|
3 |
-
size 1824256
|
|
|
|
|
|
|
|
build/torch27-cxx11-cu126-x86_64-linux/optimizer/_optimizer_b4b3752_dirty.abi3.so
DELETED
@@ -1,3 +0,0 @@
|
|
1 |
-
version https://git-lfs.github.com/spec/v1
|
2 |
-
oid sha256:b425a7fd854402508da5af17fa88f305753a09474686d6ec7afe540b3c5c082e
|
3 |
-
size 1824256
|
|
|
|
|
|
|
|
build/torch27-cxx11-cu126-x86_64-linux/optimizer/{_optimizer_20250614125054.abi3.so → _optimizer_bdd2678_dirty.abi3.so}
RENAMED
@@ -1,3 +1,3 @@
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
-
oid sha256:
|
3 |
size 1824256
|
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:a35c1c4d46f677f0fe35fec9023a866b9bd0f4245624b4e71a9812a1864c01e6
|
3 |
size 1824256
|
build/torch27-cxx11-cu126-x86_64-linux/optimizer/muon.py
CHANGED
@@ -5,10 +5,12 @@ import torch
|
|
5 |
import torch.distributed as dist
|
6 |
from torch.distributed._tensor import DTensor
|
7 |
|
8 |
-
|
9 |
# TODO leave original url and consider LICENSE
|
10 |
# This code snippet is a modified version adapted from the following GitHub repository:
|
11 |
# https://github.com/KellerJordan/Muon/blob/master/muon.py
|
|
|
|
|
|
|
12 |
def _zeropower_via_newtonschulz5(G, steps):
|
13 |
"""
|
14 |
Newton-Schulz iteration to compute the zeroth power / orthogonalization of G. We opt to use a
|
@@ -52,6 +54,7 @@ class _muon_state:
|
|
52 |
compute_event: torch.cuda.Event | None = None
|
53 |
|
54 |
|
|
|
55 |
def _gather(p, state, rank, comm_stream):
|
56 |
g = p.grad
|
57 |
mesh = g.device_mesh
|
@@ -82,6 +85,7 @@ def _gather(p, state, rank, comm_stream):
|
|
82 |
state.gather_event = None
|
83 |
|
84 |
|
|
|
85 |
def _compute_u(state, steps, rank, compute_stream):
|
86 |
with torch.cuda.stream(compute_stream):
|
87 |
if rank == state.worker_rank:
|
@@ -99,6 +103,7 @@ def _compute_u(state, steps, rank, compute_stream):
|
|
99 |
state.compute_event = None
|
100 |
|
101 |
|
|
|
102 |
def _scatter(p, state, lr, wd, rank, comm_stream):
|
103 |
u = state.computed_u
|
104 |
mesh = p.device_mesh
|
|
|
5 |
import torch.distributed as dist
|
6 |
from torch.distributed._tensor import DTensor
|
7 |
|
|
|
8 |
# TODO leave original url and consider LICENSE
|
9 |
# This code snippet is a modified version adapted from the following GitHub repository:
|
10 |
# https://github.com/KellerJordan/Muon/blob/master/muon.py
|
11 |
+
|
12 |
+
|
13 |
+
@torch.no_grad()
|
14 |
def _zeropower_via_newtonschulz5(G, steps):
|
15 |
"""
|
16 |
Newton-Schulz iteration to compute the zeroth power / orthogonalization of G. We opt to use a
|
|
|
54 |
compute_event: torch.cuda.Event | None = None
|
55 |
|
56 |
|
57 |
+
@torch.no_grad()
|
58 |
def _gather(p, state, rank, comm_stream):
|
59 |
g = p.grad
|
60 |
mesh = g.device_mesh
|
|
|
85 |
state.gather_event = None
|
86 |
|
87 |
|
88 |
+
@torch.no_grad()
|
89 |
def _compute_u(state, steps, rank, compute_stream):
|
90 |
with torch.cuda.stream(compute_stream):
|
91 |
if rank == state.worker_rank:
|
|
|
103 |
state.compute_event = None
|
104 |
|
105 |
|
106 |
+
@torch.no_grad()
|
107 |
def _scatter(p, state, lr, wd, rank, comm_stream):
|
108 |
u = state.computed_u
|
109 |
mesh = p.device_mesh
|
build/torch27-cxx11-cu128-x86_64-linux/optimizer/_ops.py
CHANGED
@@ -1,9 +1,9 @@
|
|
1 |
import torch
|
2 |
-
from . import
|
3 |
-
ops = torch.ops.
|
4 |
|
5 |
def add_op_namespace_prefix(op_name: str):
|
6 |
"""
|
7 |
Prefix op by namespace.
|
8 |
"""
|
9 |
-
return f"
|
|
|
1 |
import torch
|
2 |
+
from . import _optimizer_bdd2678_dirty
|
3 |
+
ops = torch.ops._optimizer_bdd2678_dirty
|
4 |
|
5 |
def add_op_namespace_prefix(op_name: str):
|
6 |
"""
|
7 |
Prefix op by namespace.
|
8 |
"""
|
9 |
+
return f"_optimizer_bdd2678_dirty::{op_name}"
|
build/torch27-cxx11-cu128-x86_64-linux/optimizer/_optimizer_8535e80_dirty.abi3.so
DELETED
@@ -1,3 +0,0 @@
|
|
1 |
-
version https://git-lfs.github.com/spec/v1
|
2 |
-
oid sha256:b4baf569b70749c4657062fb0f56943fc486adb0c482e50c7aa8e31ddf5cc870
|
3 |
-
size 1883352
|
|
|
|
|
|
|
|
build/torch27-cxx11-cu128-x86_64-linux/optimizer/_optimizer_b4b3752_dirty.abi3.so
DELETED
@@ -1,3 +0,0 @@
|
|
1 |
-
version https://git-lfs.github.com/spec/v1
|
2 |
-
oid sha256:1150b2d74d708ef2f7d8a24049c49b9ba6e2d8f5d5ce5ae88a611e4d555fe659
|
3 |
-
size 1883352
|
|
|
|
|
|
|
|