Add files using upload-large-folder tool
Browse files- .gitattributes +1 -0
- block_config.py +118 -0
- config.json +1485 -0
- configuration_decilm.py +65 -0
- generation_config.json +11 -0
- hf_quant_config.json +13 -0
- model-00001-of-00011.safetensors +3 -0
- model-00002-of-00011.safetensors +3 -0
- model-00003-of-00011.safetensors +3 -0
- model-00004-of-00011.safetensors +3 -0
- model-00005-of-00011.safetensors +3 -0
- model-00006-of-00011.safetensors +3 -0
- model-00007-of-00011.safetensors +3 -0
- model-00008-of-00011.safetensors +3 -0
- model-00009-of-00011.safetensors +3 -0
- model-00010-of-00011.safetensors +3 -0
- model-00011-of-00011.safetensors +3 -0
- model.safetensors.index.json +0 -0
- special_tokens_map.json +23 -0
- tokenizer.json +3 -0
- tokenizer_config.json +2065 -0
- transformers_4_44_2__configuration_llama.py +203 -0
- transformers_4_44_2__modeling_rope_utils.py +559 -0
.gitattributes
CHANGED
@@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
|
|
33 |
*.zip filter=lfs diff=lfs merge=lfs -text
|
34 |
*.zst filter=lfs diff=lfs merge=lfs -text
|
35 |
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
|
|
|
33 |
*.zip filter=lfs diff=lfs merge=lfs -text
|
34 |
*.zst filter=lfs diff=lfs merge=lfs -text
|
35 |
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
36 |
+
tokenizer.json filter=lfs diff=lfs merge=lfs -text
|
block_config.py
ADDED
@@ -0,0 +1,118 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import dataclasses
|
2 |
+
import json
|
3 |
+
import warnings
|
4 |
+
from dataclasses import dataclass, MISSING
|
5 |
+
from functools import partial
|
6 |
+
from typing import Optional, Any
|
7 |
+
|
8 |
+
|
9 |
+
@partial(dataclass, frozen=True, kw_only=True)
|
10 |
+
class JsonComparable:
|
11 |
+
def to_json(self) -> str:
|
12 |
+
return json.dumps(dataclasses.asdict(self))
|
13 |
+
|
14 |
+
def __eq__(self, other: "JsonComparable") -> bool:
|
15 |
+
return self.to_json() == other.to_json()
|
16 |
+
|
17 |
+
def __hash__(self) -> int:
|
18 |
+
return hash(self.to_json())
|
19 |
+
|
20 |
+
def __lt__(self, other: "JsonComparable") -> bool:
|
21 |
+
return self.to_json() < other.to_json()
|
22 |
+
|
23 |
+
|
24 |
+
@partial(dataclass, frozen=True, kw_only=True)
|
25 |
+
class SubblockConfig(JsonComparable):
|
26 |
+
no_op: bool = False
|
27 |
+
replace_with_linear: bool = False
|
28 |
+
sparsify: Optional[list[str]] = None
|
29 |
+
|
30 |
+
def __post_init__(self):
|
31 |
+
assert not (self.no_op and self.replace_with_linear)
|
32 |
+
|
33 |
+
def _force_setattr(self, name: str, value: Any) -> None:
|
34 |
+
"""
|
35 |
+
Set an attribute even in frozen dataclasses.
|
36 |
+
Use only inside __post_init__!
|
37 |
+
"""
|
38 |
+
object.__setattr__(self, name, value)
|
39 |
+
|
40 |
+
|
41 |
+
@partial(dataclass, frozen=True, kw_only=True)
|
42 |
+
class AttentionConfig(SubblockConfig):
|
43 |
+
n_heads_in_group: Optional[int] = None
|
44 |
+
window_length: Optional[int] = None
|
45 |
+
num_sink_tokens: Optional[int] = None
|
46 |
+
use_prefill_window_in_sink_attention: bool = False
|
47 |
+
unshifted_sink: bool = False
|
48 |
+
|
49 |
+
def __post_init__(self):
|
50 |
+
super().__post_init__()
|
51 |
+
assert not (self.no_op and self.replace_with_linear)
|
52 |
+
|
53 |
+
if self.no_op or self.replace_with_linear:
|
54 |
+
for irrelevant_att in ["n_heads_in_group", "window_length", "num_sink_tokens"]:
|
55 |
+
self._force_setattr(irrelevant_att, None)
|
56 |
+
else:
|
57 |
+
assert self.n_heads_in_group is not None
|
58 |
+
|
59 |
+
if self.is_sink:
|
60 |
+
assert not (self.unshifted_sink and self.use_prefill_window_in_sink_attention), \
|
61 |
+
("Unshifted sink uses its own kind of explicit masking, not standard window. "
|
62 |
+
"Set use_prefill_window_in_sink_attention to False.")
|
63 |
+
assert not (self.num_sink_tokens == 0 and not self.unshifted_sink), \
|
64 |
+
"Fake sink attention with 0 sink tokens is only supported with unshifted_sink=True"
|
65 |
+
|
66 |
+
@property
|
67 |
+
def prefill_sliding_window(self) -> Optional[int]:
|
68 |
+
if self.window_length is not None:
|
69 |
+
if not self.is_sink or self.use_prefill_window_in_sink_attention:
|
70 |
+
return self.window_length
|
71 |
+
return None
|
72 |
+
|
73 |
+
@property
|
74 |
+
def is_sliding(self) -> bool:
|
75 |
+
return self.prefill_sliding_window is not None
|
76 |
+
|
77 |
+
@property
|
78 |
+
def is_sink(self) -> bool:
|
79 |
+
return (
|
80 |
+
(self.window_length is not None)
|
81 |
+
and
|
82 |
+
(self.num_sink_tokens is not None)
|
83 |
+
)
|
84 |
+
|
85 |
+
|
86 |
+
@partial(dataclass, frozen=True, kw_only=True)
|
87 |
+
class FFNConfig(SubblockConfig):
|
88 |
+
ffn_mult: Optional[float] = None
|
89 |
+
|
90 |
+
def __post_init__(self):
|
91 |
+
super().__post_init__()
|
92 |
+
if self.no_op or self.replace_with_linear:
|
93 |
+
self._force_setattr("ffn_mult", None)
|
94 |
+
else:
|
95 |
+
assert self.ffn_mult is not None
|
96 |
+
self._force_setattr("ffn_mult", round(self.ffn_mult, 6))
|
97 |
+
|
98 |
+
|
99 |
+
@partial(dataclass, frozen=True, kw_only=True)
|
100 |
+
class BlockConfig(JsonComparable):
|
101 |
+
attention: AttentionConfig = MISSING
|
102 |
+
ffn: FFNConfig = MISSING
|
103 |
+
|
104 |
+
def __post_init__(self):
|
105 |
+
"""
|
106 |
+
Init subblock dataclasses from dicts
|
107 |
+
"""
|
108 |
+
for subblock_name in dataclasses.fields(self):
|
109 |
+
subblock_config = getattr(self, subblock_name.name)
|
110 |
+
if isinstance(subblock_config, dict):
|
111 |
+
subblock_fields = [field.name for field in dataclasses.fields(subblock_name.type)]
|
112 |
+
unsupported_fields = [field_name for field_name in subblock_config.keys()
|
113 |
+
if field_name not in subblock_fields]
|
114 |
+
if len(unsupported_fields) > 0:
|
115 |
+
warnings.warn(f"Removed unsupported fields {unsupported_fields} from {subblock_name.type.__name__}")
|
116 |
+
subblock_config = {k: v for k, v in subblock_config.items() if k not in unsupported_fields}
|
117 |
+
object.__setattr__(self, subblock_name.name,
|
118 |
+
subblock_name.type(**subblock_config)) # __setattr__ to overcome frozen=True
|
config.json
ADDED
@@ -0,0 +1,1485 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"_name_or_path": "llama_nemotron_super_v1_1",
|
3 |
+
"architectures": [
|
4 |
+
"DeciLMForCausalLM"
|
5 |
+
],
|
6 |
+
"attention_bias": false,
|
7 |
+
"attention_dropout": 0.0,
|
8 |
+
"auto_map": {
|
9 |
+
"AutoConfig": "configuration_decilm.DeciLMConfig",
|
10 |
+
"AutoModelForCausalLM": "modeling_decilm.DeciLMForCausalLM"
|
11 |
+
},
|
12 |
+
"block_configs": [
|
13 |
+
{
|
14 |
+
"attention": {
|
15 |
+
"n_heads_in_group": 8,
|
16 |
+
"no_op": false,
|
17 |
+
"num_sink_tokens": null,
|
18 |
+
"replace_with_linear": false,
|
19 |
+
"sparsify": null,
|
20 |
+
"unshifted_sink": false,
|
21 |
+
"use_prefill_window_in_sink_attention": false,
|
22 |
+
"window_length": null
|
23 |
+
},
|
24 |
+
"ffn": {
|
25 |
+
"ffn_mult": 2.625,
|
26 |
+
"no_op": false,
|
27 |
+
"replace_with_linear": false,
|
28 |
+
"sparsify": null
|
29 |
+
}
|
30 |
+
},
|
31 |
+
{
|
32 |
+
"attention": {
|
33 |
+
"n_heads_in_group": 8,
|
34 |
+
"no_op": false,
|
35 |
+
"num_sink_tokens": null,
|
36 |
+
"replace_with_linear": false,
|
37 |
+
"sparsify": null,
|
38 |
+
"unshifted_sink": false,
|
39 |
+
"use_prefill_window_in_sink_attention": false,
|
40 |
+
"window_length": null
|
41 |
+
},
|
42 |
+
"ffn": {
|
43 |
+
"ffn_mult": 5.25,
|
44 |
+
"no_op": false,
|
45 |
+
"replace_with_linear": false,
|
46 |
+
"sparsify": null
|
47 |
+
}
|
48 |
+
},
|
49 |
+
{
|
50 |
+
"attention": {
|
51 |
+
"n_heads_in_group": 8,
|
52 |
+
"no_op": false,
|
53 |
+
"num_sink_tokens": null,
|
54 |
+
"replace_with_linear": false,
|
55 |
+
"sparsify": null,
|
56 |
+
"unshifted_sink": false,
|
57 |
+
"use_prefill_window_in_sink_attention": false,
|
58 |
+
"window_length": null
|
59 |
+
},
|
60 |
+
"ffn": {
|
61 |
+
"ffn_mult": 5.25,
|
62 |
+
"no_op": false,
|
63 |
+
"replace_with_linear": false,
|
64 |
+
"sparsify": null
|
65 |
+
}
|
66 |
+
},
|
67 |
+
{
|
68 |
+
"attention": {
|
69 |
+
"n_heads_in_group": 8,
|
70 |
+
"no_op": false,
|
71 |
+
"num_sink_tokens": null,
|
72 |
+
"replace_with_linear": false,
|
73 |
+
"sparsify": null,
|
74 |
+
"unshifted_sink": false,
|
75 |
+
"use_prefill_window_in_sink_attention": false,
|
76 |
+
"window_length": null
|
77 |
+
},
|
78 |
+
"ffn": {
|
79 |
+
"ffn_mult": 5.25,
|
80 |
+
"no_op": false,
|
81 |
+
"replace_with_linear": false,
|
82 |
+
"sparsify": null
|
83 |
+
}
|
84 |
+
},
|
85 |
+
{
|
86 |
+
"attention": {
|
87 |
+
"n_heads_in_group": 8,
|
88 |
+
"no_op": false,
|
89 |
+
"num_sink_tokens": null,
|
90 |
+
"replace_with_linear": false,
|
91 |
+
"sparsify": null,
|
92 |
+
"unshifted_sink": false,
|
93 |
+
"use_prefill_window_in_sink_attention": false,
|
94 |
+
"window_length": null
|
95 |
+
},
|
96 |
+
"ffn": {
|
97 |
+
"ffn_mult": 5.25,
|
98 |
+
"no_op": false,
|
99 |
+
"replace_with_linear": false,
|
100 |
+
"sparsify": null
|
101 |
+
}
|
102 |
+
},
|
103 |
+
{
|
104 |
+
"attention": {
|
105 |
+
"n_heads_in_group": 8,
|
106 |
+
"no_op": false,
|
107 |
+
"num_sink_tokens": null,
|
108 |
+
"replace_with_linear": false,
|
109 |
+
"sparsify": null,
|
110 |
+
"unshifted_sink": false,
|
111 |
+
"use_prefill_window_in_sink_attention": false,
|
112 |
+
"window_length": null
|
113 |
+
},
|
114 |
+
"ffn": {
|
115 |
+
"ffn_mult": 5.25,
|
116 |
+
"no_op": false,
|
117 |
+
"replace_with_linear": false,
|
118 |
+
"sparsify": null
|
119 |
+
}
|
120 |
+
},
|
121 |
+
{
|
122 |
+
"attention": {
|
123 |
+
"n_heads_in_group": null,
|
124 |
+
"no_op": true,
|
125 |
+
"num_sink_tokens": null,
|
126 |
+
"replace_with_linear": false,
|
127 |
+
"sparsify": null,
|
128 |
+
"unshifted_sink": false,
|
129 |
+
"use_prefill_window_in_sink_attention": false,
|
130 |
+
"window_length": null
|
131 |
+
},
|
132 |
+
"ffn": {
|
133 |
+
"ffn_mult": 2.625,
|
134 |
+
"no_op": false,
|
135 |
+
"replace_with_linear": false,
|
136 |
+
"sparsify": null
|
137 |
+
}
|
138 |
+
},
|
139 |
+
{
|
140 |
+
"attention": {
|
141 |
+
"n_heads_in_group": null,
|
142 |
+
"no_op": true,
|
143 |
+
"num_sink_tokens": null,
|
144 |
+
"replace_with_linear": false,
|
145 |
+
"sparsify": null,
|
146 |
+
"unshifted_sink": false,
|
147 |
+
"use_prefill_window_in_sink_attention": false,
|
148 |
+
"window_length": null
|
149 |
+
},
|
150 |
+
"ffn": {
|
151 |
+
"ffn_mult": 2.625,
|
152 |
+
"no_op": false,
|
153 |
+
"replace_with_linear": false,
|
154 |
+
"sparsify": null
|
155 |
+
}
|
156 |
+
},
|
157 |
+
{
|
158 |
+
"attention": {
|
159 |
+
"n_heads_in_group": 8,
|
160 |
+
"no_op": false,
|
161 |
+
"num_sink_tokens": null,
|
162 |
+
"replace_with_linear": false,
|
163 |
+
"sparsify": null,
|
164 |
+
"unshifted_sink": false,
|
165 |
+
"use_prefill_window_in_sink_attention": false,
|
166 |
+
"window_length": null
|
167 |
+
},
|
168 |
+
"ffn": {
|
169 |
+
"ffn_mult": 5.25,
|
170 |
+
"no_op": false,
|
171 |
+
"replace_with_linear": false,
|
172 |
+
"sparsify": null
|
173 |
+
}
|
174 |
+
},
|
175 |
+
{
|
176 |
+
"attention": {
|
177 |
+
"n_heads_in_group": 8,
|
178 |
+
"no_op": false,
|
179 |
+
"num_sink_tokens": null,
|
180 |
+
"replace_with_linear": false,
|
181 |
+
"sparsify": null,
|
182 |
+
"unshifted_sink": false,
|
183 |
+
"use_prefill_window_in_sink_attention": false,
|
184 |
+
"window_length": null
|
185 |
+
},
|
186 |
+
"ffn": {
|
187 |
+
"ffn_mult": 5.25,
|
188 |
+
"no_op": false,
|
189 |
+
"replace_with_linear": false,
|
190 |
+
"sparsify": null
|
191 |
+
}
|
192 |
+
},
|
193 |
+
{
|
194 |
+
"attention": {
|
195 |
+
"n_heads_in_group": 8,
|
196 |
+
"no_op": false,
|
197 |
+
"num_sink_tokens": null,
|
198 |
+
"replace_with_linear": false,
|
199 |
+
"sparsify": null,
|
200 |
+
"unshifted_sink": false,
|
201 |
+
"use_prefill_window_in_sink_attention": false,
|
202 |
+
"window_length": null
|
203 |
+
},
|
204 |
+
"ffn": {
|
205 |
+
"ffn_mult": 5.25,
|
206 |
+
"no_op": false,
|
207 |
+
"replace_with_linear": false,
|
208 |
+
"sparsify": null
|
209 |
+
}
|
210 |
+
},
|
211 |
+
{
|
212 |
+
"attention": {
|
213 |
+
"n_heads_in_group": null,
|
214 |
+
"no_op": true,
|
215 |
+
"num_sink_tokens": null,
|
216 |
+
"replace_with_linear": false,
|
217 |
+
"sparsify": null,
|
218 |
+
"unshifted_sink": false,
|
219 |
+
"use_prefill_window_in_sink_attention": false,
|
220 |
+
"window_length": null
|
221 |
+
},
|
222 |
+
"ffn": {
|
223 |
+
"ffn_mult": 3.28125,
|
224 |
+
"no_op": false,
|
225 |
+
"replace_with_linear": false,
|
226 |
+
"sparsify": null
|
227 |
+
}
|
228 |
+
},
|
229 |
+
{
|
230 |
+
"attention": {
|
231 |
+
"n_heads_in_group": 8,
|
232 |
+
"no_op": false,
|
233 |
+
"num_sink_tokens": null,
|
234 |
+
"replace_with_linear": false,
|
235 |
+
"sparsify": null,
|
236 |
+
"unshifted_sink": false,
|
237 |
+
"use_prefill_window_in_sink_attention": false,
|
238 |
+
"window_length": null
|
239 |
+
},
|
240 |
+
"ffn": {
|
241 |
+
"ffn_mult": 5.25,
|
242 |
+
"no_op": false,
|
243 |
+
"replace_with_linear": false,
|
244 |
+
"sparsify": null
|
245 |
+
}
|
246 |
+
},
|
247 |
+
{
|
248 |
+
"attention": {
|
249 |
+
"n_heads_in_group": 8,
|
250 |
+
"no_op": false,
|
251 |
+
"num_sink_tokens": null,
|
252 |
+
"replace_with_linear": false,
|
253 |
+
"sparsify": null,
|
254 |
+
"unshifted_sink": false,
|
255 |
+
"use_prefill_window_in_sink_attention": false,
|
256 |
+
"window_length": null
|
257 |
+
},
|
258 |
+
"ffn": {
|
259 |
+
"ffn_mult": 5.25,
|
260 |
+
"no_op": false,
|
261 |
+
"replace_with_linear": false,
|
262 |
+
"sparsify": null
|
263 |
+
}
|
264 |
+
},
|
265 |
+
{
|
266 |
+
"attention": {
|
267 |
+
"n_heads_in_group": 8,
|
268 |
+
"no_op": false,
|
269 |
+
"num_sink_tokens": null,
|
270 |
+
"replace_with_linear": false,
|
271 |
+
"sparsify": null,
|
272 |
+
"unshifted_sink": false,
|
273 |
+
"use_prefill_window_in_sink_attention": false,
|
274 |
+
"window_length": null
|
275 |
+
},
|
276 |
+
"ffn": {
|
277 |
+
"ffn_mult": 5.25,
|
278 |
+
"no_op": false,
|
279 |
+
"replace_with_linear": false,
|
280 |
+
"sparsify": null
|
281 |
+
}
|
282 |
+
},
|
283 |
+
{
|
284 |
+
"attention": {
|
285 |
+
"n_heads_in_group": 8,
|
286 |
+
"no_op": false,
|
287 |
+
"num_sink_tokens": null,
|
288 |
+
"replace_with_linear": false,
|
289 |
+
"sparsify": null,
|
290 |
+
"unshifted_sink": false,
|
291 |
+
"use_prefill_window_in_sink_attention": false,
|
292 |
+
"window_length": null
|
293 |
+
},
|
294 |
+
"ffn": {
|
295 |
+
"ffn_mult": 5.25,
|
296 |
+
"no_op": false,
|
297 |
+
"replace_with_linear": false,
|
298 |
+
"sparsify": null
|
299 |
+
}
|
300 |
+
},
|
301 |
+
{
|
302 |
+
"attention": {
|
303 |
+
"n_heads_in_group": 8,
|
304 |
+
"no_op": false,
|
305 |
+
"num_sink_tokens": null,
|
306 |
+
"replace_with_linear": false,
|
307 |
+
"sparsify": null,
|
308 |
+
"unshifted_sink": false,
|
309 |
+
"use_prefill_window_in_sink_attention": false,
|
310 |
+
"window_length": null
|
311 |
+
},
|
312 |
+
"ffn": {
|
313 |
+
"ffn_mult": 5.25,
|
314 |
+
"no_op": false,
|
315 |
+
"replace_with_linear": false,
|
316 |
+
"sparsify": null
|
317 |
+
}
|
318 |
+
},
|
319 |
+
{
|
320 |
+
"attention": {
|
321 |
+
"n_heads_in_group": 8,
|
322 |
+
"no_op": false,
|
323 |
+
"num_sink_tokens": null,
|
324 |
+
"replace_with_linear": false,
|
325 |
+
"sparsify": null,
|
326 |
+
"unshifted_sink": false,
|
327 |
+
"use_prefill_window_in_sink_attention": false,
|
328 |
+
"window_length": null
|
329 |
+
},
|
330 |
+
"ffn": {
|
331 |
+
"ffn_mult": 5.25,
|
332 |
+
"no_op": false,
|
333 |
+
"replace_with_linear": false,
|
334 |
+
"sparsify": null
|
335 |
+
}
|
336 |
+
},
|
337 |
+
{
|
338 |
+
"attention": {
|
339 |
+
"n_heads_in_group": 8,
|
340 |
+
"no_op": false,
|
341 |
+
"num_sink_tokens": null,
|
342 |
+
"replace_with_linear": false,
|
343 |
+
"sparsify": null,
|
344 |
+
"unshifted_sink": false,
|
345 |
+
"use_prefill_window_in_sink_attention": false,
|
346 |
+
"window_length": null
|
347 |
+
},
|
348 |
+
"ffn": {
|
349 |
+
"ffn_mult": 5.25,
|
350 |
+
"no_op": false,
|
351 |
+
"replace_with_linear": false,
|
352 |
+
"sparsify": null
|
353 |
+
}
|
354 |
+
},
|
355 |
+
{
|
356 |
+
"attention": {
|
357 |
+
"n_heads_in_group": 8,
|
358 |
+
"no_op": false,
|
359 |
+
"num_sink_tokens": null,
|
360 |
+
"replace_with_linear": false,
|
361 |
+
"sparsify": null,
|
362 |
+
"unshifted_sink": false,
|
363 |
+
"use_prefill_window_in_sink_attention": false,
|
364 |
+
"window_length": null
|
365 |
+
},
|
366 |
+
"ffn": {
|
367 |
+
"ffn_mult": 5.25,
|
368 |
+
"no_op": false,
|
369 |
+
"replace_with_linear": false,
|
370 |
+
"sparsify": null
|
371 |
+
}
|
372 |
+
},
|
373 |
+
{
|
374 |
+
"attention": {
|
375 |
+
"n_heads_in_group": 8,
|
376 |
+
"no_op": false,
|
377 |
+
"num_sink_tokens": null,
|
378 |
+
"replace_with_linear": false,
|
379 |
+
"sparsify": null,
|
380 |
+
"unshifted_sink": false,
|
381 |
+
"use_prefill_window_in_sink_attention": false,
|
382 |
+
"window_length": null
|
383 |
+
},
|
384 |
+
"ffn": {
|
385 |
+
"ffn_mult": 5.25,
|
386 |
+
"no_op": false,
|
387 |
+
"replace_with_linear": false,
|
388 |
+
"sparsify": null
|
389 |
+
}
|
390 |
+
},
|
391 |
+
{
|
392 |
+
"attention": {
|
393 |
+
"n_heads_in_group": 8,
|
394 |
+
"no_op": false,
|
395 |
+
"num_sink_tokens": null,
|
396 |
+
"replace_with_linear": false,
|
397 |
+
"sparsify": null,
|
398 |
+
"unshifted_sink": false,
|
399 |
+
"use_prefill_window_in_sink_attention": false,
|
400 |
+
"window_length": null
|
401 |
+
},
|
402 |
+
"ffn": {
|
403 |
+
"ffn_mult": 5.25,
|
404 |
+
"no_op": false,
|
405 |
+
"replace_with_linear": false,
|
406 |
+
"sparsify": null
|
407 |
+
}
|
408 |
+
},
|
409 |
+
{
|
410 |
+
"attention": {
|
411 |
+
"n_heads_in_group": 8,
|
412 |
+
"no_op": false,
|
413 |
+
"num_sink_tokens": null,
|
414 |
+
"replace_with_linear": false,
|
415 |
+
"sparsify": null,
|
416 |
+
"unshifted_sink": false,
|
417 |
+
"use_prefill_window_in_sink_attention": false,
|
418 |
+
"window_length": null
|
419 |
+
},
|
420 |
+
"ffn": {
|
421 |
+
"ffn_mult": 5.25,
|
422 |
+
"no_op": false,
|
423 |
+
"replace_with_linear": false,
|
424 |
+
"sparsify": null
|
425 |
+
}
|
426 |
+
},
|
427 |
+
{
|
428 |
+
"attention": {
|
429 |
+
"n_heads_in_group": 8,
|
430 |
+
"no_op": false,
|
431 |
+
"num_sink_tokens": null,
|
432 |
+
"replace_with_linear": false,
|
433 |
+
"sparsify": null,
|
434 |
+
"unshifted_sink": false,
|
435 |
+
"use_prefill_window_in_sink_attention": false,
|
436 |
+
"window_length": null
|
437 |
+
},
|
438 |
+
"ffn": {
|
439 |
+
"ffn_mult": 5.25,
|
440 |
+
"no_op": false,
|
441 |
+
"replace_with_linear": false,
|
442 |
+
"sparsify": null
|
443 |
+
}
|
444 |
+
},
|
445 |
+
{
|
446 |
+
"attention": {
|
447 |
+
"n_heads_in_group": 8,
|
448 |
+
"no_op": false,
|
449 |
+
"num_sink_tokens": null,
|
450 |
+
"replace_with_linear": false,
|
451 |
+
"sparsify": null,
|
452 |
+
"unshifted_sink": false,
|
453 |
+
"use_prefill_window_in_sink_attention": false,
|
454 |
+
"window_length": null
|
455 |
+
},
|
456 |
+
"ffn": {
|
457 |
+
"ffn_mult": 5.25,
|
458 |
+
"no_op": false,
|
459 |
+
"replace_with_linear": false,
|
460 |
+
"sparsify": null
|
461 |
+
}
|
462 |
+
},
|
463 |
+
{
|
464 |
+
"attention": {
|
465 |
+
"n_heads_in_group": 8,
|
466 |
+
"no_op": false,
|
467 |
+
"num_sink_tokens": null,
|
468 |
+
"replace_with_linear": false,
|
469 |
+
"sparsify": null,
|
470 |
+
"unshifted_sink": false,
|
471 |
+
"use_prefill_window_in_sink_attention": false,
|
472 |
+
"window_length": null
|
473 |
+
},
|
474 |
+
"ffn": {
|
475 |
+
"ffn_mult": 5.25,
|
476 |
+
"no_op": false,
|
477 |
+
"replace_with_linear": false,
|
478 |
+
"sparsify": null
|
479 |
+
}
|
480 |
+
},
|
481 |
+
{
|
482 |
+
"attention": {
|
483 |
+
"n_heads_in_group": 8,
|
484 |
+
"no_op": false,
|
485 |
+
"num_sink_tokens": null,
|
486 |
+
"replace_with_linear": false,
|
487 |
+
"sparsify": null,
|
488 |
+
"unshifted_sink": false,
|
489 |
+
"use_prefill_window_in_sink_attention": false,
|
490 |
+
"window_length": null
|
491 |
+
},
|
492 |
+
"ffn": {
|
493 |
+
"ffn_mult": 5.25,
|
494 |
+
"no_op": false,
|
495 |
+
"replace_with_linear": false,
|
496 |
+
"sparsify": null
|
497 |
+
}
|
498 |
+
},
|
499 |
+
{
|
500 |
+
"attention": {
|
501 |
+
"n_heads_in_group": 8,
|
502 |
+
"no_op": false,
|
503 |
+
"num_sink_tokens": null,
|
504 |
+
"replace_with_linear": false,
|
505 |
+
"sparsify": null,
|
506 |
+
"unshifted_sink": false,
|
507 |
+
"use_prefill_window_in_sink_attention": false,
|
508 |
+
"window_length": null
|
509 |
+
},
|
510 |
+
"ffn": {
|
511 |
+
"ffn_mult": 5.25,
|
512 |
+
"no_op": false,
|
513 |
+
"replace_with_linear": false,
|
514 |
+
"sparsify": null
|
515 |
+
}
|
516 |
+
},
|
517 |
+
{
|
518 |
+
"attention": {
|
519 |
+
"n_heads_in_group": 8,
|
520 |
+
"no_op": false,
|
521 |
+
"num_sink_tokens": null,
|
522 |
+
"replace_with_linear": false,
|
523 |
+
"sparsify": null,
|
524 |
+
"unshifted_sink": false,
|
525 |
+
"use_prefill_window_in_sink_attention": false,
|
526 |
+
"window_length": null
|
527 |
+
},
|
528 |
+
"ffn": {
|
529 |
+
"ffn_mult": 5.25,
|
530 |
+
"no_op": false,
|
531 |
+
"replace_with_linear": false,
|
532 |
+
"sparsify": null
|
533 |
+
}
|
534 |
+
},
|
535 |
+
{
|
536 |
+
"attention": {
|
537 |
+
"n_heads_in_group": 8,
|
538 |
+
"no_op": false,
|
539 |
+
"num_sink_tokens": null,
|
540 |
+
"replace_with_linear": false,
|
541 |
+
"sparsify": null,
|
542 |
+
"unshifted_sink": false,
|
543 |
+
"use_prefill_window_in_sink_attention": false,
|
544 |
+
"window_length": null
|
545 |
+
},
|
546 |
+
"ffn": {
|
547 |
+
"ffn_mult": 5.25,
|
548 |
+
"no_op": false,
|
549 |
+
"replace_with_linear": false,
|
550 |
+
"sparsify": null
|
551 |
+
}
|
552 |
+
},
|
553 |
+
{
|
554 |
+
"attention": {
|
555 |
+
"n_heads_in_group": 8,
|
556 |
+
"no_op": false,
|
557 |
+
"num_sink_tokens": null,
|
558 |
+
"replace_with_linear": false,
|
559 |
+
"sparsify": null,
|
560 |
+
"unshifted_sink": false,
|
561 |
+
"use_prefill_window_in_sink_attention": false,
|
562 |
+
"window_length": null
|
563 |
+
},
|
564 |
+
"ffn": {
|
565 |
+
"ffn_mult": 5.25,
|
566 |
+
"no_op": false,
|
567 |
+
"replace_with_linear": false,
|
568 |
+
"sparsify": null
|
569 |
+
}
|
570 |
+
},
|
571 |
+
{
|
572 |
+
"attention": {
|
573 |
+
"n_heads_in_group": 8,
|
574 |
+
"no_op": false,
|
575 |
+
"num_sink_tokens": null,
|
576 |
+
"replace_with_linear": false,
|
577 |
+
"sparsify": null,
|
578 |
+
"unshifted_sink": false,
|
579 |
+
"use_prefill_window_in_sink_attention": false,
|
580 |
+
"window_length": null
|
581 |
+
},
|
582 |
+
"ffn": {
|
583 |
+
"ffn_mult": 5.25,
|
584 |
+
"no_op": false,
|
585 |
+
"replace_with_linear": false,
|
586 |
+
"sparsify": null
|
587 |
+
}
|
588 |
+
},
|
589 |
+
{
|
590 |
+
"attention": {
|
591 |
+
"n_heads_in_group": 8,
|
592 |
+
"no_op": false,
|
593 |
+
"num_sink_tokens": null,
|
594 |
+
"replace_with_linear": false,
|
595 |
+
"sparsify": null,
|
596 |
+
"unshifted_sink": false,
|
597 |
+
"use_prefill_window_in_sink_attention": false,
|
598 |
+
"window_length": null
|
599 |
+
},
|
600 |
+
"ffn": {
|
601 |
+
"ffn_mult": 5.25,
|
602 |
+
"no_op": false,
|
603 |
+
"replace_with_linear": false,
|
604 |
+
"sparsify": null
|
605 |
+
}
|
606 |
+
},
|
607 |
+
{
|
608 |
+
"attention": {
|
609 |
+
"n_heads_in_group": 8,
|
610 |
+
"no_op": false,
|
611 |
+
"num_sink_tokens": null,
|
612 |
+
"replace_with_linear": false,
|
613 |
+
"sparsify": null,
|
614 |
+
"unshifted_sink": false,
|
615 |
+
"use_prefill_window_in_sink_attention": false,
|
616 |
+
"window_length": null
|
617 |
+
},
|
618 |
+
"ffn": {
|
619 |
+
"ffn_mult": 5.25,
|
620 |
+
"no_op": false,
|
621 |
+
"replace_with_linear": false,
|
622 |
+
"sparsify": null
|
623 |
+
}
|
624 |
+
},
|
625 |
+
{
|
626 |
+
"attention": {
|
627 |
+
"n_heads_in_group": 8,
|
628 |
+
"no_op": false,
|
629 |
+
"num_sink_tokens": null,
|
630 |
+
"replace_with_linear": false,
|
631 |
+
"sparsify": null,
|
632 |
+
"unshifted_sink": false,
|
633 |
+
"use_prefill_window_in_sink_attention": false,
|
634 |
+
"window_length": null
|
635 |
+
},
|
636 |
+
"ffn": {
|
637 |
+
"ffn_mult": 5.25,
|
638 |
+
"no_op": false,
|
639 |
+
"replace_with_linear": false,
|
640 |
+
"sparsify": null
|
641 |
+
}
|
642 |
+
},
|
643 |
+
{
|
644 |
+
"attention": {
|
645 |
+
"n_heads_in_group": 8,
|
646 |
+
"no_op": false,
|
647 |
+
"num_sink_tokens": null,
|
648 |
+
"replace_with_linear": false,
|
649 |
+
"sparsify": null,
|
650 |
+
"unshifted_sink": false,
|
651 |
+
"use_prefill_window_in_sink_attention": false,
|
652 |
+
"window_length": null
|
653 |
+
},
|
654 |
+
"ffn": {
|
655 |
+
"ffn_mult": 5.25,
|
656 |
+
"no_op": false,
|
657 |
+
"replace_with_linear": false,
|
658 |
+
"sparsify": null
|
659 |
+
}
|
660 |
+
},
|
661 |
+
{
|
662 |
+
"attention": {
|
663 |
+
"n_heads_in_group": 8,
|
664 |
+
"no_op": false,
|
665 |
+
"num_sink_tokens": null,
|
666 |
+
"replace_with_linear": false,
|
667 |
+
"sparsify": null,
|
668 |
+
"unshifted_sink": false,
|
669 |
+
"use_prefill_window_in_sink_attention": false,
|
670 |
+
"window_length": null
|
671 |
+
},
|
672 |
+
"ffn": {
|
673 |
+
"ffn_mult": 5.25,
|
674 |
+
"no_op": false,
|
675 |
+
"replace_with_linear": false,
|
676 |
+
"sparsify": null
|
677 |
+
}
|
678 |
+
},
|
679 |
+
{
|
680 |
+
"attention": {
|
681 |
+
"n_heads_in_group": 8,
|
682 |
+
"no_op": false,
|
683 |
+
"num_sink_tokens": null,
|
684 |
+
"replace_with_linear": false,
|
685 |
+
"sparsify": null,
|
686 |
+
"unshifted_sink": false,
|
687 |
+
"use_prefill_window_in_sink_attention": false,
|
688 |
+
"window_length": null
|
689 |
+
},
|
690 |
+
"ffn": {
|
691 |
+
"ffn_mult": 5.25,
|
692 |
+
"no_op": false,
|
693 |
+
"replace_with_linear": false,
|
694 |
+
"sparsify": null
|
695 |
+
}
|
696 |
+
},
|
697 |
+
{
|
698 |
+
"attention": {
|
699 |
+
"n_heads_in_group": 8,
|
700 |
+
"no_op": false,
|
701 |
+
"num_sink_tokens": null,
|
702 |
+
"replace_with_linear": false,
|
703 |
+
"sparsify": null,
|
704 |
+
"unshifted_sink": false,
|
705 |
+
"use_prefill_window_in_sink_attention": false,
|
706 |
+
"window_length": null
|
707 |
+
},
|
708 |
+
"ffn": {
|
709 |
+
"ffn_mult": 5.25,
|
710 |
+
"no_op": false,
|
711 |
+
"replace_with_linear": false,
|
712 |
+
"sparsify": null
|
713 |
+
}
|
714 |
+
},
|
715 |
+
{
|
716 |
+
"attention": {
|
717 |
+
"n_heads_in_group": 8,
|
718 |
+
"no_op": false,
|
719 |
+
"num_sink_tokens": null,
|
720 |
+
"replace_with_linear": false,
|
721 |
+
"sparsify": null,
|
722 |
+
"unshifted_sink": false,
|
723 |
+
"use_prefill_window_in_sink_attention": false,
|
724 |
+
"window_length": null
|
725 |
+
},
|
726 |
+
"ffn": {
|
727 |
+
"ffn_mult": 5.25,
|
728 |
+
"no_op": false,
|
729 |
+
"replace_with_linear": false,
|
730 |
+
"sparsify": null
|
731 |
+
}
|
732 |
+
},
|
733 |
+
{
|
734 |
+
"attention": {
|
735 |
+
"n_heads_in_group": 8,
|
736 |
+
"no_op": false,
|
737 |
+
"num_sink_tokens": null,
|
738 |
+
"replace_with_linear": false,
|
739 |
+
"sparsify": null,
|
740 |
+
"unshifted_sink": false,
|
741 |
+
"use_prefill_window_in_sink_attention": false,
|
742 |
+
"window_length": null
|
743 |
+
},
|
744 |
+
"ffn": {
|
745 |
+
"ffn_mult": 5.25,
|
746 |
+
"no_op": false,
|
747 |
+
"replace_with_linear": false,
|
748 |
+
"sparsify": null
|
749 |
+
}
|
750 |
+
},
|
751 |
+
{
|
752 |
+
"attention": {
|
753 |
+
"n_heads_in_group": 8,
|
754 |
+
"no_op": false,
|
755 |
+
"num_sink_tokens": null,
|
756 |
+
"replace_with_linear": false,
|
757 |
+
"sparsify": null,
|
758 |
+
"unshifted_sink": false,
|
759 |
+
"use_prefill_window_in_sink_attention": false,
|
760 |
+
"window_length": null
|
761 |
+
},
|
762 |
+
"ffn": {
|
763 |
+
"ffn_mult": 5.25,
|
764 |
+
"no_op": false,
|
765 |
+
"replace_with_linear": false,
|
766 |
+
"sparsify": null
|
767 |
+
}
|
768 |
+
},
|
769 |
+
{
|
770 |
+
"attention": {
|
771 |
+
"n_heads_in_group": null,
|
772 |
+
"no_op": true,
|
773 |
+
"num_sink_tokens": null,
|
774 |
+
"replace_with_linear": false,
|
775 |
+
"sparsify": null,
|
776 |
+
"unshifted_sink": false,
|
777 |
+
"use_prefill_window_in_sink_attention": false,
|
778 |
+
"window_length": null
|
779 |
+
},
|
780 |
+
"ffn": {
|
781 |
+
"ffn_mult": 1.3125,
|
782 |
+
"no_op": false,
|
783 |
+
"replace_with_linear": false,
|
784 |
+
"sparsify": null
|
785 |
+
}
|
786 |
+
},
|
787 |
+
{
|
788 |
+
"attention": {
|
789 |
+
"n_heads_in_group": null,
|
790 |
+
"no_op": true,
|
791 |
+
"num_sink_tokens": null,
|
792 |
+
"replace_with_linear": false,
|
793 |
+
"sparsify": null,
|
794 |
+
"unshifted_sink": false,
|
795 |
+
"use_prefill_window_in_sink_attention": false,
|
796 |
+
"window_length": null
|
797 |
+
},
|
798 |
+
"ffn": {
|
799 |
+
"ffn_mult": 2.625,
|
800 |
+
"no_op": false,
|
801 |
+
"replace_with_linear": false,
|
802 |
+
"sparsify": null
|
803 |
+
}
|
804 |
+
},
|
805 |
+
{
|
806 |
+
"attention": {
|
807 |
+
"n_heads_in_group": null,
|
808 |
+
"no_op": true,
|
809 |
+
"num_sink_tokens": null,
|
810 |
+
"replace_with_linear": false,
|
811 |
+
"sparsify": null,
|
812 |
+
"unshifted_sink": false,
|
813 |
+
"use_prefill_window_in_sink_attention": false,
|
814 |
+
"window_length": null
|
815 |
+
},
|
816 |
+
"ffn": {
|
817 |
+
"ffn_mult": 2.625,
|
818 |
+
"no_op": false,
|
819 |
+
"replace_with_linear": false,
|
820 |
+
"sparsify": null
|
821 |
+
}
|
822 |
+
},
|
823 |
+
{
|
824 |
+
"attention": {
|
825 |
+
"n_heads_in_group": null,
|
826 |
+
"no_op": true,
|
827 |
+
"num_sink_tokens": null,
|
828 |
+
"replace_with_linear": false,
|
829 |
+
"sparsify": null,
|
830 |
+
"unshifted_sink": false,
|
831 |
+
"use_prefill_window_in_sink_attention": false,
|
832 |
+
"window_length": null
|
833 |
+
},
|
834 |
+
"ffn": {
|
835 |
+
"ffn_mult": 1.3125,
|
836 |
+
"no_op": false,
|
837 |
+
"replace_with_linear": false,
|
838 |
+
"sparsify": null
|
839 |
+
}
|
840 |
+
},
|
841 |
+
{
|
842 |
+
"attention": {
|
843 |
+
"n_heads_in_group": null,
|
844 |
+
"no_op": true,
|
845 |
+
"num_sink_tokens": null,
|
846 |
+
"replace_with_linear": false,
|
847 |
+
"sparsify": null,
|
848 |
+
"unshifted_sink": false,
|
849 |
+
"use_prefill_window_in_sink_attention": false,
|
850 |
+
"window_length": null
|
851 |
+
},
|
852 |
+
"ffn": {
|
853 |
+
"ffn_mult": 5.25,
|
854 |
+
"no_op": false,
|
855 |
+
"replace_with_linear": false,
|
856 |
+
"sparsify": null
|
857 |
+
}
|
858 |
+
},
|
859 |
+
{
|
860 |
+
"attention": {
|
861 |
+
"n_heads_in_group": null,
|
862 |
+
"no_op": true,
|
863 |
+
"num_sink_tokens": null,
|
864 |
+
"replace_with_linear": false,
|
865 |
+
"sparsify": null,
|
866 |
+
"unshifted_sink": false,
|
867 |
+
"use_prefill_window_in_sink_attention": false,
|
868 |
+
"window_length": null
|
869 |
+
},
|
870 |
+
"ffn": {
|
871 |
+
"ffn_mult": 1.3125,
|
872 |
+
"no_op": false,
|
873 |
+
"replace_with_linear": false,
|
874 |
+
"sparsify": null
|
875 |
+
}
|
876 |
+
},
|
877 |
+
{
|
878 |
+
"attention": {
|
879 |
+
"n_heads_in_group": null,
|
880 |
+
"no_op": true,
|
881 |
+
"num_sink_tokens": null,
|
882 |
+
"replace_with_linear": false,
|
883 |
+
"sparsify": null,
|
884 |
+
"unshifted_sink": false,
|
885 |
+
"use_prefill_window_in_sink_attention": false,
|
886 |
+
"window_length": null
|
887 |
+
},
|
888 |
+
"ffn": {
|
889 |
+
"ffn_mult": 2.625,
|
890 |
+
"no_op": false,
|
891 |
+
"replace_with_linear": false,
|
892 |
+
"sparsify": null
|
893 |
+
}
|
894 |
+
},
|
895 |
+
{
|
896 |
+
"attention": {
|
897 |
+
"n_heads_in_group": null,
|
898 |
+
"no_op": true,
|
899 |
+
"num_sink_tokens": null,
|
900 |
+
"replace_with_linear": false,
|
901 |
+
"sparsify": null,
|
902 |
+
"unshifted_sink": false,
|
903 |
+
"use_prefill_window_in_sink_attention": false,
|
904 |
+
"window_length": null
|
905 |
+
},
|
906 |
+
"ffn": {
|
907 |
+
"ffn_mult": 1.3125,
|
908 |
+
"no_op": false,
|
909 |
+
"replace_with_linear": false,
|
910 |
+
"sparsify": null
|
911 |
+
}
|
912 |
+
},
|
913 |
+
{
|
914 |
+
"attention": {
|
915 |
+
"n_heads_in_group": null,
|
916 |
+
"no_op": true,
|
917 |
+
"num_sink_tokens": null,
|
918 |
+
"replace_with_linear": false,
|
919 |
+
"sparsify": null,
|
920 |
+
"unshifted_sink": false,
|
921 |
+
"use_prefill_window_in_sink_attention": false,
|
922 |
+
"window_length": null
|
923 |
+
},
|
924 |
+
"ffn": {
|
925 |
+
"ffn_mult": 1.3125,
|
926 |
+
"no_op": false,
|
927 |
+
"replace_with_linear": false,
|
928 |
+
"sparsify": null
|
929 |
+
}
|
930 |
+
},
|
931 |
+
{
|
932 |
+
"attention": {
|
933 |
+
"n_heads_in_group": null,
|
934 |
+
"no_op": true,
|
935 |
+
"num_sink_tokens": null,
|
936 |
+
"replace_with_linear": false,
|
937 |
+
"sparsify": null,
|
938 |
+
"unshifted_sink": false,
|
939 |
+
"use_prefill_window_in_sink_attention": false,
|
940 |
+
"window_length": null
|
941 |
+
},
|
942 |
+
"ffn": {
|
943 |
+
"ffn_mult": 1.3125,
|
944 |
+
"no_op": false,
|
945 |
+
"replace_with_linear": false,
|
946 |
+
"sparsify": null
|
947 |
+
}
|
948 |
+
},
|
949 |
+
{
|
950 |
+
"attention": {
|
951 |
+
"n_heads_in_group": 8,
|
952 |
+
"no_op": false,
|
953 |
+
"num_sink_tokens": null,
|
954 |
+
"replace_with_linear": false,
|
955 |
+
"sparsify": null,
|
956 |
+
"unshifted_sink": false,
|
957 |
+
"use_prefill_window_in_sink_attention": false,
|
958 |
+
"window_length": null
|
959 |
+
},
|
960 |
+
"ffn": {
|
961 |
+
"ffn_mult": 5.25,
|
962 |
+
"no_op": false,
|
963 |
+
"replace_with_linear": false,
|
964 |
+
"sparsify": null
|
965 |
+
}
|
966 |
+
},
|
967 |
+
{
|
968 |
+
"attention": {
|
969 |
+
"n_heads_in_group": null,
|
970 |
+
"no_op": true,
|
971 |
+
"num_sink_tokens": null,
|
972 |
+
"replace_with_linear": false,
|
973 |
+
"sparsify": null,
|
974 |
+
"unshifted_sink": false,
|
975 |
+
"use_prefill_window_in_sink_attention": false,
|
976 |
+
"window_length": null
|
977 |
+
},
|
978 |
+
"ffn": {
|
979 |
+
"ffn_mult": 1.3125,
|
980 |
+
"no_op": false,
|
981 |
+
"replace_with_linear": false,
|
982 |
+
"sparsify": null
|
983 |
+
}
|
984 |
+
},
|
985 |
+
{
|
986 |
+
"attention": {
|
987 |
+
"n_heads_in_group": null,
|
988 |
+
"no_op": true,
|
989 |
+
"num_sink_tokens": null,
|
990 |
+
"replace_with_linear": false,
|
991 |
+
"sparsify": null,
|
992 |
+
"unshifted_sink": false,
|
993 |
+
"use_prefill_window_in_sink_attention": false,
|
994 |
+
"window_length": null
|
995 |
+
},
|
996 |
+
"ffn": {
|
997 |
+
"ffn_mult": 1.0,
|
998 |
+
"no_op": false,
|
999 |
+
"replace_with_linear": false,
|
1000 |
+
"sparsify": null
|
1001 |
+
}
|
1002 |
+
},
|
1003 |
+
{
|
1004 |
+
"attention": {
|
1005 |
+
"n_heads_in_group": null,
|
1006 |
+
"no_op": true,
|
1007 |
+
"num_sink_tokens": null,
|
1008 |
+
"replace_with_linear": false,
|
1009 |
+
"sparsify": null,
|
1010 |
+
"unshifted_sink": false,
|
1011 |
+
"use_prefill_window_in_sink_attention": false,
|
1012 |
+
"window_length": null
|
1013 |
+
},
|
1014 |
+
"ffn": {
|
1015 |
+
"ffn_mult": 1.0,
|
1016 |
+
"no_op": false,
|
1017 |
+
"replace_with_linear": false,
|
1018 |
+
"sparsify": null
|
1019 |
+
}
|
1020 |
+
},
|
1021 |
+
{
|
1022 |
+
"attention": {
|
1023 |
+
"n_heads_in_group": null,
|
1024 |
+
"no_op": true,
|
1025 |
+
"num_sink_tokens": null,
|
1026 |
+
"replace_with_linear": false,
|
1027 |
+
"sparsify": null,
|
1028 |
+
"unshifted_sink": false,
|
1029 |
+
"use_prefill_window_in_sink_attention": false,
|
1030 |
+
"window_length": null
|
1031 |
+
},
|
1032 |
+
"ffn": {
|
1033 |
+
"ffn_mult": 1.3125,
|
1034 |
+
"no_op": false,
|
1035 |
+
"replace_with_linear": false,
|
1036 |
+
"sparsify": null
|
1037 |
+
}
|
1038 |
+
},
|
1039 |
+
{
|
1040 |
+
"attention": {
|
1041 |
+
"n_heads_in_group": null,
|
1042 |
+
"no_op": true,
|
1043 |
+
"num_sink_tokens": null,
|
1044 |
+
"replace_with_linear": false,
|
1045 |
+
"sparsify": null,
|
1046 |
+
"unshifted_sink": false,
|
1047 |
+
"use_prefill_window_in_sink_attention": false,
|
1048 |
+
"window_length": null
|
1049 |
+
},
|
1050 |
+
"ffn": {
|
1051 |
+
"ffn_mult": 1.0,
|
1052 |
+
"no_op": false,
|
1053 |
+
"replace_with_linear": false,
|
1054 |
+
"sparsify": null
|
1055 |
+
}
|
1056 |
+
},
|
1057 |
+
{
|
1058 |
+
"attention": {
|
1059 |
+
"n_heads_in_group": null,
|
1060 |
+
"no_op": true,
|
1061 |
+
"num_sink_tokens": null,
|
1062 |
+
"replace_with_linear": false,
|
1063 |
+
"sparsify": null,
|
1064 |
+
"unshifted_sink": false,
|
1065 |
+
"use_prefill_window_in_sink_attention": false,
|
1066 |
+
"window_length": null
|
1067 |
+
},
|
1068 |
+
"ffn": {
|
1069 |
+
"ffn_mult": 1.0,
|
1070 |
+
"no_op": false,
|
1071 |
+
"replace_with_linear": false,
|
1072 |
+
"sparsify": null
|
1073 |
+
}
|
1074 |
+
},
|
1075 |
+
{
|
1076 |
+
"attention": {
|
1077 |
+
"n_heads_in_group": null,
|
1078 |
+
"no_op": true,
|
1079 |
+
"num_sink_tokens": null,
|
1080 |
+
"replace_with_linear": false,
|
1081 |
+
"sparsify": null,
|
1082 |
+
"unshifted_sink": false,
|
1083 |
+
"use_prefill_window_in_sink_attention": false,
|
1084 |
+
"window_length": null
|
1085 |
+
},
|
1086 |
+
"ffn": {
|
1087 |
+
"ffn_mult": 1.0,
|
1088 |
+
"no_op": false,
|
1089 |
+
"replace_with_linear": false,
|
1090 |
+
"sparsify": null
|
1091 |
+
}
|
1092 |
+
},
|
1093 |
+
{
|
1094 |
+
"attention": {
|
1095 |
+
"n_heads_in_group": null,
|
1096 |
+
"no_op": true,
|
1097 |
+
"num_sink_tokens": null,
|
1098 |
+
"replace_with_linear": false,
|
1099 |
+
"sparsify": null,
|
1100 |
+
"unshifted_sink": false,
|
1101 |
+
"use_prefill_window_in_sink_attention": false,
|
1102 |
+
"window_length": null
|
1103 |
+
},
|
1104 |
+
"ffn": {
|
1105 |
+
"ffn_mult": 1.3125,
|
1106 |
+
"no_op": false,
|
1107 |
+
"replace_with_linear": false,
|
1108 |
+
"sparsify": null
|
1109 |
+
}
|
1110 |
+
},
|
1111 |
+
{
|
1112 |
+
"attention": {
|
1113 |
+
"n_heads_in_group": null,
|
1114 |
+
"no_op": true,
|
1115 |
+
"num_sink_tokens": null,
|
1116 |
+
"replace_with_linear": false,
|
1117 |
+
"sparsify": null,
|
1118 |
+
"unshifted_sink": false,
|
1119 |
+
"use_prefill_window_in_sink_attention": false,
|
1120 |
+
"window_length": null
|
1121 |
+
},
|
1122 |
+
"ffn": {
|
1123 |
+
"ffn_mult": 1.3125,
|
1124 |
+
"no_op": false,
|
1125 |
+
"replace_with_linear": false,
|
1126 |
+
"sparsify": null
|
1127 |
+
}
|
1128 |
+
},
|
1129 |
+
{
|
1130 |
+
"attention": {
|
1131 |
+
"n_heads_in_group": null,
|
1132 |
+
"no_op": true,
|
1133 |
+
"num_sink_tokens": null,
|
1134 |
+
"replace_with_linear": false,
|
1135 |
+
"sparsify": null,
|
1136 |
+
"unshifted_sink": false,
|
1137 |
+
"use_prefill_window_in_sink_attention": false,
|
1138 |
+
"window_length": null
|
1139 |
+
},
|
1140 |
+
"ffn": {
|
1141 |
+
"ffn_mult": 0.5,
|
1142 |
+
"no_op": false,
|
1143 |
+
"replace_with_linear": false,
|
1144 |
+
"sparsify": null
|
1145 |
+
}
|
1146 |
+
},
|
1147 |
+
{
|
1148 |
+
"attention": {
|
1149 |
+
"n_heads_in_group": null,
|
1150 |
+
"no_op": true,
|
1151 |
+
"num_sink_tokens": null,
|
1152 |
+
"replace_with_linear": false,
|
1153 |
+
"sparsify": null,
|
1154 |
+
"unshifted_sink": false,
|
1155 |
+
"use_prefill_window_in_sink_attention": false,
|
1156 |
+
"window_length": null
|
1157 |
+
},
|
1158 |
+
"ffn": {
|
1159 |
+
"ffn_mult": 0.5,
|
1160 |
+
"no_op": false,
|
1161 |
+
"replace_with_linear": false,
|
1162 |
+
"sparsify": null
|
1163 |
+
}
|
1164 |
+
},
|
1165 |
+
{
|
1166 |
+
"attention": {
|
1167 |
+
"n_heads_in_group": null,
|
1168 |
+
"no_op": true,
|
1169 |
+
"num_sink_tokens": null,
|
1170 |
+
"replace_with_linear": false,
|
1171 |
+
"sparsify": null,
|
1172 |
+
"unshifted_sink": false,
|
1173 |
+
"use_prefill_window_in_sink_attention": false,
|
1174 |
+
"window_length": null
|
1175 |
+
},
|
1176 |
+
"ffn": {
|
1177 |
+
"ffn_mult": 1.0,
|
1178 |
+
"no_op": false,
|
1179 |
+
"replace_with_linear": false,
|
1180 |
+
"sparsify": null
|
1181 |
+
}
|
1182 |
+
},
|
1183 |
+
{
|
1184 |
+
"attention": {
|
1185 |
+
"n_heads_in_group": null,
|
1186 |
+
"no_op": true,
|
1187 |
+
"num_sink_tokens": null,
|
1188 |
+
"replace_with_linear": false,
|
1189 |
+
"sparsify": null,
|
1190 |
+
"unshifted_sink": false,
|
1191 |
+
"use_prefill_window_in_sink_attention": false,
|
1192 |
+
"window_length": null
|
1193 |
+
},
|
1194 |
+
"ffn": {
|
1195 |
+
"ffn_mult": 1.0,
|
1196 |
+
"no_op": false,
|
1197 |
+
"replace_with_linear": false,
|
1198 |
+
"sparsify": null
|
1199 |
+
}
|
1200 |
+
},
|
1201 |
+
{
|
1202 |
+
"attention": {
|
1203 |
+
"n_heads_in_group": null,
|
1204 |
+
"no_op": true,
|
1205 |
+
"num_sink_tokens": null,
|
1206 |
+
"replace_with_linear": false,
|
1207 |
+
"sparsify": null,
|
1208 |
+
"unshifted_sink": false,
|
1209 |
+
"use_prefill_window_in_sink_attention": false,
|
1210 |
+
"window_length": null
|
1211 |
+
},
|
1212 |
+
"ffn": {
|
1213 |
+
"ffn_mult": 0.5,
|
1214 |
+
"no_op": false,
|
1215 |
+
"replace_with_linear": false,
|
1216 |
+
"sparsify": null
|
1217 |
+
}
|
1218 |
+
},
|
1219 |
+
{
|
1220 |
+
"attention": {
|
1221 |
+
"n_heads_in_group": null,
|
1222 |
+
"no_op": true,
|
1223 |
+
"num_sink_tokens": null,
|
1224 |
+
"replace_with_linear": false,
|
1225 |
+
"sparsify": null,
|
1226 |
+
"unshifted_sink": false,
|
1227 |
+
"use_prefill_window_in_sink_attention": false,
|
1228 |
+
"window_length": null
|
1229 |
+
},
|
1230 |
+
"ffn": {
|
1231 |
+
"ffn_mult": 0.5,
|
1232 |
+
"no_op": false,
|
1233 |
+
"replace_with_linear": false,
|
1234 |
+
"sparsify": null
|
1235 |
+
}
|
1236 |
+
},
|
1237 |
+
{
|
1238 |
+
"attention": {
|
1239 |
+
"n_heads_in_group": null,
|
1240 |
+
"no_op": true,
|
1241 |
+
"num_sink_tokens": null,
|
1242 |
+
"replace_with_linear": false,
|
1243 |
+
"sparsify": null,
|
1244 |
+
"unshifted_sink": false,
|
1245 |
+
"use_prefill_window_in_sink_attention": false,
|
1246 |
+
"window_length": null
|
1247 |
+
},
|
1248 |
+
"ffn": {
|
1249 |
+
"ffn_mult": 1.0,
|
1250 |
+
"no_op": false,
|
1251 |
+
"replace_with_linear": false,
|
1252 |
+
"sparsify": null
|
1253 |
+
}
|
1254 |
+
},
|
1255 |
+
{
|
1256 |
+
"attention": {
|
1257 |
+
"n_heads_in_group": null,
|
1258 |
+
"no_op": true,
|
1259 |
+
"num_sink_tokens": null,
|
1260 |
+
"replace_with_linear": false,
|
1261 |
+
"sparsify": null,
|
1262 |
+
"unshifted_sink": false,
|
1263 |
+
"use_prefill_window_in_sink_attention": false,
|
1264 |
+
"window_length": null
|
1265 |
+
},
|
1266 |
+
"ffn": {
|
1267 |
+
"ffn_mult": 0.5,
|
1268 |
+
"no_op": false,
|
1269 |
+
"replace_with_linear": false,
|
1270 |
+
"sparsify": null
|
1271 |
+
}
|
1272 |
+
},
|
1273 |
+
{
|
1274 |
+
"attention": {
|
1275 |
+
"n_heads_in_group": null,
|
1276 |
+
"no_op": true,
|
1277 |
+
"num_sink_tokens": null,
|
1278 |
+
"replace_with_linear": false,
|
1279 |
+
"sparsify": null,
|
1280 |
+
"unshifted_sink": false,
|
1281 |
+
"use_prefill_window_in_sink_attention": false,
|
1282 |
+
"window_length": null
|
1283 |
+
},
|
1284 |
+
"ffn": {
|
1285 |
+
"ffn_mult": 0.5,
|
1286 |
+
"no_op": false,
|
1287 |
+
"replace_with_linear": false,
|
1288 |
+
"sparsify": null
|
1289 |
+
}
|
1290 |
+
},
|
1291 |
+
{
|
1292 |
+
"attention": {
|
1293 |
+
"n_heads_in_group": 8,
|
1294 |
+
"no_op": false,
|
1295 |
+
"num_sink_tokens": null,
|
1296 |
+
"replace_with_linear": false,
|
1297 |
+
"sparsify": null,
|
1298 |
+
"unshifted_sink": false,
|
1299 |
+
"use_prefill_window_in_sink_attention": false,
|
1300 |
+
"window_length": null
|
1301 |
+
},
|
1302 |
+
"ffn": {
|
1303 |
+
"ffn_mult": 5.25,
|
1304 |
+
"no_op": false,
|
1305 |
+
"replace_with_linear": false,
|
1306 |
+
"sparsify": null
|
1307 |
+
}
|
1308 |
+
},
|
1309 |
+
{
|
1310 |
+
"attention": {
|
1311 |
+
"n_heads_in_group": 8,
|
1312 |
+
"no_op": false,
|
1313 |
+
"num_sink_tokens": null,
|
1314 |
+
"replace_with_linear": false,
|
1315 |
+
"sparsify": null,
|
1316 |
+
"unshifted_sink": false,
|
1317 |
+
"use_prefill_window_in_sink_attention": false,
|
1318 |
+
"window_length": null
|
1319 |
+
},
|
1320 |
+
"ffn": {
|
1321 |
+
"ffn_mult": 5.25,
|
1322 |
+
"no_op": false,
|
1323 |
+
"replace_with_linear": false,
|
1324 |
+
"sparsify": null
|
1325 |
+
}
|
1326 |
+
},
|
1327 |
+
{
|
1328 |
+
"attention": {
|
1329 |
+
"n_heads_in_group": 8,
|
1330 |
+
"no_op": false,
|
1331 |
+
"num_sink_tokens": null,
|
1332 |
+
"replace_with_linear": false,
|
1333 |
+
"sparsify": null,
|
1334 |
+
"unshifted_sink": false,
|
1335 |
+
"use_prefill_window_in_sink_attention": false,
|
1336 |
+
"window_length": null
|
1337 |
+
},
|
1338 |
+
"ffn": {
|
1339 |
+
"ffn_mult": 5.25,
|
1340 |
+
"no_op": false,
|
1341 |
+
"replace_with_linear": false,
|
1342 |
+
"sparsify": null
|
1343 |
+
}
|
1344 |
+
},
|
1345 |
+
{
|
1346 |
+
"attention": {
|
1347 |
+
"n_heads_in_group": 8,
|
1348 |
+
"no_op": false,
|
1349 |
+
"num_sink_tokens": null,
|
1350 |
+
"replace_with_linear": false,
|
1351 |
+
"sparsify": null,
|
1352 |
+
"unshifted_sink": false,
|
1353 |
+
"use_prefill_window_in_sink_attention": false,
|
1354 |
+
"window_length": null
|
1355 |
+
},
|
1356 |
+
"ffn": {
|
1357 |
+
"ffn_mult": 5.25,
|
1358 |
+
"no_op": false,
|
1359 |
+
"replace_with_linear": false,
|
1360 |
+
"sparsify": null
|
1361 |
+
}
|
1362 |
+
},
|
1363 |
+
{
|
1364 |
+
"attention": {
|
1365 |
+
"n_heads_in_group": 8,
|
1366 |
+
"no_op": false,
|
1367 |
+
"num_sink_tokens": null,
|
1368 |
+
"replace_with_linear": false,
|
1369 |
+
"sparsify": null,
|
1370 |
+
"unshifted_sink": false,
|
1371 |
+
"use_prefill_window_in_sink_attention": false,
|
1372 |
+
"window_length": null
|
1373 |
+
},
|
1374 |
+
"ffn": {
|
1375 |
+
"ffn_mult": 5.25,
|
1376 |
+
"no_op": false,
|
1377 |
+
"replace_with_linear": false,
|
1378 |
+
"sparsify": null
|
1379 |
+
}
|
1380 |
+
},
|
1381 |
+
{
|
1382 |
+
"attention": {
|
1383 |
+
"n_heads_in_group": 8,
|
1384 |
+
"no_op": false,
|
1385 |
+
"num_sink_tokens": null,
|
1386 |
+
"replace_with_linear": false,
|
1387 |
+
"sparsify": null,
|
1388 |
+
"unshifted_sink": false,
|
1389 |
+
"use_prefill_window_in_sink_attention": false,
|
1390 |
+
"window_length": null
|
1391 |
+
},
|
1392 |
+
"ffn": {
|
1393 |
+
"ffn_mult": 5.25,
|
1394 |
+
"no_op": false,
|
1395 |
+
"replace_with_linear": false,
|
1396 |
+
"sparsify": null
|
1397 |
+
}
|
1398 |
+
},
|
1399 |
+
{
|
1400 |
+
"attention": {
|
1401 |
+
"n_heads_in_group": 8,
|
1402 |
+
"no_op": false,
|
1403 |
+
"num_sink_tokens": null,
|
1404 |
+
"replace_with_linear": false,
|
1405 |
+
"sparsify": null,
|
1406 |
+
"unshifted_sink": false,
|
1407 |
+
"use_prefill_window_in_sink_attention": false,
|
1408 |
+
"window_length": null
|
1409 |
+
},
|
1410 |
+
"ffn": {
|
1411 |
+
"ffn_mult": 5.25,
|
1412 |
+
"no_op": false,
|
1413 |
+
"replace_with_linear": false,
|
1414 |
+
"sparsify": null
|
1415 |
+
}
|
1416 |
+
},
|
1417 |
+
{
|
1418 |
+
"attention": {
|
1419 |
+
"n_heads_in_group": 8,
|
1420 |
+
"no_op": false,
|
1421 |
+
"num_sink_tokens": null,
|
1422 |
+
"replace_with_linear": false,
|
1423 |
+
"sparsify": null,
|
1424 |
+
"unshifted_sink": false,
|
1425 |
+
"use_prefill_window_in_sink_attention": false,
|
1426 |
+
"window_length": null
|
1427 |
+
},
|
1428 |
+
"ffn": {
|
1429 |
+
"ffn_mult": 5.25,
|
1430 |
+
"no_op": false,
|
1431 |
+
"replace_with_linear": false,
|
1432 |
+
"sparsify": null
|
1433 |
+
}
|
1434 |
+
},
|
1435 |
+
{
|
1436 |
+
"attention": {
|
1437 |
+
"n_heads_in_group": 8,
|
1438 |
+
"no_op": false,
|
1439 |
+
"num_sink_tokens": null,
|
1440 |
+
"replace_with_linear": false,
|
1441 |
+
"sparsify": null,
|
1442 |
+
"unshifted_sink": false,
|
1443 |
+
"use_prefill_window_in_sink_attention": false,
|
1444 |
+
"window_length": null
|
1445 |
+
},
|
1446 |
+
"ffn": {
|
1447 |
+
"ffn_mult": 5.25,
|
1448 |
+
"no_op": false,
|
1449 |
+
"replace_with_linear": false,
|
1450 |
+
"sparsify": null
|
1451 |
+
}
|
1452 |
+
}
|
1453 |
+
],
|
1454 |
+
"bos_token_id": 128000,
|
1455 |
+
"eos_token_id": [
|
1456 |
+
128001,
|
1457 |
+
128008,
|
1458 |
+
128009
|
1459 |
+
],
|
1460 |
+
"hidden_act": "silu",
|
1461 |
+
"hidden_size": 8192,
|
1462 |
+
"initializer_range": 0.02,
|
1463 |
+
"intermediate_size": null,
|
1464 |
+
"max_position_embeddings": 131072,
|
1465 |
+
"mlp_bias": false,
|
1466 |
+
"model_type": "nemotron-nas",
|
1467 |
+
"num_attention_heads": 64,
|
1468 |
+
"num_hidden_layers": 80,
|
1469 |
+
"num_key_value_heads": null,
|
1470 |
+
"pretraining_tp": 1,
|
1471 |
+
"rms_norm_eps": 1e-05,
|
1472 |
+
"rope_scaling": {
|
1473 |
+
"factor": 16.0,
|
1474 |
+
"high_freq_factor": 4.0,
|
1475 |
+
"low_freq_factor": 1.0,
|
1476 |
+
"original_max_position_embeddings": 8192,
|
1477 |
+
"rope_type": "llama3"
|
1478 |
+
},
|
1479 |
+
"rope_theta": 500000.0,
|
1480 |
+
"tie_word_embeddings": false,
|
1481 |
+
"torch_dtype": "bfloat16",
|
1482 |
+
"transformers_version": "4.48.3",
|
1483 |
+
"use_cache": true,
|
1484 |
+
"vocab_size": 128256
|
1485 |
+
}
|
configuration_decilm.py
ADDED
@@ -0,0 +1,65 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# coding=utf-8
|
2 |
+
# Copyright 2024 Nvidia Corporation. All rights reserved.
|
3 |
+
#
|
4 |
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
5 |
+
# you may not use this file except in compliance with the License.
|
6 |
+
# You may obtain a copy of the License at
|
7 |
+
#
|
8 |
+
# http://www.apache.org/licenses/LICENSE-2.0
|
9 |
+
#
|
10 |
+
# Unless required by applicable law or agreed to in writing, software
|
11 |
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
12 |
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
13 |
+
# See the License for the specific language governing permissions and
|
14 |
+
# limitations under the License.
|
15 |
+
|
16 |
+
import dataclasses
|
17 |
+
import warnings
|
18 |
+
from typing import Dict, Any
|
19 |
+
|
20 |
+
from transformers.utils import is_flash_attn_2_available
|
21 |
+
|
22 |
+
from .block_config import BlockConfig
|
23 |
+
from .transformers_4_44_2__configuration_llama import LlamaConfig
|
24 |
+
from .transformers_4_44_2__modeling_rope_utils import \
|
25 |
+
rope_config_validation # fake import to make AutoConfig infer the dependency
|
26 |
+
|
27 |
+
rope_config_validation # this line is here to make sure that auto-formatting doesn't remove the import
|
28 |
+
|
29 |
+
|
30 |
+
class DeciLMConfig(LlamaConfig):
|
31 |
+
model_type = "nemotron_nas"
|
32 |
+
|
33 |
+
def __init__(
|
34 |
+
self,
|
35 |
+
block_configs: list[dict] | list[BlockConfig] = None,
|
36 |
+
**kwargs,
|
37 |
+
):
|
38 |
+
attn_implementation = kwargs.pop("attn_implementation", None)
|
39 |
+
if attn_implementation is None and is_flash_attn_2_available():
|
40 |
+
attn_implementation = "flash_attention_2"
|
41 |
+
|
42 |
+
if block_configs is not None:
|
43 |
+
if isinstance(block_configs[0], dict):
|
44 |
+
block_configs = [BlockConfig(**conf) for conf in block_configs]
|
45 |
+
|
46 |
+
using_unshifted_sink = any([block_config.attention.unshifted_sink for block_config in block_configs])
|
47 |
+
if using_unshifted_sink and attn_implementation != "eager":
|
48 |
+
warnings.warn("Forcing attn_implementation='eager' since some attention layers use unshifted sink")
|
49 |
+
attn_implementation = "eager"
|
50 |
+
|
51 |
+
super().__init__(attn_implementation=attn_implementation, **kwargs)
|
52 |
+
|
53 |
+
self.intermediate_size = None
|
54 |
+
self.num_key_value_heads = None
|
55 |
+
|
56 |
+
if block_configs is not None:
|
57 |
+
assert len(block_configs) == self.num_hidden_layers
|
58 |
+
|
59 |
+
self.block_configs: list[BlockConfig] = block_configs
|
60 |
+
|
61 |
+
def to_dict(self) -> Dict[str, Any]:
|
62 |
+
self_dict = super().to_dict()
|
63 |
+
if self.block_configs is not None:
|
64 |
+
self_dict["block_configs"] = [dataclasses.asdict(conf) for conf in self.block_configs]
|
65 |
+
return self_dict
|
generation_config.json
ADDED
@@ -0,0 +1,11 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"_from_model_config": true,
|
3 |
+
"bos_token_id": 128000,
|
4 |
+
"do_sample": true,
|
5 |
+
"eos_token_id": [
|
6 |
+
128001,
|
7 |
+
128008,
|
8 |
+
128009
|
9 |
+
],
|
10 |
+
"transformers_version": "4.48.3"
|
11 |
+
}
|
hf_quant_config.json
ADDED
@@ -0,0 +1,13 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"producer": {
|
3 |
+
"name": "modelopt",
|
4 |
+
"version": "0.28.1.dev55+gaf6d3e77.d20250724"
|
5 |
+
},
|
6 |
+
"quantization": {
|
7 |
+
"quant_algo": "FP8",
|
8 |
+
"kv_cache_quant_algo": "FP8",
|
9 |
+
"exclude_modules": [
|
10 |
+
"lm_head"
|
11 |
+
]
|
12 |
+
}
|
13 |
+
}
|
model-00001-of-00011.safetensors
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:ffe25ea458e224161f4a9bd8e001f959f742c6296048d18b6c9df1a286879ab6
|
3 |
+
size 4936837336
|
model-00002-of-00011.safetensors
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:c610f0fc06a29fd88d8330417f44342d3f6f9b20a215cbbf9126baff07b31fb5
|
3 |
+
size 4983045136
|
model-00003-of-00011.safetensors
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:d906e4f9bbca215cc501acf15e32ddaffca30fe16a0d30b406f19c88b31f0f68
|
3 |
+
size 4953684488
|
model-00004-of-00011.safetensors
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:869ca1df98819efc1d945b86a517f5512ed27d299379798f31b6a5107fc9b566
|
3 |
+
size 4899142912
|
model-00005-of-00011.safetensors
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:d50c18b15da1148fb72854d8f2d66d31dc000c9fa54cbb8c7bb3cc438fc0989e
|
3 |
+
size 4899159416
|
model-00006-of-00011.safetensors
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:cf9db356f800685fb49313a75313cd0e1b3324014d0d744f1f7992babedfc7ca
|
3 |
+
size 4899159408
|
model-00007-of-00011.safetensors
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:5891e6f5c5a4fd706dda472543b29ee201e98d4b6f0ccf0ed4a358a783cf8524
|
3 |
+
size 4983027800
|
model-00008-of-00011.safetensors
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:2c811b6fbf8b571c368282786463f35db7b2883c5988949d8c159b26b06f1c86
|
3 |
+
size 4915987960
|
model-00009-of-00011.safetensors
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:1381bccbac295a85636a356212a0707161daaba8b5ee8685c08d2503734dfe96
|
3 |
+
size 4794509832
|
model-00010-of-00011.safetensors
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:17ebb09fa889d962b46ea0ffa3d8855fcc870cde81e35ea9a9d4aac24773541a
|
3 |
+
size 4899159408
|
model-00011-of-00011.safetensors
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:7594d3fa04a233e9a81fd93910b24be0ae0a6cc18fbbb85a0db61c4868ddbbcc
|
3 |
+
size 2806006920
|
model.safetensors.index.json
ADDED
The diff for this file is too large to render.
See raw diff
|
|
special_tokens_map.json
ADDED
@@ -0,0 +1,23 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"bos_token": {
|
3 |
+
"content": "<|begin_of_text|>",
|
4 |
+
"lstrip": false,
|
5 |
+
"normalized": false,
|
6 |
+
"rstrip": false,
|
7 |
+
"single_word": false
|
8 |
+
},
|
9 |
+
"eos_token": {
|
10 |
+
"content": "<|eot_id|>",
|
11 |
+
"lstrip": false,
|
12 |
+
"normalized": false,
|
13 |
+
"rstrip": false,
|
14 |
+
"single_word": false
|
15 |
+
},
|
16 |
+
"pad_token": {
|
17 |
+
"content": "<|eot_id|>",
|
18 |
+
"lstrip": false,
|
19 |
+
"normalized": false,
|
20 |
+
"rstrip": false,
|
21 |
+
"single_word": false
|
22 |
+
}
|
23 |
+
}
|
tokenizer.json
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:6b9e4e7fb171f92fd137b777cc2714bf87d11576700a1dcd7a399e7bbe39537b
|
3 |
+
size 17209920
|
tokenizer_config.json
ADDED
@@ -0,0 +1,2065 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"added_tokens_decoder": {
|
3 |
+
"128000": {
|
4 |
+
"content": "<|begin_of_text|>",
|
5 |
+
"lstrip": false,
|
6 |
+
"normalized": false,
|
7 |
+
"rstrip": false,
|
8 |
+
"single_word": false,
|
9 |
+
"special": true
|
10 |
+
},
|
11 |
+
"128001": {
|
12 |
+
"content": "<|end_of_text|>",
|
13 |
+
"lstrip": false,
|
14 |
+
"normalized": false,
|
15 |
+
"rstrip": false,
|
16 |
+
"single_word": false,
|
17 |
+
"special": true
|
18 |
+
},
|
19 |
+
"128002": {
|
20 |
+
"content": "<|reserved_special_token_0|>",
|
21 |
+
"lstrip": false,
|
22 |
+
"normalized": false,
|
23 |
+
"rstrip": false,
|
24 |
+
"single_word": false,
|
25 |
+
"special": true
|
26 |
+
},
|
27 |
+
"128003": {
|
28 |
+
"content": "<|reserved_special_token_1|>",
|
29 |
+
"lstrip": false,
|
30 |
+
"normalized": false,
|
31 |
+
"rstrip": false,
|
32 |
+
"single_word": false,
|
33 |
+
"special": true
|
34 |
+
},
|
35 |
+
"128004": {
|
36 |
+
"content": "<|finetune_right_pad_id|>",
|
37 |
+
"lstrip": false,
|
38 |
+
"normalized": false,
|
39 |
+
"rstrip": false,
|
40 |
+
"single_word": false,
|
41 |
+
"special": true
|
42 |
+
},
|
43 |
+
"128005": {
|
44 |
+
"content": "<|reserved_special_token_2|>",
|
45 |
+
"lstrip": false,
|
46 |
+
"normalized": false,
|
47 |
+
"rstrip": false,
|
48 |
+
"single_word": false,
|
49 |
+
"special": true
|
50 |
+
},
|
51 |
+
"128006": {
|
52 |
+
"content": "<|start_header_id|>",
|
53 |
+
"lstrip": false,
|
54 |
+
"normalized": false,
|
55 |
+
"rstrip": false,
|
56 |
+
"single_word": false,
|
57 |
+
"special": true
|
58 |
+
},
|
59 |
+
"128007": {
|
60 |
+
"content": "<|end_header_id|>",
|
61 |
+
"lstrip": false,
|
62 |
+
"normalized": false,
|
63 |
+
"rstrip": false,
|
64 |
+
"single_word": false,
|
65 |
+
"special": true
|
66 |
+
},
|
67 |
+
"128008": {
|
68 |
+
"content": "<|eom_id|>",
|
69 |
+
"lstrip": false,
|
70 |
+
"normalized": false,
|
71 |
+
"rstrip": false,
|
72 |
+
"single_word": false,
|
73 |
+
"special": true
|
74 |
+
},
|
75 |
+
"128009": {
|
76 |
+
"content": "<|eot_id|>",
|
77 |
+
"lstrip": false,
|
78 |
+
"normalized": false,
|
79 |
+
"rstrip": false,
|
80 |
+
"single_word": false,
|
81 |
+
"special": true
|
82 |
+
},
|
83 |
+
"128010": {
|
84 |
+
"content": "<|python_tag|>",
|
85 |
+
"lstrip": false,
|
86 |
+
"normalized": false,
|
87 |
+
"rstrip": false,
|
88 |
+
"single_word": false,
|
89 |
+
"special": true
|
90 |
+
},
|
91 |
+
"128011": {
|
92 |
+
"content": "<|reserved_special_token_3|>",
|
93 |
+
"lstrip": false,
|
94 |
+
"normalized": false,
|
95 |
+
"rstrip": false,
|
96 |
+
"single_word": false,
|
97 |
+
"special": true
|
98 |
+
},
|
99 |
+
"128012": {
|
100 |
+
"content": "<|reserved_special_token_4|>",
|
101 |
+
"lstrip": false,
|
102 |
+
"normalized": false,
|
103 |
+
"rstrip": false,
|
104 |
+
"single_word": false,
|
105 |
+
"special": true
|
106 |
+
},
|
107 |
+
"128013": {
|
108 |
+
"content": "<|reserved_special_token_5|>",
|
109 |
+
"lstrip": false,
|
110 |
+
"normalized": false,
|
111 |
+
"rstrip": false,
|
112 |
+
"single_word": false,
|
113 |
+
"special": true
|
114 |
+
},
|
115 |
+
"128014": {
|
116 |
+
"content": "<|reserved_special_token_6|>",
|
117 |
+
"lstrip": false,
|
118 |
+
"normalized": false,
|
119 |
+
"rstrip": false,
|
120 |
+
"single_word": false,
|
121 |
+
"special": true
|
122 |
+
},
|
123 |
+
"128015": {
|
124 |
+
"content": "<|reserved_special_token_7|>",
|
125 |
+
"lstrip": false,
|
126 |
+
"normalized": false,
|
127 |
+
"rstrip": false,
|
128 |
+
"single_word": false,
|
129 |
+
"special": true
|
130 |
+
},
|
131 |
+
"128016": {
|
132 |
+
"content": "<|reserved_special_token_8|>",
|
133 |
+
"lstrip": false,
|
134 |
+
"normalized": false,
|
135 |
+
"rstrip": false,
|
136 |
+
"single_word": false,
|
137 |
+
"special": true
|
138 |
+
},
|
139 |
+
"128017": {
|
140 |
+
"content": "<|reserved_special_token_9|>",
|
141 |
+
"lstrip": false,
|
142 |
+
"normalized": false,
|
143 |
+
"rstrip": false,
|
144 |
+
"single_word": false,
|
145 |
+
"special": true
|
146 |
+
},
|
147 |
+
"128018": {
|
148 |
+
"content": "<|reserved_special_token_10|>",
|
149 |
+
"lstrip": false,
|
150 |
+
"normalized": false,
|
151 |
+
"rstrip": false,
|
152 |
+
"single_word": false,
|
153 |
+
"special": true
|
154 |
+
},
|
155 |
+
"128019": {
|
156 |
+
"content": "<|reserved_special_token_11|>",
|
157 |
+
"lstrip": false,
|
158 |
+
"normalized": false,
|
159 |
+
"rstrip": false,
|
160 |
+
"single_word": false,
|
161 |
+
"special": true
|
162 |
+
},
|
163 |
+
"128020": {
|
164 |
+
"content": "<|reserved_special_token_12|>",
|
165 |
+
"lstrip": false,
|
166 |
+
"normalized": false,
|
167 |
+
"rstrip": false,
|
168 |
+
"single_word": false,
|
169 |
+
"special": true
|
170 |
+
},
|
171 |
+
"128021": {
|
172 |
+
"content": "<|reserved_special_token_13|>",
|
173 |
+
"lstrip": false,
|
174 |
+
"normalized": false,
|
175 |
+
"rstrip": false,
|
176 |
+
"single_word": false,
|
177 |
+
"special": true
|
178 |
+
},
|
179 |
+
"128022": {
|
180 |
+
"content": "<|reserved_special_token_14|>",
|
181 |
+
"lstrip": false,
|
182 |
+
"normalized": false,
|
183 |
+
"rstrip": false,
|
184 |
+
"single_word": false,
|
185 |
+
"special": true
|
186 |
+
},
|
187 |
+
"128023": {
|
188 |
+
"content": "<|reserved_special_token_15|>",
|
189 |
+
"lstrip": false,
|
190 |
+
"normalized": false,
|
191 |
+
"rstrip": false,
|
192 |
+
"single_word": false,
|
193 |
+
"special": true
|
194 |
+
},
|
195 |
+
"128024": {
|
196 |
+
"content": "<|reserved_special_token_16|>",
|
197 |
+
"lstrip": false,
|
198 |
+
"normalized": false,
|
199 |
+
"rstrip": false,
|
200 |
+
"single_word": false,
|
201 |
+
"special": true
|
202 |
+
},
|
203 |
+
"128025": {
|
204 |
+
"content": "<|reserved_special_token_17|>",
|
205 |
+
"lstrip": false,
|
206 |
+
"normalized": false,
|
207 |
+
"rstrip": false,
|
208 |
+
"single_word": false,
|
209 |
+
"special": true
|
210 |
+
},
|
211 |
+
"128026": {
|
212 |
+
"content": "<|reserved_special_token_18|>",
|
213 |
+
"lstrip": false,
|
214 |
+
"normalized": false,
|
215 |
+
"rstrip": false,
|
216 |
+
"single_word": false,
|
217 |
+
"special": true
|
218 |
+
},
|
219 |
+
"128027": {
|
220 |
+
"content": "<|reserved_special_token_19|>",
|
221 |
+
"lstrip": false,
|
222 |
+
"normalized": false,
|
223 |
+
"rstrip": false,
|
224 |
+
"single_word": false,
|
225 |
+
"special": true
|
226 |
+
},
|
227 |
+
"128028": {
|
228 |
+
"content": "<|reserved_special_token_20|>",
|
229 |
+
"lstrip": false,
|
230 |
+
"normalized": false,
|
231 |
+
"rstrip": false,
|
232 |
+
"single_word": false,
|
233 |
+
"special": true
|
234 |
+
},
|
235 |
+
"128029": {
|
236 |
+
"content": "<|reserved_special_token_21|>",
|
237 |
+
"lstrip": false,
|
238 |
+
"normalized": false,
|
239 |
+
"rstrip": false,
|
240 |
+
"single_word": false,
|
241 |
+
"special": true
|
242 |
+
},
|
243 |
+
"128030": {
|
244 |
+
"content": "<|reserved_special_token_22|>",
|
245 |
+
"lstrip": false,
|
246 |
+
"normalized": false,
|
247 |
+
"rstrip": false,
|
248 |
+
"single_word": false,
|
249 |
+
"special": true
|
250 |
+
},
|
251 |
+
"128031": {
|
252 |
+
"content": "<|reserved_special_token_23|>",
|
253 |
+
"lstrip": false,
|
254 |
+
"normalized": false,
|
255 |
+
"rstrip": false,
|
256 |
+
"single_word": false,
|
257 |
+
"special": true
|
258 |
+
},
|
259 |
+
"128032": {
|
260 |
+
"content": "<|reserved_special_token_24|>",
|
261 |
+
"lstrip": false,
|
262 |
+
"normalized": false,
|
263 |
+
"rstrip": false,
|
264 |
+
"single_word": false,
|
265 |
+
"special": true
|
266 |
+
},
|
267 |
+
"128033": {
|
268 |
+
"content": "<|reserved_special_token_25|>",
|
269 |
+
"lstrip": false,
|
270 |
+
"normalized": false,
|
271 |
+
"rstrip": false,
|
272 |
+
"single_word": false,
|
273 |
+
"special": true
|
274 |
+
},
|
275 |
+
"128034": {
|
276 |
+
"content": "<|reserved_special_token_26|>",
|
277 |
+
"lstrip": false,
|
278 |
+
"normalized": false,
|
279 |
+
"rstrip": false,
|
280 |
+
"single_word": false,
|
281 |
+
"special": true
|
282 |
+
},
|
283 |
+
"128035": {
|
284 |
+
"content": "<|reserved_special_token_27|>",
|
285 |
+
"lstrip": false,
|
286 |
+
"normalized": false,
|
287 |
+
"rstrip": false,
|
288 |
+
"single_word": false,
|
289 |
+
"special": true
|
290 |
+
},
|
291 |
+
"128036": {
|
292 |
+
"content": "<|reserved_special_token_28|>",
|
293 |
+
"lstrip": false,
|
294 |
+
"normalized": false,
|
295 |
+
"rstrip": false,
|
296 |
+
"single_word": false,
|
297 |
+
"special": true
|
298 |
+
},
|
299 |
+
"128037": {
|
300 |
+
"content": "<|reserved_special_token_29|>",
|
301 |
+
"lstrip": false,
|
302 |
+
"normalized": false,
|
303 |
+
"rstrip": false,
|
304 |
+
"single_word": false,
|
305 |
+
"special": true
|
306 |
+
},
|
307 |
+
"128038": {
|
308 |
+
"content": "<|reserved_special_token_30|>",
|
309 |
+
"lstrip": false,
|
310 |
+
"normalized": false,
|
311 |
+
"rstrip": false,
|
312 |
+
"single_word": false,
|
313 |
+
"special": true
|
314 |
+
},
|
315 |
+
"128039": {
|
316 |
+
"content": "<|reserved_special_token_31|>",
|
317 |
+
"lstrip": false,
|
318 |
+
"normalized": false,
|
319 |
+
"rstrip": false,
|
320 |
+
"single_word": false,
|
321 |
+
"special": true
|
322 |
+
},
|
323 |
+
"128040": {
|
324 |
+
"content": "<|reserved_special_token_32|>",
|
325 |
+
"lstrip": false,
|
326 |
+
"normalized": false,
|
327 |
+
"rstrip": false,
|
328 |
+
"single_word": false,
|
329 |
+
"special": true
|
330 |
+
},
|
331 |
+
"128041": {
|
332 |
+
"content": "<|reserved_special_token_33|>",
|
333 |
+
"lstrip": false,
|
334 |
+
"normalized": false,
|
335 |
+
"rstrip": false,
|
336 |
+
"single_word": false,
|
337 |
+
"special": true
|
338 |
+
},
|
339 |
+
"128042": {
|
340 |
+
"content": "<|reserved_special_token_34|>",
|
341 |
+
"lstrip": false,
|
342 |
+
"normalized": false,
|
343 |
+
"rstrip": false,
|
344 |
+
"single_word": false,
|
345 |
+
"special": true
|
346 |
+
},
|
347 |
+
"128043": {
|
348 |
+
"content": "<|reserved_special_token_35|>",
|
349 |
+
"lstrip": false,
|
350 |
+
"normalized": false,
|
351 |
+
"rstrip": false,
|
352 |
+
"single_word": false,
|
353 |
+
"special": true
|
354 |
+
},
|
355 |
+
"128044": {
|
356 |
+
"content": "<|reserved_special_token_36|>",
|
357 |
+
"lstrip": false,
|
358 |
+
"normalized": false,
|
359 |
+
"rstrip": false,
|
360 |
+
"single_word": false,
|
361 |
+
"special": true
|
362 |
+
},
|
363 |
+
"128045": {
|
364 |
+
"content": "<|reserved_special_token_37|>",
|
365 |
+
"lstrip": false,
|
366 |
+
"normalized": false,
|
367 |
+
"rstrip": false,
|
368 |
+
"single_word": false,
|
369 |
+
"special": true
|
370 |
+
},
|
371 |
+
"128046": {
|
372 |
+
"content": "<|reserved_special_token_38|>",
|
373 |
+
"lstrip": false,
|
374 |
+
"normalized": false,
|
375 |
+
"rstrip": false,
|
376 |
+
"single_word": false,
|
377 |
+
"special": true
|
378 |
+
},
|
379 |
+
"128047": {
|
380 |
+
"content": "<|reserved_special_token_39|>",
|
381 |
+
"lstrip": false,
|
382 |
+
"normalized": false,
|
383 |
+
"rstrip": false,
|
384 |
+
"single_word": false,
|
385 |
+
"special": true
|
386 |
+
},
|
387 |
+
"128048": {
|
388 |
+
"content": "<|reserved_special_token_40|>",
|
389 |
+
"lstrip": false,
|
390 |
+
"normalized": false,
|
391 |
+
"rstrip": false,
|
392 |
+
"single_word": false,
|
393 |
+
"special": true
|
394 |
+
},
|
395 |
+
"128049": {
|
396 |
+
"content": "<|reserved_special_token_41|>",
|
397 |
+
"lstrip": false,
|
398 |
+
"normalized": false,
|
399 |
+
"rstrip": false,
|
400 |
+
"single_word": false,
|
401 |
+
"special": true
|
402 |
+
},
|
403 |
+
"128050": {
|
404 |
+
"content": "<|reserved_special_token_42|>",
|
405 |
+
"lstrip": false,
|
406 |
+
"normalized": false,
|
407 |
+
"rstrip": false,
|
408 |
+
"single_word": false,
|
409 |
+
"special": true
|
410 |
+
},
|
411 |
+
"128051": {
|
412 |
+
"content": "<|reserved_special_token_43|>",
|
413 |
+
"lstrip": false,
|
414 |
+
"normalized": false,
|
415 |
+
"rstrip": false,
|
416 |
+
"single_word": false,
|
417 |
+
"special": true
|
418 |
+
},
|
419 |
+
"128052": {
|
420 |
+
"content": "<|reserved_special_token_44|>",
|
421 |
+
"lstrip": false,
|
422 |
+
"normalized": false,
|
423 |
+
"rstrip": false,
|
424 |
+
"single_word": false,
|
425 |
+
"special": true
|
426 |
+
},
|
427 |
+
"128053": {
|
428 |
+
"content": "<|reserved_special_token_45|>",
|
429 |
+
"lstrip": false,
|
430 |
+
"normalized": false,
|
431 |
+
"rstrip": false,
|
432 |
+
"single_word": false,
|
433 |
+
"special": true
|
434 |
+
},
|
435 |
+
"128054": {
|
436 |
+
"content": "<|reserved_special_token_46|>",
|
437 |
+
"lstrip": false,
|
438 |
+
"normalized": false,
|
439 |
+
"rstrip": false,
|
440 |
+
"single_word": false,
|
441 |
+
"special": true
|
442 |
+
},
|
443 |
+
"128055": {
|
444 |
+
"content": "<|reserved_special_token_47|>",
|
445 |
+
"lstrip": false,
|
446 |
+
"normalized": false,
|
447 |
+
"rstrip": false,
|
448 |
+
"single_word": false,
|
449 |
+
"special": true
|
450 |
+
},
|
451 |
+
"128056": {
|
452 |
+
"content": "<|reserved_special_token_48|>",
|
453 |
+
"lstrip": false,
|
454 |
+
"normalized": false,
|
455 |
+
"rstrip": false,
|
456 |
+
"single_word": false,
|
457 |
+
"special": true
|
458 |
+
},
|
459 |
+
"128057": {
|
460 |
+
"content": "<|reserved_special_token_49|>",
|
461 |
+
"lstrip": false,
|
462 |
+
"normalized": false,
|
463 |
+
"rstrip": false,
|
464 |
+
"single_word": false,
|
465 |
+
"special": true
|
466 |
+
},
|
467 |
+
"128058": {
|
468 |
+
"content": "<|reserved_special_token_50|>",
|
469 |
+
"lstrip": false,
|
470 |
+
"normalized": false,
|
471 |
+
"rstrip": false,
|
472 |
+
"single_word": false,
|
473 |
+
"special": true
|
474 |
+
},
|
475 |
+
"128059": {
|
476 |
+
"content": "<|reserved_special_token_51|>",
|
477 |
+
"lstrip": false,
|
478 |
+
"normalized": false,
|
479 |
+
"rstrip": false,
|
480 |
+
"single_word": false,
|
481 |
+
"special": true
|
482 |
+
},
|
483 |
+
"128060": {
|
484 |
+
"content": "<|reserved_special_token_52|>",
|
485 |
+
"lstrip": false,
|
486 |
+
"normalized": false,
|
487 |
+
"rstrip": false,
|
488 |
+
"single_word": false,
|
489 |
+
"special": true
|
490 |
+
},
|
491 |
+
"128061": {
|
492 |
+
"content": "<|reserved_special_token_53|>",
|
493 |
+
"lstrip": false,
|
494 |
+
"normalized": false,
|
495 |
+
"rstrip": false,
|
496 |
+
"single_word": false,
|
497 |
+
"special": true
|
498 |
+
},
|
499 |
+
"128062": {
|
500 |
+
"content": "<|reserved_special_token_54|>",
|
501 |
+
"lstrip": false,
|
502 |
+
"normalized": false,
|
503 |
+
"rstrip": false,
|
504 |
+
"single_word": false,
|
505 |
+
"special": true
|
506 |
+
},
|
507 |
+
"128063": {
|
508 |
+
"content": "<|reserved_special_token_55|>",
|
509 |
+
"lstrip": false,
|
510 |
+
"normalized": false,
|
511 |
+
"rstrip": false,
|
512 |
+
"single_word": false,
|
513 |
+
"special": true
|
514 |
+
},
|
515 |
+
"128064": {
|
516 |
+
"content": "<|reserved_special_token_56|>",
|
517 |
+
"lstrip": false,
|
518 |
+
"normalized": false,
|
519 |
+
"rstrip": false,
|
520 |
+
"single_word": false,
|
521 |
+
"special": true
|
522 |
+
},
|
523 |
+
"128065": {
|
524 |
+
"content": "<|reserved_special_token_57|>",
|
525 |
+
"lstrip": false,
|
526 |
+
"normalized": false,
|
527 |
+
"rstrip": false,
|
528 |
+
"single_word": false,
|
529 |
+
"special": true
|
530 |
+
},
|
531 |
+
"128066": {
|
532 |
+
"content": "<|reserved_special_token_58|>",
|
533 |
+
"lstrip": false,
|
534 |
+
"normalized": false,
|
535 |
+
"rstrip": false,
|
536 |
+
"single_word": false,
|
537 |
+
"special": true
|
538 |
+
},
|
539 |
+
"128067": {
|
540 |
+
"content": "<|reserved_special_token_59|>",
|
541 |
+
"lstrip": false,
|
542 |
+
"normalized": false,
|
543 |
+
"rstrip": false,
|
544 |
+
"single_word": false,
|
545 |
+
"special": true
|
546 |
+
},
|
547 |
+
"128068": {
|
548 |
+
"content": "<|reserved_special_token_60|>",
|
549 |
+
"lstrip": false,
|
550 |
+
"normalized": false,
|
551 |
+
"rstrip": false,
|
552 |
+
"single_word": false,
|
553 |
+
"special": true
|
554 |
+
},
|
555 |
+
"128069": {
|
556 |
+
"content": "<|reserved_special_token_61|>",
|
557 |
+
"lstrip": false,
|
558 |
+
"normalized": false,
|
559 |
+
"rstrip": false,
|
560 |
+
"single_word": false,
|
561 |
+
"special": true
|
562 |
+
},
|
563 |
+
"128070": {
|
564 |
+
"content": "<|reserved_special_token_62|>",
|
565 |
+
"lstrip": false,
|
566 |
+
"normalized": false,
|
567 |
+
"rstrip": false,
|
568 |
+
"single_word": false,
|
569 |
+
"special": true
|
570 |
+
},
|
571 |
+
"128071": {
|
572 |
+
"content": "<|reserved_special_token_63|>",
|
573 |
+
"lstrip": false,
|
574 |
+
"normalized": false,
|
575 |
+
"rstrip": false,
|
576 |
+
"single_word": false,
|
577 |
+
"special": true
|
578 |
+
},
|
579 |
+
"128072": {
|
580 |
+
"content": "<|reserved_special_token_64|>",
|
581 |
+
"lstrip": false,
|
582 |
+
"normalized": false,
|
583 |
+
"rstrip": false,
|
584 |
+
"single_word": false,
|
585 |
+
"special": true
|
586 |
+
},
|
587 |
+
"128073": {
|
588 |
+
"content": "<|reserved_special_token_65|>",
|
589 |
+
"lstrip": false,
|
590 |
+
"normalized": false,
|
591 |
+
"rstrip": false,
|
592 |
+
"single_word": false,
|
593 |
+
"special": true
|
594 |
+
},
|
595 |
+
"128074": {
|
596 |
+
"content": "<|reserved_special_token_66|>",
|
597 |
+
"lstrip": false,
|
598 |
+
"normalized": false,
|
599 |
+
"rstrip": false,
|
600 |
+
"single_word": false,
|
601 |
+
"special": true
|
602 |
+
},
|
603 |
+
"128075": {
|
604 |
+
"content": "<|reserved_special_token_67|>",
|
605 |
+
"lstrip": false,
|
606 |
+
"normalized": false,
|
607 |
+
"rstrip": false,
|
608 |
+
"single_word": false,
|
609 |
+
"special": true
|
610 |
+
},
|
611 |
+
"128076": {
|
612 |
+
"content": "<|reserved_special_token_68|>",
|
613 |
+
"lstrip": false,
|
614 |
+
"normalized": false,
|
615 |
+
"rstrip": false,
|
616 |
+
"single_word": false,
|
617 |
+
"special": true
|
618 |
+
},
|
619 |
+
"128077": {
|
620 |
+
"content": "<|reserved_special_token_69|>",
|
621 |
+
"lstrip": false,
|
622 |
+
"normalized": false,
|
623 |
+
"rstrip": false,
|
624 |
+
"single_word": false,
|
625 |
+
"special": true
|
626 |
+
},
|
627 |
+
"128078": {
|
628 |
+
"content": "<|reserved_special_token_70|>",
|
629 |
+
"lstrip": false,
|
630 |
+
"normalized": false,
|
631 |
+
"rstrip": false,
|
632 |
+
"single_word": false,
|
633 |
+
"special": true
|
634 |
+
},
|
635 |
+
"128079": {
|
636 |
+
"content": "<|reserved_special_token_71|>",
|
637 |
+
"lstrip": false,
|
638 |
+
"normalized": false,
|
639 |
+
"rstrip": false,
|
640 |
+
"single_word": false,
|
641 |
+
"special": true
|
642 |
+
},
|
643 |
+
"128080": {
|
644 |
+
"content": "<|reserved_special_token_72|>",
|
645 |
+
"lstrip": false,
|
646 |
+
"normalized": false,
|
647 |
+
"rstrip": false,
|
648 |
+
"single_word": false,
|
649 |
+
"special": true
|
650 |
+
},
|
651 |
+
"128081": {
|
652 |
+
"content": "<|reserved_special_token_73|>",
|
653 |
+
"lstrip": false,
|
654 |
+
"normalized": false,
|
655 |
+
"rstrip": false,
|
656 |
+
"single_word": false,
|
657 |
+
"special": true
|
658 |
+
},
|
659 |
+
"128082": {
|
660 |
+
"content": "<|reserved_special_token_74|>",
|
661 |
+
"lstrip": false,
|
662 |
+
"normalized": false,
|
663 |
+
"rstrip": false,
|
664 |
+
"single_word": false,
|
665 |
+
"special": true
|
666 |
+
},
|
667 |
+
"128083": {
|
668 |
+
"content": "<|reserved_special_token_75|>",
|
669 |
+
"lstrip": false,
|
670 |
+
"normalized": false,
|
671 |
+
"rstrip": false,
|
672 |
+
"single_word": false,
|
673 |
+
"special": true
|
674 |
+
},
|
675 |
+
"128084": {
|
676 |
+
"content": "<|reserved_special_token_76|>",
|
677 |
+
"lstrip": false,
|
678 |
+
"normalized": false,
|
679 |
+
"rstrip": false,
|
680 |
+
"single_word": false,
|
681 |
+
"special": true
|
682 |
+
},
|
683 |
+
"128085": {
|
684 |
+
"content": "<|reserved_special_token_77|>",
|
685 |
+
"lstrip": false,
|
686 |
+
"normalized": false,
|
687 |
+
"rstrip": false,
|
688 |
+
"single_word": false,
|
689 |
+
"special": true
|
690 |
+
},
|
691 |
+
"128086": {
|
692 |
+
"content": "<|reserved_special_token_78|>",
|
693 |
+
"lstrip": false,
|
694 |
+
"normalized": false,
|
695 |
+
"rstrip": false,
|
696 |
+
"single_word": false,
|
697 |
+
"special": true
|
698 |
+
},
|
699 |
+
"128087": {
|
700 |
+
"content": "<|reserved_special_token_79|>",
|
701 |
+
"lstrip": false,
|
702 |
+
"normalized": false,
|
703 |
+
"rstrip": false,
|
704 |
+
"single_word": false,
|
705 |
+
"special": true
|
706 |
+
},
|
707 |
+
"128088": {
|
708 |
+
"content": "<|reserved_special_token_80|>",
|
709 |
+
"lstrip": false,
|
710 |
+
"normalized": false,
|
711 |
+
"rstrip": false,
|
712 |
+
"single_word": false,
|
713 |
+
"special": true
|
714 |
+
},
|
715 |
+
"128089": {
|
716 |
+
"content": "<|reserved_special_token_81|>",
|
717 |
+
"lstrip": false,
|
718 |
+
"normalized": false,
|
719 |
+
"rstrip": false,
|
720 |
+
"single_word": false,
|
721 |
+
"special": true
|
722 |
+
},
|
723 |
+
"128090": {
|
724 |
+
"content": "<|reserved_special_token_82|>",
|
725 |
+
"lstrip": false,
|
726 |
+
"normalized": false,
|
727 |
+
"rstrip": false,
|
728 |
+
"single_word": false,
|
729 |
+
"special": true
|
730 |
+
},
|
731 |
+
"128091": {
|
732 |
+
"content": "<|reserved_special_token_83|>",
|
733 |
+
"lstrip": false,
|
734 |
+
"normalized": false,
|
735 |
+
"rstrip": false,
|
736 |
+
"single_word": false,
|
737 |
+
"special": true
|
738 |
+
},
|
739 |
+
"128092": {
|
740 |
+
"content": "<|reserved_special_token_84|>",
|
741 |
+
"lstrip": false,
|
742 |
+
"normalized": false,
|
743 |
+
"rstrip": false,
|
744 |
+
"single_word": false,
|
745 |
+
"special": true
|
746 |
+
},
|
747 |
+
"128093": {
|
748 |
+
"content": "<|reserved_special_token_85|>",
|
749 |
+
"lstrip": false,
|
750 |
+
"normalized": false,
|
751 |
+
"rstrip": false,
|
752 |
+
"single_word": false,
|
753 |
+
"special": true
|
754 |
+
},
|
755 |
+
"128094": {
|
756 |
+
"content": "<|reserved_special_token_86|>",
|
757 |
+
"lstrip": false,
|
758 |
+
"normalized": false,
|
759 |
+
"rstrip": false,
|
760 |
+
"single_word": false,
|
761 |
+
"special": true
|
762 |
+
},
|
763 |
+
"128095": {
|
764 |
+
"content": "<|reserved_special_token_87|>",
|
765 |
+
"lstrip": false,
|
766 |
+
"normalized": false,
|
767 |
+
"rstrip": false,
|
768 |
+
"single_word": false,
|
769 |
+
"special": true
|
770 |
+
},
|
771 |
+
"128096": {
|
772 |
+
"content": "<|reserved_special_token_88|>",
|
773 |
+
"lstrip": false,
|
774 |
+
"normalized": false,
|
775 |
+
"rstrip": false,
|
776 |
+
"single_word": false,
|
777 |
+
"special": true
|
778 |
+
},
|
779 |
+
"128097": {
|
780 |
+
"content": "<|reserved_special_token_89|>",
|
781 |
+
"lstrip": false,
|
782 |
+
"normalized": false,
|
783 |
+
"rstrip": false,
|
784 |
+
"single_word": false,
|
785 |
+
"special": true
|
786 |
+
},
|
787 |
+
"128098": {
|
788 |
+
"content": "<|reserved_special_token_90|>",
|
789 |
+
"lstrip": false,
|
790 |
+
"normalized": false,
|
791 |
+
"rstrip": false,
|
792 |
+
"single_word": false,
|
793 |
+
"special": true
|
794 |
+
},
|
795 |
+
"128099": {
|
796 |
+
"content": "<|reserved_special_token_91|>",
|
797 |
+
"lstrip": false,
|
798 |
+
"normalized": false,
|
799 |
+
"rstrip": false,
|
800 |
+
"single_word": false,
|
801 |
+
"special": true
|
802 |
+
},
|
803 |
+
"128100": {
|
804 |
+
"content": "<|reserved_special_token_92|>",
|
805 |
+
"lstrip": false,
|
806 |
+
"normalized": false,
|
807 |
+
"rstrip": false,
|
808 |
+
"single_word": false,
|
809 |
+
"special": true
|
810 |
+
},
|
811 |
+
"128101": {
|
812 |
+
"content": "<|reserved_special_token_93|>",
|
813 |
+
"lstrip": false,
|
814 |
+
"normalized": false,
|
815 |
+
"rstrip": false,
|
816 |
+
"single_word": false,
|
817 |
+
"special": true
|
818 |
+
},
|
819 |
+
"128102": {
|
820 |
+
"content": "<|reserved_special_token_94|>",
|
821 |
+
"lstrip": false,
|
822 |
+
"normalized": false,
|
823 |
+
"rstrip": false,
|
824 |
+
"single_word": false,
|
825 |
+
"special": true
|
826 |
+
},
|
827 |
+
"128103": {
|
828 |
+
"content": "<|reserved_special_token_95|>",
|
829 |
+
"lstrip": false,
|
830 |
+
"normalized": false,
|
831 |
+
"rstrip": false,
|
832 |
+
"single_word": false,
|
833 |
+
"special": true
|
834 |
+
},
|
835 |
+
"128104": {
|
836 |
+
"content": "<|reserved_special_token_96|>",
|
837 |
+
"lstrip": false,
|
838 |
+
"normalized": false,
|
839 |
+
"rstrip": false,
|
840 |
+
"single_word": false,
|
841 |
+
"special": true
|
842 |
+
},
|
843 |
+
"128105": {
|
844 |
+
"content": "<|reserved_special_token_97|>",
|
845 |
+
"lstrip": false,
|
846 |
+
"normalized": false,
|
847 |
+
"rstrip": false,
|
848 |
+
"single_word": false,
|
849 |
+
"special": true
|
850 |
+
},
|
851 |
+
"128106": {
|
852 |
+
"content": "<|reserved_special_token_98|>",
|
853 |
+
"lstrip": false,
|
854 |
+
"normalized": false,
|
855 |
+
"rstrip": false,
|
856 |
+
"single_word": false,
|
857 |
+
"special": true
|
858 |
+
},
|
859 |
+
"128107": {
|
860 |
+
"content": "<|reserved_special_token_99|>",
|
861 |
+
"lstrip": false,
|
862 |
+
"normalized": false,
|
863 |
+
"rstrip": false,
|
864 |
+
"single_word": false,
|
865 |
+
"special": true
|
866 |
+
},
|
867 |
+
"128108": {
|
868 |
+
"content": "<|reserved_special_token_100|>",
|
869 |
+
"lstrip": false,
|
870 |
+
"normalized": false,
|
871 |
+
"rstrip": false,
|
872 |
+
"single_word": false,
|
873 |
+
"special": true
|
874 |
+
},
|
875 |
+
"128109": {
|
876 |
+
"content": "<|reserved_special_token_101|>",
|
877 |
+
"lstrip": false,
|
878 |
+
"normalized": false,
|
879 |
+
"rstrip": false,
|
880 |
+
"single_word": false,
|
881 |
+
"special": true
|
882 |
+
},
|
883 |
+
"128110": {
|
884 |
+
"content": "<|reserved_special_token_102|>",
|
885 |
+
"lstrip": false,
|
886 |
+
"normalized": false,
|
887 |
+
"rstrip": false,
|
888 |
+
"single_word": false,
|
889 |
+
"special": true
|
890 |
+
},
|
891 |
+
"128111": {
|
892 |
+
"content": "<|reserved_special_token_103|>",
|
893 |
+
"lstrip": false,
|
894 |
+
"normalized": false,
|
895 |
+
"rstrip": false,
|
896 |
+
"single_word": false,
|
897 |
+
"special": true
|
898 |
+
},
|
899 |
+
"128112": {
|
900 |
+
"content": "<|reserved_special_token_104|>",
|
901 |
+
"lstrip": false,
|
902 |
+
"normalized": false,
|
903 |
+
"rstrip": false,
|
904 |
+
"single_word": false,
|
905 |
+
"special": true
|
906 |
+
},
|
907 |
+
"128113": {
|
908 |
+
"content": "<|reserved_special_token_105|>",
|
909 |
+
"lstrip": false,
|
910 |
+
"normalized": false,
|
911 |
+
"rstrip": false,
|
912 |
+
"single_word": false,
|
913 |
+
"special": true
|
914 |
+
},
|
915 |
+
"128114": {
|
916 |
+
"content": "<|reserved_special_token_106|>",
|
917 |
+
"lstrip": false,
|
918 |
+
"normalized": false,
|
919 |
+
"rstrip": false,
|
920 |
+
"single_word": false,
|
921 |
+
"special": true
|
922 |
+
},
|
923 |
+
"128115": {
|
924 |
+
"content": "<|reserved_special_token_107|>",
|
925 |
+
"lstrip": false,
|
926 |
+
"normalized": false,
|
927 |
+
"rstrip": false,
|
928 |
+
"single_word": false,
|
929 |
+
"special": true
|
930 |
+
},
|
931 |
+
"128116": {
|
932 |
+
"content": "<|reserved_special_token_108|>",
|
933 |
+
"lstrip": false,
|
934 |
+
"normalized": false,
|
935 |
+
"rstrip": false,
|
936 |
+
"single_word": false,
|
937 |
+
"special": true
|
938 |
+
},
|
939 |
+
"128117": {
|
940 |
+
"content": "<|reserved_special_token_109|>",
|
941 |
+
"lstrip": false,
|
942 |
+
"normalized": false,
|
943 |
+
"rstrip": false,
|
944 |
+
"single_word": false,
|
945 |
+
"special": true
|
946 |
+
},
|
947 |
+
"128118": {
|
948 |
+
"content": "<|reserved_special_token_110|>",
|
949 |
+
"lstrip": false,
|
950 |
+
"normalized": false,
|
951 |
+
"rstrip": false,
|
952 |
+
"single_word": false,
|
953 |
+
"special": true
|
954 |
+
},
|
955 |
+
"128119": {
|
956 |
+
"content": "<|reserved_special_token_111|>",
|
957 |
+
"lstrip": false,
|
958 |
+
"normalized": false,
|
959 |
+
"rstrip": false,
|
960 |
+
"single_word": false,
|
961 |
+
"special": true
|
962 |
+
},
|
963 |
+
"128120": {
|
964 |
+
"content": "<|reserved_special_token_112|>",
|
965 |
+
"lstrip": false,
|
966 |
+
"normalized": false,
|
967 |
+
"rstrip": false,
|
968 |
+
"single_word": false,
|
969 |
+
"special": true
|
970 |
+
},
|
971 |
+
"128121": {
|
972 |
+
"content": "<|reserved_special_token_113|>",
|
973 |
+
"lstrip": false,
|
974 |
+
"normalized": false,
|
975 |
+
"rstrip": false,
|
976 |
+
"single_word": false,
|
977 |
+
"special": true
|
978 |
+
},
|
979 |
+
"128122": {
|
980 |
+
"content": "<|reserved_special_token_114|>",
|
981 |
+
"lstrip": false,
|
982 |
+
"normalized": false,
|
983 |
+
"rstrip": false,
|
984 |
+
"single_word": false,
|
985 |
+
"special": true
|
986 |
+
},
|
987 |
+
"128123": {
|
988 |
+
"content": "<|reserved_special_token_115|>",
|
989 |
+
"lstrip": false,
|
990 |
+
"normalized": false,
|
991 |
+
"rstrip": false,
|
992 |
+
"single_word": false,
|
993 |
+
"special": true
|
994 |
+
},
|
995 |
+
"128124": {
|
996 |
+
"content": "<|reserved_special_token_116|>",
|
997 |
+
"lstrip": false,
|
998 |
+
"normalized": false,
|
999 |
+
"rstrip": false,
|
1000 |
+
"single_word": false,
|
1001 |
+
"special": true
|
1002 |
+
},
|
1003 |
+
"128125": {
|
1004 |
+
"content": "<|reserved_special_token_117|>",
|
1005 |
+
"lstrip": false,
|
1006 |
+
"normalized": false,
|
1007 |
+
"rstrip": false,
|
1008 |
+
"single_word": false,
|
1009 |
+
"special": true
|
1010 |
+
},
|
1011 |
+
"128126": {
|
1012 |
+
"content": "<|reserved_special_token_118|>",
|
1013 |
+
"lstrip": false,
|
1014 |
+
"normalized": false,
|
1015 |
+
"rstrip": false,
|
1016 |
+
"single_word": false,
|
1017 |
+
"special": true
|
1018 |
+
},
|
1019 |
+
"128127": {
|
1020 |
+
"content": "<|reserved_special_token_119|>",
|
1021 |
+
"lstrip": false,
|
1022 |
+
"normalized": false,
|
1023 |
+
"rstrip": false,
|
1024 |
+
"single_word": false,
|
1025 |
+
"special": true
|
1026 |
+
},
|
1027 |
+
"128128": {
|
1028 |
+
"content": "<|reserved_special_token_120|>",
|
1029 |
+
"lstrip": false,
|
1030 |
+
"normalized": false,
|
1031 |
+
"rstrip": false,
|
1032 |
+
"single_word": false,
|
1033 |
+
"special": true
|
1034 |
+
},
|
1035 |
+
"128129": {
|
1036 |
+
"content": "<|reserved_special_token_121|>",
|
1037 |
+
"lstrip": false,
|
1038 |
+
"normalized": false,
|
1039 |
+
"rstrip": false,
|
1040 |
+
"single_word": false,
|
1041 |
+
"special": true
|
1042 |
+
},
|
1043 |
+
"128130": {
|
1044 |
+
"content": "<|reserved_special_token_122|>",
|
1045 |
+
"lstrip": false,
|
1046 |
+
"normalized": false,
|
1047 |
+
"rstrip": false,
|
1048 |
+
"single_word": false,
|
1049 |
+
"special": true
|
1050 |
+
},
|
1051 |
+
"128131": {
|
1052 |
+
"content": "<|reserved_special_token_123|>",
|
1053 |
+
"lstrip": false,
|
1054 |
+
"normalized": false,
|
1055 |
+
"rstrip": false,
|
1056 |
+
"single_word": false,
|
1057 |
+
"special": true
|
1058 |
+
},
|
1059 |
+
"128132": {
|
1060 |
+
"content": "<|reserved_special_token_124|>",
|
1061 |
+
"lstrip": false,
|
1062 |
+
"normalized": false,
|
1063 |
+
"rstrip": false,
|
1064 |
+
"single_word": false,
|
1065 |
+
"special": true
|
1066 |
+
},
|
1067 |
+
"128133": {
|
1068 |
+
"content": "<|reserved_special_token_125|>",
|
1069 |
+
"lstrip": false,
|
1070 |
+
"normalized": false,
|
1071 |
+
"rstrip": false,
|
1072 |
+
"single_word": false,
|
1073 |
+
"special": true
|
1074 |
+
},
|
1075 |
+
"128134": {
|
1076 |
+
"content": "<|reserved_special_token_126|>",
|
1077 |
+
"lstrip": false,
|
1078 |
+
"normalized": false,
|
1079 |
+
"rstrip": false,
|
1080 |
+
"single_word": false,
|
1081 |
+
"special": true
|
1082 |
+
},
|
1083 |
+
"128135": {
|
1084 |
+
"content": "<|reserved_special_token_127|>",
|
1085 |
+
"lstrip": false,
|
1086 |
+
"normalized": false,
|
1087 |
+
"rstrip": false,
|
1088 |
+
"single_word": false,
|
1089 |
+
"special": true
|
1090 |
+
},
|
1091 |
+
"128136": {
|
1092 |
+
"content": "<|reserved_special_token_128|>",
|
1093 |
+
"lstrip": false,
|
1094 |
+
"normalized": false,
|
1095 |
+
"rstrip": false,
|
1096 |
+
"single_word": false,
|
1097 |
+
"special": true
|
1098 |
+
},
|
1099 |
+
"128137": {
|
1100 |
+
"content": "<|reserved_special_token_129|>",
|
1101 |
+
"lstrip": false,
|
1102 |
+
"normalized": false,
|
1103 |
+
"rstrip": false,
|
1104 |
+
"single_word": false,
|
1105 |
+
"special": true
|
1106 |
+
},
|
1107 |
+
"128138": {
|
1108 |
+
"content": "<|reserved_special_token_130|>",
|
1109 |
+
"lstrip": false,
|
1110 |
+
"normalized": false,
|
1111 |
+
"rstrip": false,
|
1112 |
+
"single_word": false,
|
1113 |
+
"special": true
|
1114 |
+
},
|
1115 |
+
"128139": {
|
1116 |
+
"content": "<|reserved_special_token_131|>",
|
1117 |
+
"lstrip": false,
|
1118 |
+
"normalized": false,
|
1119 |
+
"rstrip": false,
|
1120 |
+
"single_word": false,
|
1121 |
+
"special": true
|
1122 |
+
},
|
1123 |
+
"128140": {
|
1124 |
+
"content": "<|reserved_special_token_132|>",
|
1125 |
+
"lstrip": false,
|
1126 |
+
"normalized": false,
|
1127 |
+
"rstrip": false,
|
1128 |
+
"single_word": false,
|
1129 |
+
"special": true
|
1130 |
+
},
|
1131 |
+
"128141": {
|
1132 |
+
"content": "<|reserved_special_token_133|>",
|
1133 |
+
"lstrip": false,
|
1134 |
+
"normalized": false,
|
1135 |
+
"rstrip": false,
|
1136 |
+
"single_word": false,
|
1137 |
+
"special": true
|
1138 |
+
},
|
1139 |
+
"128142": {
|
1140 |
+
"content": "<|reserved_special_token_134|>",
|
1141 |
+
"lstrip": false,
|
1142 |
+
"normalized": false,
|
1143 |
+
"rstrip": false,
|
1144 |
+
"single_word": false,
|
1145 |
+
"special": true
|
1146 |
+
},
|
1147 |
+
"128143": {
|
1148 |
+
"content": "<|reserved_special_token_135|>",
|
1149 |
+
"lstrip": false,
|
1150 |
+
"normalized": false,
|
1151 |
+
"rstrip": false,
|
1152 |
+
"single_word": false,
|
1153 |
+
"special": true
|
1154 |
+
},
|
1155 |
+
"128144": {
|
1156 |
+
"content": "<|reserved_special_token_136|>",
|
1157 |
+
"lstrip": false,
|
1158 |
+
"normalized": false,
|
1159 |
+
"rstrip": false,
|
1160 |
+
"single_word": false,
|
1161 |
+
"special": true
|
1162 |
+
},
|
1163 |
+
"128145": {
|
1164 |
+
"content": "<|reserved_special_token_137|>",
|
1165 |
+
"lstrip": false,
|
1166 |
+
"normalized": false,
|
1167 |
+
"rstrip": false,
|
1168 |
+
"single_word": false,
|
1169 |
+
"special": true
|
1170 |
+
},
|
1171 |
+
"128146": {
|
1172 |
+
"content": "<|reserved_special_token_138|>",
|
1173 |
+
"lstrip": false,
|
1174 |
+
"normalized": false,
|
1175 |
+
"rstrip": false,
|
1176 |
+
"single_word": false,
|
1177 |
+
"special": true
|
1178 |
+
},
|
1179 |
+
"128147": {
|
1180 |
+
"content": "<|reserved_special_token_139|>",
|
1181 |
+
"lstrip": false,
|
1182 |
+
"normalized": false,
|
1183 |
+
"rstrip": false,
|
1184 |
+
"single_word": false,
|
1185 |
+
"special": true
|
1186 |
+
},
|
1187 |
+
"128148": {
|
1188 |
+
"content": "<|reserved_special_token_140|>",
|
1189 |
+
"lstrip": false,
|
1190 |
+
"normalized": false,
|
1191 |
+
"rstrip": false,
|
1192 |
+
"single_word": false,
|
1193 |
+
"special": true
|
1194 |
+
},
|
1195 |
+
"128149": {
|
1196 |
+
"content": "<|reserved_special_token_141|>",
|
1197 |
+
"lstrip": false,
|
1198 |
+
"normalized": false,
|
1199 |
+
"rstrip": false,
|
1200 |
+
"single_word": false,
|
1201 |
+
"special": true
|
1202 |
+
},
|
1203 |
+
"128150": {
|
1204 |
+
"content": "<|reserved_special_token_142|>",
|
1205 |
+
"lstrip": false,
|
1206 |
+
"normalized": false,
|
1207 |
+
"rstrip": false,
|
1208 |
+
"single_word": false,
|
1209 |
+
"special": true
|
1210 |
+
},
|
1211 |
+
"128151": {
|
1212 |
+
"content": "<|reserved_special_token_143|>",
|
1213 |
+
"lstrip": false,
|
1214 |
+
"normalized": false,
|
1215 |
+
"rstrip": false,
|
1216 |
+
"single_word": false,
|
1217 |
+
"special": true
|
1218 |
+
},
|
1219 |
+
"128152": {
|
1220 |
+
"content": "<|reserved_special_token_144|>",
|
1221 |
+
"lstrip": false,
|
1222 |
+
"normalized": false,
|
1223 |
+
"rstrip": false,
|
1224 |
+
"single_word": false,
|
1225 |
+
"special": true
|
1226 |
+
},
|
1227 |
+
"128153": {
|
1228 |
+
"content": "<|reserved_special_token_145|>",
|
1229 |
+
"lstrip": false,
|
1230 |
+
"normalized": false,
|
1231 |
+
"rstrip": false,
|
1232 |
+
"single_word": false,
|
1233 |
+
"special": true
|
1234 |
+
},
|
1235 |
+
"128154": {
|
1236 |
+
"content": "<|reserved_special_token_146|>",
|
1237 |
+
"lstrip": false,
|
1238 |
+
"normalized": false,
|
1239 |
+
"rstrip": false,
|
1240 |
+
"single_word": false,
|
1241 |
+
"special": true
|
1242 |
+
},
|
1243 |
+
"128155": {
|
1244 |
+
"content": "<|reserved_special_token_147|>",
|
1245 |
+
"lstrip": false,
|
1246 |
+
"normalized": false,
|
1247 |
+
"rstrip": false,
|
1248 |
+
"single_word": false,
|
1249 |
+
"special": true
|
1250 |
+
},
|
1251 |
+
"128156": {
|
1252 |
+
"content": "<|reserved_special_token_148|>",
|
1253 |
+
"lstrip": false,
|
1254 |
+
"normalized": false,
|
1255 |
+
"rstrip": false,
|
1256 |
+
"single_word": false,
|
1257 |
+
"special": true
|
1258 |
+
},
|
1259 |
+
"128157": {
|
1260 |
+
"content": "<|reserved_special_token_149|>",
|
1261 |
+
"lstrip": false,
|
1262 |
+
"normalized": false,
|
1263 |
+
"rstrip": false,
|
1264 |
+
"single_word": false,
|
1265 |
+
"special": true
|
1266 |
+
},
|
1267 |
+
"128158": {
|
1268 |
+
"content": "<|reserved_special_token_150|>",
|
1269 |
+
"lstrip": false,
|
1270 |
+
"normalized": false,
|
1271 |
+
"rstrip": false,
|
1272 |
+
"single_word": false,
|
1273 |
+
"special": true
|
1274 |
+
},
|
1275 |
+
"128159": {
|
1276 |
+
"content": "<|reserved_special_token_151|>",
|
1277 |
+
"lstrip": false,
|
1278 |
+
"normalized": false,
|
1279 |
+
"rstrip": false,
|
1280 |
+
"single_word": false,
|
1281 |
+
"special": true
|
1282 |
+
},
|
1283 |
+
"128160": {
|
1284 |
+
"content": "<|reserved_special_token_152|>",
|
1285 |
+
"lstrip": false,
|
1286 |
+
"normalized": false,
|
1287 |
+
"rstrip": false,
|
1288 |
+
"single_word": false,
|
1289 |
+
"special": true
|
1290 |
+
},
|
1291 |
+
"128161": {
|
1292 |
+
"content": "<|reserved_special_token_153|>",
|
1293 |
+
"lstrip": false,
|
1294 |
+
"normalized": false,
|
1295 |
+
"rstrip": false,
|
1296 |
+
"single_word": false,
|
1297 |
+
"special": true
|
1298 |
+
},
|
1299 |
+
"128162": {
|
1300 |
+
"content": "<|reserved_special_token_154|>",
|
1301 |
+
"lstrip": false,
|
1302 |
+
"normalized": false,
|
1303 |
+
"rstrip": false,
|
1304 |
+
"single_word": false,
|
1305 |
+
"special": true
|
1306 |
+
},
|
1307 |
+
"128163": {
|
1308 |
+
"content": "<|reserved_special_token_155|>",
|
1309 |
+
"lstrip": false,
|
1310 |
+
"normalized": false,
|
1311 |
+
"rstrip": false,
|
1312 |
+
"single_word": false,
|
1313 |
+
"special": true
|
1314 |
+
},
|
1315 |
+
"128164": {
|
1316 |
+
"content": "<|reserved_special_token_156|>",
|
1317 |
+
"lstrip": false,
|
1318 |
+
"normalized": false,
|
1319 |
+
"rstrip": false,
|
1320 |
+
"single_word": false,
|
1321 |
+
"special": true
|
1322 |
+
},
|
1323 |
+
"128165": {
|
1324 |
+
"content": "<|reserved_special_token_157|>",
|
1325 |
+
"lstrip": false,
|
1326 |
+
"normalized": false,
|
1327 |
+
"rstrip": false,
|
1328 |
+
"single_word": false,
|
1329 |
+
"special": true
|
1330 |
+
},
|
1331 |
+
"128166": {
|
1332 |
+
"content": "<|reserved_special_token_158|>",
|
1333 |
+
"lstrip": false,
|
1334 |
+
"normalized": false,
|
1335 |
+
"rstrip": false,
|
1336 |
+
"single_word": false,
|
1337 |
+
"special": true
|
1338 |
+
},
|
1339 |
+
"128167": {
|
1340 |
+
"content": "<|reserved_special_token_159|>",
|
1341 |
+
"lstrip": false,
|
1342 |
+
"normalized": false,
|
1343 |
+
"rstrip": false,
|
1344 |
+
"single_word": false,
|
1345 |
+
"special": true
|
1346 |
+
},
|
1347 |
+
"128168": {
|
1348 |
+
"content": "<|reserved_special_token_160|>",
|
1349 |
+
"lstrip": false,
|
1350 |
+
"normalized": false,
|
1351 |
+
"rstrip": false,
|
1352 |
+
"single_word": false,
|
1353 |
+
"special": true
|
1354 |
+
},
|
1355 |
+
"128169": {
|
1356 |
+
"content": "<|reserved_special_token_161|>",
|
1357 |
+
"lstrip": false,
|
1358 |
+
"normalized": false,
|
1359 |
+
"rstrip": false,
|
1360 |
+
"single_word": false,
|
1361 |
+
"special": true
|
1362 |
+
},
|
1363 |
+
"128170": {
|
1364 |
+
"content": "<|reserved_special_token_162|>",
|
1365 |
+
"lstrip": false,
|
1366 |
+
"normalized": false,
|
1367 |
+
"rstrip": false,
|
1368 |
+
"single_word": false,
|
1369 |
+
"special": true
|
1370 |
+
},
|
1371 |
+
"128171": {
|
1372 |
+
"content": "<|reserved_special_token_163|>",
|
1373 |
+
"lstrip": false,
|
1374 |
+
"normalized": false,
|
1375 |
+
"rstrip": false,
|
1376 |
+
"single_word": false,
|
1377 |
+
"special": true
|
1378 |
+
},
|
1379 |
+
"128172": {
|
1380 |
+
"content": "<|reserved_special_token_164|>",
|
1381 |
+
"lstrip": false,
|
1382 |
+
"normalized": false,
|
1383 |
+
"rstrip": false,
|
1384 |
+
"single_word": false,
|
1385 |
+
"special": true
|
1386 |
+
},
|
1387 |
+
"128173": {
|
1388 |
+
"content": "<|reserved_special_token_165|>",
|
1389 |
+
"lstrip": false,
|
1390 |
+
"normalized": false,
|
1391 |
+
"rstrip": false,
|
1392 |
+
"single_word": false,
|
1393 |
+
"special": true
|
1394 |
+
},
|
1395 |
+
"128174": {
|
1396 |
+
"content": "<|reserved_special_token_166|>",
|
1397 |
+
"lstrip": false,
|
1398 |
+
"normalized": false,
|
1399 |
+
"rstrip": false,
|
1400 |
+
"single_word": false,
|
1401 |
+
"special": true
|
1402 |
+
},
|
1403 |
+
"128175": {
|
1404 |
+
"content": "<|reserved_special_token_167|>",
|
1405 |
+
"lstrip": false,
|
1406 |
+
"normalized": false,
|
1407 |
+
"rstrip": false,
|
1408 |
+
"single_word": false,
|
1409 |
+
"special": true
|
1410 |
+
},
|
1411 |
+
"128176": {
|
1412 |
+
"content": "<|reserved_special_token_168|>",
|
1413 |
+
"lstrip": false,
|
1414 |
+
"normalized": false,
|
1415 |
+
"rstrip": false,
|
1416 |
+
"single_word": false,
|
1417 |
+
"special": true
|
1418 |
+
},
|
1419 |
+
"128177": {
|
1420 |
+
"content": "<|reserved_special_token_169|>",
|
1421 |
+
"lstrip": false,
|
1422 |
+
"normalized": false,
|
1423 |
+
"rstrip": false,
|
1424 |
+
"single_word": false,
|
1425 |
+
"special": true
|
1426 |
+
},
|
1427 |
+
"128178": {
|
1428 |
+
"content": "<|reserved_special_token_170|>",
|
1429 |
+
"lstrip": false,
|
1430 |
+
"normalized": false,
|
1431 |
+
"rstrip": false,
|
1432 |
+
"single_word": false,
|
1433 |
+
"special": true
|
1434 |
+
},
|
1435 |
+
"128179": {
|
1436 |
+
"content": "<|reserved_special_token_171|>",
|
1437 |
+
"lstrip": false,
|
1438 |
+
"normalized": false,
|
1439 |
+
"rstrip": false,
|
1440 |
+
"single_word": false,
|
1441 |
+
"special": true
|
1442 |
+
},
|
1443 |
+
"128180": {
|
1444 |
+
"content": "<|reserved_special_token_172|>",
|
1445 |
+
"lstrip": false,
|
1446 |
+
"normalized": false,
|
1447 |
+
"rstrip": false,
|
1448 |
+
"single_word": false,
|
1449 |
+
"special": true
|
1450 |
+
},
|
1451 |
+
"128181": {
|
1452 |
+
"content": "<|reserved_special_token_173|>",
|
1453 |
+
"lstrip": false,
|
1454 |
+
"normalized": false,
|
1455 |
+
"rstrip": false,
|
1456 |
+
"single_word": false,
|
1457 |
+
"special": true
|
1458 |
+
},
|
1459 |
+
"128182": {
|
1460 |
+
"content": "<|reserved_special_token_174|>",
|
1461 |
+
"lstrip": false,
|
1462 |
+
"normalized": false,
|
1463 |
+
"rstrip": false,
|
1464 |
+
"single_word": false,
|
1465 |
+
"special": true
|
1466 |
+
},
|
1467 |
+
"128183": {
|
1468 |
+
"content": "<|reserved_special_token_175|>",
|
1469 |
+
"lstrip": false,
|
1470 |
+
"normalized": false,
|
1471 |
+
"rstrip": false,
|
1472 |
+
"single_word": false,
|
1473 |
+
"special": true
|
1474 |
+
},
|
1475 |
+
"128184": {
|
1476 |
+
"content": "<|reserved_special_token_176|>",
|
1477 |
+
"lstrip": false,
|
1478 |
+
"normalized": false,
|
1479 |
+
"rstrip": false,
|
1480 |
+
"single_word": false,
|
1481 |
+
"special": true
|
1482 |
+
},
|
1483 |
+
"128185": {
|
1484 |
+
"content": "<|reserved_special_token_177|>",
|
1485 |
+
"lstrip": false,
|
1486 |
+
"normalized": false,
|
1487 |
+
"rstrip": false,
|
1488 |
+
"single_word": false,
|
1489 |
+
"special": true
|
1490 |
+
},
|
1491 |
+
"128186": {
|
1492 |
+
"content": "<|reserved_special_token_178|>",
|
1493 |
+
"lstrip": false,
|
1494 |
+
"normalized": false,
|
1495 |
+
"rstrip": false,
|
1496 |
+
"single_word": false,
|
1497 |
+
"special": true
|
1498 |
+
},
|
1499 |
+
"128187": {
|
1500 |
+
"content": "<|reserved_special_token_179|>",
|
1501 |
+
"lstrip": false,
|
1502 |
+
"normalized": false,
|
1503 |
+
"rstrip": false,
|
1504 |
+
"single_word": false,
|
1505 |
+
"special": true
|
1506 |
+
},
|
1507 |
+
"128188": {
|
1508 |
+
"content": "<|reserved_special_token_180|>",
|
1509 |
+
"lstrip": false,
|
1510 |
+
"normalized": false,
|
1511 |
+
"rstrip": false,
|
1512 |
+
"single_word": false,
|
1513 |
+
"special": true
|
1514 |
+
},
|
1515 |
+
"128189": {
|
1516 |
+
"content": "<|reserved_special_token_181|>",
|
1517 |
+
"lstrip": false,
|
1518 |
+
"normalized": false,
|
1519 |
+
"rstrip": false,
|
1520 |
+
"single_word": false,
|
1521 |
+
"special": true
|
1522 |
+
},
|
1523 |
+
"128190": {
|
1524 |
+
"content": "<|reserved_special_token_182|>",
|
1525 |
+
"lstrip": false,
|
1526 |
+
"normalized": false,
|
1527 |
+
"rstrip": false,
|
1528 |
+
"single_word": false,
|
1529 |
+
"special": true
|
1530 |
+
},
|
1531 |
+
"128191": {
|
1532 |
+
"content": "<|reserved_special_token_183|>",
|
1533 |
+
"lstrip": false,
|
1534 |
+
"normalized": false,
|
1535 |
+
"rstrip": false,
|
1536 |
+
"single_word": false,
|
1537 |
+
"special": true
|
1538 |
+
},
|
1539 |
+
"128192": {
|
1540 |
+
"content": "<|reserved_special_token_184|>",
|
1541 |
+
"lstrip": false,
|
1542 |
+
"normalized": false,
|
1543 |
+
"rstrip": false,
|
1544 |
+
"single_word": false,
|
1545 |
+
"special": true
|
1546 |
+
},
|
1547 |
+
"128193": {
|
1548 |
+
"content": "<|reserved_special_token_185|>",
|
1549 |
+
"lstrip": false,
|
1550 |
+
"normalized": false,
|
1551 |
+
"rstrip": false,
|
1552 |
+
"single_word": false,
|
1553 |
+
"special": true
|
1554 |
+
},
|
1555 |
+
"128194": {
|
1556 |
+
"content": "<|reserved_special_token_186|>",
|
1557 |
+
"lstrip": false,
|
1558 |
+
"normalized": false,
|
1559 |
+
"rstrip": false,
|
1560 |
+
"single_word": false,
|
1561 |
+
"special": true
|
1562 |
+
},
|
1563 |
+
"128195": {
|
1564 |
+
"content": "<|reserved_special_token_187|>",
|
1565 |
+
"lstrip": false,
|
1566 |
+
"normalized": false,
|
1567 |
+
"rstrip": false,
|
1568 |
+
"single_word": false,
|
1569 |
+
"special": true
|
1570 |
+
},
|
1571 |
+
"128196": {
|
1572 |
+
"content": "<|reserved_special_token_188|>",
|
1573 |
+
"lstrip": false,
|
1574 |
+
"normalized": false,
|
1575 |
+
"rstrip": false,
|
1576 |
+
"single_word": false,
|
1577 |
+
"special": true
|
1578 |
+
},
|
1579 |
+
"128197": {
|
1580 |
+
"content": "<|reserved_special_token_189|>",
|
1581 |
+
"lstrip": false,
|
1582 |
+
"normalized": false,
|
1583 |
+
"rstrip": false,
|
1584 |
+
"single_word": false,
|
1585 |
+
"special": true
|
1586 |
+
},
|
1587 |
+
"128198": {
|
1588 |
+
"content": "<|reserved_special_token_190|>",
|
1589 |
+
"lstrip": false,
|
1590 |
+
"normalized": false,
|
1591 |
+
"rstrip": false,
|
1592 |
+
"single_word": false,
|
1593 |
+
"special": true
|
1594 |
+
},
|
1595 |
+
"128199": {
|
1596 |
+
"content": "<|reserved_special_token_191|>",
|
1597 |
+
"lstrip": false,
|
1598 |
+
"normalized": false,
|
1599 |
+
"rstrip": false,
|
1600 |
+
"single_word": false,
|
1601 |
+
"special": true
|
1602 |
+
},
|
1603 |
+
"128200": {
|
1604 |
+
"content": "<|reserved_special_token_192|>",
|
1605 |
+
"lstrip": false,
|
1606 |
+
"normalized": false,
|
1607 |
+
"rstrip": false,
|
1608 |
+
"single_word": false,
|
1609 |
+
"special": true
|
1610 |
+
},
|
1611 |
+
"128201": {
|
1612 |
+
"content": "<|reserved_special_token_193|>",
|
1613 |
+
"lstrip": false,
|
1614 |
+
"normalized": false,
|
1615 |
+
"rstrip": false,
|
1616 |
+
"single_word": false,
|
1617 |
+
"special": true
|
1618 |
+
},
|
1619 |
+
"128202": {
|
1620 |
+
"content": "<|reserved_special_token_194|>",
|
1621 |
+
"lstrip": false,
|
1622 |
+
"normalized": false,
|
1623 |
+
"rstrip": false,
|
1624 |
+
"single_word": false,
|
1625 |
+
"special": true
|
1626 |
+
},
|
1627 |
+
"128203": {
|
1628 |
+
"content": "<|reserved_special_token_195|>",
|
1629 |
+
"lstrip": false,
|
1630 |
+
"normalized": false,
|
1631 |
+
"rstrip": false,
|
1632 |
+
"single_word": false,
|
1633 |
+
"special": true
|
1634 |
+
},
|
1635 |
+
"128204": {
|
1636 |
+
"content": "<|reserved_special_token_196|>",
|
1637 |
+
"lstrip": false,
|
1638 |
+
"normalized": false,
|
1639 |
+
"rstrip": false,
|
1640 |
+
"single_word": false,
|
1641 |
+
"special": true
|
1642 |
+
},
|
1643 |
+
"128205": {
|
1644 |
+
"content": "<|reserved_special_token_197|>",
|
1645 |
+
"lstrip": false,
|
1646 |
+
"normalized": false,
|
1647 |
+
"rstrip": false,
|
1648 |
+
"single_word": false,
|
1649 |
+
"special": true
|
1650 |
+
},
|
1651 |
+
"128206": {
|
1652 |
+
"content": "<|reserved_special_token_198|>",
|
1653 |
+
"lstrip": false,
|
1654 |
+
"normalized": false,
|
1655 |
+
"rstrip": false,
|
1656 |
+
"single_word": false,
|
1657 |
+
"special": true
|
1658 |
+
},
|
1659 |
+
"128207": {
|
1660 |
+
"content": "<|reserved_special_token_199|>",
|
1661 |
+
"lstrip": false,
|
1662 |
+
"normalized": false,
|
1663 |
+
"rstrip": false,
|
1664 |
+
"single_word": false,
|
1665 |
+
"special": true
|
1666 |
+
},
|
1667 |
+
"128208": {
|
1668 |
+
"content": "<|reserved_special_token_200|>",
|
1669 |
+
"lstrip": false,
|
1670 |
+
"normalized": false,
|
1671 |
+
"rstrip": false,
|
1672 |
+
"single_word": false,
|
1673 |
+
"special": true
|
1674 |
+
},
|
1675 |
+
"128209": {
|
1676 |
+
"content": "<|reserved_special_token_201|>",
|
1677 |
+
"lstrip": false,
|
1678 |
+
"normalized": false,
|
1679 |
+
"rstrip": false,
|
1680 |
+
"single_word": false,
|
1681 |
+
"special": true
|
1682 |
+
},
|
1683 |
+
"128210": {
|
1684 |
+
"content": "<|reserved_special_token_202|>",
|
1685 |
+
"lstrip": false,
|
1686 |
+
"normalized": false,
|
1687 |
+
"rstrip": false,
|
1688 |
+
"single_word": false,
|
1689 |
+
"special": true
|
1690 |
+
},
|
1691 |
+
"128211": {
|
1692 |
+
"content": "<|reserved_special_token_203|>",
|
1693 |
+
"lstrip": false,
|
1694 |
+
"normalized": false,
|
1695 |
+
"rstrip": false,
|
1696 |
+
"single_word": false,
|
1697 |
+
"special": true
|
1698 |
+
},
|
1699 |
+
"128212": {
|
1700 |
+
"content": "<|reserved_special_token_204|>",
|
1701 |
+
"lstrip": false,
|
1702 |
+
"normalized": false,
|
1703 |
+
"rstrip": false,
|
1704 |
+
"single_word": false,
|
1705 |
+
"special": true
|
1706 |
+
},
|
1707 |
+
"128213": {
|
1708 |
+
"content": "<|reserved_special_token_205|>",
|
1709 |
+
"lstrip": false,
|
1710 |
+
"normalized": false,
|
1711 |
+
"rstrip": false,
|
1712 |
+
"single_word": false,
|
1713 |
+
"special": true
|
1714 |
+
},
|
1715 |
+
"128214": {
|
1716 |
+
"content": "<|reserved_special_token_206|>",
|
1717 |
+
"lstrip": false,
|
1718 |
+
"normalized": false,
|
1719 |
+
"rstrip": false,
|
1720 |
+
"single_word": false,
|
1721 |
+
"special": true
|
1722 |
+
},
|
1723 |
+
"128215": {
|
1724 |
+
"content": "<|reserved_special_token_207|>",
|
1725 |
+
"lstrip": false,
|
1726 |
+
"normalized": false,
|
1727 |
+
"rstrip": false,
|
1728 |
+
"single_word": false,
|
1729 |
+
"special": true
|
1730 |
+
},
|
1731 |
+
"128216": {
|
1732 |
+
"content": "<|reserved_special_token_208|>",
|
1733 |
+
"lstrip": false,
|
1734 |
+
"normalized": false,
|
1735 |
+
"rstrip": false,
|
1736 |
+
"single_word": false,
|
1737 |
+
"special": true
|
1738 |
+
},
|
1739 |
+
"128217": {
|
1740 |
+
"content": "<|reserved_special_token_209|>",
|
1741 |
+
"lstrip": false,
|
1742 |
+
"normalized": false,
|
1743 |
+
"rstrip": false,
|
1744 |
+
"single_word": false,
|
1745 |
+
"special": true
|
1746 |
+
},
|
1747 |
+
"128218": {
|
1748 |
+
"content": "<|reserved_special_token_210|>",
|
1749 |
+
"lstrip": false,
|
1750 |
+
"normalized": false,
|
1751 |
+
"rstrip": false,
|
1752 |
+
"single_word": false,
|
1753 |
+
"special": true
|
1754 |
+
},
|
1755 |
+
"128219": {
|
1756 |
+
"content": "<|reserved_special_token_211|>",
|
1757 |
+
"lstrip": false,
|
1758 |
+
"normalized": false,
|
1759 |
+
"rstrip": false,
|
1760 |
+
"single_word": false,
|
1761 |
+
"special": true
|
1762 |
+
},
|
1763 |
+
"128220": {
|
1764 |
+
"content": "<|reserved_special_token_212|>",
|
1765 |
+
"lstrip": false,
|
1766 |
+
"normalized": false,
|
1767 |
+
"rstrip": false,
|
1768 |
+
"single_word": false,
|
1769 |
+
"special": true
|
1770 |
+
},
|
1771 |
+
"128221": {
|
1772 |
+
"content": "<|reserved_special_token_213|>",
|
1773 |
+
"lstrip": false,
|
1774 |
+
"normalized": false,
|
1775 |
+
"rstrip": false,
|
1776 |
+
"single_word": false,
|
1777 |
+
"special": true
|
1778 |
+
},
|
1779 |
+
"128222": {
|
1780 |
+
"content": "<|reserved_special_token_214|>",
|
1781 |
+
"lstrip": false,
|
1782 |
+
"normalized": false,
|
1783 |
+
"rstrip": false,
|
1784 |
+
"single_word": false,
|
1785 |
+
"special": true
|
1786 |
+
},
|
1787 |
+
"128223": {
|
1788 |
+
"content": "<|reserved_special_token_215|>",
|
1789 |
+
"lstrip": false,
|
1790 |
+
"normalized": false,
|
1791 |
+
"rstrip": false,
|
1792 |
+
"single_word": false,
|
1793 |
+
"special": true
|
1794 |
+
},
|
1795 |
+
"128224": {
|
1796 |
+
"content": "<|reserved_special_token_216|>",
|
1797 |
+
"lstrip": false,
|
1798 |
+
"normalized": false,
|
1799 |
+
"rstrip": false,
|
1800 |
+
"single_word": false,
|
1801 |
+
"special": true
|
1802 |
+
},
|
1803 |
+
"128225": {
|
1804 |
+
"content": "<|reserved_special_token_217|>",
|
1805 |
+
"lstrip": false,
|
1806 |
+
"normalized": false,
|
1807 |
+
"rstrip": false,
|
1808 |
+
"single_word": false,
|
1809 |
+
"special": true
|
1810 |
+
},
|
1811 |
+
"128226": {
|
1812 |
+
"content": "<|reserved_special_token_218|>",
|
1813 |
+
"lstrip": false,
|
1814 |
+
"normalized": false,
|
1815 |
+
"rstrip": false,
|
1816 |
+
"single_word": false,
|
1817 |
+
"special": true
|
1818 |
+
},
|
1819 |
+
"128227": {
|
1820 |
+
"content": "<|reserved_special_token_219|>",
|
1821 |
+
"lstrip": false,
|
1822 |
+
"normalized": false,
|
1823 |
+
"rstrip": false,
|
1824 |
+
"single_word": false,
|
1825 |
+
"special": true
|
1826 |
+
},
|
1827 |
+
"128228": {
|
1828 |
+
"content": "<|reserved_special_token_220|>",
|
1829 |
+
"lstrip": false,
|
1830 |
+
"normalized": false,
|
1831 |
+
"rstrip": false,
|
1832 |
+
"single_word": false,
|
1833 |
+
"special": true
|
1834 |
+
},
|
1835 |
+
"128229": {
|
1836 |
+
"content": "<|reserved_special_token_221|>",
|
1837 |
+
"lstrip": false,
|
1838 |
+
"normalized": false,
|
1839 |
+
"rstrip": false,
|
1840 |
+
"single_word": false,
|
1841 |
+
"special": true
|
1842 |
+
},
|
1843 |
+
"128230": {
|
1844 |
+
"content": "<|reserved_special_token_222|>",
|
1845 |
+
"lstrip": false,
|
1846 |
+
"normalized": false,
|
1847 |
+
"rstrip": false,
|
1848 |
+
"single_word": false,
|
1849 |
+
"special": true
|
1850 |
+
},
|
1851 |
+
"128231": {
|
1852 |
+
"content": "<|reserved_special_token_223|>",
|
1853 |
+
"lstrip": false,
|
1854 |
+
"normalized": false,
|
1855 |
+
"rstrip": false,
|
1856 |
+
"single_word": false,
|
1857 |
+
"special": true
|
1858 |
+
},
|
1859 |
+
"128232": {
|
1860 |
+
"content": "<|reserved_special_token_224|>",
|
1861 |
+
"lstrip": false,
|
1862 |
+
"normalized": false,
|
1863 |
+
"rstrip": false,
|
1864 |
+
"single_word": false,
|
1865 |
+
"special": true
|
1866 |
+
},
|
1867 |
+
"128233": {
|
1868 |
+
"content": "<|reserved_special_token_225|>",
|
1869 |
+
"lstrip": false,
|
1870 |
+
"normalized": false,
|
1871 |
+
"rstrip": false,
|
1872 |
+
"single_word": false,
|
1873 |
+
"special": true
|
1874 |
+
},
|
1875 |
+
"128234": {
|
1876 |
+
"content": "<|reserved_special_token_226|>",
|
1877 |
+
"lstrip": false,
|
1878 |
+
"normalized": false,
|
1879 |
+
"rstrip": false,
|
1880 |
+
"single_word": false,
|
1881 |
+
"special": true
|
1882 |
+
},
|
1883 |
+
"128235": {
|
1884 |
+
"content": "<|reserved_special_token_227|>",
|
1885 |
+
"lstrip": false,
|
1886 |
+
"normalized": false,
|
1887 |
+
"rstrip": false,
|
1888 |
+
"single_word": false,
|
1889 |
+
"special": true
|
1890 |
+
},
|
1891 |
+
"128236": {
|
1892 |
+
"content": "<|reserved_special_token_228|>",
|
1893 |
+
"lstrip": false,
|
1894 |
+
"normalized": false,
|
1895 |
+
"rstrip": false,
|
1896 |
+
"single_word": false,
|
1897 |
+
"special": true
|
1898 |
+
},
|
1899 |
+
"128237": {
|
1900 |
+
"content": "<|reserved_special_token_229|>",
|
1901 |
+
"lstrip": false,
|
1902 |
+
"normalized": false,
|
1903 |
+
"rstrip": false,
|
1904 |
+
"single_word": false,
|
1905 |
+
"special": true
|
1906 |
+
},
|
1907 |
+
"128238": {
|
1908 |
+
"content": "<|reserved_special_token_230|>",
|
1909 |
+
"lstrip": false,
|
1910 |
+
"normalized": false,
|
1911 |
+
"rstrip": false,
|
1912 |
+
"single_word": false,
|
1913 |
+
"special": true
|
1914 |
+
},
|
1915 |
+
"128239": {
|
1916 |
+
"content": "<|reserved_special_token_231|>",
|
1917 |
+
"lstrip": false,
|
1918 |
+
"normalized": false,
|
1919 |
+
"rstrip": false,
|
1920 |
+
"single_word": false,
|
1921 |
+
"special": true
|
1922 |
+
},
|
1923 |
+
"128240": {
|
1924 |
+
"content": "<|reserved_special_token_232|>",
|
1925 |
+
"lstrip": false,
|
1926 |
+
"normalized": false,
|
1927 |
+
"rstrip": false,
|
1928 |
+
"single_word": false,
|
1929 |
+
"special": true
|
1930 |
+
},
|
1931 |
+
"128241": {
|
1932 |
+
"content": "<|reserved_special_token_233|>",
|
1933 |
+
"lstrip": false,
|
1934 |
+
"normalized": false,
|
1935 |
+
"rstrip": false,
|
1936 |
+
"single_word": false,
|
1937 |
+
"special": true
|
1938 |
+
},
|
1939 |
+
"128242": {
|
1940 |
+
"content": "<|reserved_special_token_234|>",
|
1941 |
+
"lstrip": false,
|
1942 |
+
"normalized": false,
|
1943 |
+
"rstrip": false,
|
1944 |
+
"single_word": false,
|
1945 |
+
"special": true
|
1946 |
+
},
|
1947 |
+
"128243": {
|
1948 |
+
"content": "<|reserved_special_token_235|>",
|
1949 |
+
"lstrip": false,
|
1950 |
+
"normalized": false,
|
1951 |
+
"rstrip": false,
|
1952 |
+
"single_word": false,
|
1953 |
+
"special": true
|
1954 |
+
},
|
1955 |
+
"128244": {
|
1956 |
+
"content": "<|reserved_special_token_236|>",
|
1957 |
+
"lstrip": false,
|
1958 |
+
"normalized": false,
|
1959 |
+
"rstrip": false,
|
1960 |
+
"single_word": false,
|
1961 |
+
"special": true
|
1962 |
+
},
|
1963 |
+
"128245": {
|
1964 |
+
"content": "<|reserved_special_token_237|>",
|
1965 |
+
"lstrip": false,
|
1966 |
+
"normalized": false,
|
1967 |
+
"rstrip": false,
|
1968 |
+
"single_word": false,
|
1969 |
+
"special": true
|
1970 |
+
},
|
1971 |
+
"128246": {
|
1972 |
+
"content": "<|reserved_special_token_238|>",
|
1973 |
+
"lstrip": false,
|
1974 |
+
"normalized": false,
|
1975 |
+
"rstrip": false,
|
1976 |
+
"single_word": false,
|
1977 |
+
"special": true
|
1978 |
+
},
|
1979 |
+
"128247": {
|
1980 |
+
"content": "<|reserved_special_token_239|>",
|
1981 |
+
"lstrip": false,
|
1982 |
+
"normalized": false,
|
1983 |
+
"rstrip": false,
|
1984 |
+
"single_word": false,
|
1985 |
+
"special": true
|
1986 |
+
},
|
1987 |
+
"128248": {
|
1988 |
+
"content": "<|reserved_special_token_240|>",
|
1989 |
+
"lstrip": false,
|
1990 |
+
"normalized": false,
|
1991 |
+
"rstrip": false,
|
1992 |
+
"single_word": false,
|
1993 |
+
"special": true
|
1994 |
+
},
|
1995 |
+
"128249": {
|
1996 |
+
"content": "<|reserved_special_token_241|>",
|
1997 |
+
"lstrip": false,
|
1998 |
+
"normalized": false,
|
1999 |
+
"rstrip": false,
|
2000 |
+
"single_word": false,
|
2001 |
+
"special": true
|
2002 |
+
},
|
2003 |
+
"128250": {
|
2004 |
+
"content": "<|reserved_special_token_242|>",
|
2005 |
+
"lstrip": false,
|
2006 |
+
"normalized": false,
|
2007 |
+
"rstrip": false,
|
2008 |
+
"single_word": false,
|
2009 |
+
"special": true
|
2010 |
+
},
|
2011 |
+
"128251": {
|
2012 |
+
"content": "<|reserved_special_token_243|>",
|
2013 |
+
"lstrip": false,
|
2014 |
+
"normalized": false,
|
2015 |
+
"rstrip": false,
|
2016 |
+
"single_word": false,
|
2017 |
+
"special": true
|
2018 |
+
},
|
2019 |
+
"128252": {
|
2020 |
+
"content": "<|reserved_special_token_244|>",
|
2021 |
+
"lstrip": false,
|
2022 |
+
"normalized": false,
|
2023 |
+
"rstrip": false,
|
2024 |
+
"single_word": false,
|
2025 |
+
"special": true
|
2026 |
+
},
|
2027 |
+
"128253": {
|
2028 |
+
"content": "<|reserved_special_token_245|>",
|
2029 |
+
"lstrip": false,
|
2030 |
+
"normalized": false,
|
2031 |
+
"rstrip": false,
|
2032 |
+
"single_word": false,
|
2033 |
+
"special": true
|
2034 |
+
},
|
2035 |
+
"128254": {
|
2036 |
+
"content": "<|reserved_special_token_246|>",
|
2037 |
+
"lstrip": false,
|
2038 |
+
"normalized": false,
|
2039 |
+
"rstrip": false,
|
2040 |
+
"single_word": false,
|
2041 |
+
"special": true
|
2042 |
+
},
|
2043 |
+
"128255": {
|
2044 |
+
"content": "<|reserved_special_token_247|>",
|
2045 |
+
"lstrip": false,
|
2046 |
+
"normalized": false,
|
2047 |
+
"rstrip": false,
|
2048 |
+
"single_word": false,
|
2049 |
+
"special": true
|
2050 |
+
}
|
2051 |
+
},
|
2052 |
+
"bos_token": "<|begin_of_text|>",
|
2053 |
+
"chat_template": "{% set bos = \"<|begin_of_text|>\" %}{%- set enable_thinking = true -%}{% set system_start_header = \"<|start_header_id|>\" %}{% set system_end_header = \"<|end_header_id|>\n\n\" %}{% set start_header = \"<|start_header_id|>\" %}{% set end_header = \"<|end_header_id|>\n\n\" %}{% set eot = \"<|eot_id|>\" %}{% set system_token = \"system\" %}{% set user_token = \"user\" %}{% set assistant_token = \"assistant\" %}{% set tool_token = \"tool\" %}{{- bos ~ system_start_header ~ system_token ~ system_end_header -}}{%- if messages[0].role == 'system' and messages[0].content != '' -%}{%- set system_content = messages[0].content -%}{%- if '/no_think' in system_content -%}{%- set system_content = system_content.replace('/no_think', '')|trim -%}{%- set enable_thinking = false -%}{%- elif '/think' in system_content -%}{%- set system_content = system_content.replace('/think', '')|trim -%}{%- set enable_thinking = true -%}{%- endif -%}{{- system_content + '\n\n' -}}{%- endif -%}{%- if tools -%}{{- 'You can use the following tools to assist the user if required:\n<AVAILABLE_TOOLS>[' -}}{%- for tool in tools -%}{{- (tool.function if tool.function is defined else tool) | tojson -}}{{- ', ' if not loop.last else '' -}}{%- endfor -%}{{- ']</AVAILABLE_TOOLS>\n\nIf you decide to call any tool(s), use the following format:\n<TOOLCALL>[{{\"name\": \"tool_name1\", \"arguments\": \"tool_args1\"}}, {{\"name\": \"tool_name2\", \"arguments\": \"tool_args2\"}}]</TOOLCALL>\n\nResponse from tool(s) will be returned in this format:\n<TOOL_RESPONSE>[{{\"response\": \"tool_response1\"}}, {{\"response\": \"tool_response2\"}}]</TOOL_RESPONSE>\n\nBased on the results returned by the tool(s), you can call additional tools if needed, correct tool calls if any errors are found, or just respond with the answer to the user.' -}}{%- endif -%}{{- eot -}}{%- for message in messages -%}{%- if message.role == user_token -%}{{- start_header ~ user_token ~ end_header -}}{{ message.content -}}{{ eot -}}{%- elif message.role == assistant_token -%}{%- if '</think>' in message.content -%}{%- set content = message.content.split('</think>')[-1].lstrip() -%}{%- else -%}{%- set content = message.content -%}{%- endif -%}{{- start_header ~ assistant_token ~ end_header -}}{{ content -}}{%- if message.tool_calls -%}{{- '<TOOLCALL>[' -}}{%- for call in message.tool_calls -%}{%- set fn = call.function if call.function is defined else call -%}{{- '{\"name\": \"' + fn.name + '\", \"arguments\": ' -}}{%- if fn.arguments is string -%}{{- fn.arguments -}}{%- else -%}{{- fn.arguments | tojson -}}{%- endif -%}{{- '}' + (', ' if not loop.last else '') -}}{%- endfor -%}{{- ']</TOOLCALL>' -}}{%- endif -%}{{- eot -}}{%- elif message.role == tool_token -%}{%- if loop.first or (messages[loop.index0 - 1].role != tool_token) -%}{{- start_header ~ tool_token ~ end_header -}}{{ '<TOOL_RESPONSE>[' -}}{%- endif -%}{{- message.content -}}{{- ', ' if not loop.last and (messages[loop.index0 + 1].role == tool_token) else '' -}}{%- if loop.last or (messages[loop.index0 + 1].role != tool_token) -%}{{- ']</TOOL_RESPONSE>' -}}{{ eot -}}{%- endif -%}{%- endif -%}{%- endfor -%}{%- if add_generation_prompt -%}{{- start_header ~ assistant_token ~ end_header -}}{%- if not enable_thinking -%}{{- '<think>\n\n</think>\n\n' -}}{%- endif -%}{%- endif -%}",
|
2054 |
+
"clean_up_tokenization_spaces": true,
|
2055 |
+
"eos_token": "<|eot_id|>",
|
2056 |
+
"extra_special_tokens": {},
|
2057 |
+
"model_input_names": [
|
2058 |
+
"input_ids",
|
2059 |
+
"attention_mask"
|
2060 |
+
],
|
2061 |
+
"model_max_length": 131072,
|
2062 |
+
"pad_token": "<|eot_id|>",
|
2063 |
+
"tokenizer_class": "PreTrainedTokenizerFast"
|
2064 |
+
}
|
2065 |
+
|
transformers_4_44_2__configuration_llama.py
ADDED
@@ -0,0 +1,203 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# coding=utf-8
|
2 |
+
# Copyright 2022 EleutherAI and the HuggingFace Inc. team. All rights reserved.
|
3 |
+
#
|
4 |
+
# This code is based on EleutherAI's GPT-NeoX library and the GPT-NeoX
|
5 |
+
# and OPT implementations in this library. It has been modified from its
|
6 |
+
# original forms to accommodate minor architectural differences compared
|
7 |
+
# to GPT-NeoX and OPT used by the Meta AI team that trained the model.
|
8 |
+
#
|
9 |
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
10 |
+
# you may not use this file except in compliance with the License.
|
11 |
+
# You may obtain a copy of the License at
|
12 |
+
#
|
13 |
+
# http://www.apache.org/licenses/LICENSE-2.0
|
14 |
+
#
|
15 |
+
# Unless required by applicable law or agreed to in writing, software
|
16 |
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
17 |
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
18 |
+
# See the License for the specific language governing permissions and
|
19 |
+
# limitations under the License.
|
20 |
+
"""LLaMA model configuration"""
|
21 |
+
|
22 |
+
from transformers.configuration_utils import PretrainedConfig
|
23 |
+
from .transformers_4_44_2__modeling_rope_utils import rope_config_validation
|
24 |
+
|
25 |
+
|
26 |
+
class LlamaConfig(PretrainedConfig):
|
27 |
+
r"""
|
28 |
+
This is the configuration class to store the configuration of a [`LlamaModel`]. It is used to instantiate an LLaMA
|
29 |
+
model according to the specified arguments, defining the model architecture. Instantiating a configuration with the
|
30 |
+
defaults will yield a similar configuration to that of the LLaMA-7B.
|
31 |
+
|
32 |
+
Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
|
33 |
+
documentation from [`PretrainedConfig`] for more information.
|
34 |
+
|
35 |
+
|
36 |
+
Args:
|
37 |
+
vocab_size (`int`, *optional*, defaults to 32000):
|
38 |
+
Vocabulary size of the LLaMA model. Defines the number of different tokens that can be represented by the
|
39 |
+
`inputs_ids` passed when calling [`LlamaModel`]
|
40 |
+
hidden_size (`int`, *optional*, defaults to 4096):
|
41 |
+
Dimension of the hidden representations.
|
42 |
+
intermediate_size (`int`, *optional*, defaults to 11008):
|
43 |
+
Dimension of the MLP representations.
|
44 |
+
num_hidden_layers (`int`, *optional*, defaults to 32):
|
45 |
+
Number of hidden layers in the Transformer decoder.
|
46 |
+
num_attention_heads (`int`, *optional*, defaults to 32):
|
47 |
+
Number of attention heads for each attention layer in the Transformer decoder.
|
48 |
+
num_key_value_heads (`int`, *optional*):
|
49 |
+
This is the number of key_value heads that should be used to implement Grouped Query Attention. If
|
50 |
+
`num_key_value_heads=num_attention_heads`, the model will use Multi Head Attention (MHA), if
|
51 |
+
`num_key_value_heads=1` the model will use Multi Query Attention (MQA) otherwise GQA is used. When
|
52 |
+
converting a multi-head checkpoint to a GQA checkpoint, each group key and value head should be constructed
|
53 |
+
by meanpooling all the original heads within that group. For more details checkout [this
|
54 |
+
paper](https://arxiv.org/pdf/2305.13245.pdf). If it is not specified, will default to
|
55 |
+
`num_attention_heads`.
|
56 |
+
hidden_act (`str` or `function`, *optional*, defaults to `"silu"`):
|
57 |
+
The non-linear activation function (function or string) in the decoder.
|
58 |
+
max_position_embeddings (`int`, *optional*, defaults to 2048):
|
59 |
+
The maximum sequence length that this model might ever be used with. Llama 1 supports up to 2048 tokens,
|
60 |
+
Llama 2 up to 4096, CodeLlama up to 16384.
|
61 |
+
initializer_range (`float`, *optional*, defaults to 0.02):
|
62 |
+
The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
|
63 |
+
rms_norm_eps (`float`, *optional*, defaults to 1e-06):
|
64 |
+
The epsilon used by the rms normalization layers.
|
65 |
+
use_cache (`bool`, *optional*, defaults to `True`):
|
66 |
+
Whether or not the model should return the last key/values attentions (not used by all models). Only
|
67 |
+
relevant if `config.is_decoder=True`.
|
68 |
+
pad_token_id (`int`, *optional*):
|
69 |
+
Padding token id.
|
70 |
+
bos_token_id (`int`, *optional*, defaults to 1):
|
71 |
+
Beginning of stream token id.
|
72 |
+
eos_token_id (`int`, *optional*, defaults to 2):
|
73 |
+
End of stream token id.
|
74 |
+
pretraining_tp (`int`, *optional*, defaults to 1):
|
75 |
+
Experimental feature. Tensor parallelism rank used during pretraining. Please refer to [this
|
76 |
+
document](https://huggingface.co/docs/transformers/main/perf_train_gpu_many#tensor-parallelism) to
|
77 |
+
understand more about it. This value is necessary to ensure exact reproducibility of the pretraining
|
78 |
+
results. Please refer to [this issue](https://github.com/pytorch/pytorch/issues/76232).
|
79 |
+
tie_word_embeddings (`bool`, *optional*, defaults to `False`):
|
80 |
+
Whether to tie weight embeddings
|
81 |
+
rope_theta (`float`, *optional*, defaults to 10000.0):
|
82 |
+
The base period of the RoPE embeddings.
|
83 |
+
rope_scaling (`Dict`, *optional*):
|
84 |
+
Dictionary containing the scaling configuration for the RoPE embeddings. NOTE: if you apply new rope type
|
85 |
+
and you expect the model to work on longer `max_position_embeddings`, we recommend you to update this value
|
86 |
+
accordingly.
|
87 |
+
Expected contents:
|
88 |
+
`rope_type` (`str`):
|
89 |
+
The sub-variant of RoPE to use. Can be one of ['default', 'linear', 'dynamic', 'yarn', 'longrope',
|
90 |
+
'llama3'], with 'default' being the original RoPE implementation.
|
91 |
+
`factor` (`float`, *optional*):
|
92 |
+
Used with all rope types except 'default'. The scaling factor to apply to the RoPE embeddings. In
|
93 |
+
most scaling types, a `factor` of x will enable the model to handle sequences of length x *
|
94 |
+
original maximum pre-trained length.
|
95 |
+
`original_max_position_embeddings` (`int`, *optional*):
|
96 |
+
Used with 'dynamic', 'longrope' and 'llama3'. The original max position embeddings used during
|
97 |
+
pretraining.
|
98 |
+
`attention_factor` (`float`, *optional*):
|
99 |
+
Used with 'yarn' and 'longrope'. The scaling factor to be applied on the attention
|
100 |
+
computation. If unspecified, it defaults to value recommended by the implementation, using the
|
101 |
+
`factor` field to infer the suggested value.
|
102 |
+
`beta_fast` (`float`, *optional*):
|
103 |
+
Only used with 'yarn'. Parameter to set the boundary for extrapolation (only) in the linear
|
104 |
+
ramp function. If unspecified, it defaults to 32.
|
105 |
+
`beta_slow` (`float`, *optional*):
|
106 |
+
Only used with 'yarn'. Parameter to set the boundary for interpolation (only) in the linear
|
107 |
+
ramp function. If unspecified, it defaults to 1.
|
108 |
+
`short_factor` (`List[float]`, *optional*):
|
109 |
+
Only used with 'longrope'. The scaling factor to be applied to short contexts (<
|
110 |
+
`original_max_position_embeddings`). Must be a list of numbers with the same length as the hidden
|
111 |
+
size divided by the number of attention heads divided by 2
|
112 |
+
`long_factor` (`List[float]`, *optional*):
|
113 |
+
Only used with 'longrope'. The scaling factor to be applied to long contexts (<
|
114 |
+
`original_max_position_embeddings`). Must be a list of numbers with the same length as the hidden
|
115 |
+
size divided by the number of attention heads divided by 2
|
116 |
+
`low_freq_factor` (`float`, *optional*):
|
117 |
+
Only used with 'llama3'. Scaling factor applied to low frequency components of the RoPE
|
118 |
+
`high_freq_factor` (`float`, *optional*):
|
119 |
+
Only used with 'llama3'. Scaling factor applied to high frequency components of the RoPE
|
120 |
+
attention_bias (`bool`, *optional*, defaults to `False`):
|
121 |
+
Whether to use a bias in the query, key, value and output projection layers during self-attention.
|
122 |
+
attention_dropout (`float`, *optional*, defaults to 0.0):
|
123 |
+
The dropout ratio for the attention probabilities.
|
124 |
+
mlp_bias (`bool`, *optional*, defaults to `False`):
|
125 |
+
Whether to use a bias in up_proj, down_proj and gate_proj layers in the MLP layers.
|
126 |
+
|
127 |
+
```python
|
128 |
+
>>> from transformers import LlamaModel, LlamaConfig
|
129 |
+
|
130 |
+
>>> # Initializing a LLaMA llama-7b style configuration
|
131 |
+
>>> configuration = LlamaConfig()
|
132 |
+
|
133 |
+
>>> # Initializing a model from the llama-7b style configuration
|
134 |
+
>>> model = LlamaModel(configuration)
|
135 |
+
|
136 |
+
>>> # Accessing the model configuration
|
137 |
+
>>> configuration = model.config
|
138 |
+
```"""
|
139 |
+
|
140 |
+
model_type = "llama"
|
141 |
+
keys_to_ignore_at_inference = ["past_key_values"]
|
142 |
+
|
143 |
+
def __init__(
|
144 |
+
self,
|
145 |
+
vocab_size=32000,
|
146 |
+
hidden_size=4096,
|
147 |
+
intermediate_size=11008,
|
148 |
+
num_hidden_layers=32,
|
149 |
+
num_attention_heads=32,
|
150 |
+
num_key_value_heads=None,
|
151 |
+
hidden_act="silu",
|
152 |
+
max_position_embeddings=2048,
|
153 |
+
initializer_range=0.02,
|
154 |
+
rms_norm_eps=1e-6,
|
155 |
+
use_cache=True,
|
156 |
+
pad_token_id=None,
|
157 |
+
bos_token_id=1,
|
158 |
+
eos_token_id=2,
|
159 |
+
pretraining_tp=1,
|
160 |
+
tie_word_embeddings=False,
|
161 |
+
rope_theta=10000.0,
|
162 |
+
rope_scaling=None,
|
163 |
+
attention_bias=False,
|
164 |
+
attention_dropout=0.0,
|
165 |
+
mlp_bias=False,
|
166 |
+
**kwargs,
|
167 |
+
):
|
168 |
+
self.vocab_size = vocab_size
|
169 |
+
self.max_position_embeddings = max_position_embeddings
|
170 |
+
self.hidden_size = hidden_size
|
171 |
+
self.intermediate_size = intermediate_size
|
172 |
+
self.num_hidden_layers = num_hidden_layers
|
173 |
+
self.num_attention_heads = num_attention_heads
|
174 |
+
|
175 |
+
# for backward compatibility
|
176 |
+
if num_key_value_heads is None:
|
177 |
+
num_key_value_heads = num_attention_heads
|
178 |
+
|
179 |
+
self.num_key_value_heads = num_key_value_heads
|
180 |
+
self.hidden_act = hidden_act
|
181 |
+
self.initializer_range = initializer_range
|
182 |
+
self.rms_norm_eps = rms_norm_eps
|
183 |
+
self.pretraining_tp = pretraining_tp
|
184 |
+
self.use_cache = use_cache
|
185 |
+
self.rope_theta = rope_theta
|
186 |
+
self.rope_scaling = rope_scaling
|
187 |
+
self.attention_bias = attention_bias
|
188 |
+
self.attention_dropout = attention_dropout
|
189 |
+
self.mlp_bias = mlp_bias
|
190 |
+
|
191 |
+
# Validate the correctness of rotary position embeddings parameters
|
192 |
+
# BC: if there is a 'type' field, move it to 'rope_type'.
|
193 |
+
if self.rope_scaling is not None and "type" in self.rope_scaling:
|
194 |
+
self.rope_scaling["rope_type"] = self.rope_scaling["type"]
|
195 |
+
rope_config_validation(self)
|
196 |
+
|
197 |
+
super().__init__(
|
198 |
+
pad_token_id=pad_token_id,
|
199 |
+
bos_token_id=bos_token_id,
|
200 |
+
eos_token_id=eos_token_id,
|
201 |
+
tie_word_embeddings=tie_word_embeddings,
|
202 |
+
**kwargs,
|
203 |
+
)
|
transformers_4_44_2__modeling_rope_utils.py
ADDED
@@ -0,0 +1,559 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# Copyright 2024 The HuggingFace Team. All rights reserved.
|
2 |
+
#
|
3 |
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
4 |
+
# you may not use this file except in compliance with the License.
|
5 |
+
# You may obtain a copy of the License at
|
6 |
+
#
|
7 |
+
# http://www.apache.org/licenses/LICENSE-2.0
|
8 |
+
#
|
9 |
+
# Unless required by applicable law or agreed to in writing, software
|
10 |
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
11 |
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
12 |
+
# See the License for the specific language governing permissions and
|
13 |
+
# limitations under the License.
|
14 |
+
|
15 |
+
import math
|
16 |
+
from typing import Optional, Tuple
|
17 |
+
|
18 |
+
from transformers.configuration_utils import PretrainedConfig
|
19 |
+
from transformers.utils import is_torch_available, logging
|
20 |
+
|
21 |
+
|
22 |
+
logger = logging.get_logger(__name__)
|
23 |
+
|
24 |
+
|
25 |
+
if is_torch_available():
|
26 |
+
import torch
|
27 |
+
|
28 |
+
|
29 |
+
def _compute_default_rope_parameters(
|
30 |
+
config: Optional[PretrainedConfig] = None,
|
31 |
+
device: Optional["torch.device"] = None,
|
32 |
+
seq_len: Optional[int] = None,
|
33 |
+
**rope_kwargs,
|
34 |
+
) -> Tuple["torch.Tensor", float]:
|
35 |
+
"""
|
36 |
+
Computes the inverse frequencies according to the original RoPE implementation
|
37 |
+
Args:
|
38 |
+
config ([`~transformers.PretrainedConfig`]):
|
39 |
+
The model configuration.
|
40 |
+
device (`torch.device`):
|
41 |
+
The device to use for initialization of the inverse frequencies.
|
42 |
+
seq_len (`int`, *optional*):
|
43 |
+
The current sequence length. Unused for this type of RoPE.
|
44 |
+
rope_kwargs (`Dict`, *optional*):
|
45 |
+
BC compatibility with the previous RoPE class instantiation, will be removed in v4.45.
|
46 |
+
Returns:
|
47 |
+
Tuple of (`torch.Tensor`, `float`), containing the inverse frequencies for the RoPE embeddings and the
|
48 |
+
post-processing scaling factor applied to the computed cos/sin (unused in this type of RoPE).
|
49 |
+
"""
|
50 |
+
if config is not None and len(rope_kwargs) > 0:
|
51 |
+
raise ValueError(
|
52 |
+
"Unexpected arguments: `**rope_kwargs` and `config` are mutually exclusive in "
|
53 |
+
f"`_compute_default_rope_parameters`, got `rope_kwargs`={rope_kwargs} and `config`={config}"
|
54 |
+
)
|
55 |
+
if len(rope_kwargs) > 0:
|
56 |
+
base = rope_kwargs["base"]
|
57 |
+
dim = rope_kwargs["dim"]
|
58 |
+
elif config is not None:
|
59 |
+
base = config.rope_theta
|
60 |
+
partial_rotary_factor = config.partial_rotary_factor if hasattr(config, "partial_rotary_factor") else 1.0
|
61 |
+
head_dim = getattr(config, "head_dim", config.hidden_size // config.num_attention_heads)
|
62 |
+
dim = int(head_dim * partial_rotary_factor)
|
63 |
+
|
64 |
+
attention_factor = 1.0 # Unused in this type of RoPE
|
65 |
+
|
66 |
+
# Compute the inverse frequencies
|
67 |
+
inv_freq = 1.0 / (base ** (torch.arange(0, dim, 2, dtype=torch.int64).float().to(device) / dim))
|
68 |
+
return inv_freq, attention_factor
|
69 |
+
|
70 |
+
|
71 |
+
def _compute_linear_scaling_rope_parameters(
|
72 |
+
config: Optional[PretrainedConfig] = None,
|
73 |
+
device: Optional["torch.device"] = None,
|
74 |
+
seq_len: Optional[int] = None,
|
75 |
+
**rope_kwargs,
|
76 |
+
) -> Tuple["torch.Tensor", float]:
|
77 |
+
"""
|
78 |
+
Computes the inverse frequencies with linear scaling. Credits to the Reddit user /u/kaiokendev
|
79 |
+
Args:
|
80 |
+
config ([`~transformers.PretrainedConfig`]):
|
81 |
+
The model configuration.
|
82 |
+
device (`torch.device`):
|
83 |
+
The device to use for initialization of the inverse frequencies.
|
84 |
+
seq_len (`int`, *optional*):
|
85 |
+
The current sequence length. Unused for this type of RoPE.
|
86 |
+
rope_kwargs (`Dict`, *optional*):
|
87 |
+
BC compatibility with the previous RoPE class instantiation, will be removed in v4.45.
|
88 |
+
Returns:
|
89 |
+
Tuple of (`torch.Tensor`, `float`), containing the inverse frequencies for the RoPE embeddings and the
|
90 |
+
post-processing scaling factor applied to the computed cos/sin (unused in this type of RoPE).
|
91 |
+
"""
|
92 |
+
if config is not None and len(rope_kwargs) > 0:
|
93 |
+
raise ValueError(
|
94 |
+
"Unexpected arguments: `**rope_kwargs` and `config` are mutually exclusive in "
|
95 |
+
f"`_compute_linear_scaling_rope_parameters`, got `rope_kwargs`={rope_kwargs} and `config`={config}"
|
96 |
+
)
|
97 |
+
if len(rope_kwargs) > 0:
|
98 |
+
factor = rope_kwargs["factor"]
|
99 |
+
elif config is not None:
|
100 |
+
factor = config.rope_scaling["factor"]
|
101 |
+
|
102 |
+
# Gets the default RoPE parameters
|
103 |
+
inv_freq, attention_factor = _compute_default_rope_parameters(config, device, seq_len, **rope_kwargs)
|
104 |
+
|
105 |
+
# Then applies linear scaling to the frequencies.
|
106 |
+
# NOTE: originally, scaling was applied to the position_ids. However, we get `embs = inv_freq @ position_ids`, so
|
107 |
+
# applying scaling to the inverse frequencies is equivalent.
|
108 |
+
inv_freq /= factor
|
109 |
+
return inv_freq, attention_factor
|
110 |
+
|
111 |
+
|
112 |
+
def _compute_dynamic_ntk_parameters(
|
113 |
+
config: Optional[PretrainedConfig] = None,
|
114 |
+
device: Optional["torch.device"] = None,
|
115 |
+
seq_len: Optional[int] = None,
|
116 |
+
**rope_kwargs,
|
117 |
+
) -> Tuple["torch.Tensor", float]:
|
118 |
+
"""
|
119 |
+
Computes the inverse frequencies with NTK scaling. Credits to the Reddit users /u/bloc97 and /u/emozilla
|
120 |
+
Args:
|
121 |
+
config ([`~transformers.PretrainedConfig`]):
|
122 |
+
The model configuration.
|
123 |
+
device (`torch.device`):
|
124 |
+
The device to use for initialization of the inverse frequencies.
|
125 |
+
seq_len (`int`, *optional*):
|
126 |
+
The current sequence length, used to update the dynamic RoPE at inference time.
|
127 |
+
rope_kwargs (`Dict`, *optional*):
|
128 |
+
BC compatibility with the previous RoPE class instantiation, will be removed in v4.45.
|
129 |
+
Returns:
|
130 |
+
Tuple of (`torch.Tensor`, `float`), containing the inverse frequencies for the RoPE embeddings and the
|
131 |
+
post-processing scaling factor applied to the computed cos/sin (unused in this type of RoPE).
|
132 |
+
"""
|
133 |
+
# TODO (joao): use the new `original_max_position_embeddings` from rope_scaling
|
134 |
+
if config is not None and len(rope_kwargs) > 0:
|
135 |
+
raise ValueError(
|
136 |
+
"Unexpected arguments: `**rope_kwargs` and `config` are mutually exclusive in "
|
137 |
+
f"`_compute_dynamic_ntk_parameters`, got `rope_kwargs`={rope_kwargs} and `config`={config}"
|
138 |
+
)
|
139 |
+
if len(rope_kwargs) > 0:
|
140 |
+
base = rope_kwargs["base"]
|
141 |
+
dim = rope_kwargs["dim"]
|
142 |
+
max_position_embeddings = rope_kwargs["max_position_embeddings"]
|
143 |
+
factor = rope_kwargs["factor"]
|
144 |
+
elif config is not None:
|
145 |
+
base = config.rope_theta
|
146 |
+
partial_rotary_factor = config.partial_rotary_factor if hasattr(config, "partial_rotary_factor") else 1.0
|
147 |
+
head_dim = getattr(config, "head_dim", config.hidden_size // config.num_attention_heads)
|
148 |
+
dim = int(head_dim * partial_rotary_factor)
|
149 |
+
max_position_embeddings = config.max_position_embeddings
|
150 |
+
factor = config.rope_scaling["factor"]
|
151 |
+
|
152 |
+
attention_factor = 1.0 # Unused in this type of RoPE
|
153 |
+
|
154 |
+
# seq_len: default to max_position_embeddings, e.g. at init time
|
155 |
+
seq_len = seq_len if seq_len is not None and seq_len > max_position_embeddings else max_position_embeddings
|
156 |
+
|
157 |
+
# Compute the inverse frequencies
|
158 |
+
base = base * ((factor * seq_len / max_position_embeddings) - (factor - 1)) ** (dim / (dim - 2))
|
159 |
+
inv_freq = 1.0 / (base ** (torch.arange(0, dim, 2, dtype=torch.int64).float().to(device) / dim))
|
160 |
+
return inv_freq, attention_factor
|
161 |
+
|
162 |
+
|
163 |
+
def _compute_yarn_parameters(
|
164 |
+
config: PretrainedConfig, device: "torch.device", seq_len: Optional[int] = None, **rope_kwargs
|
165 |
+
) -> Tuple["torch.Tensor", float]:
|
166 |
+
"""
|
167 |
+
Computes the inverse frequencies with NTK scaling. Please refer to the
|
168 |
+
[original paper](https://arxiv.org/abs/2309.00071)
|
169 |
+
Args:
|
170 |
+
config ([`~transformers.PretrainedConfig`]):
|
171 |
+
The model configuration.
|
172 |
+
device (`torch.device`):
|
173 |
+
The device to use for initialization of the inverse frequencies.
|
174 |
+
seq_len (`int`, *optional*):
|
175 |
+
The current sequence length. Unused for this type of RoPE.
|
176 |
+
rope_kwargs (`Dict`, *optional*):
|
177 |
+
BC compatibility with the previous RoPE class instantiation, will be removed in v4.45.
|
178 |
+
Returns:
|
179 |
+
Tuple of (`torch.Tensor`, `float`), containing the inverse frequencies for the RoPE embeddings and the
|
180 |
+
post-processing scaling factor applied to the computed cos/sin.
|
181 |
+
"""
|
182 |
+
# No need to keep BC with yarn, unreleased when this new pattern was created.
|
183 |
+
if len(rope_kwargs) > 0:
|
184 |
+
raise ValueError(
|
185 |
+
f"Unexpected arguments: `**rope_kwargs` should be unset in `_compute_yarn_parameters`, got {rope_kwargs}"
|
186 |
+
)
|
187 |
+
|
188 |
+
base = config.rope_theta
|
189 |
+
partial_rotary_factor = config.partial_rotary_factor if hasattr(config, "partial_rotary_factor") else 1.0
|
190 |
+
head_dim = getattr(config, "head_dim", config.hidden_size // config.num_attention_heads)
|
191 |
+
dim = int(head_dim * partial_rotary_factor)
|
192 |
+
max_position_embeddings = config.max_position_embeddings
|
193 |
+
factor = config.rope_scaling["factor"]
|
194 |
+
|
195 |
+
# Sets the attention factor as suggested in the paper
|
196 |
+
attention_factor = config.rope_scaling.get("attention_factor")
|
197 |
+
if attention_factor is None:
|
198 |
+
attention_factor = 0.1 * math.log(factor) + 1.0
|
199 |
+
|
200 |
+
# Optional config options
|
201 |
+
# beta_fast/beta_slow: as suggested in the paper, default to 32/1 (correspondingly)
|
202 |
+
beta_fast = config.rope_scaling.get("beta_fast") or 32
|
203 |
+
beta_slow = config.rope_scaling.get("beta_slow") or 1
|
204 |
+
|
205 |
+
# Compute the inverse frequencies
|
206 |
+
def find_correction_dim(num_rotations, dim, base, max_position_embeddings):
|
207 |
+
"""Inverse dimension formula to find the dimension based on the number of rotations"""
|
208 |
+
return (dim * math.log(max_position_embeddings / (num_rotations * 2 * math.pi))) / (2 * math.log(base))
|
209 |
+
|
210 |
+
def find_correction_range(low_rot, high_rot, dim, base, max_position_embeddings):
|
211 |
+
"""Find dimension range bounds based on rotations"""
|
212 |
+
low = math.floor(find_correction_dim(low_rot, dim, base, max_position_embeddings))
|
213 |
+
high = math.ceil(find_correction_dim(high_rot, dim, base, max_position_embeddings))
|
214 |
+
return max(low, 0), min(high, dim - 1)
|
215 |
+
|
216 |
+
def linear_ramp_factor(min, max, dim):
|
217 |
+
if min == max:
|
218 |
+
max += 0.001 # Prevent singularity
|
219 |
+
|
220 |
+
linear_func = (torch.arange(dim, dtype=torch.float32) - min) / (max - min)
|
221 |
+
ramp_func = torch.clamp(linear_func, 0, 1)
|
222 |
+
return ramp_func
|
223 |
+
|
224 |
+
# Note on variable naming: "interpolation" comes from the original technique, where we interpolate the position IDs
|
225 |
+
# to expand the possible context length. In other words, interpolation = apply scaling factor.
|
226 |
+
pos_freqs = base ** (torch.arange(0, dim, 2).float().to(device) / dim)
|
227 |
+
inv_freq_extrapolation = 1.0 / pos_freqs
|
228 |
+
inv_freq_interpolation = 1.0 / (factor * pos_freqs)
|
229 |
+
|
230 |
+
low, high = find_correction_range(beta_fast, beta_slow, dim, base, max_position_embeddings)
|
231 |
+
|
232 |
+
# Get n-dimensional rotational scaling corrected for extrapolation
|
233 |
+
inv_freq_extrapolation_factor = 1 - linear_ramp_factor(low, high, dim // 2).float().to(device)
|
234 |
+
inv_freq = (
|
235 |
+
inv_freq_interpolation * (1 - inv_freq_extrapolation_factor)
|
236 |
+
+ inv_freq_extrapolation * inv_freq_extrapolation_factor
|
237 |
+
)
|
238 |
+
|
239 |
+
return inv_freq, attention_factor
|
240 |
+
|
241 |
+
|
242 |
+
def _compute_longrope_parameters(
|
243 |
+
config: PretrainedConfig, device: "torch.device", seq_len: Optional[int] = None, **rope_kwargs
|
244 |
+
) -> Tuple["torch.Tensor", float]:
|
245 |
+
"""
|
246 |
+
Computes the inverse frequencies with LongRoPE scaling. Please refer to the
|
247 |
+
[original implementation](https://github.com/microsoft/LongRoPE)
|
248 |
+
Args:
|
249 |
+
config ([`~transformers.PretrainedConfig`]):
|
250 |
+
The model configuration.
|
251 |
+
device (`torch.device`):
|
252 |
+
The device to use for initialization of the inverse frequencies.
|
253 |
+
seq_len (`int`, *optional*):
|
254 |
+
The current sequence length. Unused for this type of RoPE.
|
255 |
+
rope_kwargs (`Dict`, *optional*):
|
256 |
+
BC compatibility with the previous RoPE class instantiation, will be removed in v4.45.
|
257 |
+
Returns:
|
258 |
+
Tuple of (`torch.Tensor`, `float`), containing the inverse frequencies for the RoPE embeddings and the
|
259 |
+
post-processing scaling factor applied to the computed cos/sin.
|
260 |
+
"""
|
261 |
+
# TODO (joao): use the new `original_max_position_embeddings` from rope_scaling
|
262 |
+
# No need to keep BC with longrope, unreleased when this new pattern was created.
|
263 |
+
if len(rope_kwargs) > 0:
|
264 |
+
raise ValueError(
|
265 |
+
"Unexpected arguments: `**rope_kwargs` should be unset in `_compute_longrope_parameters`, got "
|
266 |
+
f"{rope_kwargs}"
|
267 |
+
)
|
268 |
+
|
269 |
+
base = config.rope_theta
|
270 |
+
partial_rotary_factor = config.partial_rotary_factor if hasattr(config, "partial_rotary_factor") else 1.0
|
271 |
+
head_dim = getattr(config, "head_dim", config.hidden_size // config.num_attention_heads)
|
272 |
+
dim = int(head_dim * partial_rotary_factor)
|
273 |
+
long_factor = config.rope_scaling["long_factor"]
|
274 |
+
short_factor = config.rope_scaling["short_factor"]
|
275 |
+
factor = config.rope_scaling.get("factor")
|
276 |
+
attention_factor = config.rope_scaling.get("attention_factor")
|
277 |
+
|
278 |
+
# NOTE: Phi3 (and potentially other models) modify `max_position_embeddings` and have a
|
279 |
+
# `original_max_position_embeddings` field containing the pretrained value. They use the ratio between these two
|
280 |
+
# values to compute the default attention scaling factor, instead of using `factor`.
|
281 |
+
if hasattr(config, "original_max_position_embeddings"):
|
282 |
+
max_position_embeddings = config.original_max_position_embeddings
|
283 |
+
expanded_max_position_embeddings = config.max_position_embeddings
|
284 |
+
factor = expanded_max_position_embeddings / max_position_embeddings
|
285 |
+
else:
|
286 |
+
max_position_embeddings = config.max_position_embeddings
|
287 |
+
expanded_max_position_embeddings = max_position_embeddings * factor
|
288 |
+
|
289 |
+
# Sets the attention factor as suggested in the paper
|
290 |
+
if attention_factor is None:
|
291 |
+
if factor <= 1.0:
|
292 |
+
attention_factor = 1.0
|
293 |
+
else:
|
294 |
+
attention_factor = math.sqrt(1 + math.log(factor) / math.log(max_position_embeddings))
|
295 |
+
|
296 |
+
# Compute the inverse frequencies -- scaled based on the target sequence length
|
297 |
+
if expanded_max_position_embeddings > max_position_embeddings:
|
298 |
+
ext_factors = torch.tensor(long_factor, dtype=torch.float32, device=device)
|
299 |
+
else:
|
300 |
+
ext_factors = torch.tensor(short_factor, dtype=torch.float32, device=device)
|
301 |
+
inv_freq_shape = torch.arange(0, dim, 2, dtype=torch.int64, device=device).float() / dim
|
302 |
+
inv_freq = 1.0 / (ext_factors * base**inv_freq_shape)
|
303 |
+
|
304 |
+
return inv_freq, attention_factor
|
305 |
+
|
306 |
+
|
307 |
+
def _compute_llama3_parameters(
|
308 |
+
config: PretrainedConfig, device: "torch.device", seq_len: Optional[int] = None, **rope_kwargs
|
309 |
+
) -> Tuple["torch.Tensor", float]:
|
310 |
+
"""
|
311 |
+
Computes the inverse frequencies for llama 3.1.
|
312 |
+
|
313 |
+
Args:
|
314 |
+
config ([`~transformers.PretrainedConfig`]):
|
315 |
+
The model configuration.
|
316 |
+
device (`torch.device`):
|
317 |
+
The device to use for initialization of the inverse frequencies.
|
318 |
+
seq_len (`int`, *optional*):
|
319 |
+
The current sequence length. Unused for this type of RoPE.
|
320 |
+
rope_kwargs (`Dict`, *optional*):
|
321 |
+
BC compatibility with the previous RoPE class instantiation, will be removed in v4.45.
|
322 |
+
Returns:
|
323 |
+
Tuple of (`torch.Tensor`, `float`), containing the inverse frequencies for the RoPE embeddings and the
|
324 |
+
post-processing scaling factor applied to the computed cos/sin.
|
325 |
+
"""
|
326 |
+
# Gets the default RoPE parameters
|
327 |
+
inv_freq, attention_factor = _compute_default_rope_parameters(config, device, seq_len, **rope_kwargs)
|
328 |
+
|
329 |
+
factor = config.rope_scaling["factor"] # `8` in the original implementation
|
330 |
+
low_freq_factor = config.rope_scaling["low_freq_factor"] # `1` in the original implementation
|
331 |
+
high_freq_factor = config.rope_scaling["high_freq_factor"] # `4` in the original implementation
|
332 |
+
old_context_len = config.rope_scaling["original_max_position_embeddings"] # `8192` in the original implementation
|
333 |
+
|
334 |
+
low_freq_wavelen = old_context_len / low_freq_factor
|
335 |
+
high_freq_wavelen = old_context_len / high_freq_factor
|
336 |
+
|
337 |
+
wavelen = 2 * math.pi / inv_freq
|
338 |
+
# wavelen < high_freq_wavelen: do nothing
|
339 |
+
# wavelen > low_freq_wavelen: divide by factor
|
340 |
+
inv_freq_llama = torch.where(wavelen > low_freq_wavelen, inv_freq / factor, inv_freq)
|
341 |
+
# otherwise: interpolate between the two, using a smooth factor
|
342 |
+
smooth_factor = (old_context_len / wavelen - low_freq_factor) / (high_freq_factor - low_freq_factor)
|
343 |
+
smoothed_inv_freq = (1 - smooth_factor) * inv_freq_llama / factor + smooth_factor * inv_freq_llama
|
344 |
+
is_medium_freq = ~(wavelen < high_freq_wavelen) * ~(wavelen > low_freq_wavelen)
|
345 |
+
inv_freq_llama = torch.where(is_medium_freq, smoothed_inv_freq, inv_freq_llama)
|
346 |
+
|
347 |
+
return inv_freq_llama, attention_factor
|
348 |
+
|
349 |
+
|
350 |
+
# This maps the "rope_type" string field in rope config to the corresponding function to compute the RoPE parameters
|
351 |
+
# from the model config. You can append new {'rope_type': callable} pairs to this dictionary to enable custom RoPE
|
352 |
+
# parameterizations, as long as the callable has the same signature.
|
353 |
+
ROPE_INIT_FUNCTIONS = {
|
354 |
+
"default": _compute_default_rope_parameters,
|
355 |
+
"linear": _compute_linear_scaling_rope_parameters,
|
356 |
+
"dynamic": _compute_dynamic_ntk_parameters,
|
357 |
+
"yarn": _compute_yarn_parameters,
|
358 |
+
"longrope": _compute_longrope_parameters,
|
359 |
+
"llama3": _compute_llama3_parameters,
|
360 |
+
}
|
361 |
+
|
362 |
+
|
363 |
+
def _check_received_keys(rope_type: str, received_keys: set, required_keys: set, optional_keys: Optional[set] = None):
|
364 |
+
"""Compare the received keys in `config.rope_scaling` against the expected and optional keys"""
|
365 |
+
# BC: "rope_type" was originally "type" -- let's gracefully handle it
|
366 |
+
if "rope_type" not in received_keys and "type" in received_keys:
|
367 |
+
received_keys -= {"type"}
|
368 |
+
received_keys.add("rope_type")
|
369 |
+
|
370 |
+
missing_keys = required_keys - received_keys
|
371 |
+
if missing_keys:
|
372 |
+
raise KeyError(f"Missing required keys in `rope_scaling` for 'rope_type'='{rope_type}': {missing_keys}")
|
373 |
+
|
374 |
+
if optional_keys is not None:
|
375 |
+
unused_keys = received_keys - required_keys - optional_keys
|
376 |
+
else:
|
377 |
+
unused_keys = received_keys - required_keys
|
378 |
+
if unused_keys:
|
379 |
+
logger.warning(f"Unrecognized keys in `rope_scaling` for 'rope_type'='{rope_type}': {unused_keys}")
|
380 |
+
|
381 |
+
|
382 |
+
def _validate_default_rope_parameters(config: PretrainedConfig):
|
383 |
+
rope_scaling = config.rope_scaling
|
384 |
+
rope_type = rope_scaling.get("rope_type", rope_scaling.get("type", None)) # BC: "rope_type" was originally "type"
|
385 |
+
required_keys = {"rope_type"}
|
386 |
+
received_keys = set(rope_scaling.keys())
|
387 |
+
_check_received_keys(rope_type, received_keys, required_keys)
|
388 |
+
|
389 |
+
|
390 |
+
def _validate_linear_scaling_rope_parameters(config: PretrainedConfig):
|
391 |
+
rope_scaling = config.rope_scaling
|
392 |
+
rope_type = rope_scaling.get("rope_type", rope_scaling.get("type", None)) # BC: "rope_type" was originally "type"
|
393 |
+
required_keys = {"rope_type", "factor"}
|
394 |
+
received_keys = set(rope_scaling.keys())
|
395 |
+
_check_received_keys(rope_type, received_keys, required_keys)
|
396 |
+
|
397 |
+
factor = rope_scaling["factor"]
|
398 |
+
if factor is None or not isinstance(factor, float) or factor < 1.0:
|
399 |
+
logger.warning(f"`rope_scaling`'s factor field must be a float >= 1, got {factor}")
|
400 |
+
|
401 |
+
|
402 |
+
def _validate_dynamic_scaling_rope_parameters(config: PretrainedConfig):
|
403 |
+
rope_scaling = config.rope_scaling
|
404 |
+
rope_type = rope_scaling.get("rope_type", rope_scaling.get("type", None)) # BC: "rope_type" was originally "type"
|
405 |
+
required_keys = {"rope_type", "factor"}
|
406 |
+
# TODO (joao): update logic for the inclusion of `original_max_position_embeddings`
|
407 |
+
optional_keys = {"original_max_position_embeddings"}
|
408 |
+
received_keys = set(rope_scaling.keys())
|
409 |
+
_check_received_keys(rope_type, received_keys, required_keys, optional_keys)
|
410 |
+
|
411 |
+
factor = rope_scaling["factor"]
|
412 |
+
if factor is None or not isinstance(factor, float) or factor < 1.0:
|
413 |
+
logger.warning(f"`rope_scaling`'s factor field must be a float >= 1, got {factor}")
|
414 |
+
|
415 |
+
|
416 |
+
def _validate_yarn_parameters(config: PretrainedConfig):
|
417 |
+
rope_scaling = config.rope_scaling
|
418 |
+
rope_type = rope_scaling.get("rope_type", rope_scaling.get("type", None)) # BC: "rope_type" was originally "type"
|
419 |
+
required_keys = {"rope_type", "factor"}
|
420 |
+
optional_keys = {"attention_factor", "beta_fast", "beta_slow"}
|
421 |
+
received_keys = set(rope_scaling.keys())
|
422 |
+
_check_received_keys(rope_type, received_keys, required_keys, optional_keys)
|
423 |
+
|
424 |
+
factor = rope_scaling["factor"]
|
425 |
+
if factor is None or not isinstance(factor, float) or factor < 1.0:
|
426 |
+
logger.warning(f"`rope_scaling`'s factor field must be a float >= 1, got {factor}")
|
427 |
+
|
428 |
+
attention_factor = rope_scaling.get("attention_factor")
|
429 |
+
if attention_factor is not None and (not isinstance(attention_factor, float) or attention_factor < 0):
|
430 |
+
logger.warning(
|
431 |
+
f"`rope_scaling`'s attention_factor field must be a float greater than 0, got {attention_factor}"
|
432 |
+
)
|
433 |
+
beta_fast = rope_scaling.get("beta_fast")
|
434 |
+
if beta_fast is not None and not isinstance(beta_fast, float):
|
435 |
+
logger.warning(f"`rope_scaling`'s beta_fast field must be a float, got {beta_fast}")
|
436 |
+
beta_slow = rope_scaling.get("beta_slow")
|
437 |
+
if beta_slow is not None and not isinstance(beta_slow, float):
|
438 |
+
logger.warning(f"`rope_scaling`'s beta_slow field must be a float, got {beta_slow}")
|
439 |
+
|
440 |
+
if (beta_fast or 32) < (beta_slow or 1):
|
441 |
+
logger.warning(
|
442 |
+
f"`rope_scaling`'s beta_fast field must be greater than beta_slow, got beta_fast={beta_fast} "
|
443 |
+
f"(defaults to 32 if None) and beta_slow={beta_slow} (defaults to 1 if None)"
|
444 |
+
)
|
445 |
+
|
446 |
+
|
447 |
+
def _validate_longrope_parameters(config: PretrainedConfig):
|
448 |
+
rope_scaling = config.rope_scaling
|
449 |
+
rope_type = rope_scaling.get("rope_type", rope_scaling.get("type", None)) # BC: "rope_type" was originally "type"
|
450 |
+
required_keys = {"rope_type", "short_factor", "long_factor"}
|
451 |
+
# TODO (joao): update logic for the inclusion of `original_max_position_embeddings`
|
452 |
+
optional_keys = {"attention_factor", "factor", "original_max_position_embeddings"}
|
453 |
+
received_keys = set(rope_scaling.keys())
|
454 |
+
_check_received_keys(rope_type, received_keys, required_keys, optional_keys)
|
455 |
+
|
456 |
+
partial_rotary_factor = config.partial_rotary_factor if hasattr(config, "partial_rotary_factor") else 1.0
|
457 |
+
head_dim = getattr(config, "head_dim", config.hidden_size // config.num_attention_heads)
|
458 |
+
dim = int(head_dim * partial_rotary_factor)
|
459 |
+
|
460 |
+
short_factor = rope_scaling.get("short_factor")
|
461 |
+
if not isinstance(short_factor, list) and all(isinstance(x, (int, float)) for x in short_factor):
|
462 |
+
logger.warning(f"`rope_scaling`'s short_factor field must be a list of numbers, got {short_factor}")
|
463 |
+
if not len(short_factor) == dim // 2:
|
464 |
+
logger.warning(f"`rope_scaling`'s short_factor field must have length {dim // 2}, got {len(short_factor)}")
|
465 |
+
|
466 |
+
long_factor = rope_scaling.get("long_factor")
|
467 |
+
if not isinstance(long_factor, list) and all(isinstance(x, (int, float)) for x in long_factor):
|
468 |
+
logger.warning(f"`rope_scaling`'s long_factor field must be a list of numbers, got {long_factor}")
|
469 |
+
if not len(long_factor) == dim // 2:
|
470 |
+
logger.warning(f"`rope_scaling`'s long_factor field must have length {dim // 2}, got {len(long_factor)}")
|
471 |
+
|
472 |
+
# Handle Phi3 divergence: prefer the use of `attention_factor` and/or `factor` over
|
473 |
+
# `original_max_position_embeddings` to compute internal variables. The latter lives outside `rope_scaling` and is
|
474 |
+
# unique to longrope (= undesirable)
|
475 |
+
if hasattr(config, "original_max_position_embeddings"):
|
476 |
+
logger.warning_once(
|
477 |
+
"This model has set a `original_max_position_embeddings` field, to be used together with "
|
478 |
+
"`max_position_embeddings` to determine a scaling factor. Please set the `factor` field of `rope_scaling`"
|
479 |
+
"with this ratio instead -- we recommend the use of this field over `original_max_position_embeddings`, "
|
480 |
+
"as it is compatible with most model architectures."
|
481 |
+
)
|
482 |
+
else:
|
483 |
+
factor = rope_scaling.get("factor")
|
484 |
+
if factor is None:
|
485 |
+
logger.warning("Missing required keys in `rope_scaling`: 'factor'")
|
486 |
+
elif not isinstance(factor, float) or factor < 1.0:
|
487 |
+
logger.warning(f"`rope_scaling`'s factor field must be a float >= 1, got {factor}")
|
488 |
+
|
489 |
+
attention_factor = rope_scaling.get("attention_factor")
|
490 |
+
if attention_factor is not None and not isinstance(attention_factor, float) or attention_factor < 0:
|
491 |
+
logger.warning(
|
492 |
+
f"`rope_scaling`'s attention_factor field must be a float greater than 0, got {attention_factor}"
|
493 |
+
)
|
494 |
+
|
495 |
+
|
496 |
+
def _validate_llama3_parameters(config: PretrainedConfig):
|
497 |
+
rope_scaling = config.rope_scaling
|
498 |
+
rope_type = rope_scaling.get("rope_type", rope_scaling.get("type", None)) # BC: "rope_type" was originally "type"
|
499 |
+
required_keys = {"rope_type", "factor", "original_max_position_embeddings", "low_freq_factor", "high_freq_factor"}
|
500 |
+
received_keys = set(rope_scaling.keys())
|
501 |
+
_check_received_keys(rope_type, received_keys, required_keys)
|
502 |
+
|
503 |
+
factor = rope_scaling["factor"]
|
504 |
+
if factor is None or not isinstance(factor, float) or factor < 1.0:
|
505 |
+
logger.warning(f"`rope_scaling`'s factor field must be a float >= 1, got {factor}")
|
506 |
+
|
507 |
+
low_freq_factor = rope_scaling["low_freq_factor"]
|
508 |
+
high_freq_factor = rope_scaling["high_freq_factor"]
|
509 |
+
if low_freq_factor is None or not isinstance(low_freq_factor, float):
|
510 |
+
logger.warning(f"`rope_scaling`'s low_freq_factor field must be a float, got {low_freq_factor}")
|
511 |
+
if high_freq_factor is None or not isinstance(high_freq_factor, float):
|
512 |
+
logger.warning(f"`rope_scaling`'s high_freq_factor field must be a float, got {high_freq_factor}")
|
513 |
+
if high_freq_factor <= low_freq_factor:
|
514 |
+
logger.warning(
|
515 |
+
"`rope_scaling`'s high_freq_factor field must be greater than low_freq_factor, got high_freq_factor="
|
516 |
+
f"{high_freq_factor} and low_freq_factor={low_freq_factor}"
|
517 |
+
)
|
518 |
+
|
519 |
+
original_max_position_embeddings = rope_scaling["original_max_position_embeddings"]
|
520 |
+
if original_max_position_embeddings is None or not isinstance(original_max_position_embeddings, int):
|
521 |
+
logger.warning(
|
522 |
+
"`rope_scaling`'s original_max_position_embeddings field must be an integer, got "
|
523 |
+
f"{original_max_position_embeddings}"
|
524 |
+
)
|
525 |
+
if original_max_position_embeddings >= config.max_position_embeddings:
|
526 |
+
logger.warning(
|
527 |
+
"`rope_scaling`'s original_max_position_embeddings field must be less than max_position_embeddings, got "
|
528 |
+
f"{original_max_position_embeddings} and max_position_embeddings={config.max_position_embeddings}"
|
529 |
+
)
|
530 |
+
|
531 |
+
|
532 |
+
# Like `ROPE_INIT_FUNCTIONS`, this validation function mapping can be dynamically updated for custom RoPE types.
|
533 |
+
ROPE_VALIDATION_FUNCTIONS = {
|
534 |
+
"default": _validate_default_rope_parameters,
|
535 |
+
"linear": _validate_linear_scaling_rope_parameters,
|
536 |
+
"dynamic": _validate_dynamic_scaling_rope_parameters,
|
537 |
+
"yarn": _validate_yarn_parameters,
|
538 |
+
"longrope": _validate_longrope_parameters,
|
539 |
+
"llama3": _validate_llama3_parameters,
|
540 |
+
}
|
541 |
+
|
542 |
+
|
543 |
+
def rope_config_validation(config: PretrainedConfig):
|
544 |
+
"""
|
545 |
+
Validate the RoPE config arguments, given a `PretrainedConfig` object
|
546 |
+
"""
|
547 |
+
rope_scaling = getattr(config, "rope_scaling", None) # not a default parameter in `PretrainedConfig`
|
548 |
+
if rope_scaling is None:
|
549 |
+
return
|
550 |
+
|
551 |
+
# BC: "rope_type" was originally "type"
|
552 |
+
rope_type = rope_scaling.get("rope_type", rope_scaling.get("type", "default"))
|
553 |
+
validation_fn = ROPE_VALIDATION_FUNCTIONS.get(rope_type)
|
554 |
+
if validation_fn is not None:
|
555 |
+
validation_fn(config)
|
556 |
+
else:
|
557 |
+
logger.warning(
|
558 |
+
f"Missing validation function mapping in `ROPE_VALIDATION_FUNCTIONS` for 'rope_type'='{rope_type}'"
|
559 |
+
)
|