Update configuration_rwkv6qwen2.py
Browse files- configuration_rwkv6qwen2.py +16 -0
configuration_rwkv6qwen2.py
CHANGED
@@ -137,12 +137,15 @@ class RWKV6Qwen2Config(PretrainedConfig):
|
|
137 |
num_hidden_layers=32,
|
138 |
num_attention_heads=32,
|
139 |
num_key_value_heads=32,
|
|
|
|
|
140 |
hidden_act="silu",
|
141 |
max_position_embeddings=32768,
|
142 |
initializer_range=0.02,
|
143 |
rms_norm_eps=1e-6,
|
144 |
use_cache=True,
|
145 |
tie_word_embeddings=False,
|
|
|
146 |
rope_theta=10000.0,
|
147 |
rope_scaling=None,
|
148 |
use_sliding_window=False,
|
@@ -151,6 +154,11 @@ class RWKV6Qwen2Config(PretrainedConfig):
|
|
151 |
attention_dropout=0.0,
|
152 |
attention_bias=True,
|
153 |
attention_output_bias=False,
|
|
|
|
|
|
|
|
|
|
|
154 |
**kwargs,
|
155 |
):
|
156 |
self.vocab_size = vocab_size
|
@@ -168,10 +176,13 @@ class RWKV6Qwen2Config(PretrainedConfig):
|
|
168 |
num_key_value_heads = num_attention_heads
|
169 |
|
170 |
self.num_key_value_heads = num_key_value_heads
|
|
|
|
|
171 |
self.hidden_act = hidden_act
|
172 |
self.initializer_range = initializer_range
|
173 |
self.rms_norm_eps = rms_norm_eps
|
174 |
self.use_cache = use_cache
|
|
|
175 |
self.rope_theta = rope_theta
|
176 |
self.rope_scaling = rope_scaling
|
177 |
self.attention_dropout = attention_dropout
|
@@ -183,6 +194,11 @@ class RWKV6Qwen2Config(PretrainedConfig):
|
|
183 |
|
184 |
self.attention_bias = attention_bias
|
185 |
self.attention_output_bias = attention_output_bias
|
|
|
|
|
|
|
|
|
|
|
186 |
|
187 |
super().__init__(
|
188 |
tie_word_embeddings=tie_word_embeddings,
|
|
|
137 |
num_hidden_layers=32,
|
138 |
num_attention_heads=32,
|
139 |
num_key_value_heads=32,
|
140 |
+
lora_rank_tokenshift=None,
|
141 |
+
lora_rank_decay=None,
|
142 |
hidden_act="silu",
|
143 |
max_position_embeddings=32768,
|
144 |
initializer_range=0.02,
|
145 |
rms_norm_eps=1e-6,
|
146 |
use_cache=True,
|
147 |
tie_word_embeddings=False,
|
148 |
+
use_rope=False,
|
149 |
rope_theta=10000.0,
|
150 |
rope_scaling=None,
|
151 |
use_sliding_window=False,
|
|
|
154 |
attention_dropout=0.0,
|
155 |
attention_bias=True,
|
156 |
attention_output_bias=False,
|
157 |
+
gate_rank_type=1,
|
158 |
+
lora_rank_gate=None,
|
159 |
+
balance_state=True,
|
160 |
+
groupnorm_att=False,
|
161 |
+
use_tokenshift=True,
|
162 |
**kwargs,
|
163 |
):
|
164 |
self.vocab_size = vocab_size
|
|
|
176 |
num_key_value_heads = num_attention_heads
|
177 |
|
178 |
self.num_key_value_heads = num_key_value_heads
|
179 |
+
self.lora_rank_tokenshift = lora_rank_tokenshift
|
180 |
+
self.lora_rank_decay = lora_rank_decay
|
181 |
self.hidden_act = hidden_act
|
182 |
self.initializer_range = initializer_range
|
183 |
self.rms_norm_eps = rms_norm_eps
|
184 |
self.use_cache = use_cache
|
185 |
+
self.use_rope = use_rope
|
186 |
self.rope_theta = rope_theta
|
187 |
self.rope_scaling = rope_scaling
|
188 |
self.attention_dropout = attention_dropout
|
|
|
194 |
|
195 |
self.attention_bias = attention_bias
|
196 |
self.attention_output_bias = attention_output_bias
|
197 |
+
self.gate_rank_type = gate_rank_type
|
198 |
+
self.lora_rank_gate = lora_rank_gate
|
199 |
+
self.balance_state = balance_state
|
200 |
+
self.groupnorm_att = groupnorm_att
|
201 |
+
self.use_tokenshift = use_tokenshift
|
202 |
|
203 |
super().__init__(
|
204 |
tie_word_embeddings=tie_word_embeddings,
|