Update configuration_quasrav4.py
Browse files
configuration_quasrav4.py
CHANGED
@@ -8,7 +8,6 @@ class InfinityFormerConfig(PretrainedConfig):
|
|
8 |
model_type = "infinity_former"
|
9 |
|
10 |
def __init__(self, **kwargs):
|
11 |
-
# Pop custom arguments from kwargs, using defaults from your config.json
|
12 |
self.vocab_size = kwargs.pop("vocab_size", 151669)
|
13 |
self.hidden_size = kwargs.pop("hidden_size", 768)
|
14 |
self.num_hidden_layers = kwargs.pop("num_hidden_layers", 54)
|
@@ -16,7 +15,7 @@ class InfinityFormerConfig(PretrainedConfig):
|
|
16 |
self.intermediate_size = kwargs.pop("intermediate_size", 3072)
|
17 |
self.hidden_dropout_prob = kwargs.pop("hidden_dropout_prob", 0.1)
|
18 |
self.attention_probs_dropout_prob = kwargs.pop("attention_probs_dropout_prob", 0.1)
|
19 |
-
self.max_position_embeddings = kwargs.pop("max_position_embeddings",
|
20 |
self.initializer_range = kwargs.pop("initializer_range", 0.02)
|
21 |
self.layer_norm_eps = kwargs.pop("layer_norm_eps", 1e-5)
|
22 |
self.use_rotary_embeddings = kwargs.pop("use_rotary_embeddings", True)
|
@@ -32,17 +31,10 @@ class InfinityFormerConfig(PretrainedConfig):
|
|
32 |
self.use_memory_attention = kwargs.pop("use_memory_attention", False)
|
33 |
self.use_gradient_checkpointing = kwargs.pop("use_gradient_checkpointing", False)
|
34 |
|
35 |
-
# The `use_return_dict` is a read-only property that depends on `return_dict`.
|
36 |
-
# We must pop it from kwargs before calling super().__init__ to avoid an error.
|
37 |
use_return_dict = kwargs.pop("use_return_dict", True)
|
38 |
-
|
39 |
-
# Pass the rest of the arguments to the parent class.
|
40 |
super().__init__(**kwargs)
|
41 |
-
|
42 |
-
# Now, set the underlying attribute that the `use_return_dict` property uses.
|
43 |
self.return_dict = use_return_dict
|
44 |
|
45 |
-
# Validation logic
|
46 |
if self.hidden_size % self.num_attention_heads != 0:
|
47 |
raise ValueError(
|
48 |
f"`hidden_size` ({self.hidden_size}) must be a multiple of `num_attention_heads` "
|
|
|
8 |
model_type = "infinity_former"
|
9 |
|
10 |
def __init__(self, **kwargs):
|
|
|
11 |
self.vocab_size = kwargs.pop("vocab_size", 151669)
|
12 |
self.hidden_size = kwargs.pop("hidden_size", 768)
|
13 |
self.num_hidden_layers = kwargs.pop("num_hidden_layers", 54)
|
|
|
15 |
self.intermediate_size = kwargs.pop("intermediate_size", 3072)
|
16 |
self.hidden_dropout_prob = kwargs.pop("hidden_dropout_prob", 0.1)
|
17 |
self.attention_probs_dropout_prob = kwargs.pop("attention_probs_dropout_prob", 0.1)
|
18 |
+
self.max_position_embeddings = kwargs.pop("max_position_embeddings", 8192)
|
19 |
self.initializer_range = kwargs.pop("initializer_range", 0.02)
|
20 |
self.layer_norm_eps = kwargs.pop("layer_norm_eps", 1e-5)
|
21 |
self.use_rotary_embeddings = kwargs.pop("use_rotary_embeddings", True)
|
|
|
31 |
self.use_memory_attention = kwargs.pop("use_memory_attention", False)
|
32 |
self.use_gradient_checkpointing = kwargs.pop("use_gradient_checkpointing", False)
|
33 |
|
|
|
|
|
34 |
use_return_dict = kwargs.pop("use_return_dict", True)
|
|
|
|
|
35 |
super().__init__(**kwargs)
|
|
|
|
|
36 |
self.return_dict = use_return_dict
|
37 |
|
|
|
38 |
if self.hidden_size % self.num_attention_heads != 0:
|
39 |
raise ValueError(
|
40 |
f"`hidden_size` ({self.hidden_size}) must be a multiple of `num_attention_heads` "
|