eyad-silx commited on
Commit
fdf47df
·
verified ·
1 Parent(s): 69e76c8

Update configuration_quasrav4.py

Browse files
Files changed (1) hide show
  1. configuration_quasrav4.py +26 -49
configuration_quasrav4.py CHANGED
@@ -7,56 +7,32 @@ class QuasraV4Config(PretrainedConfig):
7
  """
8
  model_type = "quasarv4"
9
 
10
- def __init__(
11
- self,
12
- vocab_size=151669,
13
- hidden_size=768,
14
- num_hidden_layers=12,
15
- num_attention_heads=12,
16
- intermediate_size=3072,
17
- hidden_dropout_prob=0.1,
18
- attention_probs_dropout_prob=0.1,
19
- max_position_embeddings=2048,
20
- initializer_range=0.02,
21
- layer_norm_eps=1e-5,
22
- use_rotary_embeddings=True,
23
- rotary_embedding_base=10000,
24
- use_multi_scale_memory=True,
25
- num_memory_scales=3,
26
- memory_compression_ratio=0.5,
27
- memory_compression_frequency=100,
28
- kernel_type='elu',
29
- kernel_epsilon=0.1,
30
- use_gating=True,
31
- gate_init_bias=-2.0,
32
- use_gradient_checkpointing=False,
33
- # `**kwargs` will catch all standard Hugging Face parameters
34
- **kwargs,
35
- ):
36
- # Set model-specific attributes
37
- self.vocab_size = vocab_size
38
- self.hidden_size = hidden_size
39
- self.num_hidden_layers = num_hidden_layers
40
- self.num_attention_heads = num_attention_heads
41
- self.intermediate_size = intermediate_size
42
- self.hidden_dropout_prob = hidden_dropout_prob
43
- self.attention_probs_dropout_prob = attention_probs_dropout_prob
44
- self.max_position_embeddings = max_position_embeddings
45
- self.initializer_range = initializer_range
46
- self.layer_norm_eps = layer_norm_eps
47
- self.use_rotary_embeddings = use_rotary_embeddings
48
- self.rotary_embedding_base = rotary_embedding_base
49
- self.use_multi_scale_memory = use_multi_scale_memory
50
- self.num_memory_scales = num_memory_scales
51
- self.memory_compression_ratio = memory_compression_ratio
52
- self.memory_compression_frequency = memory_compression_frequency
53
- self.kernel_type = kernel_type
54
- self.kernel_epsilon = kernel_epsilon
55
- self.use_gating = use_gating
56
- self.gate_init_bias = gate_init_bias
57
- self.use_gradient_checkpointing = use_gradient_checkpointing
58
 
59
- # Pass all other arguments, including standard HF ones, to the parent class
 
60
  super().__init__(**kwargs)
61
 
62
  # Validation logic
@@ -68,3 +44,4 @@ class QuasraV4Config(PretrainedConfig):
68
  if self.kernel_type not in ['elu', 'relu', 'learnable']:
69
  raise ValueError(f"`kernel_type` must be one of 'elu', 'relu', or 'learnable', got {self.kernel_type}")
70
 
 
 
7
  """
8
  model_type = "quasarv4"
9
 
10
+ def __init__(self, **kwargs):
11
+ # Pop custom arguments from kwargs, using defaults from your config.json
12
+ self.vocab_size = kwargs.pop("vocab_size", 151669)
13
+ self.hidden_size = kwargs.pop("hidden_size", 768)
14
+ self.num_hidden_layers = kwargs.pop("num_hidden_layers", 54)
15
+ self.num_attention_heads = kwargs.pop("num_attention_heads", 12)
16
+ self.intermediate_size = kwargs.pop("intermediate_size", 3072)
17
+ self.hidden_dropout_prob = kwargs.pop("hidden_dropout_prob", 0.1)
18
+ self.attention_probs_dropout_prob = kwargs.pop("attention_probs_dropout_prob", 0.1)
19
+ self.max_position_embeddings = kwargs.pop("max_position_embeddings", 812)
20
+ self.initializer_range = kwargs.pop("initializer_range", 0.02)
21
+ self.layer_norm_eps = kwargs.pop("layer_norm_eps", 1e-5)
22
+ self.use_rotary_embeddings = kwargs.pop("use_rotary_embeddings", True)
23
+ self.rotary_embedding_base = kwargs.pop("rotary_embedding_base", 10000)
24
+ self.use_multi_scale_memory = kwargs.pop("use_multi_scale_memory", True)
25
+ self.num_memory_scales = kwargs.pop("num_memory_scales", 3)
26
+ self.memory_compression_ratio = kwargs.pop("memory_compression_ratio", 0.5)
27
+ self.memory_compression_frequency = kwargs.pop("memory_compression_frequency", 100)
28
+ self.kernel_type = kwargs.pop("kernel_type", 'elu')
29
+ self.kernel_epsilon = kwargs.pop("kernel_epsilon", 0.1)
30
+ self.use_gating = kwargs.pop("use_gating", True)
31
+ self.gate_init_bias = kwargs.pop("gate_init_bias", -2.0)
32
+ self.use_gradient_checkpointing = kwargs.pop("use_gradient_checkpointing", False)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
33
 
34
+ # Pass the rest of the arguments to the parent class.
35
+ # This will include 'use_return_dict', 'tie_word_embeddings', 'architectures', etc.
36
  super().__init__(**kwargs)
37
 
38
  # Validation logic
 
44
  if self.kernel_type not in ['elu', 'relu', 'learnable']:
45
  raise ValueError(f"`kernel_type` must be one of 'elu', 'relu', or 'learnable', got {self.kernel_type}")
46
 
47
+