neilmehta24 commited on
Commit
39c36b4
·
verified ·
1 Parent(s): 6a61e1d

Delete configuration_ernie4_5.py

Browse files
Files changed (1) hide show
  1. configuration_ernie4_5.py +0 -127
configuration_ernie4_5.py DELETED
@@ -1,127 +0,0 @@
1
- # Copyright (c) 2025 Baidu, Inc. All Rights Reserved.
2
- #
3
- # Licensed under the Apache License, Version 2.0 (the "License");
4
- # you may not use this file except in compliance with the License.
5
- # You may obtain a copy of the License at
6
- #
7
- # http://www.apache.org/licenses/LICENSE-2.0
8
- #
9
- # Unless required by applicable law or agreed to in writing, software
10
- # distributed under the License is distributed on an "AS IS" BASIS,
11
- # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
- # See the License for the specific language governing permissions and
13
- # limitations under the License.
14
-
15
- from transformers import PretrainedConfig
16
-
17
-
18
- class Ernie4_5_Config(PretrainedConfig):
19
- """
20
- Configuration class.
21
-
22
- This class stores the configuration of an Ernie model, defining the model architecture.
23
- It inherits from PretrainedConfig and can be used to control model outputs.
24
- """
25
-
26
- model_type = "ernie4_5"
27
- keys_to_ignore_at_inference = ["past_key_values"]
28
-
29
- # Default tensor parallel plan for base model `Qwen3`
30
- base_model_tp_plan = {
31
- "layers.*.self_attn.q_proj": "colwise",
32
- "layers.*.self_attn.k_proj": "colwise",
33
- "layers.*.self_attn.v_proj": "colwise",
34
- "layers.*.self_attn.o_proj": "rowwise",
35
- "layers.*.mlp.gate_proj": "colwise",
36
- "layers.*.mlp.up_proj": "colwise",
37
- "layers.*.mlp.down_proj": "rowwise",
38
- }
39
- base_model_pp_plan = {
40
- "embed_tokens": (["input_ids"], ["inputs_embeds"]),
41
- "layers": (["hidden_states", "attention_mask"], ["hidden_states"]),
42
- "norm": (["hidden_states"], ["hidden_states"]),
43
- }
44
-
45
- def __init__(
46
- self,
47
- vocab_size=32000,
48
- hidden_size=768,
49
- intermediate_size=11008,
50
- max_position_embeddings=32768,
51
- num_hidden_layers=2,
52
- num_attention_heads=2,
53
- rms_norm_eps=1e-6,
54
- use_cache=False,
55
- use_flash_attention=False,
56
- pad_token_id=0,
57
- bos_token_id=1,
58
- eos_token_id=2,
59
- use_bias=False,
60
- rope_theta=10000,
61
- weight_share_add_bias=True,
62
- ignored_index=-100,
63
- attention_probs_dropout_prob=0.0,
64
- hidden_dropout_prob=0.0,
65
- compression_ratio: float = 1.0,
66
- num_key_value_heads=None,
67
- max_sequence_length=None,
68
- **kwargs,
69
- ):
70
- """
71
- Initialize configuration with default or specified parameters.
72
-
73
- Args:
74
- vocab_size (int): Size of the vocabulary (number of unique tokens)
75
- hidden_size (int): Dimensionality of the encoder layers and the pooler layer
76
- intermediate_size (int): Dimensionality of the "intermediate" (feed-forward) layer
77
- max_position_embeddings (int): Maximum sequence length the model can handle
78
- num_hidden_layers (int): Number of hidden layers in the Transformer encoder
79
- num_attention_heads (int): Number of attention heads for each attention layer
80
- rms_norm_eps (float): The epsilon used by the RMS normalization layers
81
- use_cache (bool): Whether to use caching for faster generation (decoding)
82
- use_flash_attention (bool): Whether to use FlashAttention for optimized attention computation
83
- pad_token_id (int): Token ID used for padding sequences
84
- bos_token_id (int): Token ID used for beginning-of-sequence
85
- eos_token_id (int): Token ID used for end-of-sequence
86
- use_bias (bool): Whether to use bias terms in linear layers
87
- rope_theta (float): The base period of the RoPE embeddings
88
- weight_share_add_bias (bool): Whether to share bias weights in certain layers
89
- ignored_index (int): Target value that is ignored during loss computation
90
- attention_probs_dropout_prob (float): Dropout probability for attention weights
91
- hidden_dropout_prob (float): Dropout probability for hidden layers
92
- compression_ratio (float): Ratio for KV cache compression (1.0 = no compression)
93
- num_key_value_heads (int): Number of key/value heads (for Grouped Query Attention)
94
- max_sequence_length (int): Maximum sequence length for positional embeddings
95
- **kwargs: Additional keyword arguments passed to parent class
96
- """
97
-
98
- # Set default for tied embeddings if not specified.
99
- if "tie_word_embeddings" not in kwargs:
100
- kwargs["tie_word_embeddings"] = False
101
- super().__init__(
102
- pad_token_id=pad_token_id,
103
- bos_token_id=bos_token_id,
104
- eos_token_id=eos_token_id,
105
- **kwargs,
106
- )
107
- self.vocab_size = vocab_size
108
- self.hidden_size = hidden_size
109
- self.intermediate_size = intermediate_size
110
- self.max_position_embeddings = max_position_embeddings
111
- self.num_hidden_layers = num_hidden_layers
112
- self.num_attention_heads = num_attention_heads
113
- self.rms_norm_eps = rms_norm_eps
114
- self.use_cache = use_cache
115
- self.use_flash_attention = use_flash_attention
116
- self.pad_token_id = pad_token_id
117
- self.bos_token_id = bos_token_id
118
- self.eos_token_id = eos_token_id
119
- self.use_bias = use_bias
120
- self.weight_share_add_bias = weight_share_add_bias
121
- self.rope_theta = rope_theta
122
- self.ignored_index = ignored_index
123
- self.attention_probs_dropout_prob = attention_probs_dropout_prob
124
- self.hidden_dropout_prob = hidden_dropout_prob
125
- self.compression_ratio = compression_ratio
126
- self.num_key_value_heads = num_key_value_heads
127
- self.max_sequence_length = max_sequence_length