pstjohn commited on
Commit
8795250
·
verified ·
1 Parent(s): c1e3fff

Upload folder using huggingface_hub

Browse files
Files changed (8) hide show
  1. LICENSE +25 -0
  2. README.md +125 -3
  3. amplify_te.py +307 -0
  4. config.json +38 -0
  5. model.safetensors +3 -0
  6. special_tokens_map.json +37 -0
  7. tokenizer.json +154 -0
  8. tokenizer_config.json +59 -0
LICENSE ADDED
@@ -0,0 +1,25 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # SPDX-FileCopyrightText: Copyright (c) 2024 chandar-lab
2
+ # SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
3
+ # SPDX-License-Identifier: MIT
4
+
5
+ MIT License
6
+
7
+ Copyright (c) 2024 chandar-lab
8
+
9
+ Permission is hereby granted, free of charge, to any person obtaining a copy
10
+ of this software and associated documentation files (the "Software"), to deal
11
+ in the Software without restriction, including without limitation the rights
12
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
13
+ copies of the Software, and to permit persons to whom the Software is
14
+ furnished to do so, subject to the following conditions:
15
+
16
+ The above copyright notice and this permission notice shall be included in all
17
+ copies or substantial portions of the Software.
18
+
19
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
20
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
21
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
22
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
23
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
24
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
25
+ SOFTWARE.
README.md CHANGED
@@ -1,3 +1,125 @@
1
- ---
2
- license: mit
3
- ---
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ license: mit
3
+ datasets:
4
+ - chandar-lab/UR100P
5
+ language:
6
+ - en
7
+ tags:
8
+ - biology
9
+ ---
10
+
11
+ > [!NOTE]
12
+ > This model has been optimized using NVIDIA's [TransformerEngine](https://github.com/NVIDIA/TransformerEngine)
13
+ > library. Slight numerical differences may be observed between the original model and the optimized
14
+ > model. For instructions on how to install TransformerEngine, please refer to the
15
+ > [official documentation](https://github.com/NVIDIA/TransformerEngine?tab=readme-ov-file#installation).
16
+ >
17
+ > The original xformers-based models are available at [chandar-lab/AMPLIFY](https://huggingface.co/chandar-lab/AMPLIFY_350M).
18
+
19
+ ## AMPLIFY
20
+
21
+ AMPLIFY is an efficient, state-of-the-art protein language model pre-trained using masked language modeling on UniRef100, OAS, and SCOP ([UR100P](https://huggingface.co/datasets/chandar-lab/UR100P)). AMPLIFY can generate residue and protein embeddings, suggest mutations, differentiate disordered proteins from non-protein sequences, and much more. AMPLIFY is available in two sizes, 120M and 350M parameters, with the `_base` models not extended beyond 512 residues (Stage 1). The model architecture and pre-training procedure are detailed below. For more details, please refer to the [accompanying paper](https://www.biorxiv.org/content/10.1101/2024.09.23.614603v1).
22
+
23
+ - [`AMPLIFY_350M`](https://huggingface.co/nvidia/AMPLIFY_350M)
24
+ - [`AMPLIFY_350M_base`](https://huggingface.co/chandar-lab/AMPLIFY_350M_base)
25
+ - [`AMPLIFY_120M`](https://huggingface.co/nvidia/AMPLIFY_120M)
26
+ - [`AMPLIFY_120M_base`](https://huggingface.co/chandar-lab/AMPLIFY_120M_base)
27
+
28
+ ### Model Description
29
+
30
+ | | AMPLIFY 120M | AMPLIFY 350M |
31
+ | :----------------------------- | -----------: | -----------: |
32
+ | `hidden-size` | 640 | 960 |
33
+ | `num-hidden-layers` | 24 | 32 |
34
+ | `num-attention-heads` | 10 | 15 |
35
+ | `intermediate-size` | 2560 | 3840 |
36
+ | `max-position-embeddings` | 2048 | 2048 |
37
+ | `vocab-size` | 27 | 27 |
38
+ | `rope-theta` | 10000 | 10000 |
39
+ | `dropout-prob` | 0 | 0 |
40
+ | `embedding-init-range` | 0.02 | 0.02 |
41
+ | `norm-eps` | 1.0e-05 | 1.0e-05 |
42
+ | `hidden-act` | swiglu | swiglu |
43
+ | `pre-activation-layer-norm` | true | true |
44
+ | `layer-norm-after-embedding` | false | false |
45
+ | `layer-norm-before-last-layer` | true | true |
46
+ | `rms-norm` | true | true |
47
+ | `ffn-bias` | false | false |
48
+ | `attn-bias` | false | false |
49
+
50
+ ### Training Description
51
+
52
+ | | Stage 1 | Stage 2 |
53
+ | :------------------ | ----------: | ---------------------------: |
54
+ | `dataset` | UR100P | UR100P |
55
+ | `max-steps` | 1000000 | 25000 (120M) or 50000 (350M) |
56
+ | `max-length` | 512 | 2048 |
57
+ | `optimizer` | adamw | adamw |
58
+ | `lr` | 0.001 | 0.0001 |
59
+ | `betas` | (0.9, 0.95) | (0.9, 0.95) |
60
+ | `eps` | 1.0e-08 | 1.0e-08 |
61
+ | `weight-decay` | 0.01 | 0.01 |
62
+ | `scheduler` | cosinedecay | none |
63
+ | `warmup-steps` | 1,000 | none |
64
+ | `final-step` | 900,000 | none |
65
+ | `warmup-steps` | 1,000 | none |
66
+ | `gradient-clipping` | 1.0 | 1.0 |
67
+ | `tf32` | true | true |
68
+ | `mixed-precision` | bf16 | bf16 |
69
+ | `padding` | max-length | max-length |
70
+ | `random-truncate` | true | true |
71
+ | `mask-probability` | 0.15 | 0.15 |
72
+ | `total-batch-size` | 4096 | 4096 |
73
+ | `deepspeed` | true | true |
74
+ | `zero-stage` | 3 | 3 |
75
+
76
+ ## Get Started
77
+
78
+ ```python
79
+ from transformers import AutoModel
80
+ from transformers import AutoTokenizer
81
+ from datasets import load_dataset
82
+
83
+ # Load AMPLIFY and tokenizer
84
+ model = AutoModel.from_pretrained("nvidia/AMPLIFY_350M", trust_remote_code=True)
85
+ tokenizer = AutoTokenizer.from_pretrained("nvidia/AMPLIFY_350M", trust_remote_code=True)
86
+
87
+ # Move the model to GPU (required due to Flash Attention)
88
+ model = model.to("cuda")
89
+
90
+ # Load the UniProt validation set
91
+ dataset = load_dataset("chandar-lab/UR100P", data_dir="UniProt", split="test")
92
+
93
+ for sample in dataset:
94
+ # Protein
95
+ print("Sample: ", sample["name"], sample["sequence"])
96
+
97
+ # Tokenize the protein
98
+ input = tokenizer.encode(sample["sequence"], return_tensors="pt")
99
+ print("Input: ", input)
100
+
101
+ # Move to the GPU and make a prediction
102
+ input = input.to("cuda")
103
+ output = model(input)
104
+ print("Output: ", output)
105
+
106
+ break
107
+ ```
108
+
109
+ ## Citations
110
+
111
+ If you find the models useful in your research, we ask that you cite the paper:
112
+
113
+ ```bibtex
114
+ @article{Fournier2024.09.23.614603,
115
+ title = {Protein Language Models: Is Scaling Necessary?},
116
+ author = {Fournier, Quentin and Vernon, Robert M. and van der Sloot, Almer and Schulz, Benjamin and Chandar, Sarath and Langmead, Christopher James},
117
+ year = {2024},
118
+ journal = {bioRxiv},
119
+ publisher = {Cold Spring Harbor Laboratory},
120
+ doi = {10.1101/2024.09.23.614603},
121
+ url = {https://www.biorxiv.org/content/early/2024/09/23/2024.09.23.614603},
122
+ elocation-id = {2024.09.23.614603},
123
+ eprint = {https://www.biorxiv.org/content/early/2024/09/23/2024.09.23.614603.full.pdf}
124
+ }
125
+ ```
amplify_te.py ADDED
@@ -0,0 +1,307 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # SPDX-FileCopyrightText: Copyright (c) 2024 chandar-lab
2
+ # SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
3
+ # SPDX-License-Identifier: MIT
4
+ #
5
+ # Adapted from https://huggingface.co/chandar-lab/AMPLIFY_120M/blob/main/amplify.py
6
+
7
+ import torch
8
+ import transformer_engine.pytorch
9
+ from torch import nn
10
+ from transformer_engine.pytorch.attention.rope import RotaryPositionEmbedding
11
+ from transformers.configuration_utils import PretrainedConfig
12
+ from transformers.modeling_outputs import BaseModelOutput, MaskedLMOutput
13
+ from transformers.modeling_utils import PreTrainedModel
14
+
15
+
16
+ class AMPLIFYConfig(PretrainedConfig):
17
+ """AMPLIFY model configuration."""
18
+
19
+ model_type = "AMPLIFY"
20
+
21
+ # All config parameters must have a default value.
22
+ def __init__(
23
+ self,
24
+ hidden_size: int = 960,
25
+ num_hidden_layers: int = 32,
26
+ num_attention_heads: int = 15,
27
+ intermediate_size: int = 3840,
28
+ dropout_prob: float = 0,
29
+ embedding_init_range: float = 0.02,
30
+ decoder_init_range: float = 0.02,
31
+ rms_norm: bool = True,
32
+ norm_eps: float = 1e-05,
33
+ hidden_act: str = "SwiGLU",
34
+ layer_norm_after_embedding: bool = False,
35
+ layer_norm_before_last_layer: bool = True,
36
+ vocab_size: int = 27,
37
+ ffn_bias: bool = False,
38
+ att_bias: bool = False,
39
+ pad_token_id: int = 0,
40
+ max_length: int = 2048,
41
+ **kwargs,
42
+ ):
43
+ """Initialize a AMPLIFYConfig.
44
+
45
+ Args:
46
+ hidden_size (int): The hidden size of the model.
47
+ num_hidden_layers (int): The number of hidden layers in the model.
48
+ num_attention_heads (int): The number of attention heads in the model.
49
+ intermediate_size (int): The intermediate size of the model.
50
+ dropout_prob (float): The dropout probability of the model.
51
+ embedding_init_range (float): The range of the embedding initialization.
52
+ decoder_init_range (float): The range of the decoder initialization.
53
+ rms_norm (bool): Whether to use RMSNorm.
54
+ norm_eps (float): The epsilon for the normalization.
55
+ hidden_act (str): The activation function of the model.
56
+ layer_norm_after_embedding (bool): Whether to use layer normalization after the embedding.
57
+ layer_norm_before_last_layer (bool): Whether to use layer normalization before the last layer.
58
+ vocab_size (int): The vocabulary size of the model.
59
+ ffn_bias (bool): Whether to use bias in the feedforward network.
60
+ att_bias (bool): Whether to use bias in the attention.
61
+ pad_token_id (int): The padding token id.
62
+ max_length (int): The maximum length of the sequence.
63
+ **kwargs: Additional arguments.
64
+ """
65
+ super().__init__(**kwargs)
66
+
67
+ self.hidden_size = hidden_size
68
+ self.num_hidden_layers = num_hidden_layers
69
+ self.num_attention_heads = num_attention_heads
70
+ self.intermediate_size = intermediate_size
71
+ self.dropout_prob = dropout_prob
72
+ self.embedding_init_range = embedding_init_range
73
+ self.decoder_init_range = decoder_init_range
74
+ self.rms_norm = rms_norm
75
+ self.norm_eps = norm_eps
76
+ self.hidden_act = hidden_act
77
+ self.layer_norm_after_embedding = layer_norm_after_embedding
78
+ self.layer_norm_before_last_layer = layer_norm_before_last_layer
79
+ self.vocab_size = vocab_size
80
+ self.ffn_bias = ffn_bias
81
+ self.att_bias = att_bias
82
+ self.pad_token_id = pad_token_id
83
+ self.max_length = max_length
84
+
85
+
86
+ class AMPLIFYPreTrainedModel(PreTrainedModel):
87
+ """AMPLIFY pre-trained model."""
88
+
89
+ config_class = AMPLIFYConfig
90
+
91
+ def _init_weights(self, module):
92
+ if isinstance(
93
+ module, (nn.Linear, transformer_engine.pytorch.Linear, transformer_engine.pytorch.LayerNormLinear)
94
+ ):
95
+ module.weight.data.uniform_(-self.config.decoder_init_range, self.config.decoder_init_range)
96
+ if module.bias is not None:
97
+ module.bias.data.zero_()
98
+ if isinstance(module, nn.Embedding):
99
+ module.weight.data.uniform_(-self.config.embedding_init_range, self.config.embedding_init_range)
100
+
101
+
102
+ class AMPLIFY(AMPLIFYPreTrainedModel):
103
+ """The main model class."""
104
+
105
+ def __init__(self, config: AMPLIFYConfig, **kwargs):
106
+ """Initialize a AMPLIFY model.
107
+
108
+ Args:
109
+ config (AMPLIFYConfig): The configuration of the model.
110
+ **kwargs: Additional arguments.
111
+ """
112
+ super().__init__(config)
113
+
114
+ self.config = config
115
+
116
+ self.encoder = nn.Embedding(
117
+ config.vocab_size,
118
+ config.hidden_size,
119
+ padding_idx=config.pad_token_id,
120
+ dtype=config.torch_dtype,
121
+ )
122
+
123
+ if config.layer_norm_after_embedding:
124
+ self.layer_norm_1 = (
125
+ transformer_engine.pytorch.RMSNorm(
126
+ config.hidden_size, config.norm_eps, params_dtype=config.torch_dtype
127
+ )
128
+ if config.rms_norm
129
+ else transformer_engine.pytorch.LayerNorm(
130
+ config.hidden_size, config.norm_eps, params_dtype=config.torch_dtype
131
+ )
132
+ )
133
+
134
+ if config.hidden_act.lower() == "swiglu":
135
+ # To keep the number of parameters and the amount of computation constant, we reduce the
136
+ # number of hidden units by a factor of 2/3 (https://arxiv.org/pdf/2002.05202.pdf) and
137
+ # make it a multiple of 8 to avoid RuntimeError due to misaligned operand
138
+ multiple_of = 8
139
+ intermediate_size = int(2 * config.intermediate_size / 3)
140
+ intermediate_size = multiple_of * ((intermediate_size + multiple_of - 1) // multiple_of)
141
+
142
+ self.transformer_encoder = nn.ModuleList()
143
+ for layer_num in range(config.num_hidden_layers):
144
+ self.transformer_encoder.append(
145
+ transformer_engine.pytorch.TransformerLayer(
146
+ hidden_size=config.hidden_size,
147
+ ffn_hidden_size=intermediate_size,
148
+ num_attention_heads=config.num_attention_heads,
149
+ layernorm_epsilon=config.norm_eps,
150
+ hidden_dropout=config.dropout_prob,
151
+ attention_dropout=config.dropout_prob,
152
+ apply_residual_connection_post_layernorm=False,
153
+ layer_type="encoder",
154
+ self_attn_mask_type="padding",
155
+ normalization="RMSNorm" if config.rms_norm else "LayerNorm",
156
+ fuse_qkv_params=True,
157
+ qkv_weight_interleaved=True,
158
+ output_layernorm=False,
159
+ bias=False,
160
+ activation=config.hidden_act.lower(),
161
+ attn_input_format="bshd",
162
+ layer_number=layer_num + 1,
163
+ name="encoder_block",
164
+ window_size=(-1, -1),
165
+ rotary_pos_interleaved=True,
166
+ seq_length=config.max_length,
167
+ params_dtype=config.torch_dtype,
168
+ )
169
+ )
170
+
171
+ self.freqs_cis = RotaryPositionEmbedding(config.hidden_size // config.num_attention_heads, interleaved=True)(
172
+ config.max_length
173
+ )
174
+
175
+ # Initialize weights and apply final processing
176
+ self.post_init()
177
+
178
+ def forward(
179
+ self,
180
+ input_ids,
181
+ attention_mask=None,
182
+ output_hidden_states=False,
183
+ output_attentions=False,
184
+ labels=None,
185
+ **kwargs,
186
+ ) -> BaseModelOutput:
187
+ """Forward pass of the AMPLIFY model.
188
+
189
+ Args:
190
+ input_ids (torch.Tensor): The input ids.
191
+ attention_mask (torch.Tensor): The attention mask.
192
+ output_hidden_states (bool): Whether to output the hidden states.
193
+ output_attentions (bool): Whether to output the attention weights.
194
+ labels (torch.Tensor): The labels.
195
+ **kwargs: Additional arguments.
196
+
197
+ Returns:
198
+ BaseModelOutput: The output of the model.
199
+ """
200
+ # Initialize
201
+ hidden_states = []
202
+
203
+ # Attention mask
204
+ if attention_mask is not None and attention_mask.dtype is torch.int64:
205
+ # TE expects a boolean attention mask, where "True" indicates a token to be masked.
206
+ attention_mask = ~attention_mask.to(bool)
207
+
208
+ # RoPE
209
+ self.freqs_cis = self.freqs_cis.to(input_ids.device, non_blocking=True)
210
+ freqs_cis = self.freqs_cis[: input_ids.shape[1]]
211
+
212
+ # Embedding
213
+ x = self.encoder(input_ids)
214
+ if self.config.layer_norm_after_embedding:
215
+ x = self.layer_norm_1(x)
216
+
217
+ # Transformer encoder
218
+ for layer in self.transformer_encoder:
219
+ x = layer(x, attention_mask, rotary_pos_emb=freqs_cis)
220
+ if output_hidden_states:
221
+ hidden_states.append(x)
222
+ if output_attentions:
223
+ raise ValueError("output_attentions is not supported for TE")
224
+
225
+ return BaseModelOutput(
226
+ last_hidden_state=x,
227
+ hidden_states=tuple(hidden_states) if hidden_states else None,
228
+ attentions=None,
229
+ )
230
+
231
+
232
+ class AMPLIFYForMaskedLM(AMPLIFYPreTrainedModel):
233
+ """AMPLIFY for masked language modeling."""
234
+
235
+ def __init__(self, config: AMPLIFYConfig, **kwargs):
236
+ """Initialize a AMPLIFYForMaskedLM model.
237
+
238
+ Args:
239
+ config (AMPLIFYConfig): The configuration of the model.
240
+ **kwargs: Additional arguments.
241
+ """
242
+ super().__init__(config)
243
+ self.amplify = AMPLIFY(config, **kwargs)
244
+
245
+ if config.layer_norm_before_last_layer:
246
+ self.decoder = transformer_engine.pytorch.LayerNormLinear(
247
+ config.hidden_size,
248
+ config.vocab_size,
249
+ config.norm_eps,
250
+ params_dtype=config.torch_dtype,
251
+ normalization="RMSNorm" if config.rms_norm else "LayerNorm",
252
+ init_method=lambda x: torch.nn.init.uniform_(
253
+ x, -self.config.decoder_init_range, self.config.decoder_init_range
254
+ ),
255
+ )
256
+
257
+ else:
258
+ self.decoder = transformer_engine.pytorch.Linear(
259
+ config.hidden_size, config.vocab_size, params_dtype=config.torch_dtype
260
+ )
261
+
262
+ def forward(
263
+ self,
264
+ input_ids,
265
+ attention_mask=None,
266
+ output_hidden_states=False,
267
+ output_attentions=False,
268
+ labels=None,
269
+ **kwargs,
270
+ ) -> MaskedLMOutput:
271
+ """Forward pass of the AMPLIFYForMaskedLM model.
272
+
273
+ Args:
274
+ input_ids (torch.Tensor): The input ids.
275
+ attention_mask (torch.Tensor): The attention mask.
276
+ output_hidden_states (bool): Whether to output the hidden states.
277
+ output_attentions (bool): Whether to output the attention weights.
278
+ labels (torch.Tensor): The labels.
279
+ **kwargs: Additional arguments.
280
+
281
+ Returns:
282
+ MaskedLMOutput: The output of the model.
283
+ """
284
+ outputs = self.amplify(
285
+ input_ids,
286
+ attention_mask,
287
+ output_hidden_states,
288
+ output_attentions,
289
+ labels,
290
+ **kwargs,
291
+ )
292
+
293
+ # Classification head with layer norm
294
+ logits = self.decoder(outputs.last_hidden_state)
295
+
296
+ if labels is not None:
297
+ loss = nn.functional.cross_entropy(logits.view(-1, logits.size(-1)), labels.view(-1))
298
+
299
+ else:
300
+ loss = None
301
+
302
+ # Return logits or the output of the last hidden layer
303
+ return MaskedLMOutput(
304
+ loss=loss,
305
+ logits=logits,
306
+ hidden_states=outputs.hidden_states,
307
+ )
config.json ADDED
@@ -0,0 +1,38 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_name_": "AMPLIFY",
3
+ "architectures": [
4
+ "AMPLIFYForMaskedLM"
5
+ ],
6
+ "att_bias": false,
7
+ "auto_map": {
8
+ "AutoConfig": "amplify_te.AMPLIFYConfig",
9
+ "AutoModel": "amplify_te.AMPLIFY",
10
+ "AutoModelForMaskedLM": "amplify_te.AMPLIFYForMaskedLM"
11
+ },
12
+ "bos_token_id": 3,
13
+ "decoder_init_range": 0.02,
14
+ "dropout_prob": 0,
15
+ "embedding_init_range": 0.02,
16
+ "eos_token_id": 4,
17
+ "ffn_bias": false,
18
+ "hidden_act": "SwiGLU",
19
+ "hidden_size": 640,
20
+ "intermediate_size": 2560,
21
+ "layer_norm_after_embedding": false,
22
+ "layer_norm_before_last_layer": true,
23
+ "mask_token_id": 2,
24
+ "max_length": 2048,
25
+ "model_type": "AMPLIFY",
26
+ "norm_eps": 1e-05,
27
+ "num_attention_heads": 10,
28
+ "num_hidden_layers": 24,
29
+ "other_special_token_ids": null,
30
+ "pad_token_id": 0,
31
+ "pre_activation_layer_norm": true,
32
+ "rms_norm": true,
33
+ "torch_dtype": "float32",
34
+ "transformers_version": "4.54.0.dev0",
35
+ "unk_token_id": 1,
36
+ "vocab_path": "conf/tokenizer/amplify_vocab.txt",
37
+ "vocab_size": 27
38
+ }
model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:4c24689ec4865382b883b0f7bfbb4b504dc3d671c71270dcd209422fa53553df
3
+ size 473138596
special_tokens_map.json ADDED
@@ -0,0 +1,37 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "bos_token": {
3
+ "content": "<bos>",
4
+ "lstrip": false,
5
+ "normalized": false,
6
+ "rstrip": false,
7
+ "single_word": false
8
+ },
9
+ "eos_token": {
10
+ "content": "<eos>",
11
+ "lstrip": false,
12
+ "normalized": false,
13
+ "rstrip": false,
14
+ "single_word": false
15
+ },
16
+ "mask_token": {
17
+ "content": "<mask>",
18
+ "lstrip": false,
19
+ "normalized": false,
20
+ "rstrip": false,
21
+ "single_word": false
22
+ },
23
+ "pad_token": {
24
+ "content": "<pad>",
25
+ "lstrip": false,
26
+ "normalized": false,
27
+ "rstrip": false,
28
+ "single_word": false
29
+ },
30
+ "unk_token": {
31
+ "content": "<unk>",
32
+ "lstrip": false,
33
+ "normalized": false,
34
+ "rstrip": false,
35
+ "single_word": false
36
+ }
37
+ }
tokenizer.json ADDED
@@ -0,0 +1,154 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "version": "1.0",
3
+ "truncation": null,
4
+ "padding": null,
5
+ "added_tokens": [
6
+ {
7
+ "id": 0,
8
+ "content": "<pad>",
9
+ "single_word": false,
10
+ "lstrip": false,
11
+ "rstrip": false,
12
+ "normalized": false,
13
+ "special": true
14
+ },
15
+ {
16
+ "id": 1,
17
+ "content": "<unk>",
18
+ "single_word": false,
19
+ "lstrip": false,
20
+ "rstrip": false,
21
+ "normalized": false,
22
+ "special": true
23
+ },
24
+ {
25
+ "id": 2,
26
+ "content": "<mask>",
27
+ "single_word": false,
28
+ "lstrip": false,
29
+ "rstrip": false,
30
+ "normalized": false,
31
+ "special": true
32
+ },
33
+ {
34
+ "id": 3,
35
+ "content": "<bos>",
36
+ "single_word": false,
37
+ "lstrip": false,
38
+ "rstrip": false,
39
+ "normalized": false,
40
+ "special": true
41
+ },
42
+ {
43
+ "id": 4,
44
+ "content": "<eos>",
45
+ "single_word": false,
46
+ "lstrip": false,
47
+ "rstrip": false,
48
+ "normalized": false,
49
+ "special": true
50
+ }
51
+ ],
52
+ "normalizer": null,
53
+ "pre_tokenizer": {
54
+ "type": "Split",
55
+ "pattern": {
56
+ "String": ""
57
+ },
58
+ "behavior": "Removed",
59
+ "invert": false
60
+ },
61
+ "post_processor": {
62
+ "type": "TemplateProcessing",
63
+ "single": [
64
+ {
65
+ "SpecialToken": {
66
+ "id": "<bos>",
67
+ "type_id": 0
68
+ }
69
+ },
70
+ {
71
+ "Sequence": {
72
+ "id": "A",
73
+ "type_id": 0
74
+ }
75
+ },
76
+ {
77
+ "SpecialToken": {
78
+ "id": "<eos>",
79
+ "type_id": 0
80
+ }
81
+ }
82
+ ],
83
+ "pair": [
84
+ {
85
+ "Sequence": {
86
+ "id": "A",
87
+ "type_id": 0
88
+ }
89
+ },
90
+ {
91
+ "Sequence": {
92
+ "id": "B",
93
+ "type_id": 1
94
+ }
95
+ }
96
+ ],
97
+ "special_tokens": {
98
+ "<bos>": {
99
+ "id": "<bos>",
100
+ "ids": [
101
+ 3
102
+ ],
103
+ "tokens": [
104
+ "<bos>"
105
+ ]
106
+ },
107
+ "<eos>": {
108
+ "id": "<eos>",
109
+ "ids": [
110
+ 4
111
+ ],
112
+ "tokens": [
113
+ "<eos>"
114
+ ]
115
+ }
116
+ }
117
+ },
118
+ "decoder": null,
119
+ "model": {
120
+ "type": "WordPiece",
121
+ "unk_token": "<unk>",
122
+ "continuing_subword_prefix": "##",
123
+ "max_input_chars_per_word": 100,
124
+ "vocab": {
125
+ "<pad>": 0,
126
+ "<unk>": 1,
127
+ "<mask>": 2,
128
+ "<bos>": 3,
129
+ "<eos>": 4,
130
+ "|": 5,
131
+ "L": 6,
132
+ "A": 7,
133
+ "G": 8,
134
+ "V": 9,
135
+ "S": 10,
136
+ "E": 11,
137
+ "R": 12,
138
+ "T": 13,
139
+ "I": 14,
140
+ "D": 15,
141
+ "P": 16,
142
+ "K": 17,
143
+ "Q": 18,
144
+ "N": 19,
145
+ "F": 20,
146
+ "Y": 21,
147
+ "M": 22,
148
+ "H": 23,
149
+ "W": 24,
150
+ "C": 25,
151
+ "B": 26
152
+ }
153
+ }
154
+ }
tokenizer_config.json ADDED
@@ -0,0 +1,59 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "added_tokens_decoder": {
3
+ "0": {
4
+ "content": "<pad>",
5
+ "lstrip": false,
6
+ "normalized": false,
7
+ "rstrip": false,
8
+ "single_word": false,
9
+ "special": true
10
+ },
11
+ "1": {
12
+ "content": "<unk>",
13
+ "lstrip": false,
14
+ "normalized": false,
15
+ "rstrip": false,
16
+ "single_word": false,
17
+ "special": true
18
+ },
19
+ "2": {
20
+ "content": "<mask>",
21
+ "lstrip": false,
22
+ "normalized": false,
23
+ "rstrip": false,
24
+ "single_word": false,
25
+ "special": true
26
+ },
27
+ "3": {
28
+ "content": "<bos>",
29
+ "lstrip": false,
30
+ "normalized": false,
31
+ "rstrip": false,
32
+ "single_word": false,
33
+ "special": true
34
+ },
35
+ "4": {
36
+ "content": "<eos>",
37
+ "lstrip": false,
38
+ "normalized": false,
39
+ "rstrip": false,
40
+ "single_word": false,
41
+ "special": true
42
+ }
43
+ },
44
+ "bos_token": "<bos>",
45
+ "clean_up_tokenization_spaces": true,
46
+ "eos_token": "<eos>",
47
+ "extra_special_tokens": {},
48
+ "mask_token": "<mask>",
49
+ "model_input_names": [
50
+ "input_ids",
51
+ "attention_mask"
52
+ ],
53
+ "model_max_length": 2048,
54
+ "pad_token": "<pad>",
55
+ "padding_side": "right",
56
+ "tokenizer_class": "PreTrainedTokenizerFast",
57
+ "truncation_side": "right",
58
+ "unk_token": "<unk>"
59
+ }