| from transformers import GPTNeoConfig | |
| class Gia2Config(GPTNeoConfig): | |
| r""" | |
| This is the configuration class to store the configuration of a [`Gia2Model`]. It is used to instantiate a Gia2 | |
| model according to the specified arguments, defining the model architecture. Instantiating a configuration with | |
| the defaults will yield a similar configuration to that of the ... (TODO) | |
| Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the | |
| documentation from [`PretrainedConfig`] for more information. | |
| Args: | |
| vocab_size (`int`, *optional*, defaults to 50257): | |
| Vocabulary size of the GPT Neo model. Defines the number of different tokens that can be represented by the | |
| `inputs_ids` passed when calling [`GPTNeoModel`]. Vocabulary size of the model. Defines the different | |
| tokens that can be represented by the *inputs_ids* passed to the forward method of [`GPTNeoModel`]. | |
| max_position_embeddings (`int`, *optional*, defaults to 2048): | |
| The maximum sequence length that this model might ever be used with. Typically set this to something large | |
| just in case (e.g., 512 or 1024 or 2048). | |
| hidden_size (`int`, *optional*, defaults to 2048): | |
| Dimensionality of the encoder layers and the pooler layer. | |
| num_layers (`int`, *optional*, defaults to 24): | |
| Number of hidden layers in the Transformer encoder. | |
| attention_types (`List`, *optional*, defaults to `[[["global", "local"], 12]]`): | |
| The type of attention for each layer in a `List` of the following format `[[["attention_type"], | |
| num_layerss]]` e.g. for a 24 layer model `[[["global"], 24]]` or `[[["global", "local"], 12]]` Choose the | |
| value of `attention_type` from `["global", "local"]` | |
| num_heads (`int`, *optional*, defaults to 16): | |
| Number of attention heads for each attention layer in the Transformer encoder. | |
| intermediate_size (`int`, *optional*, defaults to 8192): | |
| Dimensionality of the "intermediate" (i.e., feed-forward) layer in the Transformer encoder. | |
| window_size (`int`, *optional*, defaults to 256): | |
| The size of the sliding window for local attention. | |
| activation_function (`str` or `function`, *optional*, defaults to `"gelu_new"`): | |
| The non-linear activation function (function or string) in the encoder and pooler. If string, `"gelu"`, | |
| `"relu"`, `"selu"` and `"gelu_new"` are supported. | |
| resid_dropout (`float`, *optional*, defaults to 0.0): | |
| Residual dropout used in the attention pattern. | |
| embed_dropout (`float`, *optional*, defaults to 0.0): | |
| The dropout probabilitiy for all fully connected layers in the embeddings, encoder, and pooler. | |
| attention_dropout (`float`, *optional*, defaults to 0.0): | |
| The dropout ratio for the attention probabilities. | |
| classifier_dropout (`float`, *optional*, defaults to 0.1): | |
| Argument used when doing token classification, used in the model [`GPTNeoForTokenClassification`]. The | |
| dropout ratio for the hidden layer. | |
| layer_norm_epsilon (`float`, *optional*, defaults to 1e-5): | |
| The epsilon used by the layer normalization layers. | |
| initializer_range (`float`, *optional*, defaults to 0.02): | |
| The standard deviation of the truncated_normal_initializer for initializing all weight matrices. | |
| use_cache (`bool`, *optional*, defaults to `True`): | |
| Whether or not the model should return the last key/values attentions (not used by all models). Only | |
| relevant if `config.is_decoder=True`. | |
| bos_token_id (`int`, *optional*, defaults to 50256): | |
| The id of the beginning of sentence token in the vocabulary. | |
| eos_token_id (`int`, *optional*, defaults to 50256): | |
| The id of the end of sentence token in the vocabulary. | |
| max_continuous_size (`int`, *optional*, default to 376): | |
| The maximum size of the continuous values. | |
| max_discrete_value (`int`, *optional*, default to 18): | |
| The maximum value of the discrete values. | |
| image_size (`int`, *optional*, defaults to 224): | |
| The size (resolution) of each image. | |
| patch_size (`int`, *optional*, defaults to 16): | |
| The size (resolution) of each patch. | |
| tokenizer_class (`str`, *optional*, defaults to `"GPT2Tokenizer"`): | |
| The name of the tokenizer class to use. | |
| """ | |
| model_type = "gia2" | |
| def __init__( | |
| self, | |
| vocab_size=50257, | |
| max_position_embeddings=2048, | |
| hidden_size=2048, | |
| num_layers=24, | |
| attention_types=[[["global", "local"], 12]], | |
| num_heads=16, | |
| intermediate_size=None, | |
| window_size=256, | |
| activation_function="gelu_new", | |
| resid_dropout=0.0, | |
| embed_dropout=0.0, | |
| attention_dropout=0.0, | |
| classifier_dropout=0.1, | |
| layer_norm_epsilon=1e-5, | |
| initializer_range=0.02, | |
| use_cache=True, | |
| bos_token_id=50256, | |
| eos_token_id=50256, | |
| max_continuous_size=377, | |
| max_discrete_value=18, | |
| image_size=224, | |
| num_channels=3, | |
| patch_size=16, | |
| tokenizer_class="GPT2Tokenizer", | |
| **kwargs, | |
| ): | |
| super().__init__( | |
| vocab_size, | |
| max_position_embeddings, | |
| hidden_size, | |
| num_layers, | |
| attention_types, | |
| num_heads, | |
| intermediate_size, | |
| window_size, | |
| activation_function, | |
| resid_dropout, | |
| embed_dropout, | |
| attention_dropout, | |
| classifier_dropout, | |
| layer_norm_epsilon, | |
| initializer_range, | |
| use_cache, | |
| bos_token_id, | |
| eos_token_id, | |
| tokenizer_class=tokenizer_class, | |
| **kwargs, | |
| ) | |
| self.max_continuous_size = max_continuous_size | |
| self.max_discrete_value = max_discrete_value | |
| self.image_size = image_size | |
| self.num_channels = num_channels | |
| self.patch_size = patch_size | |
| Gia2Config.register_for_auto_class() | |