Update modeling_codeshell.py
Browse files- modeling_codeshell.py +1 -15
    	
        modeling_codeshell.py
    CHANGED
    
    | @@ -457,15 +457,12 @@ class CodeShellPreTrainedModel(PreTrainedModel): | |
| 457 |  | 
| 458 |  | 
| 459 | 
             
            GPT_BIGCODE_START_DOCSTRING = r"""
         | 
| 460 | 
            -
             | 
| 461 | 
             
                This model inherits from [`PreTrainedModel`]. Check the superclass documentation for the generic methods the
         | 
| 462 | 
             
                library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads
         | 
| 463 | 
             
                etc.)
         | 
| 464 | 
            -
             | 
| 465 | 
             
                This model is also a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) subclass.
         | 
| 466 | 
             
                Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage
         | 
| 467 | 
             
                and behavior.
         | 
| 468 | 
            -
             | 
| 469 | 
             
                Parameters:
         | 
| 470 | 
             
                    config ([`CodeShellConfig`]): Model configuration class with all the parameters of the model.
         | 
| 471 | 
             
                        Initializing with a config file does not load the weights associated with the model, only the
         | 
| @@ -478,13 +475,10 @@ GPT_BIGCODE_INPUTS_DOCSTRING = r""" | |
| 478 | 
             
                        `input_ids_length` = `sequence_length` if `past_key_values` is `None` else
         | 
| 479 | 
             
                        `past_key_values[0][0].shape[-2]` (`sequence_length` of input past key value states). Indices of input
         | 
| 480 | 
             
                        sequence tokens in the vocabulary.
         | 
| 481 | 
            -
             | 
| 482 | 
             
                        If `past_key_values` is used, only `input_ids` that do not have their past calculated should be passed as
         | 
| 483 | 
             
                        `input_ids`.
         | 
| 484 | 
            -
             | 
| 485 | 
             
                        Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
         | 
| 486 | 
             
                        [`PreTrainedTokenizer.__call__`] for details.
         | 
| 487 | 
            -
             | 
| 488 | 
             
                        [What are input IDs?](../glossary#input-ids)
         | 
| 489 | 
             
                    past_key_values (`Tuple[torch.Tensor]` of length `config.n_layers`):
         | 
| 490 | 
             
                        Contains precomputed hidden-states (key and values in the attention blocks) as computed by the model (see
         | 
| @@ -492,39 +486,30 @@ GPT_BIGCODE_INPUTS_DOCSTRING = r""" | |
| 492 | 
             
                        their past given to this model should not be passed as `input_ids` as they have already been computed.
         | 
| 493 | 
             
                    attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
         | 
| 494 | 
             
                        Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
         | 
| 495 | 
            -
             | 
| 496 | 
             
                        - 1 for tokens that are **not masked**,
         | 
| 497 | 
             
                        - 0 for tokens that are **masked**.
         | 
| 498 | 
            -
             | 
| 499 | 
             
                        If `past_key_values` is used, `attention_mask` needs to contain the masking strategy that was used for
         | 
| 500 | 
             
                        `past_key_values`. In other words, the `attention_mask` always has to have the length:
         | 
| 501 | 
             
                        `len(past_key_values) + len(input_ids)`
         | 
| 502 | 
            -
             | 
| 503 | 
             
                        [What are attention masks?](../glossary#attention-mask)
         | 
| 504 | 
             
                    token_type_ids (`torch.Tensor` of shape `(batch_size, input_ids_length)`, *optional*):
         | 
| 505 | 
             
                        Segment token indices to indicate first and second portions of the inputs. Indices are selected in `[0,
         | 
| 506 | 
             
                        1]`:
         | 
| 507 | 
            -
             | 
| 508 | 
             
                        - 0 corresponds to a *sentence A* token,
         | 
| 509 | 
             
                        - 1 corresponds to a *sentence B* token.
         | 
| 510 | 
            -
             | 
| 511 | 
             
                        [What are token type IDs?](../glossary#token-type-ids)
         | 
| 512 | 
             
                    position_ids (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
         | 
| 513 | 
             
                        Indices of positions of each input sequence tokens in the position embeddings. Selected in the range `[0,
         | 
| 514 | 
             
                        config.max_position_embeddings - 1]`.
         | 
| 515 | 
            -
             | 
| 516 | 
             
                        [What are position IDs?](../glossary#position-ids)
         | 
| 517 | 
             
                    head_mask (`torch.Tensor` of shape `(num_heads,)` or `(num_layers, num_heads)`, *optional*):
         | 
| 518 | 
             
                        Mask to nullify selected heads of the self-attention modules. Mask values selected in `[0, 1]`:
         | 
| 519 | 
            -
             | 
| 520 | 
             
                        - 1 indicates the head is **not masked**,
         | 
| 521 | 
             
                        - 0 indicates the head is **masked**.
         | 
| 522 | 
            -
             | 
| 523 | 
             
                    inputs_embeds (`torch.Tensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
         | 
| 524 | 
             
                        Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation. This
         | 
| 525 | 
             
                        is useful if you want more control over how to convert `input_ids` indices into associated vectors than the
         | 
| 526 | 
             
                        model's internal embedding lookup matrix.
         | 
| 527 | 
            -
             | 
| 528 | 
             
                        If `past_key_values` is used, optionally only the last `inputs_embeds` have to be input (see
         | 
| 529 | 
             
                        `past_key_values`).
         | 
| 530 | 
             
                    use_cache (`bool`, *optional*):
         | 
| @@ -959,6 +944,7 @@ class CodeShellForCausalLM(CodeShellPreTrainedModel): | |
| 959 | 
             
                    prompt += ai_name.rstrip()
         | 
| 960 |  | 
| 961 | 
             
                    max_new_tokens = max_new_tokens or self.generation_config.max_new_tokens
         | 
|  | |
| 962 | 
             
                    max_input_tokens = self.config.n_positions - max_new_tokens
         | 
| 963 |  | 
| 964 | 
             
                    input_tokens = tokenizer.encode(prompt)
         | 
|  | |
| 457 |  | 
| 458 |  | 
| 459 | 
             
            GPT_BIGCODE_START_DOCSTRING = r"""
         | 
|  | |
| 460 | 
             
                This model inherits from [`PreTrainedModel`]. Check the superclass documentation for the generic methods the
         | 
| 461 | 
             
                library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads
         | 
| 462 | 
             
                etc.)
         | 
|  | |
| 463 | 
             
                This model is also a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) subclass.
         | 
| 464 | 
             
                Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage
         | 
| 465 | 
             
                and behavior.
         | 
|  | |
| 466 | 
             
                Parameters:
         | 
| 467 | 
             
                    config ([`CodeShellConfig`]): Model configuration class with all the parameters of the model.
         | 
| 468 | 
             
                        Initializing with a config file does not load the weights associated with the model, only the
         | 
|  | |
| 475 | 
             
                        `input_ids_length` = `sequence_length` if `past_key_values` is `None` else
         | 
| 476 | 
             
                        `past_key_values[0][0].shape[-2]` (`sequence_length` of input past key value states). Indices of input
         | 
| 477 | 
             
                        sequence tokens in the vocabulary.
         | 
|  | |
| 478 | 
             
                        If `past_key_values` is used, only `input_ids` that do not have their past calculated should be passed as
         | 
| 479 | 
             
                        `input_ids`.
         | 
|  | |
| 480 | 
             
                        Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
         | 
| 481 | 
             
                        [`PreTrainedTokenizer.__call__`] for details.
         | 
|  | |
| 482 | 
             
                        [What are input IDs?](../glossary#input-ids)
         | 
| 483 | 
             
                    past_key_values (`Tuple[torch.Tensor]` of length `config.n_layers`):
         | 
| 484 | 
             
                        Contains precomputed hidden-states (key and values in the attention blocks) as computed by the model (see
         | 
|  | |
| 486 | 
             
                        their past given to this model should not be passed as `input_ids` as they have already been computed.
         | 
| 487 | 
             
                    attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
         | 
| 488 | 
             
                        Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
         | 
|  | |
| 489 | 
             
                        - 1 for tokens that are **not masked**,
         | 
| 490 | 
             
                        - 0 for tokens that are **masked**.
         | 
|  | |
| 491 | 
             
                        If `past_key_values` is used, `attention_mask` needs to contain the masking strategy that was used for
         | 
| 492 | 
             
                        `past_key_values`. In other words, the `attention_mask` always has to have the length:
         | 
| 493 | 
             
                        `len(past_key_values) + len(input_ids)`
         | 
|  | |
| 494 | 
             
                        [What are attention masks?](../glossary#attention-mask)
         | 
| 495 | 
             
                    token_type_ids (`torch.Tensor` of shape `(batch_size, input_ids_length)`, *optional*):
         | 
| 496 | 
             
                        Segment token indices to indicate first and second portions of the inputs. Indices are selected in `[0,
         | 
| 497 | 
             
                        1]`:
         | 
|  | |
| 498 | 
             
                        - 0 corresponds to a *sentence A* token,
         | 
| 499 | 
             
                        - 1 corresponds to a *sentence B* token.
         | 
|  | |
| 500 | 
             
                        [What are token type IDs?](../glossary#token-type-ids)
         | 
| 501 | 
             
                    position_ids (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
         | 
| 502 | 
             
                        Indices of positions of each input sequence tokens in the position embeddings. Selected in the range `[0,
         | 
| 503 | 
             
                        config.max_position_embeddings - 1]`.
         | 
|  | |
| 504 | 
             
                        [What are position IDs?](../glossary#position-ids)
         | 
| 505 | 
             
                    head_mask (`torch.Tensor` of shape `(num_heads,)` or `(num_layers, num_heads)`, *optional*):
         | 
| 506 | 
             
                        Mask to nullify selected heads of the self-attention modules. Mask values selected in `[0, 1]`:
         | 
|  | |
| 507 | 
             
                        - 1 indicates the head is **not masked**,
         | 
| 508 | 
             
                        - 0 indicates the head is **masked**.
         | 
|  | |
| 509 | 
             
                    inputs_embeds (`torch.Tensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
         | 
| 510 | 
             
                        Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation. This
         | 
| 511 | 
             
                        is useful if you want more control over how to convert `input_ids` indices into associated vectors than the
         | 
| 512 | 
             
                        model's internal embedding lookup matrix.
         | 
|  | |
| 513 | 
             
                        If `past_key_values` is used, optionally only the last `inputs_embeds` have to be input (see
         | 
| 514 | 
             
                        `past_key_values`).
         | 
| 515 | 
             
                    use_cache (`bool`, *optional*):
         | 
|  | |
| 944 | 
             
                    prompt += ai_name.rstrip()
         | 
| 945 |  | 
| 946 | 
             
                    max_new_tokens = max_new_tokens or self.generation_config.max_new_tokens
         | 
| 947 | 
            +
                    max_new_tokens = max_new_tokens or 128
         | 
| 948 | 
             
                    max_input_tokens = self.config.n_positions - max_new_tokens
         | 
| 949 |  | 
| 950 | 
             
                    input_tokens = tokenizer.encode(prompt)
         | 
