Modelling fixes (#2)
Browse files- Modelling fixes (7e409e9ff1f5b669417a77d8c4eb5d949a76244f)
- Update modeling_florence2.py (38dde526b82d731d1fc28c89bb29da5f19d43af9)
- modeling_florence2.py +9 -2
    	
        modeling_florence2.py
    CHANGED
    
    | @@ -2240,6 +2240,10 @@ class Florence2Seq2SeqLMOutput(ModelOutput): | |
| 2240 | 
             
                decoding.
         | 
| 2241 |  | 
| 2242 | 
             
                Args:
         | 
|  | |
|  | |
|  | |
|  | |
| 2243 | 
             
                    last_hidden_state (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
         | 
| 2244 | 
             
                        Sequence of hidden-states at the output of the last layer of the decoder of the model.
         | 
| 2245 |  | 
| @@ -2288,7 +2292,8 @@ class Florence2Seq2SeqLMOutput(ModelOutput): | |
| 2288 |  | 
| 2289 | 
             
                        image_hidden_states of the model produced by the vision encoder
         | 
| 2290 | 
             
                """
         | 
| 2291 | 
            -
             | 
|  | |
| 2292 | 
             
                last_hidden_state: torch.FloatTensor = None
         | 
| 2293 | 
             
                past_key_values: Optional[Tuple[Tuple[torch.FloatTensor]]] = None
         | 
| 2294 | 
             
                decoder_hidden_states: Optional[Tuple[torch.FloatTensor, ...]] = None
         | 
| @@ -2297,6 +2302,7 @@ class Florence2Seq2SeqLMOutput(ModelOutput): | |
| 2297 | 
             
                encoder_last_hidden_state: Optional[torch.FloatTensor] = None
         | 
| 2298 | 
             
                encoder_hidden_states: Optional[Tuple[torch.FloatTensor, ...]] = None
         | 
| 2299 | 
             
                encoder_attentions: Optional[Tuple[torch.FloatTensor, ...]] = None
         | 
|  | |
| 2300 |  | 
| 2301 |  | 
| 2302 | 
             
            FLORENCE2_START_DOCSTRING = r"""
         | 
| @@ -2731,7 +2737,8 @@ class Florence2ForConditionalGeneration(Florence2PreTrainedModel): | |
| 2731 | 
             
                            image_features = self._encode_image(pixel_values)
         | 
| 2732 | 
             
                            inputs_embeds, attention_mask = self._merge_input_ids_with_image_features(image_features, inputs_embeds)
         | 
| 2733 |  | 
| 2734 | 
            -
                     | 
|  | |
| 2735 | 
             
                    outputs = self.language_model(
         | 
| 2736 | 
             
                        attention_mask=attention_mask,
         | 
| 2737 | 
             
                        labels=labels,
         | 
|  | |
| 2240 | 
             
                decoding.
         | 
| 2241 |  | 
| 2242 | 
             
                Args:
         | 
| 2243 | 
            +
                    loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `labels` is provided):
         | 
| 2244 | 
            +
                        Language modeling loss.
         | 
| 2245 | 
            +
                    logits (`torch.FloatTensor` of shape `(batch_size, sequence_length, config.vocab_size)`):
         | 
| 2246 | 
            +
                        Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax).
         | 
| 2247 | 
             
                    last_hidden_state (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
         | 
| 2248 | 
             
                        Sequence of hidden-states at the output of the last layer of the decoder of the model.
         | 
| 2249 |  | 
|  | |
| 2292 |  | 
| 2293 | 
             
                        image_hidden_states of the model produced by the vision encoder
         | 
| 2294 | 
             
                """
         | 
| 2295 | 
            +
                loss: Optional[torch.FloatTensor] = None
         | 
| 2296 | 
            +
                logits: torch.FloatTensor = None
         | 
| 2297 | 
             
                last_hidden_state: torch.FloatTensor = None
         | 
| 2298 | 
             
                past_key_values: Optional[Tuple[Tuple[torch.FloatTensor]]] = None
         | 
| 2299 | 
             
                decoder_hidden_states: Optional[Tuple[torch.FloatTensor, ...]] = None
         | 
|  | |
| 2302 | 
             
                encoder_last_hidden_state: Optional[torch.FloatTensor] = None
         | 
| 2303 | 
             
                encoder_hidden_states: Optional[Tuple[torch.FloatTensor, ...]] = None
         | 
| 2304 | 
             
                encoder_attentions: Optional[Tuple[torch.FloatTensor, ...]] = None
         | 
| 2305 | 
            +
                image_hidden_states: Optional[Tuple[torch.FloatTensor, ...]] = None
         | 
| 2306 |  | 
| 2307 |  | 
| 2308 | 
             
            FLORENCE2_START_DOCSTRING = r"""
         | 
|  | |
| 2737 | 
             
                            image_features = self._encode_image(pixel_values)
         | 
| 2738 | 
             
                            inputs_embeds, attention_mask = self._merge_input_ids_with_image_features(image_features, inputs_embeds)
         | 
| 2739 |  | 
| 2740 | 
            +
                    if inputs_embeds is not None:
         | 
| 2741 | 
            +
                        attention_mask = attention_mask.to(inputs_embeds.dtype)
         | 
| 2742 | 
             
                    outputs = self.language_model(
         | 
| 2743 | 
             
                        attention_mask=attention_mask,
         | 
| 2744 | 
             
                        labels=labels,
         | 
