Upload folder using huggingface_hub
Browse files- config.json +4 -3
- modeling_internlm2.py +182 -0
- modeling_internvl_chat.py +2 -3
config.json
CHANGED
|
@@ -1,6 +1,6 @@
|
|
| 1 |
{
|
| 2 |
"_commit_hash": null,
|
| 3 |
-
"_name_or_path": "
|
| 4 |
"architectures": [
|
| 5 |
"InternVLChatModel"
|
| 6 |
],
|
|
@@ -11,6 +11,7 @@
|
|
| 11 |
"downsample_ratio": 0.5,
|
| 12 |
"dynamic_image_size": true,
|
| 13 |
"force_image_size": 448,
|
|
|
|
| 14 |
"llm_config": {
|
| 15 |
"_name_or_path": "pretrained/internlm2-chat-20b/",
|
| 16 |
"add_cross_attention": false,
|
|
@@ -100,7 +101,7 @@
|
|
| 100 |
"use_cache": false,
|
| 101 |
"vocab_size": 92553
|
| 102 |
},
|
| 103 |
-
"max_dynamic_patch":
|
| 104 |
"min_dynamic_patch": 1,
|
| 105 |
"model_type": "internvl_chat",
|
| 106 |
"pad2square": false,
|
|
@@ -113,7 +114,7 @@
|
|
| 113 |
"use_llm_lora": 0,
|
| 114 |
"use_thumbnail": true,
|
| 115 |
"vision_config": {
|
| 116 |
-
"_name_or_path": "
|
| 117 |
"add_cross_attention": false,
|
| 118 |
"architectures": [
|
| 119 |
"InternVisionModel"
|
|
|
|
| 1 |
{
|
| 2 |
"_commit_hash": null,
|
| 3 |
+
"_name_or_path": "./work_dirs/internvl_chat_internlm2_20b_448_dynamic_chinese_pretrain3/checkpoint-1600_replace_llm",
|
| 4 |
"architectures": [
|
| 5 |
"InternVLChatModel"
|
| 6 |
],
|
|
|
|
| 11 |
"downsample_ratio": 0.5,
|
| 12 |
"dynamic_image_size": true,
|
| 13 |
"force_image_size": 448,
|
| 14 |
+
"image_fold": null,
|
| 15 |
"llm_config": {
|
| 16 |
"_name_or_path": "pretrained/internlm2-chat-20b/",
|
| 17 |
"add_cross_attention": false,
|
|
|
|
| 101 |
"use_cache": false,
|
| 102 |
"vocab_size": 92553
|
| 103 |
},
|
| 104 |
+
"max_dynamic_patch": 6,
|
| 105 |
"min_dynamic_patch": 1,
|
| 106 |
"model_type": "internvl_chat",
|
| 107 |
"pad2square": false,
|
|
|
|
| 114 |
"use_llm_lora": 0,
|
| 115 |
"use_thumbnail": true,
|
| 116 |
"vision_config": {
|
| 117 |
+
"_name_or_path": "work_dirs/internvl_chat_internlm2_20b_448_dynamic_chinese_pretrain/checkpoint-5200-vit",
|
| 118 |
"add_cross_attention": false,
|
| 119 |
"architectures": [
|
| 120 |
"InternVisionModel"
|
modeling_internlm2.py
CHANGED
|
@@ -39,6 +39,20 @@ try:
|
|
| 39 |
from transformers.generation.streamers import BaseStreamer
|
| 40 |
except: # noqa # pylint: disable=bare-except
|
| 41 |
BaseStreamer = None
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 42 |
|
| 43 |
from .configuration_internlm2 import InternLM2Config
|
| 44 |
|
|
@@ -1272,6 +1286,174 @@ class InternLM2ForCausalLM(InternLM2PreTrainedModel):
|
|
| 1272 |
|
| 1273 |
return consumer()
|
| 1274 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1275 |
|
| 1276 |
# Copied from transformers.model.llama.modeling_llama.LlamaForSequenceClassification with Llama->InternLM2
|
| 1277 |
@add_start_docstrings(
|
|
|
|
| 39 |
from transformers.generation.streamers import BaseStreamer
|
| 40 |
except: # noqa # pylint: disable=bare-except
|
| 41 |
BaseStreamer = None
|
| 42 |
+
from typing import Any, List, Optional, Tuple, Union
|
| 43 |
+
import torch.distributed as dist
|
| 44 |
+
import torch.utils.checkpoint
|
| 45 |
+
from peft import LoraConfig, get_peft_model
|
| 46 |
+
from torch import nn
|
| 47 |
+
from torch.nn import CrossEntropyLoss
|
| 48 |
+
from transformers import GenerationConfig, LlamaForCausalLM, LlamaTokenizer
|
| 49 |
+
from transformers.generation.logits_process import LogitsProcessorList
|
| 50 |
+
from transformers.generation.stopping_criteria import StoppingCriteriaList
|
| 51 |
+
from transformers.generation.streamers import BaseStreamer
|
| 52 |
+
from transformers.modeling_outputs import CausalLMOutputWithPast
|
| 53 |
+
from transformers.modeling_utils import PreTrainedModel
|
| 54 |
+
from transformers.utils import ModelOutput, logging
|
| 55 |
+
from transformers.generation.utils import GreedySearchOutput, validate_stopping_criteria, GreedySearchDecoderOnlyOutput, GreedySearchEncoderDecoderOutput
|
| 56 |
|
| 57 |
from .configuration_internlm2 import InternLM2Config
|
| 58 |
|
|
|
|
| 1286 |
|
| 1287 |
return consumer()
|
| 1288 |
|
| 1289 |
+
def greedy_search(
|
| 1290 |
+
self,
|
| 1291 |
+
input_ids: torch.LongTensor,
|
| 1292 |
+
logits_processor: Optional[LogitsProcessorList] = None,
|
| 1293 |
+
stopping_criteria: Optional[StoppingCriteriaList] = None,
|
| 1294 |
+
max_length: Optional[int] = None,
|
| 1295 |
+
pad_token_id: Optional[int] = None,
|
| 1296 |
+
eos_token_id: Optional[Union[int, List[int]]] = None,
|
| 1297 |
+
output_attentions: Optional[bool] = None,
|
| 1298 |
+
output_hidden_states: Optional[bool] = None,
|
| 1299 |
+
output_scores: Optional[bool] = None,
|
| 1300 |
+
return_dict_in_generate: Optional[bool] = None,
|
| 1301 |
+
synced_gpus: bool = False,
|
| 1302 |
+
streamer: Optional["BaseStreamer"] = None,
|
| 1303 |
+
**model_kwargs,
|
| 1304 |
+
) -> Union[GreedySearchOutput, torch.LongTensor]:
|
| 1305 |
+
# init values
|
| 1306 |
+
logits_processor = logits_processor if logits_processor is not None else LogitsProcessorList()
|
| 1307 |
+
stopping_criteria = stopping_criteria if stopping_criteria is not None else StoppingCriteriaList()
|
| 1308 |
+
if max_length is not None:
|
| 1309 |
+
warnings.warn(
|
| 1310 |
+
"`max_length` is deprecated in this function, use"
|
| 1311 |
+
" `stopping_criteria=StoppingCriteriaList([MaxLengthCriteria(max_length=max_length)])` instead.",
|
| 1312 |
+
UserWarning,
|
| 1313 |
+
)
|
| 1314 |
+
stopping_criteria = validate_stopping_criteria(stopping_criteria, max_length)
|
| 1315 |
+
pad_token_id = pad_token_id if pad_token_id is not None else self.generation_config.pad_token_id
|
| 1316 |
+
eos_token_id = eos_token_id if eos_token_id is not None else self.generation_config.eos_token_id
|
| 1317 |
+
if isinstance(eos_token_id, int):
|
| 1318 |
+
eos_token_id = [eos_token_id]
|
| 1319 |
+
eos_token_id_tensor = torch.tensor(eos_token_id).to(input_ids.device) if eos_token_id is not None else None
|
| 1320 |
+
output_scores = output_scores if output_scores is not None else self.generation_config.output_scores
|
| 1321 |
+
output_attentions = (
|
| 1322 |
+
output_attentions if output_attentions is not None else self.generation_config.output_attentions
|
| 1323 |
+
)
|
| 1324 |
+
output_hidden_states = (
|
| 1325 |
+
output_hidden_states if output_hidden_states is not None else self.generation_config.output_hidden_states
|
| 1326 |
+
)
|
| 1327 |
+
return_dict_in_generate = (
|
| 1328 |
+
return_dict_in_generate
|
| 1329 |
+
if return_dict_in_generate is not None
|
| 1330 |
+
else self.generation_config.return_dict_in_generate
|
| 1331 |
+
)
|
| 1332 |
+
|
| 1333 |
+
# init attention / hidden states / scores tuples
|
| 1334 |
+
scores = () if (return_dict_in_generate and output_scores) else None
|
| 1335 |
+
decoder_attentions = () if (return_dict_in_generate and output_attentions) else None
|
| 1336 |
+
cross_attentions = () if (return_dict_in_generate and output_attentions) else None
|
| 1337 |
+
decoder_hidden_states = () if (return_dict_in_generate and output_hidden_states) else None
|
| 1338 |
+
|
| 1339 |
+
# if model is an encoder-decoder, retrieve encoder attention weights and hidden states
|
| 1340 |
+
if return_dict_in_generate and self.config.is_encoder_decoder:
|
| 1341 |
+
encoder_attentions = model_kwargs["encoder_outputs"].get("attentions") if output_attentions else None
|
| 1342 |
+
encoder_hidden_states = (
|
| 1343 |
+
model_kwargs["encoder_outputs"].get("hidden_states") if output_hidden_states else None
|
| 1344 |
+
)
|
| 1345 |
+
|
| 1346 |
+
# keep track of which sequences are already finished
|
| 1347 |
+
unfinished_sequences = torch.ones(input_ids.shape[0], dtype=torch.long, device=input_ids.device)
|
| 1348 |
+
|
| 1349 |
+
this_peer_finished = False # used by synced_gpus only
|
| 1350 |
+
while True:
|
| 1351 |
+
if synced_gpus:
|
| 1352 |
+
# Under synced_gpus the `forward` call must continue until all gpus complete their sequence.
|
| 1353 |
+
# The following logic allows an early break if all peers finished generating their sequence
|
| 1354 |
+
this_peer_finished_flag = torch.tensor(0.0 if this_peer_finished else 1.0).to(input_ids.device)
|
| 1355 |
+
# send 0.0 if we finished, 1.0 otherwise
|
| 1356 |
+
dist.all_reduce(this_peer_finished_flag, op=dist.ReduceOp.SUM)
|
| 1357 |
+
# did all peers finish? the reduced sum will be 0.0 then
|
| 1358 |
+
if this_peer_finished_flag.item() == 0.0:
|
| 1359 |
+
break
|
| 1360 |
+
|
| 1361 |
+
# prepare model inputs
|
| 1362 |
+
model_inputs = self.prepare_inputs_for_generation(input_ids, **model_kwargs)
|
| 1363 |
+
|
| 1364 |
+
# forward pass to get next token
|
| 1365 |
+
outputs = self(
|
| 1366 |
+
**model_inputs,
|
| 1367 |
+
return_dict=True,
|
| 1368 |
+
output_attentions=output_attentions,
|
| 1369 |
+
output_hidden_states=output_hidden_states,
|
| 1370 |
+
)
|
| 1371 |
+
|
| 1372 |
+
if synced_gpus and this_peer_finished:
|
| 1373 |
+
continue # don't waste resources running the code we don't need
|
| 1374 |
+
|
| 1375 |
+
next_token_logits = outputs.logits[:, -1, :]
|
| 1376 |
+
|
| 1377 |
+
# pre-process distribution
|
| 1378 |
+
next_tokens_scores = logits_processor(input_ids, next_token_logits)
|
| 1379 |
+
|
| 1380 |
+
# Store scores, attentions and hidden_states when required
|
| 1381 |
+
if return_dict_in_generate:
|
| 1382 |
+
if output_scores:
|
| 1383 |
+
scores += (next_tokens_scores,)
|
| 1384 |
+
if output_attentions:
|
| 1385 |
+
decoder_attentions += (
|
| 1386 |
+
(outputs.decoder_attentions,) if self.config.is_encoder_decoder else (outputs.attentions,)
|
| 1387 |
+
)
|
| 1388 |
+
if self.config.is_encoder_decoder:
|
| 1389 |
+
cross_attentions += (outputs.cross_attentions,)
|
| 1390 |
+
|
| 1391 |
+
if output_hidden_states:
|
| 1392 |
+
decoder_hidden_states += (
|
| 1393 |
+
(outputs.decoder_hidden_states,)
|
| 1394 |
+
if self.config.is_encoder_decoder
|
| 1395 |
+
else (outputs.hidden_states,)
|
| 1396 |
+
)
|
| 1397 |
+
|
| 1398 |
+
# argmax
|
| 1399 |
+
next_tokens = torch.argmax(next_tokens_scores, dim=-1).to(device=input_ids.device)
|
| 1400 |
+
# finished sentences should have their next token be a padding token
|
| 1401 |
+
if eos_token_id is not None:
|
| 1402 |
+
if pad_token_id is None:
|
| 1403 |
+
raise ValueError("If `eos_token_id` is defined, make sure that `pad_token_id` is defined.")
|
| 1404 |
+
next_tokens = next_tokens * unfinished_sequences + pad_token_id * (1 - unfinished_sequences)
|
| 1405 |
+
|
| 1406 |
+
# update generated ids, model inputs, and length for next step
|
| 1407 |
+
input_ids = torch.cat([input_ids, next_tokens[:, None]], dim=-1)
|
| 1408 |
+
if streamer is not None:
|
| 1409 |
+
streamer.put(next_tokens.cpu())
|
| 1410 |
+
model_kwargs = self._update_model_kwargs_for_generation(
|
| 1411 |
+
outputs, model_kwargs, is_encoder_decoder=self.config.is_encoder_decoder
|
| 1412 |
+
)
|
| 1413 |
+
|
| 1414 |
+
# if eos_token was found in one sentence, set sentence to finished
|
| 1415 |
+
if eos_token_id_tensor is not None:
|
| 1416 |
+
unfinished_sequences = unfinished_sequences.mul(
|
| 1417 |
+
next_tokens.tile(eos_token_id_tensor.shape[0], 1).ne(eos_token_id_tensor.unsqueeze(1)).prod(dim=0)
|
| 1418 |
+
)
|
| 1419 |
+
|
| 1420 |
+
# stop when each sentence is finished
|
| 1421 |
+
if unfinished_sequences.max() == 0:
|
| 1422 |
+
this_peer_finished = True
|
| 1423 |
+
|
| 1424 |
+
# stop if we exceed the maximum length
|
| 1425 |
+
if stopping_criteria(input_ids, scores):
|
| 1426 |
+
this_peer_finished = True
|
| 1427 |
+
|
| 1428 |
+
if this_peer_finished and not synced_gpus:
|
| 1429 |
+
break
|
| 1430 |
+
|
| 1431 |
+
if streamer is not None:
|
| 1432 |
+
streamer.end()
|
| 1433 |
+
|
| 1434 |
+
if return_dict_in_generate:
|
| 1435 |
+
if self.config.is_encoder_decoder:
|
| 1436 |
+
return GreedySearchEncoderDecoderOutput(
|
| 1437 |
+
sequences=input_ids,
|
| 1438 |
+
scores=scores,
|
| 1439 |
+
encoder_attentions=encoder_attentions,
|
| 1440 |
+
encoder_hidden_states=encoder_hidden_states,
|
| 1441 |
+
decoder_attentions=decoder_attentions,
|
| 1442 |
+
cross_attentions=cross_attentions,
|
| 1443 |
+
decoder_hidden_states=decoder_hidden_states,
|
| 1444 |
+
past_key_values=model_kwargs.get("past_key_values"),
|
| 1445 |
+
)
|
| 1446 |
+
else:
|
| 1447 |
+
return GreedySearchDecoderOnlyOutput(
|
| 1448 |
+
sequences=input_ids,
|
| 1449 |
+
scores=scores,
|
| 1450 |
+
attentions=decoder_attentions,
|
| 1451 |
+
hidden_states=decoder_hidden_states,
|
| 1452 |
+
past_key_values=model_kwargs.get("past_key_values"),
|
| 1453 |
+
)
|
| 1454 |
+
else:
|
| 1455 |
+
return input_ids
|
| 1456 |
+
|
| 1457 |
|
| 1458 |
# Copied from transformers.model.llama.modeling_llama.LlamaForSequenceClassification with Llama->InternLM2
|
| 1459 |
@add_start_docstrings(
|
modeling_internvl_chat.py
CHANGED
|
@@ -26,7 +26,7 @@ logger = logging.get_logger(__name__)
|
|
| 26 |
class InternVLChatModel(PreTrainedModel):
|
| 27 |
config_class = InternVLChatConfig
|
| 28 |
main_input_name = 'pixel_values'
|
| 29 |
-
_no_split_modules = ['InternVisionEncoderLayer', 'LlamaDecoderLayer'
|
| 30 |
|
| 31 |
def __init__(self, config: InternVLChatConfig, vision_model=None, language_model=None):
|
| 32 |
super().__init__(config)
|
|
@@ -337,7 +337,6 @@ class InternVLChatModel(PreTrainedModel):
|
|
| 337 |
vit_embeds = visual_features
|
| 338 |
else:
|
| 339 |
vit_embeds = self.extract_feature(pixel_values)
|
| 340 |
-
|
| 341 |
input_embeds = self.language_model.get_input_embeddings()(input_ids)
|
| 342 |
B, N, C = input_embeds.shape
|
| 343 |
input_embeds = input_embeds.reshape(B * N, C)
|
|
@@ -345,7 +344,7 @@ class InternVLChatModel(PreTrainedModel):
|
|
| 345 |
input_ids = input_ids.reshape(B * N)
|
| 346 |
selected = (input_ids == self.img_context_token_id)
|
| 347 |
assert selected.sum() != 0
|
| 348 |
-
input_embeds[selected] = vit_embeds.reshape(-1, C)
|
| 349 |
|
| 350 |
input_embeds = input_embeds.reshape(B, N, C)
|
| 351 |
else:
|
|
|
|
| 26 |
class InternVLChatModel(PreTrainedModel):
|
| 27 |
config_class = InternVLChatConfig
|
| 28 |
main_input_name = 'pixel_values'
|
| 29 |
+
_no_split_modules = ['InternVisionEncoderLayer', 'LlamaDecoderLayer']
|
| 30 |
|
| 31 |
def __init__(self, config: InternVLChatConfig, vision_model=None, language_model=None):
|
| 32 |
super().__init__(config)
|
|
|
|
| 337 |
vit_embeds = visual_features
|
| 338 |
else:
|
| 339 |
vit_embeds = self.extract_feature(pixel_values)
|
|
|
|
| 340 |
input_embeds = self.language_model.get_input_embeddings()(input_ids)
|
| 341 |
B, N, C = input_embeds.shape
|
| 342 |
input_embeds = input_embeds.reshape(B * N, C)
|
|
|
|
| 344 |
input_ids = input_ids.reshape(B * N)
|
| 345 |
selected = (input_ids == self.img_context_token_id)
|
| 346 |
assert selected.sum() != 0
|
| 347 |
+
input_embeds[selected] = vit_embeds.reshape(-1, C).to(input_embeds.device)
|
| 348 |
|
| 349 |
input_embeds = input_embeds.reshape(B, N, C)
|
| 350 |
else:
|