Upload folder using huggingface_hub
Browse files- README.md +28 -91
- modeling_intern_vit.py +6 -12
README.md
CHANGED
|
@@ -62,6 +62,8 @@ InternVL 2.0 is a multimodal large language model series, featuring models of va
|
|
| 62 |
| MathVista<sub>testmini</sub> | 58.1 | 57.7 | 59.4 | 63.7 |
|
| 63 |
| OpenCompass<sub>avg</sub> | 63.5 | 64.4 | 66.4 | 69.7 |
|
| 64 |
|
|
|
|
|
|
|
| 65 |
- We simultaneously use InternVL and VLMEvalKit repositories for model evaluation. Specifically, the results reported for DocVQA, ChartQA, InfoVQA, TextVQA, MME, AI2D, MMBench, CCBench, MMVet, and SEED-Image were tested using the InternVL repository. OCRBench, RealWorldQA, HallBench, and MathVista were evaluated using the VLMEvalKit.
|
| 66 |
|
| 67 |
- For MMMU, we report both the original scores (left side: evaluated using the InternVL codebase for InternVL series models, and sourced from technical reports or webpages for other models) and the VLMEvalKit scores (right side: collected from the OpenCompass leaderboard).
|
|
@@ -321,7 +323,7 @@ tokenizer = AutoTokenizer.from_pretrained(path, trust_remote_code=True, use_fast
|
|
| 321 |
|
| 322 |
# set the max number of tiles in `max_num`
|
| 323 |
pixel_values = load_image('./examples/image1.jpg', max_num=12).to(torch.bfloat16).cuda()
|
| 324 |
-
generation_config = dict(max_new_tokens=1024, do_sample=
|
| 325 |
|
| 326 |
# pure-text conversation (纯文本对话)
|
| 327 |
question = 'Hello, who are you?'
|
|
@@ -473,7 +475,7 @@ for new_text in streamer:
|
|
| 473 |
|
| 474 |
## Finetune
|
| 475 |
|
| 476 |
-
|
| 477 |
|
| 478 |
## Deployment
|
| 479 |
|
|
@@ -482,7 +484,7 @@ SWIFT from ModelScope community has supported the fine-tuning (Image/Video) of I
|
|
| 482 |
LMDeploy is a toolkit for compressing, deploying, and serving LLM, developed by the MMRazor and MMDeploy teams.
|
| 483 |
|
| 484 |
```sh
|
| 485 |
-
pip install lmdeploy
|
| 486 |
```
|
| 487 |
|
| 488 |
LMDeploy abstracts the complex inference process of multi-modal Vision-Language Models (VLM) into an easy-to-use pipeline, similar to the Large Language Model (LLM) inference pipeline.
|
|
@@ -490,16 +492,12 @@ LMDeploy abstracts the complex inference process of multi-modal Vision-Language
|
|
| 490 |
#### A 'Hello, world' example
|
| 491 |
|
| 492 |
```python
|
| 493 |
-
from lmdeploy import pipeline, TurbomindEngineConfig
|
| 494 |
from lmdeploy.vl import load_image
|
| 495 |
|
| 496 |
model = 'OpenGVLab/InternVL2-40B'
|
| 497 |
-
system_prompt = '我是书生·万象,英文名是InternVL,是由上海人工智能实验室、清华大学及多家合作单位联合开发的多模态大语言模型。'
|
| 498 |
image = load_image('https://raw.githubusercontent.com/open-mmlab/mmdeploy/main/tests/data/tiger.jpeg')
|
| 499 |
-
|
| 500 |
-
chat_template_config.meta_instruction = system_prompt
|
| 501 |
-
pipe = pipeline(model, chat_template_config=chat_template_config,
|
| 502 |
-
backend_config=TurbomindEngineConfig(session_len=8192))
|
| 503 |
response = pipe(('describe this image', image))
|
| 504 |
print(response.text)
|
| 505 |
```
|
|
@@ -513,16 +511,12 @@ When dealing with multiple images, you can put them all in one list. Keep in min
|
|
| 513 |
> Warning: Due to the scarcity of multi-image conversation data, the performance on multi-image tasks may be unstable, and it may require multiple attempts to achieve satisfactory results.
|
| 514 |
|
| 515 |
```python
|
| 516 |
-
from lmdeploy import pipeline, TurbomindEngineConfig
|
| 517 |
from lmdeploy.vl import load_image
|
| 518 |
from lmdeploy.vl.constants import IMAGE_TOKEN
|
| 519 |
|
| 520 |
model = 'OpenGVLab/InternVL2-40B'
|
| 521 |
-
|
| 522 |
-
chat_template_config = ChatTemplateConfig('internvl-zh-hermes2')
|
| 523 |
-
chat_template_config.meta_instruction = system_prompt
|
| 524 |
-
pipe = pipeline(model, chat_template_config=chat_template_config,
|
| 525 |
-
backend_config=TurbomindEngineConfig(session_len=8192))
|
| 526 |
|
| 527 |
image_urls=[
|
| 528 |
'https://raw.githubusercontent.com/open-mmlab/mmdeploy/main/demo/resources/human-pose.jpg',
|
|
@@ -540,15 +534,11 @@ print(response.text)
|
|
| 540 |
Conducting inference with batch prompts is quite straightforward; just place them within a list structure:
|
| 541 |
|
| 542 |
```python
|
| 543 |
-
from lmdeploy import pipeline, TurbomindEngineConfig
|
| 544 |
from lmdeploy.vl import load_image
|
| 545 |
|
| 546 |
model = 'OpenGVLab/InternVL2-40B'
|
| 547 |
-
|
| 548 |
-
chat_template_config = ChatTemplateConfig('internvl-zh-hermes2')
|
| 549 |
-
chat_template_config.meta_instruction = system_prompt
|
| 550 |
-
pipe = pipeline(model, chat_template_config=chat_template_config,
|
| 551 |
-
backend_config=TurbomindEngineConfig(session_len=8192))
|
| 552 |
|
| 553 |
image_urls=[
|
| 554 |
"https://raw.githubusercontent.com/open-mmlab/mmdeploy/main/demo/resources/human-pose.jpg",
|
|
@@ -564,15 +554,11 @@ print(response)
|
|
| 564 |
There are two ways to do the multi-turn conversations with the pipeline. One is to construct messages according to the format of OpenAI and use above introduced method, the other is to use the `pipeline.chat` interface.
|
| 565 |
|
| 566 |
```python
|
| 567 |
-
from lmdeploy import pipeline, TurbomindEngineConfig,
|
| 568 |
from lmdeploy.vl import load_image
|
| 569 |
|
| 570 |
model = 'OpenGVLab/InternVL2-40B'
|
| 571 |
-
|
| 572 |
-
chat_template_config = ChatTemplateConfig('internvl-zh-hermes2')
|
| 573 |
-
chat_template_config.meta_instruction = system_prompt
|
| 574 |
-
pipe = pipeline(model, chat_template_config=chat_template_config,
|
| 575 |
-
backend_config=TurbomindEngineConfig(session_len=8192))
|
| 576 |
|
| 577 |
image = load_image('https://raw.githubusercontent.com/open-mmlab/mmdeploy/main/demo/resources/human-pose.jpg')
|
| 578 |
gen_config = GenerationConfig(top_k=40, top_p=0.8, temperature=0.8)
|
|
@@ -584,20 +570,10 @@ print(sess.response.text)
|
|
| 584 |
|
| 585 |
#### Service
|
| 586 |
|
| 587 |
-
To deploy InternVL2 as an API, please configure the chat template config first. Create the following JSON file `chat_template.json`.
|
| 588 |
-
|
| 589 |
-
```json
|
| 590 |
-
{
|
| 591 |
-
"model_name":"internvl-zh-hermes2",
|
| 592 |
-
"meta_instruction":"我是书生·万象,英文名是InternVL,是由上海人工智能实验室、清华大学及多家合作单位联合开发的多模态大语言模型。",
|
| 593 |
-
"stop_words":["<|im_start|>", "<|im_end|>"]
|
| 594 |
-
}
|
| 595 |
-
```
|
| 596 |
-
|
| 597 |
LMDeploy's `api_server` enables models to be easily packed into services with a single command. The provided RESTful APIs are compatible with OpenAI's interfaces. Below are an example of service startup:
|
| 598 |
|
| 599 |
```shell
|
| 600 |
-
lmdeploy serve api_server OpenGVLab/InternVL2-40B --backend turbomind --server-port 23333
|
| 601 |
```
|
| 602 |
|
| 603 |
To use the OpenAI-style interface, you need to install OpenAI:
|
|
@@ -634,14 +610,6 @@ response = client.chat.completions.create(
|
|
| 634 |
print(response)
|
| 635 |
```
|
| 636 |
|
| 637 |
-
### vLLM
|
| 638 |
-
|
| 639 |
-
TODO
|
| 640 |
-
|
| 641 |
-
### Ollama
|
| 642 |
-
|
| 643 |
-
TODO
|
| 644 |
-
|
| 645 |
## License
|
| 646 |
|
| 647 |
This project is released under the MIT license, while InternLM2 is licensed under the Apache-2.0 license.
|
|
@@ -714,6 +682,8 @@ InternVL 2.0 是一个多模态大语言模型系列,包含各种规模的模
|
|
| 714 |
| MathVista<sub>testmini</sub> | 58.1 | 57.7 | 59.4 | 63.7 |
|
| 715 |
| OpenCompass<sub>avg</sub> | 63.5 | 64.4 | 66.4 | 69.7 |
|
| 716 |
|
|
|
|
|
|
|
| 717 |
- 我们同时使用 InternVL 和 VLMEvalKit 仓库进行模型评估。具体来说,DocVQA、ChartQA、InfoVQA、TextVQA、MME、AI2D、MMBench、CCBench、MMVet 和 SEED-Image 的结果是使用 InternVL 仓库测试的。OCRBench、RealWorldQA、HallBench 和 MathVista 是使用 VLMEvalKit 进行评估的。
|
| 718 |
|
| 719 |
- 对于MMMU,我们报告了原始分数(左侧:InternVL系列模型使用InternVL代码库评测,其他模型的分数来自其技术报告或网页)和VLMEvalKit分数(右侧:从OpenCompass排行榜收集)。
|
|
@@ -772,7 +742,7 @@ InternVL 2.0 是一个多模态大语言模型系列,包含各种规模的模
|
|
| 772 |
|
| 773 |
## 微调
|
| 774 |
|
| 775 |
-
|
| 776 |
|
| 777 |
## 部署
|
| 778 |
|
|
@@ -781,7 +751,7 @@ InternVL 2.0 是一个多模态大语言模型系列,包含各种规模的模
|
|
| 781 |
LMDeploy 是由 MMRazor 和 MMDeploy 团队开发的用于压缩、部署和服务大语言模型(LLM)的工具包。
|
| 782 |
|
| 783 |
```sh
|
| 784 |
-
pip install lmdeploy
|
| 785 |
```
|
| 786 |
|
| 787 |
LMDeploy 将��模态视觉-语言模型(VLM)的复杂推理过程抽象为一个易于使用的管道,类似于大语言模型(LLM)的推理管道。
|
|
@@ -789,16 +759,12 @@ LMDeploy 将多模态视觉-语言模型(VLM)的复杂推理过程抽象为
|
|
| 789 |
#### 一个“你好,世界”示例
|
| 790 |
|
| 791 |
```python
|
| 792 |
-
from lmdeploy import pipeline, TurbomindEngineConfig
|
| 793 |
from lmdeploy.vl import load_image
|
| 794 |
|
| 795 |
model = 'OpenGVLab/InternVL2-40B'
|
| 796 |
-
system_prompt = '我是书生·万象,英文名是InternVL,是由上海人工智能实验室、清华大学及多家合作单位联合开发的多模态大语言模型。'
|
| 797 |
image = load_image('https://raw.githubusercontent.com/open-mmlab/mmdeploy/main/tests/data/tiger.jpeg')
|
| 798 |
-
|
| 799 |
-
chat_template_config.meta_instruction = system_prompt
|
| 800 |
-
pipe = pipeline(model, chat_template_config=chat_template_config,
|
| 801 |
-
backend_config=TurbomindEngineConfig(session_len=8192))
|
| 802 |
response = pipe(('describe this image', image))
|
| 803 |
print(response.text)
|
| 804 |
```
|
|
@@ -810,16 +776,12 @@ print(response.text)
|
|
| 810 |
在处理多张图像时,可以将它们全部放入一个列表中。请注意,多张图像会导致输入 token 数量增加,因此通常需要增加上下文窗口的大小。
|
| 811 |
|
| 812 |
```python
|
| 813 |
-
from lmdeploy import pipeline, TurbomindEngineConfig
|
| 814 |
from lmdeploy.vl import load_image
|
| 815 |
from lmdeploy.vl.constants import IMAGE_TOKEN
|
| 816 |
|
| 817 |
model = 'OpenGVLab/InternVL2-40B'
|
| 818 |
-
|
| 819 |
-
chat_template_config = ChatTemplateConfig('internvl-zh-hermes2')
|
| 820 |
-
chat_template_config.meta_instruction = system_prompt
|
| 821 |
-
pipe = pipeline(model, chat_template_config=chat_template_config,
|
| 822 |
-
backend_config=TurbomindEngineConfig(session_len=8192))
|
| 823 |
|
| 824 |
image_urls=[
|
| 825 |
'https://raw.githubusercontent.com/open-mmlab/mmdeploy/main/demo/resources/human-pose.jpg',
|
|
@@ -827,6 +789,7 @@ image_urls=[
|
|
| 827 |
]
|
| 828 |
|
| 829 |
images = [load_image(img_url) for img_url in image_urls]
|
|
|
|
| 830 |
response = pipe((f'Image-1: {IMAGE_TOKEN}\nImage-2: {IMAGE_TOKEN}\ndescribe these two images', images))
|
| 831 |
print(response.text)
|
| 832 |
```
|
|
@@ -836,15 +799,11 @@ print(response.text)
|
|
| 836 |
使用批量Prompt进行推理非常简单;只需将它们放在一个列表结构中:
|
| 837 |
|
| 838 |
```python
|
| 839 |
-
from lmdeploy import pipeline, TurbomindEngineConfig
|
| 840 |
from lmdeploy.vl import load_image
|
| 841 |
|
| 842 |
model = 'OpenGVLab/InternVL2-40B'
|
| 843 |
-
|
| 844 |
-
chat_template_config = ChatTemplateConfig('internvl-zh-hermes2')
|
| 845 |
-
chat_template_config.meta_instruction = system_prompt
|
| 846 |
-
pipe = pipeline(model, chat_template_config=chat_template_config,
|
| 847 |
-
backend_config=TurbomindEngineConfig(session_len=8192))
|
| 848 |
|
| 849 |
image_urls=[
|
| 850 |
"https://raw.githubusercontent.com/open-mmlab/mmdeploy/main/demo/resources/human-pose.jpg",
|
|
@@ -860,15 +819,11 @@ print(response)
|
|
| 860 |
使用管道进行多轮对话有两种方法。一种是根据 OpenAI 的格式构建消息并使用上述方法,另一种是使用 `pipeline.chat` 接口。
|
| 861 |
|
| 862 |
```python
|
| 863 |
-
from lmdeploy import pipeline, TurbomindEngineConfig,
|
| 864 |
from lmdeploy.vl import load_image
|
| 865 |
|
| 866 |
model = 'OpenGVLab/InternVL2-40B'
|
| 867 |
-
|
| 868 |
-
chat_template_config = ChatTemplateConfig('internvl-zh-hermes2')
|
| 869 |
-
chat_template_config.meta_instruction = system_prompt
|
| 870 |
-
pipe = pipeline(model, chat_template_config=chat_template_config,
|
| 871 |
-
backend_config=TurbomindEngineConfig(session_len=8192))
|
| 872 |
|
| 873 |
image = load_image('https://raw.githubusercontent.com/open-mmlab/mmdeploy/main/demo/resources/human-pose.jpg')
|
| 874 |
gen_config = GenerationConfig(top_k=40, top_p=0.8, temperature=0.8)
|
|
@@ -880,20 +835,10 @@ print(sess.response.text)
|
|
| 880 |
|
| 881 |
#### API部署
|
| 882 |
|
| 883 |
-
为了将InternVL2部署成API,请先配置聊天模板配置文件。创建如下的 JSON 文件 `chat_template.json`。
|
| 884 |
-
|
| 885 |
-
```json
|
| 886 |
-
{
|
| 887 |
-
"model_name":"internvl-zh-hermes2",
|
| 888 |
-
"meta_instruction":"我是书生·万象,英文名是InternVL,是由上海人工智能实验室、清华大学及多家合作��位联合开发的多模态大语言模型。",
|
| 889 |
-
"stop_words":["<|im_start|>", "<|im_end|>"]
|
| 890 |
-
}
|
| 891 |
-
```
|
| 892 |
-
|
| 893 |
LMDeploy 的 `api_server` 使模型能够通过一个命令轻松打包成服务。提供的 RESTful API 与 OpenAI 的接口兼容。以下是服务启动的示例:
|
| 894 |
|
| 895 |
```shell
|
| 896 |
-
lmdeploy serve api_server OpenGVLab/InternVL2-40B --backend turbomind --server-port 23333
|
| 897 |
```
|
| 898 |
|
| 899 |
为了使用OpenAI风格的API接口,您需要安装OpenAI:
|
|
@@ -930,14 +875,6 @@ response = client.chat.completions.create(
|
|
| 930 |
print(response)
|
| 931 |
```
|
| 932 |
|
| 933 |
-
### vLLM
|
| 934 |
-
|
| 935 |
-
TODO
|
| 936 |
-
|
| 937 |
-
### Ollama
|
| 938 |
-
|
| 939 |
-
TODO
|
| 940 |
-
|
| 941 |
## 开源许可证
|
| 942 |
|
| 943 |
该项目采用 MIT 许可证发布,而 InternLM2 则采用 Apache-2.0 许可证。
|
|
|
|
| 62 |
| MathVista<sub>testmini</sub> | 58.1 | 57.7 | 59.4 | 63.7 |
|
| 63 |
| OpenCompass<sub>avg</sub> | 63.5 | 64.4 | 66.4 | 69.7 |
|
| 64 |
|
| 65 |
+
- For more details and evaluation reproduction, please refer to our [Evaluation Guide](https://internvl.readthedocs.io/en/latest/internvl2.0/evaluation.html).
|
| 66 |
+
|
| 67 |
- We simultaneously use InternVL and VLMEvalKit repositories for model evaluation. Specifically, the results reported for DocVQA, ChartQA, InfoVQA, TextVQA, MME, AI2D, MMBench, CCBench, MMVet, and SEED-Image were tested using the InternVL repository. OCRBench, RealWorldQA, HallBench, and MathVista were evaluated using the VLMEvalKit.
|
| 68 |
|
| 69 |
- For MMMU, we report both the original scores (left side: evaluated using the InternVL codebase for InternVL series models, and sourced from technical reports or webpages for other models) and the VLMEvalKit scores (right side: collected from the OpenCompass leaderboard).
|
|
|
|
| 323 |
|
| 324 |
# set the max number of tiles in `max_num`
|
| 325 |
pixel_values = load_image('./examples/image1.jpg', max_num=12).to(torch.bfloat16).cuda()
|
| 326 |
+
generation_config = dict(max_new_tokens=1024, do_sample=True)
|
| 327 |
|
| 328 |
# pure-text conversation (纯文本对话)
|
| 329 |
question = 'Hello, who are you?'
|
|
|
|
| 475 |
|
| 476 |
## Finetune
|
| 477 |
|
| 478 |
+
Many repositories now support fine-tuning of the InternVL series models, including [InternVL](https://github.com/OpenGVLab/InternVL), [SWIFT](https://github.com/modelscope/ms-swift), [XTurner](https://github.com/InternLM/xtuner), and others. Please refer to their documentation for more details on fine-tuning.
|
| 479 |
|
| 480 |
## Deployment
|
| 481 |
|
|
|
|
| 484 |
LMDeploy is a toolkit for compressing, deploying, and serving LLM, developed by the MMRazor and MMDeploy teams.
|
| 485 |
|
| 486 |
```sh
|
| 487 |
+
pip install lmdeploy==0.5.3
|
| 488 |
```
|
| 489 |
|
| 490 |
LMDeploy abstracts the complex inference process of multi-modal Vision-Language Models (VLM) into an easy-to-use pipeline, similar to the Large Language Model (LLM) inference pipeline.
|
|
|
|
| 492 |
#### A 'Hello, world' example
|
| 493 |
|
| 494 |
```python
|
| 495 |
+
from lmdeploy import pipeline, TurbomindEngineConfig
|
| 496 |
from lmdeploy.vl import load_image
|
| 497 |
|
| 498 |
model = 'OpenGVLab/InternVL2-40B'
|
|
|
|
| 499 |
image = load_image('https://raw.githubusercontent.com/open-mmlab/mmdeploy/main/tests/data/tiger.jpeg')
|
| 500 |
+
pipe = pipeline(model, backend_config=TurbomindEngineConfig(session_len=8192))
|
|
|
|
|
|
|
|
|
|
| 501 |
response = pipe(('describe this image', image))
|
| 502 |
print(response.text)
|
| 503 |
```
|
|
|
|
| 511 |
> Warning: Due to the scarcity of multi-image conversation data, the performance on multi-image tasks may be unstable, and it may require multiple attempts to achieve satisfactory results.
|
| 512 |
|
| 513 |
```python
|
| 514 |
+
from lmdeploy import pipeline, TurbomindEngineConfig
|
| 515 |
from lmdeploy.vl import load_image
|
| 516 |
from lmdeploy.vl.constants import IMAGE_TOKEN
|
| 517 |
|
| 518 |
model = 'OpenGVLab/InternVL2-40B'
|
| 519 |
+
pipe = pipeline(model, backend_config=TurbomindEngineConfig(session_len=8192))
|
|
|
|
|
|
|
|
|
|
|
|
|
| 520 |
|
| 521 |
image_urls=[
|
| 522 |
'https://raw.githubusercontent.com/open-mmlab/mmdeploy/main/demo/resources/human-pose.jpg',
|
|
|
|
| 534 |
Conducting inference with batch prompts is quite straightforward; just place them within a list structure:
|
| 535 |
|
| 536 |
```python
|
| 537 |
+
from lmdeploy import pipeline, TurbomindEngineConfig
|
| 538 |
from lmdeploy.vl import load_image
|
| 539 |
|
| 540 |
model = 'OpenGVLab/InternVL2-40B'
|
| 541 |
+
pipe = pipeline(model, backend_config=TurbomindEngineConfig(session_len=8192))
|
|
|
|
|
|
|
|
|
|
|
|
|
| 542 |
|
| 543 |
image_urls=[
|
| 544 |
"https://raw.githubusercontent.com/open-mmlab/mmdeploy/main/demo/resources/human-pose.jpg",
|
|
|
|
| 554 |
There are two ways to do the multi-turn conversations with the pipeline. One is to construct messages according to the format of OpenAI and use above introduced method, the other is to use the `pipeline.chat` interface.
|
| 555 |
|
| 556 |
```python
|
| 557 |
+
from lmdeploy import pipeline, TurbomindEngineConfig, GenerationConfig
|
| 558 |
from lmdeploy.vl import load_image
|
| 559 |
|
| 560 |
model = 'OpenGVLab/InternVL2-40B'
|
| 561 |
+
pipe = pipeline(model, backend_config=TurbomindEngineConfig(session_len=8192))
|
|
|
|
|
|
|
|
|
|
|
|
|
| 562 |
|
| 563 |
image = load_image('https://raw.githubusercontent.com/open-mmlab/mmdeploy/main/demo/resources/human-pose.jpg')
|
| 564 |
gen_config = GenerationConfig(top_k=40, top_p=0.8, temperature=0.8)
|
|
|
|
| 570 |
|
| 571 |
#### Service
|
| 572 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 573 |
LMDeploy's `api_server` enables models to be easily packed into services with a single command. The provided RESTful APIs are compatible with OpenAI's interfaces. Below are an example of service startup:
|
| 574 |
|
| 575 |
```shell
|
| 576 |
+
lmdeploy serve api_server OpenGVLab/InternVL2-40B --backend turbomind --server-port 23333
|
| 577 |
```
|
| 578 |
|
| 579 |
To use the OpenAI-style interface, you need to install OpenAI:
|
|
|
|
| 610 |
print(response)
|
| 611 |
```
|
| 612 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 613 |
## License
|
| 614 |
|
| 615 |
This project is released under the MIT license, while InternLM2 is licensed under the Apache-2.0 license.
|
|
|
|
| 682 |
| MathVista<sub>testmini</sub> | 58.1 | 57.7 | 59.4 | 63.7 |
|
| 683 |
| OpenCompass<sub>avg</sub> | 63.5 | 64.4 | 66.4 | 69.7 |
|
| 684 |
|
| 685 |
+
- 关于更多的细节以及评测复现,请看我们的[评测指南](https://internvl.readthedocs.io/en/latest/internvl2.0/evaluation.html)。
|
| 686 |
+
|
| 687 |
- 我们同时使用 InternVL 和 VLMEvalKit 仓库进行模型评估。具体来说,DocVQA、ChartQA、InfoVQA、TextVQA、MME、AI2D、MMBench、CCBench、MMVet 和 SEED-Image 的结果是使用 InternVL 仓库测试的。OCRBench、RealWorldQA、HallBench 和 MathVista 是使用 VLMEvalKit 进行评估的。
|
| 688 |
|
| 689 |
- 对于MMMU,我们报告了原始分数(左侧:InternVL系列模型使用InternVL代码库评测,其他模型的分数来自其技术报告或网页)和VLMEvalKit分数(右侧:从OpenCompass排行榜收集)。
|
|
|
|
| 742 |
|
| 743 |
## 微调
|
| 744 |
|
| 745 |
+
许多仓库现在都支持 InternVL 系列模型的微调,包括 [InternVL](https://github.com/OpenGVLab/InternVL)、[SWIFT](https://github.com/modelscope/ms-swift)、[XTurner](https://github.com/InternLM/xtuner) 等。请参阅它们的文档以获取更多微调细节。
|
| 746 |
|
| 747 |
## 部署
|
| 748 |
|
|
|
|
| 751 |
LMDeploy 是由 MMRazor 和 MMDeploy 团队开发的用于压缩、部署和服务大语言模型(LLM)的工具包。
|
| 752 |
|
| 753 |
```sh
|
| 754 |
+
pip install lmdeploy==0.5.3
|
| 755 |
```
|
| 756 |
|
| 757 |
LMDeploy 将��模态视觉-语言模型(VLM)的复杂推理过程抽象为一个易于使用的管道,类似于大语言模型(LLM)的推理管道。
|
|
|
|
| 759 |
#### 一个“你好,世界”示例
|
| 760 |
|
| 761 |
```python
|
| 762 |
+
from lmdeploy import pipeline, TurbomindEngineConfig
|
| 763 |
from lmdeploy.vl import load_image
|
| 764 |
|
| 765 |
model = 'OpenGVLab/InternVL2-40B'
|
|
|
|
| 766 |
image = load_image('https://raw.githubusercontent.com/open-mmlab/mmdeploy/main/tests/data/tiger.jpeg')
|
| 767 |
+
pipe = pipeline(model, backend_config=TurbomindEngineConfig(session_len=8192))
|
|
|
|
|
|
|
|
|
|
| 768 |
response = pipe(('describe this image', image))
|
| 769 |
print(response.text)
|
| 770 |
```
|
|
|
|
| 776 |
在处理多张图像时,可以将它们全部放入一个列表中。请注意,多张图像会导致输入 token 数量增加,因此通常需要增加上下文窗口的大小。
|
| 777 |
|
| 778 |
```python
|
| 779 |
+
from lmdeploy import pipeline, TurbomindEngineConfig
|
| 780 |
from lmdeploy.vl import load_image
|
| 781 |
from lmdeploy.vl.constants import IMAGE_TOKEN
|
| 782 |
|
| 783 |
model = 'OpenGVLab/InternVL2-40B'
|
| 784 |
+
pipe = pipeline(model, backend_config=TurbomindEngineConfig(session_len=8192))
|
|
|
|
|
|
|
|
|
|
|
|
|
| 785 |
|
| 786 |
image_urls=[
|
| 787 |
'https://raw.githubusercontent.com/open-mmlab/mmdeploy/main/demo/resources/human-pose.jpg',
|
|
|
|
| 789 |
]
|
| 790 |
|
| 791 |
images = [load_image(img_url) for img_url in image_urls]
|
| 792 |
+
# Numbering images improves multi-image conversations
|
| 793 |
response = pipe((f'Image-1: {IMAGE_TOKEN}\nImage-2: {IMAGE_TOKEN}\ndescribe these two images', images))
|
| 794 |
print(response.text)
|
| 795 |
```
|
|
|
|
| 799 |
使用批量Prompt进行推理非常简单;只需将它们放在一个列表结构中:
|
| 800 |
|
| 801 |
```python
|
| 802 |
+
from lmdeploy import pipeline, TurbomindEngineConfig
|
| 803 |
from lmdeploy.vl import load_image
|
| 804 |
|
| 805 |
model = 'OpenGVLab/InternVL2-40B'
|
| 806 |
+
pipe = pipeline(model, backend_config=TurbomindEngineConfig(session_len=8192))
|
|
|
|
|
|
|
|
|
|
|
|
|
| 807 |
|
| 808 |
image_urls=[
|
| 809 |
"https://raw.githubusercontent.com/open-mmlab/mmdeploy/main/demo/resources/human-pose.jpg",
|
|
|
|
| 819 |
使用管道进行多轮对话有两种方法。一种是根据 OpenAI 的格式构建消息并使用上述方法,另一种是使用 `pipeline.chat` 接口。
|
| 820 |
|
| 821 |
```python
|
| 822 |
+
from lmdeploy import pipeline, TurbomindEngineConfig, GenerationConfig
|
| 823 |
from lmdeploy.vl import load_image
|
| 824 |
|
| 825 |
model = 'OpenGVLab/InternVL2-40B'
|
| 826 |
+
pipe = pipeline(model, backend_config=TurbomindEngineConfig(session_len=8192))
|
|
|
|
|
|
|
|
|
|
|
|
|
| 827 |
|
| 828 |
image = load_image('https://raw.githubusercontent.com/open-mmlab/mmdeploy/main/demo/resources/human-pose.jpg')
|
| 829 |
gen_config = GenerationConfig(top_k=40, top_p=0.8, temperature=0.8)
|
|
|
|
| 835 |
|
| 836 |
#### API部署
|
| 837 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 838 |
LMDeploy 的 `api_server` 使模型能够通过一个命令轻松打包成服务。提供的 RESTful API 与 OpenAI 的接口兼容。以下是服务启动的示例:
|
| 839 |
|
| 840 |
```shell
|
| 841 |
+
lmdeploy serve api_server OpenGVLab/InternVL2-40B --backend turbomind --server-port 23333
|
| 842 |
```
|
| 843 |
|
| 844 |
为了使用OpenAI风格的API接口,您需要安装OpenAI:
|
|
|
|
| 875 |
print(response)
|
| 876 |
```
|
| 877 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 878 |
## 开源许可证
|
| 879 |
|
| 880 |
该项目采用 MIT 许可证发布,而 InternLM2 则采用 Apache-2.0 许可证。
|
modeling_intern_vit.py
CHANGED
|
@@ -20,18 +20,12 @@ from transformers.utils import logging
|
|
| 20 |
from .configuration_intern_vit import InternVisionConfig
|
| 21 |
|
| 22 |
try:
|
| 23 |
-
try: # v1
|
| 24 |
-
from flash_attn.flash_attn_interface import \
|
| 25 |
-
flash_attn_unpadded_qkvpacked_func
|
| 26 |
-
except: # v2
|
| 27 |
-
from flash_attn.flash_attn_interface import \
|
| 28 |
-
flash_attn_varlen_qkvpacked_func as flash_attn_unpadded_qkvpacked_func
|
| 29 |
-
|
| 30 |
from flash_attn.bert_padding import pad_input, unpad_input
|
| 31 |
-
|
|
|
|
| 32 |
has_flash_attn = True
|
| 33 |
except:
|
| 34 |
-
print('
|
| 35 |
has_flash_attn = False
|
| 36 |
|
| 37 |
logger = logging.get_logger(__name__)
|
|
@@ -74,7 +68,7 @@ class FlashAttention(nn.Module):
|
|
| 74 |
max_s = seqlen
|
| 75 |
cu_seqlens = torch.arange(0, (batch_size + 1) * seqlen, step=seqlen, dtype=torch.int32,
|
| 76 |
device=qkv.device)
|
| 77 |
-
output =
|
| 78 |
qkv, cu_seqlens, max_s, self.dropout_p if self.training else 0.0,
|
| 79 |
softmax_scale=self.softmax_scale, causal=causal
|
| 80 |
)
|
|
@@ -84,7 +78,7 @@ class FlashAttention(nn.Module):
|
|
| 84 |
x = rearrange(qkv, 'b s three h d -> b s (three h d)')
|
| 85 |
x_unpad, indices, cu_seqlens, max_s = unpad_input(x, key_padding_mask)
|
| 86 |
x_unpad = rearrange(x_unpad, 'nnz (three h d) -> nnz three h d', three=3, h=nheads)
|
| 87 |
-
output_unpad =
|
| 88 |
x_unpad, cu_seqlens, max_s, self.dropout_p if self.training else 0.0,
|
| 89 |
softmax_scale=self.softmax_scale, causal=causal
|
| 90 |
)
|
|
@@ -93,7 +87,7 @@ class FlashAttention(nn.Module):
|
|
| 93 |
'b s (h d) -> b s h d', h=nheads)
|
| 94 |
else:
|
| 95 |
assert max_s is not None
|
| 96 |
-
output =
|
| 97 |
qkv, cu_seqlens, max_s, self.dropout_p if self.training else 0.0,
|
| 98 |
softmax_scale=self.softmax_scale, causal=causal
|
| 99 |
)
|
|
|
|
| 20 |
from .configuration_intern_vit import InternVisionConfig
|
| 21 |
|
| 22 |
try:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 23 |
from flash_attn.bert_padding import pad_input, unpad_input
|
| 24 |
+
from flash_attn.flash_attn_interface import \
|
| 25 |
+
flash_attn_varlen_qkvpacked_func
|
| 26 |
has_flash_attn = True
|
| 27 |
except:
|
| 28 |
+
print('FlashAttention2 is not installed.')
|
| 29 |
has_flash_attn = False
|
| 30 |
|
| 31 |
logger = logging.get_logger(__name__)
|
|
|
|
| 68 |
max_s = seqlen
|
| 69 |
cu_seqlens = torch.arange(0, (batch_size + 1) * seqlen, step=seqlen, dtype=torch.int32,
|
| 70 |
device=qkv.device)
|
| 71 |
+
output = flash_attn_varlen_qkvpacked_func(
|
| 72 |
qkv, cu_seqlens, max_s, self.dropout_p if self.training else 0.0,
|
| 73 |
softmax_scale=self.softmax_scale, causal=causal
|
| 74 |
)
|
|
|
|
| 78 |
x = rearrange(qkv, 'b s three h d -> b s (three h d)')
|
| 79 |
x_unpad, indices, cu_seqlens, max_s = unpad_input(x, key_padding_mask)
|
| 80 |
x_unpad = rearrange(x_unpad, 'nnz (three h d) -> nnz three h d', three=3, h=nheads)
|
| 81 |
+
output_unpad = flash_attn_varlen_qkvpacked_func(
|
| 82 |
x_unpad, cu_seqlens, max_s, self.dropout_p if self.training else 0.0,
|
| 83 |
softmax_scale=self.softmax_scale, causal=causal
|
| 84 |
)
|
|
|
|
| 87 |
'b s (h d) -> b s h d', h=nheads)
|
| 88 |
else:
|
| 89 |
assert max_s is not None
|
| 90 |
+
output = flash_attn_varlen_qkvpacked_func(
|
| 91 |
qkv, cu_seqlens, max_s, self.dropout_p if self.training else 0.0,
|
| 92 |
softmax_scale=self.softmax_scale, causal=causal
|
| 93 |
)
|