xxyyy123 commited on
Commit
61692de
·
1 Parent(s): 97f967c

update chat_template

Browse files
Files changed (3) hide show
  1. README.md +85 -38
  2. chat_template.json +1 -1
  3. tokenizer_config.json +1 -1
README.md CHANGED
@@ -48,14 +48,18 @@ Building on these advances, **Ovis2.5-9B** achieves an average score of 78.3 on
48
  </div>
49
 
50
  ## Quick Inference
 
51
  Below is a simple example demonstrating how to run Ovis2.5 with a single image input.
52
 
53
  First, install the required dependencies:
 
54
  ```bash
55
  pip install torch==2.4.0 transformers==4.51.3 numpy==1.25.0 pillow==10.3.0 moviepy==1.0.3
56
  pip install flash-attn==2.7.0.post2 --no-build-isolation
57
  ```
58
- Then, run the following code. The thinking and thinking budget logic can be applied in the same way for multi-image, video and pure text scenarios.
 
 
59
  ```python
60
  import torch
61
  import requests
@@ -63,51 +67,21 @@ from PIL import Image
63
  from transformers import AutoModelForCausalLM
64
 
65
  MODEL_PATH = "AIDC-AI/Ovis2.5-9B"
66
- # Controls whether to enable thinking mode.
 
67
  enable_thinking = True
68
- # NOTE: The thinking budget mechanism is effective only when
69
- # enable_thinking and enable_thinking_budget are both True.
70
- # Setting enable_thinking=True and enable_thinking_budget=False
71
- # enables thinking without budget. In such case the model might
72
- # spend a lot of tokens in the thinking phase and could be slow.
73
- enable_thinking_budget = True
74
- # max_new_tokens is the upper limit for thinking and non-thinking tokens combined.
75
- # MUST ensure that max_new_tokens > thinking_budget + 25
76
- # when using the thinking budget mechanism.
77
  max_new_tokens = 3072
78
  thinking_budget = 2048
79
 
80
- # The implementation of thinking budget involves two-phase generation,
81
- # which is incompatible with the official transformers TextIteratorStreamer.
82
- # MUST use this new class for streaming whether thinking budget is used
83
- # or not. See the commented lines below that involve "streamer" for usage.
84
- from transformers import TextIteratorStreamer
85
- class MyTextIteratorStreamer(TextIteratorStreamer):
86
- def manual_end(self):
87
- """Flushes any remaining cache and prints a newline to stdout."""
88
- # Flush the cache, if it exists
89
- if len(self.token_cache) > 0:
90
- text = self.tokenizer.decode(self.token_cache, **self.decode_kwargs)
91
- printable_text = text[self.print_len :]
92
- self.token_cache = []
93
- self.print_len = 0
94
- else:
95
- printable_text = ""
96
-
97
- self.next_tokens_are_prompt = True
98
- self.on_finalized_text(printable_text, stream_end=True)
99
-
100
- def end(self):
101
- pass
102
-
103
  model = AutoModelForCausalLM.from_pretrained(
104
  MODEL_PATH,
105
  torch_dtype=torch.bfloat16,
106
  trust_remote_code=True
107
  ).cuda()
108
 
109
- # streamer = MyTextIteratorStreamer(model.text_tokenizer, skip_prompt=True, skip_special_tokens=True)
110
-
111
  messages = [{
112
  "role": "user",
113
  "content": [
@@ -133,13 +107,85 @@ outputs = model.generate(
133
  enable_thinking_budget=enable_thinking_budget,
134
  max_new_tokens=max_new_tokens,
135
  thinking_budget=thinking_budget,
136
- # streamer=streamer
137
  )
138
 
139
  response = model.text_tokenizer.decode(outputs[0], skip_special_tokens=True)
140
  print(response)
141
  ```
142
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
143
  <details>
144
  <summary>Example: Multi-image</summary>
145
  Demonstrates how to run inference with multiple images and a related question.
@@ -168,6 +214,7 @@ with torch.no_grad():
168
  pad_token_id=model.text_tokenizer.pad_token_id)
169
  print(model.text_tokenizer.decode(outputs[0], skip_special_tokens=True))
170
  ```
 
171
  </details>
172
 
173
  <details>
@@ -176,7 +223,7 @@ Demonstrates how to run inference on a video by sampling multiple frames and ask
176
 
177
  ```python
178
  # Video inference
179
- from moviepy.editor import VideoFileClip # pip install moviepy==1.0.3
180
 
181
  video_file = "/path/to/video_1.mp4"
182
  num_frames = 8
 
48
  </div>
49
 
50
  ## Quick Inference
51
+
52
  Below is a simple example demonstrating how to run Ovis2.5 with a single image input.
53
 
54
  First, install the required dependencies:
55
+
56
  ```bash
57
  pip install torch==2.4.0 transformers==4.51.3 numpy==1.25.0 pillow==10.3.0 moviepy==1.0.3
58
  pip install flash-attn==2.7.0.post2 --no-build-isolation
59
  ```
60
+
61
+ Then, run the following code.
62
+
63
  ```python
64
  import torch
65
  import requests
 
67
  from transformers import AutoModelForCausalLM
68
 
69
  MODEL_PATH = "AIDC-AI/Ovis2.5-9B"
70
+
71
+ # Thinking mode & budget
72
  enable_thinking = True
73
+ enable_thinking_budget = True # Only effective if enable_thinking is True.
74
+
75
+ # Total tokens for thinking + answer. Ensure: max_new_tokens > thinking_budget + 25
 
 
 
 
 
 
76
  max_new_tokens = 3072
77
  thinking_budget = 2048
78
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
79
  model = AutoModelForCausalLM.from_pretrained(
80
  MODEL_PATH,
81
  torch_dtype=torch.bfloat16,
82
  trust_remote_code=True
83
  ).cuda()
84
 
 
 
85
  messages = [{
86
  "role": "user",
87
  "content": [
 
107
  enable_thinking_budget=enable_thinking_budget,
108
  max_new_tokens=max_new_tokens,
109
  thinking_budget=thinking_budget,
 
110
  )
111
 
112
  response = model.text_tokenizer.decode(outputs[0], skip_special_tokens=True)
113
  print(response)
114
  ```
115
 
116
+ The thinking and thinking budget logic can be applied in the same way for multi-image, video and pure text scenarios.
117
+
118
+ **Note (answer extraction for CoT/Thinking):**
119
+ To make evaluation and usage easier, we recommend appending a fixed suffix to prompts when using chain-of-thought (CoT) or thinking mode. This ensures the model clearly outputs a final answer that can be extracted programmatically:
120
+
121
+ ```
122
+ End your response with 'Final answer: '.
123
+ ```
124
+
125
+ For example:
126
+
127
+ ```
128
+ Calculate the sum of the numbers in the middle box in figure (c).
129
+ End your response with 'Final answer: '.
130
+ ```
131
+
132
+
133
+ **Tip:** The sections below include an optional streaming helper (compatible with two-phase thinking/budget runs) and extra inference modes: multi-image, video, and text-only.
134
+
135
+ <details>
136
+ <summary>Optional: Streaming (Advanced)</summary>
137
+
138
+ When using the thinking budget (two-phase generation), the default `TextIteratorStreamer` is not compatible. If you need streaming output, use the helper below (recommended for streaming with or without budget).
139
+
140
+ ```python
141
+ # --- Budget-aware streamer helper ---
142
+ from transformers import TextIteratorStreamer
143
+
144
+ class BudgetAwareTextStreamer(TextIteratorStreamer):
145
+ """A streamer compatible with Ovis two-phase generation.
146
+
147
+ Call .manual_end() after generation to flush any remaining text.
148
+ """
149
+ def manual_end(self):
150
+ if len(self.token_cache) > 0:
151
+ text = self.tokenizer.decode(self.token_cache, **self.decode_kwargs)
152
+ printable_text = text[self.print_len:]
153
+ self.token_cache = []
154
+ self.print_len = 0
155
+ else:
156
+ printable_text = ""
157
+ self.next_tokens_are_prompt = True
158
+ self.on_finalized_text(printable_text, stream_end=True)
159
+
160
+ # Disable base class's end hook; we'll finalize via manual_end()
161
+ def end(self):
162
+ pass
163
+ ```
164
+
165
+ Example usage (replacing the blocking decode in the main demo):
166
+
167
+ ```python
168
+ streamer = BudgetAwareTextStreamer(
169
+ model.text_tokenizer,
170
+ skip_prompt=True,
171
+ skip_special_tokens=True
172
+ )
173
+
174
+ outputs = model.generate(
175
+ inputs=input_ids,
176
+ pixel_values=pixel_values,
177
+ grid_thws=grid_thws,
178
+ enable_thinking=enable_thinking,
179
+ enable_thinking_budget=enable_thinking_budget,
180
+ max_new_tokens=max_new_tokens,
181
+ thinking_budget=thinking_budget,
182
+ streamer=streamer
183
+ )
184
+
185
+ ```
186
+
187
+ </details>
188
+
189
  <details>
190
  <summary>Example: Multi-image</summary>
191
  Demonstrates how to run inference with multiple images and a related question.
 
214
  pad_token_id=model.text_tokenizer.pad_token_id)
215
  print(model.text_tokenizer.decode(outputs[0], skip_special_tokens=True))
216
  ```
217
+
218
  </details>
219
 
220
  <details>
 
223
 
224
  ```python
225
  # Video inference
226
+ from moviepy.editor import VideoFileClip # pip install moviepy==1.0.3
227
 
228
  video_file = "/path/to/video_1.mp4"
229
  num_frames = 8
chat_template.json CHANGED
@@ -1,3 +1,3 @@
1
  {
2
- "chat_template": "{%- for message in messages %}{{- '<|im_start|>' + message.role + '\n'}}{%- if message.role == 'system' or message.role == 'user' %}{%- if message.content is string %}{{- message.content | replace('<image>', '') | replace('<video>', '') }}{%- else %}{%- for item in message.content %}{%- if item.type == 'text' and 'text' in item %}{{- item.text | replace('<image>', '') | replace('<video>', '') }}{%- elif item.type == 'image' and 'image' in item %}{{- '<image>'}}{%- elif item.type == 'video' and 'video' in item %}{{- '<video>'}}{%- else %}{{- raise_exception('Invalid content type. Supported types for system and user are text, image, video.')}}{%- endif %}{%- if not loop.last %}{{- '\n'}}{%- endif %}{%- endfor %}{%- endif %}{%- elif message.role == 'assistant' %}{%- set content = '' %}{%- if message.content is string %}{%- set content = message.content | replace('<image>', '') | replace('<video>', '') %}{%- else %}{%- for item in message.content %}{%- if item.type == 'text' and 'text' in item %}{%- set content = content ~ (item.text | replace('<image>', '') | replace('<video>', '')) %}{%- else %}{{- raise_exception('Invalid content type. Supported type for assistant is text.')}}{%- endif %}{%- endfor %}{%- endif %}{%- set content = content.split('</think>')[-1].lstrip('\n') %}{{- content }}{%- else %}{{- raise_exception('Invalid role. Supported roles are system, user, assistant.')}}{%- endif %}{{- '<|im_end|>\n'}}{%- endfor %}{%- if add_generation_prompt %}{{- '<|im_start|>assistant\n' }}{%- if enable_thinking is defined and enable_thinking is false %}{{- '<think>\n\n</think>\n\n' }}{%- endif %}{%- endif %}"
3
  }
 
1
  {
2
+ "chat_template": "{%- for message in messages %}{{- '<|im_start|>' + message.role + '\n'}}{%- if message.role == 'system' or message.role == 'user' %}{%- if message.content is string %}{{- message.content | replace('<image>', '') | replace('<video>', '') }}{%- else %}{%- for item in message.content %}{%- if item.type == 'text' and 'text' in item %}{{- item.text | replace('<image>', '') | replace('<video>', '') }}{%- elif item.type == 'image' %}{{- '<image>'}}{%- elif item.type == 'video' %}{{- '<video>'}}{%- else %}{{- raise_exception('Invalid content type. Supported types for system and user are text, image, video.')}}{%- endif %}{%- if not loop.last %}{{- '\n'}}{%- endif %}{%- endfor %}{%- endif %}{%- elif message.role == 'assistant' %}{%- set content = '' %}{%- if message.content is string %}{%- set content = message.content | replace('<image>', '') | replace('<video>', '') %}{%- else %}{%- for item in message.content %}{%- if item.type == 'text' and 'text' in item %}{%- set content = content ~ (item.text | replace('<image>', '') | replace('<video>', '')) %}{%- else %}{{- raise_exception('Invalid content type. Supported type for assistant is text.')}}{%- endif %}{%- endfor %}{%- endif %}{%- set content = content.split('</think>')[-1].lstrip('\n') %}{{- content }}{%- else %}{{- raise_exception('Invalid role. Supported roles are system, user, assistant.')}}{%- endif %}{{- '<|im_end|>\n'}}{%- endfor %}{%- if add_generation_prompt %}{{- '<|im_start|>assistant\n' }}{%- if enable_thinking is defined and enable_thinking is false %}{{- '<think>\n\n</think>\n\n' }}{%- endif %}{%- endif %}"
3
  }
tokenizer_config.json CHANGED
@@ -227,7 +227,7 @@
227
  "<|video_pad|>"
228
  ],
229
  "bos_token": null,
230
- "chat_template": "{%- for message in messages %}{{- '<|im_start|>' + message.role + '\n'}}{%- if message.role == 'system' or message.role == 'user' %}{%- if message.content is string %}{{- message.content | replace('<image>', '') | replace('<video>', '') }}{%- else %}{%- for item in message.content %}{%- if item.type == 'text' and 'text' in item %}{{- item.text | replace('<image>', '') | replace('<video>', '') }}{%- elif item.type == 'image' and 'image' in item %}{{- '<image>'}}{%- elif item.type == 'video' and 'video' in item %}{{- '<video>'}}{%- else %}{{- raise_exception('Invalid content type. Supported types for system and user are text, image, video.')}}{%- endif %}{%- if not loop.last %}{{- '\n'}}{%- endif %}{%- endfor %}{%- endif %}{%- elif message.role == 'assistant' %}{%- set content = '' %}{%- if message.content is string %}{%- set content = message.content | replace('<image>', '') | replace('<video>', '') %}{%- else %}{%- for item in message.content %}{%- if item.type == 'text' and 'text' in item %}{%- set content = content ~ (item.text | replace('<image>', '') | replace('<video>', '')) %}{%- else %}{{- raise_exception('Invalid content type. Supported type for assistant is text.')}}{%- endif %}{%- endfor %}{%- endif %}{%- set content = content.split('</think>')[-1].lstrip('\n') %}{{- content }}{%- else %}{{- raise_exception('Invalid role. Supported roles are system, user, assistant.')}}{%- endif %}{{- '<|im_end|>\n'}}{%- endfor %}{%- if add_generation_prompt %}{{- '<|im_start|>assistant\n' }}{%- if enable_thinking is defined and enable_thinking is false %}{{- '<think>\n\n</think>\n\n' }}{%- endif %}{%- endif %}",
231
  "clean_up_tokenization_spaces": false,
232
  "eos_token": "<|im_end|>",
233
  "errors": "replace",
 
227
  "<|video_pad|>"
228
  ],
229
  "bos_token": null,
230
+ "chat_template": "{%- for message in messages %}{{- '<|im_start|>' + message.role + '\n'}}{%- if message.role == 'system' or message.role == 'user' %}{%- if message.content is string %}{{- message.content | replace('<image>', '') | replace('<video>', '') }}{%- else %}{%- for item in message.content %}{%- if item.type == 'text' and 'text' in item %}{{- item.text | replace('<image>', '') | replace('<video>', '') }}{%- elif item.type == 'image' %}{{- '<image>'}}{%- elif item.type == 'video' %}{{- '<video>'}}{%- else %}{{- raise_exception('Invalid content type. Supported types for system and user are text, image, video.')}}{%- endif %}{%- if not loop.last %}{{- '\n'}}{%- endif %}{%- endfor %}{%- endif %}{%- elif message.role == 'assistant' %}{%- set content = '' %}{%- if message.content is string %}{%- set content = message.content | replace('<image>', '') | replace('<video>', '') %}{%- else %}{%- for item in message.content %}{%- if item.type == 'text' and 'text' in item %}{%- set content = content ~ (item.text | replace('<image>', '') | replace('<video>', '')) %}{%- else %}{{- raise_exception('Invalid content type. Supported type for assistant is text.')}}{%- endif %}{%- endfor %}{%- endif %}{%- set content = content.split('</think>')[-1].lstrip('\n') %}{{- content }}{%- else %}{{- raise_exception('Invalid role. Supported roles are system, user, assistant.')}}{%- endif %}{{- '<|im_end|>\n'}}{%- endfor %}{%- if add_generation_prompt %}{{- '<|im_start|>assistant\n' }}{%- if enable_thinking is defined and enable_thinking is false %}{{- '<think>\n\n</think>\n\n' }}{%- endif %}{%- endif %}",
231
  "clean_up_tokenization_spaces": false,
232
  "eos_token": "<|im_end|>",
233
  "errors": "replace",