Update README.md
Browse files
README.md
CHANGED
|
@@ -47,7 +47,17 @@ SmolDocling is a multimodal Image-Text-to-Text model designed for efficient docu
|
|
| 47 |
|
| 48 |
You can use transformers or docling to perform inference:
|
| 49 |
|
| 50 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 51 |
|
| 52 |
```python
|
| 53 |
import torch
|
|
@@ -93,9 +103,59 @@ generated_texts = processor.batch_decode(
|
|
| 93 |
|
| 94 |
print(generated_texts[0])
|
| 95 |
```
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 96 |
|
| 97 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 98 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 99 |
|
| 100 |
|
| 101 |
## Supported Instructions
|
|
|
|
| 47 |
|
| 48 |
You can use transformers or docling to perform inference:
|
| 49 |
|
| 50 |
+
<details>
|
| 51 |
+
<summary>Inference using Docling</summary>
|
| 52 |
+
|
| 53 |
+
```python
|
| 54 |
+
|
| 55 |
+
print(generated_texts[0])
|
| 56 |
+
```
|
| 57 |
+
</details>
|
| 58 |
+
|
| 59 |
+
<details>
|
| 60 |
+
<summary>Single image inference using Tranformers</summary>
|
| 61 |
|
| 62 |
```python
|
| 63 |
import torch
|
|
|
|
| 103 |
|
| 104 |
print(generated_texts[0])
|
| 105 |
```
|
| 106 |
+
</details>
|
| 107 |
+
|
| 108 |
+
<details>
|
| 109 |
+
<summary> 🚀 Fast Batch Inference Using VLLM</summary>
|
| 110 |
+
|
| 111 |
+
```python
|
| 112 |
+
!pip install vllm
|
| 113 |
+
|
| 114 |
+
import time
|
| 115 |
+
import os
|
| 116 |
+
from vllm import LLM, SamplingParams
|
| 117 |
+
from PIL import Image
|
| 118 |
|
| 119 |
+
# Configuration
|
| 120 |
+
MODEL_PATH = "ds4sd/SmolDocling-256M-preview"
|
| 121 |
+
IMAGE_DIR = "images_dir"
|
| 122 |
+
OUTPUT_DIR = "output_pred_dir"
|
| 123 |
+
PROMPT_TEXT = "Convert page to Docling."
|
| 124 |
|
| 125 |
+
# Ensure output directory exists
|
| 126 |
+
os.makedirs(OUTPUT_DIR, exist_ok=True)
|
| 127 |
+
|
| 128 |
+
# Initialize LLM
|
| 129 |
+
llm = LLM(model=MODEL_PATH, limit_mm_per_prompt={"image": 1})
|
| 130 |
+
|
| 131 |
+
sampling_params = SamplingParams(
|
| 132 |
+
temperature=0.0,
|
| 133 |
+
max_tokens=8192)
|
| 134 |
+
|
| 135 |
+
chat_template = f"<|im_start|>User:<image>{PROMPT_TEXT}<end_of_utterance>\nAssistant:"
|
| 136 |
+
|
| 137 |
+
image_files = sorted([f for f in os.listdir(IMAGE_DIR) if f.lower().endswith((".png", ".jpg", ".jpeg"))])
|
| 138 |
+
|
| 139 |
+
start_time = time.time()
|
| 140 |
+
total_tokens = 0
|
| 141 |
+
|
| 142 |
+
for idx, img_file in enumerate(image_files, 1):
|
| 143 |
+
img_path = os.path.join(IMAGE_DIR, img_file)
|
| 144 |
+
image = Image.open(img_path).convert("RGB")
|
| 145 |
+
|
| 146 |
+
llm_input = {"prompt": chat_template, "multi_modal_data": {"image": image}}
|
| 147 |
+
output = llm.generate([llm_input], sampling_params=sampling_params)[0]
|
| 148 |
+
|
| 149 |
+
output_text = output.outputs[0].text
|
| 150 |
+
output_filename = os.path.splitext(img_file)[0] + ".dt"
|
| 151 |
+
output_path = os.path.join(OUTPUT_DIR, output_filename)
|
| 152 |
+
|
| 153 |
+
with open(output_path, "w", encoding="utf-8") as f:
|
| 154 |
+
f.write(output_text)
|
| 155 |
+
|
| 156 |
+
print(f"Total time: {time.time() - start_time:.2f} sec")
|
| 157 |
+
```
|
| 158 |
+
</details>
|
| 159 |
|
| 160 |
|
| 161 |
## Supported Instructions
|