Update README.md
Browse files
README.md
CHANGED
|
@@ -38,7 +38,7 @@ pipeline_tag: image-text-to-text
|
|
| 38 |
- 📊 **Better chart recognition 🛠️**
|
| 39 |
- 📚 **One shot multi-page inference ⏱️**
|
| 40 |
|
| 41 |
-
##
|
| 42 |
|
| 43 |
You can use transformers or docling to perform inference:
|
| 44 |
|
|
@@ -115,17 +115,23 @@ print(doc.export_to_markdown())
|
|
| 115 |
<summary> 🚀 Fast Batch Inference Using VLLM</summary>
|
| 116 |
|
| 117 |
```python
|
| 118 |
-
|
|
|
|
|
|
|
|
|
|
| 119 |
|
| 120 |
import time
|
| 121 |
import os
|
| 122 |
from vllm import LLM, SamplingParams
|
| 123 |
from PIL import Image
|
|
|
|
|
|
|
| 124 |
|
| 125 |
# Configuration
|
| 126 |
MODEL_PATH = "ds4sd/SmolDocling-256M-preview"
|
| 127 |
-
IMAGE_DIR = "images_dir"
|
| 128 |
-
|
|
|
|
| 129 |
PROMPT_TEXT = "Convert page to Docling."
|
| 130 |
|
| 131 |
# Ensure output directory exists
|
|
@@ -152,12 +158,29 @@ for idx, img_file in enumerate(image_files, 1):
|
|
| 152 |
llm_input = {"prompt": chat_template, "multi_modal_data": {"image": image}}
|
| 153 |
output = llm.generate([llm_input], sampling_params=sampling_params)[0]
|
| 154 |
|
| 155 |
-
|
| 156 |
-
|
|
|
|
| 157 |
output_path = os.path.join(OUTPUT_DIR, output_filename)
|
| 158 |
|
| 159 |
with open(output_path, "w", encoding="utf-8") as f:
|
| 160 |
-
f.write(
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 161 |
|
| 162 |
print(f"Total time: {time.time() - start_time:.2f} sec")
|
| 163 |
```
|
|
@@ -223,5 +246,7 @@ DocTags are integrated with Docling, which allows export to HTML, Markdown, and
|
|
| 223 |
- **Finetuned from model:** Based on [Idefics3](https://huggingface.co/HuggingFaceM4/Idefics3-8B-Llama3) (see technical summary)
|
| 224 |
|
| 225 |
**Repository:** [Docling](https://github.com/docling-project/docling)
|
| 226 |
-
|
| 227 |
-
**
|
|
|
|
|
|
|
|
|
| 38 |
- 📊 **Better chart recognition 🛠️**
|
| 39 |
- 📚 **One shot multi-page inference ⏱️**
|
| 40 |
|
| 41 |
+
## ⌨️ Get started (code examples)
|
| 42 |
|
| 43 |
You can use transformers or docling to perform inference:
|
| 44 |
|
|
|
|
| 115 |
<summary> 🚀 Fast Batch Inference Using VLLM</summary>
|
| 116 |
|
| 117 |
```python
|
| 118 |
+
# Prerequisites:
|
| 119 |
+
# pip install vllm
|
| 120 |
+
# pip install docling_core
|
| 121 |
+
# place page images you want to convert into img/ dir
|
| 122 |
|
| 123 |
import time
|
| 124 |
import os
|
| 125 |
from vllm import LLM, SamplingParams
|
| 126 |
from PIL import Image
|
| 127 |
+
from docling_core.types.doc import DoclingDocument
|
| 128 |
+
from docling_core.types.doc.document import DocTagsDocument
|
| 129 |
|
| 130 |
# Configuration
|
| 131 |
MODEL_PATH = "ds4sd/SmolDocling-256M-preview"
|
| 132 |
+
# IMAGE_DIR = "images_dir"
|
| 133 |
+
IMAGE_DIR = "img/"
|
| 134 |
+
OUTPUT_DIR = "out/"
|
| 135 |
PROMPT_TEXT = "Convert page to Docling."
|
| 136 |
|
| 137 |
# Ensure output directory exists
|
|
|
|
| 158 |
llm_input = {"prompt": chat_template, "multi_modal_data": {"image": image}}
|
| 159 |
output = llm.generate([llm_input], sampling_params=sampling_params)[0]
|
| 160 |
|
| 161 |
+
doctags = output.outputs[0].text
|
| 162 |
+
img_fn = os.path.splitext(img_file)[0]
|
| 163 |
+
output_filename = img_fn + ".dt"
|
| 164 |
output_path = os.path.join(OUTPUT_DIR, output_filename)
|
| 165 |
|
| 166 |
with open(output_path, "w", encoding="utf-8") as f:
|
| 167 |
+
f.write(doctags)
|
| 168 |
+
|
| 169 |
+
# To convert to Docling Document, MD, HTML, etc.:
|
| 170 |
+
doctags_doc = DocTagsDocument.from_doctags_and_image_pairs([doctags], [image])
|
| 171 |
+
doc = DoclingDocument(name="Document")
|
| 172 |
+
doc.load_from_doctags(doctags_doc)
|
| 173 |
+
# export as any format
|
| 174 |
+
# HTML
|
| 175 |
+
# print(doc.export_to_html())
|
| 176 |
+
# with open(output_file, "w", encoding="utf-8") as f:
|
| 177 |
+
# f.write(doc.export_to_html())
|
| 178 |
+
# MD
|
| 179 |
+
output_filename_md = img_fn + ".md"
|
| 180 |
+
output_path_md = os.path.join(OUTPUT_DIR, output_filename_md)
|
| 181 |
+
markdown = doc.export_to_markdown()
|
| 182 |
+
with open(output_path_md, "w", encoding="utf-8") as f:
|
| 183 |
+
f.write(markdown)
|
| 184 |
|
| 185 |
print(f"Total time: {time.time() - start_time:.2f} sec")
|
| 186 |
```
|
|
|
|
| 246 |
- **Finetuned from model:** Based on [Idefics3](https://huggingface.co/HuggingFaceM4/Idefics3-8B-Llama3) (see technical summary)
|
| 247 |
|
| 248 |
**Repository:** [Docling](https://github.com/docling-project/docling)
|
| 249 |
+
|
| 250 |
+
**Paper:** [Coming soon]
|
| 251 |
+
|
| 252 |
+
**Demo:** [Coming soon]
|