Update README.md
Browse files
README.md
CHANGED
|
@@ -31,12 +31,11 @@ pipeline_tag: image-text-to-text
|
|
| 31 |
- 🔲 **OCR with Bounding Boxes** – OCR regions using a bounding box.
|
| 32 |
- 📂 **General Document Processing** – Trained for both scientific and non-scientific documents.
|
| 33 |
- 🔄 **Seamless Docling Integration** – Import into **Docling** and export in multiple formats.
|
| 34 |
-
- 📚 **Multi-Page & Full Document Conversion** – Coming Soon.
|
| 35 |
-
- 🧪 **Chemical Recognition** – Coming Soon.
|
| 36 |
|
| 37 |
### 🚧 *Coming soon!*
|
| 38 |
- 📊 **Better chart recognition 🛠️**
|
| 39 |
- 📚 **One shot multi-page inference ⏱️**
|
|
|
|
| 40 |
|
| 41 |
## ⌨️ Get started (code examples)
|
| 42 |
|
|
@@ -49,6 +48,7 @@ You can use transformers or docling to perform inference:
|
|
| 49 |
# Prerequisites:
|
| 50 |
# pip install torch
|
| 51 |
# pip install docling_core
|
|
|
|
| 52 |
|
| 53 |
import torch
|
| 54 |
from docling_core.types.doc import DoclingDocument
|
|
@@ -96,15 +96,14 @@ doctags = processor.batch_decode(
|
|
| 96 |
|
| 97 |
# Populate document
|
| 98 |
doctags_doc = DocTagsDocument.from_doctags_and_image_pairs([doctags], [image])
|
|
|
|
| 99 |
# create a docling document
|
| 100 |
doc = DoclingDocument(name="Document")
|
| 101 |
doc.load_from_doctags(doctags_doc)
|
| 102 |
|
| 103 |
# export as any format
|
| 104 |
# HTML
|
| 105 |
-
#
|
| 106 |
-
# with open(output_file, "w", encoding="utf-8") as f:
|
| 107 |
-
# f.write(doc.export_to_html())
|
| 108 |
# MD
|
| 109 |
print(doc.export_to_markdown())
|
| 110 |
```
|
|
@@ -118,7 +117,7 @@ print(doc.export_to_markdown())
|
|
| 118 |
# Prerequisites:
|
| 119 |
# pip install vllm
|
| 120 |
# pip install docling_core
|
| 121 |
-
# place page images you want to convert into img/ dir
|
| 122 |
|
| 123 |
import time
|
| 124 |
import os
|
|
@@ -129,8 +128,7 @@ from docling_core.types.doc.document import DocTagsDocument
|
|
| 129 |
|
| 130 |
# Configuration
|
| 131 |
MODEL_PATH = "ds4sd/SmolDocling-256M-preview"
|
| 132 |
-
|
| 133 |
-
IMAGE_DIR = "img/"
|
| 134 |
OUTPUT_DIR = "out/"
|
| 135 |
PROMPT_TEXT = "Convert page to Docling."
|
| 136 |
|
|
@@ -172,15 +170,11 @@ for idx, img_file in enumerate(image_files, 1):
|
|
| 172 |
doc.load_from_doctags(doctags_doc)
|
| 173 |
# export as any format
|
| 174 |
# HTML
|
| 175 |
-
#
|
| 176 |
-
# with open(output_file, "w", encoding="utf-8") as f:
|
| 177 |
-
# f.write(doc.export_to_html())
|
| 178 |
# MD
|
| 179 |
output_filename_md = img_fn + ".md"
|
| 180 |
output_path_md = os.path.join(OUTPUT_DIR, output_filename_md)
|
| 181 |
-
|
| 182 |
-
with open(output_path_md, "w", encoding="utf-8") as f:
|
| 183 |
-
f.write(markdown)
|
| 184 |
|
| 185 |
print(f"Total time: {time.time() - start_time:.2f} sec")
|
| 186 |
```
|
|
@@ -198,42 +192,49 @@ DocTags are integrated with Docling, which allows export to HTML, Markdown, and
|
|
| 198 |
<tr>
|
| 199 |
<td><b>Description</b></td>
|
| 200 |
<td><b>Instruction</b></td>
|
|
|
|
| 201 |
</tr>
|
| 202 |
<tr>
|
| 203 |
<td>Full conversion</td>
|
| 204 |
<td>Convert this page to docling.</td>
|
|
|
|
| 205 |
</tr>
|
| 206 |
<tr>
|
| 207 |
<td>Chart</td>
|
| 208 |
-
<td>Convert chart to table
|
|
|
|
| 209 |
</tr>
|
| 210 |
<tr>
|
| 211 |
<td>Formula</td>
|
| 212 |
-
<td>Convert formula to LaTeX
|
|
|
|
| 213 |
</tr>
|
| 214 |
<tr>
|
| 215 |
<td>Code</td>
|
| 216 |
-
<td>Convert code to text
|
|
|
|
| 217 |
</tr>
|
| 218 |
<tr>
|
| 219 |
<td>Table</td>
|
| 220 |
-
<td>Convert table to OTSL
|
|
|
|
| 221 |
</tr>
|
| 222 |
<tr>
|
| 223 |
-
<td>No-Code Actions/Pipelines</td>
|
| 224 |
<td>OCR the text in a specific location: <loc_155><loc_233><loc_206><loc_237></td>
|
|
|
|
| 225 |
</tr>
|
| 226 |
<tr>
|
| 227 |
-
<td></td>
|
| 228 |
<td>Identify element at: <loc_247><loc_482><10c_252><loc_486></td>
|
|
|
|
| 229 |
</tr>
|
| 230 |
<tr>
|
| 231 |
-
<td></td>
|
| 232 |
<td>Find all 'text' elements on the page, retrieve all section headers.</td>
|
|
|
|
| 233 |
</tr>
|
| 234 |
<tr>
|
| 235 |
-
<td></td>
|
| 236 |
<td>Detect footer elements on the page.</td>
|
|
|
|
| 237 |
</tr>
|
| 238 |
</table>
|
| 239 |
|
|
|
|
| 31 |
- 🔲 **OCR with Bounding Boxes** – OCR regions using a bounding box.
|
| 32 |
- 📂 **General Document Processing** – Trained for both scientific and non-scientific documents.
|
| 33 |
- 🔄 **Seamless Docling Integration** – Import into **Docling** and export in multiple formats.
|
|
|
|
|
|
|
| 34 |
|
| 35 |
### 🚧 *Coming soon!*
|
| 36 |
- 📊 **Better chart recognition 🛠️**
|
| 37 |
- 📚 **One shot multi-page inference ⏱️**
|
| 38 |
+
- 🧪 **Chemical Recognition**
|
| 39 |
|
| 40 |
## ⌨️ Get started (code examples)
|
| 41 |
|
|
|
|
| 48 |
# Prerequisites:
|
| 49 |
# pip install torch
|
| 50 |
# pip install docling_core
|
| 51 |
+
# pip install transformers
|
| 52 |
|
| 53 |
import torch
|
| 54 |
from docling_core.types.doc import DoclingDocument
|
|
|
|
| 96 |
|
| 97 |
# Populate document
|
| 98 |
doctags_doc = DocTagsDocument.from_doctags_and_image_pairs([doctags], [image])
|
| 99 |
+
print(doctags)
|
| 100 |
# create a docling document
|
| 101 |
doc = DoclingDocument(name="Document")
|
| 102 |
doc.load_from_doctags(doctags_doc)
|
| 103 |
|
| 104 |
# export as any format
|
| 105 |
# HTML
|
| 106 |
+
# doc.save_as_html(output_file)
|
|
|
|
|
|
|
| 107 |
# MD
|
| 108 |
print(doc.export_to_markdown())
|
| 109 |
```
|
|
|
|
| 117 |
# Prerequisites:
|
| 118 |
# pip install vllm
|
| 119 |
# pip install docling_core
|
| 120 |
+
# place page images you want to convert into "img/" dir
|
| 121 |
|
| 122 |
import time
|
| 123 |
import os
|
|
|
|
| 128 |
|
| 129 |
# Configuration
|
| 130 |
MODEL_PATH = "ds4sd/SmolDocling-256M-preview"
|
| 131 |
+
IMAGE_DIR = "img/" # Place your page images here
|
|
|
|
| 132 |
OUTPUT_DIR = "out/"
|
| 133 |
PROMPT_TEXT = "Convert page to Docling."
|
| 134 |
|
|
|
|
| 170 |
doc.load_from_doctags(doctags_doc)
|
| 171 |
# export as any format
|
| 172 |
# HTML
|
| 173 |
+
# doc.save_as_html(output_file)
|
|
|
|
|
|
|
| 174 |
# MD
|
| 175 |
output_filename_md = img_fn + ".md"
|
| 176 |
output_path_md = os.path.join(OUTPUT_DIR, output_filename_md)
|
| 177 |
+
doc.save_as_markdown(output_path_md)
|
|
|
|
|
|
|
| 178 |
|
| 179 |
print(f"Total time: {time.time() - start_time:.2f} sec")
|
| 180 |
```
|
|
|
|
| 192 |
<tr>
|
| 193 |
<td><b>Description</b></td>
|
| 194 |
<td><b>Instruction</b></td>
|
| 195 |
+
<td><b>Comment</b></td>
|
| 196 |
</tr>
|
| 197 |
<tr>
|
| 198 |
<td>Full conversion</td>
|
| 199 |
<td>Convert this page to docling.</td>
|
| 200 |
+
<td></td>
|
| 201 |
</tr>
|
| 202 |
<tr>
|
| 203 |
<td>Chart</td>
|
| 204 |
+
<td>Convert chart to table.</td>
|
| 205 |
+
<td>(e.g., <chart>)</td>
|
| 206 |
</tr>
|
| 207 |
<tr>
|
| 208 |
<td>Formula</td>
|
| 209 |
+
<td>Convert formula to LaTeX.</td>
|
| 210 |
+
<td>(e.g., <formula>)</td>
|
| 211 |
</tr>
|
| 212 |
<tr>
|
| 213 |
<td>Code</td>
|
| 214 |
+
<td>Convert code to text.</td>
|
| 215 |
+
<td>(e.g., <code>)</td>
|
| 216 |
</tr>
|
| 217 |
<tr>
|
| 218 |
<td>Table</td>
|
| 219 |
+
<td>Convert table to OTSL.</td>
|
| 220 |
+
<td>(e.g., <otsl>) OTSL: <a href="https://arxiv.org/pdf/2305.03393">Lysak et al., 2023</a></td>
|
| 221 |
</tr>
|
| 222 |
<tr>
|
| 223 |
+
<td rowspan=4>No-Code Actions/Pipelines</td>
|
| 224 |
<td>OCR the text in a specific location: <loc_155><loc_233><loc_206><loc_237></td>
|
| 225 |
+
<td></td>
|
| 226 |
</tr>
|
| 227 |
<tr>
|
|
|
|
| 228 |
<td>Identify element at: <loc_247><loc_482><10c_252><loc_486></td>
|
| 229 |
+
<td></td>
|
| 230 |
</tr>
|
| 231 |
<tr>
|
|
|
|
| 232 |
<td>Find all 'text' elements on the page, retrieve all section headers.</td>
|
| 233 |
+
<td></td>
|
| 234 |
</tr>
|
| 235 |
<tr>
|
|
|
|
| 236 |
<td>Detect footer elements on the page.</td>
|
| 237 |
+
<td></td>
|
| 238 |
</tr>
|
| 239 |
</table>
|
| 240 |
|