File size: 3,406 Bytes
46ed648
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
85a2e68
 
 
 
 
 
 
46ed648
 
 
 
 
 
 
 
 
 
55c27f2
46ed648
55c27f2
46ed648
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
---
license: gemma
---

## Usage

```python
import onnxruntime as ort
import os
import torch
from PIL import Image
from transformers import ColPaliProcessor


MODEL_IMAGE_PATH = "ssonpull519/colpali-v1.3-hf-image-onnx-fp16"
MODEL_TEXT_PATH = "ssonpull519/colpali-v1.3-hf-text-onnx-fp16"

device = "cuda"

processor = ColPaliProcessor.from_pretrained(MODEL_IMAGE_PATH)

# Your inputs
images = [
    Image.open("image1.png"),
    Image.open("image2.png"),
]
queries = [
    "Who printed the edition of Romeo and Juliet?",
    "When was the United States Declaration of Independence proclaimed?",
]

# Process the inputs
batch_images = processor(images=images, return_tensors="pt")    # ['input_ids', 'attention_mask', 'pixel_values']; (B, 1030), (B, 3, 448, 448); input_ids are full of <image> + prefix.
batch_queries = processor(text=queries, return_tensors="pt")    # ['input_ids', 'attention_mask']; (B, S)

# move inputs to GPU
batch_images = batch_images.to(device)
batch_queries = batch_queries.to(device)

# Convert the inputs to numpy arrays for the ONNX model
inputs_images_onnx = {name: tensor.cpu().numpy() for name, tensor in batch_images.items()}
inputs_queries_onnx = {name: tensor.cpu().numpy() for name, tensor in batch_queries.items()}

# Run the ONNX model
sess_image = ort.InferenceSession(os.path.join(MODEL_IMAGE_PATH, "model.onnx"))
sess_text = ort.InferenceSession(os.path.join(MODEL_TEXT_PATH, "model.onnx"))

onnx_output_images = sess_image.run(None, inputs_images_onnx)
onnx_output_queries = sess_text.run(None, inputs_queries_onnx)

# Score the queries against the images
scores = processor.score_retrieval(torch.Tensor(onnx_output_queries[0]), torch.Tensor(onnx_output_images[0]))    # (Bt, Bi, S, 1030) -> (Bt, Bi)

print("onnx_output size [images]:", onnx_output_images[0].shape)
print("onnx_output size [queries]:", onnx_output_queries[0].shape)

print("scores:")
print(scores)
```

## ONNX Conversion Script

Currently in [pull request](https://github.com/huggingface/optimum/pull/2251) (not merged yet).

```bash
optimum-cli export onnx --model vidore/colpali-v1.3-hf ./onnx_output --task feature-extraction --variant text --dtype fp16
```

For fp16, there's an issue with transformers that is not fixed for now, so please use script below.

```python
from pathlib import Path
from optimum.exporters import TasksManager
from optimum.exporters.onnx import export
from transformers import ColPaliForRetrieval
import torch


MODEL_PATH = "vidore/colpali-v1.3-hf"
VARIANT = "text"                        # one of "vision" or "text"
ONNX_PATH = f"onnx/{VARIANT}/model.onnx"
MODEL_DTYPE = torch.float16             # one of torch.float32 or torch.float16

base_model = ColPaliForRetrieval.from_pretrained(MODEL_PATH)
base_model = base_model.to(dtype=MODEL_DTYPE)

onnx_path = Path(ONNX_PATH)

onnx_config_constructor = TasksManager.get_exporter_config_constructor("onnx", base_model)
onnx_config = onnx_config_constructor(base_model.config)
onnx_config.variant = VARIANT
onnx_inputs, onnx_outputs = export(base_model, onnx_config, onnx_path, onnx_config.DEFAULT_ONNX_OPSET)

# -- validate model --
import onnx

onnx_model = onnx.load(ONNX_PATH)
onnx.checker.check_model(ONNX_PATH)

from optimum.exporters.onnx import validate_model_outputs

validate_model_outputs(
    onnx_config, base_model, onnx_path, ["embeddings"], onnx_config.ATOL_FOR_VALIDATION, use_subprocess=False
)

```