| 
							 | 
						--- | 
					
					
						
						| 
							 | 
						base_model: | 
					
					
						
						| 
							 | 
						- BAAI/bge-m3 | 
					
					
						
						| 
							 | 
						pipeline_tag: feature-extraction | 
					
					
						
						| 
							 | 
						tags: | 
					
					
						
						| 
							 | 
						- bge-m3 | 
					
					
						
						| 
							 | 
						- onnx | 
					
					
						
						| 
							 | 
						--- | 
					
					
						
						| 
							 | 
						 | 
					
					
						
						| 
							 | 
						Based on `aapot/bge-m3-onnx` and `philipchung/bge-m3-onnx` | 
					
					
						
						| 
							 | 
						
 | 
					
					
						
						| 
							 | 
						All three vectors (dense, sparse and colbert) are supported. | 
					
					
						
						| 
							 | 
						
 | 
					
					
						
						| 
							 | 
						## Deploy with tritonserver | 
					
					
						
						| 
							 | 
						
 | 
					
					
						
						| 
							 | 
						- Folder structure | 
					
					
						
						| 
							 | 
						
 | 
					
					
						
						| 
							 | 
						``` | 
					
					
						
						| 
							 | 
						. | 
					
					
						
						| 
							 | 
						├── model_repository | 
					
					
						
						| 
							 | 
						│   └── bge-m3 | 
					
					
						
						| 
							 | 
						│       ├── 1 | 
					
					
						
						| 
							 | 
						│       │   ├── model.onnx | 
					
					
						
						| 
							 | 
						│       │   └── model.onnx.data | 
					
					
						
						| 
							 | 
						│       └── config.pbtxt | 
					
					
						
						| 
							 | 
						``` | 
					
					
						
						| 
							 | 
						
 | 
					
					
						
						| 
							 | 
						- `config.pbtxt` file | 
					
					
						
						| 
							 | 
						
 | 
					
					
						
						| 
							 | 
						``` | 
					
					
						
						| 
							 | 
						name: "bge-m3" | 
					
					
						
						| 
							 | 
						backend: "onnxruntime" | 
					
					
						
						| 
							 | 
						max_batch_size : 4 | 
					
					
						
						| 
							 | 
						 | 
					
					
						
						| 
							 | 
						input [ | 
					
					
						
						| 
							 | 
						  { | 
					
					
						
						| 
							 | 
						    name: "input_ids" | 
					
					
						
						| 
							 | 
						    data_type: TYPE_INT64 | 
					
					
						
						| 
							 | 
						    dims: [ -1 ] | 
					
					
						
						| 
							 | 
						  }, | 
					
					
						
						| 
							 | 
						  { | 
					
					
						
						| 
							 | 
						    name: "attention_mask" | 
					
					
						
						| 
							 | 
						    data_type: TYPE_INT64 | 
					
					
						
						| 
							 | 
						    dims: [ -1 ] | 
					
					
						
						| 
							 | 
						  } | 
					
					
						
						| 
							 | 
						] | 
					
					
						
						| 
							 | 
						 | 
					
					
						
						| 
							 | 
						output [ | 
					
					
						
						| 
							 | 
						  { | 
					
					
						
						| 
							 | 
						    name: "dense_vecs" | 
					
					
						
						| 
							 | 
						    data_type: TYPE_FP32 | 
					
					
						
						| 
							 | 
						    dims: [ 1024 ] | 
					
					
						
						| 
							 | 
						  }, | 
					
					
						
						| 
							 | 
						  { | 
					
					
						
						| 
							 | 
						    name: "sparse_vecs" | 
					
					
						
						| 
							 | 
						    data_type: TYPE_FP32 | 
					
					
						
						| 
							 | 
						    dims: [ -1, 1 ] | 
					
					
						
						| 
							 | 
						  }, | 
					
					
						
						| 
							 | 
						  { | 
					
					
						
						| 
							 | 
						    name: "colbert_vecs" | 
					
					
						
						| 
							 | 
						    data_type: TYPE_FP32 | 
					
					
						
						| 
							 | 
						    dims: [ -1, 1024 ] | 
					
					
						
						| 
							 | 
						  } | 
					
					
						
						| 
							 | 
						] | 
					
					
						
						| 
							 | 
						 | 
					
					
						
						| 
							 | 
						``` | 
					
					
						
						| 
							 | 
						
 | 
					
					
						
						| 
							 | 
						- Run with tritonserver docker image | 
					
					
						
						| 
							 | 
						
 | 
					
					
						
						| 
							 | 
						```bash | 
					
					
						
						| 
							 | 
						docker run --gpus all --rm -p 8000:8000 -p 8001:8001 -p 8002:8002 -v ./model_repository:/models nvcr.io/nvidia/tritonserver:24.12-py3 tritonserver -- | 
					
					
						
						| 
							 | 
						model-repository=/models | 
					
					
						
						| 
							 | 
						``` | 
					
					
						
						| 
							 | 
						
 | 
					
					
						
						| 
							 | 
						- Infer with `tritonsclient` | 
					
					
						
						| 
							 | 
						
 | 
					
					
						
						| 
							 | 
						```python | 
					
					
						
						| 
							 | 
						from typing import List | 
					
					
						
						| 
							 | 
						from tritonclient.http import InferenceServerClient, InferInput | 
					
					
						
						| 
							 | 
						from datasets import load_dataset | 
					
					
						
						| 
							 | 
						from transformers import AutoTokenizer | 
					
					
						
						| 
							 | 
						 | 
					
					
						
						| 
							 | 
						BS = 4 | 
					
					
						
						| 
							 | 
						TOKENIZER_NAME = "BAAI/bge-m3" | 
					
					
						
						| 
							 | 
						TRITON_MODEL_NAME = "bge-m3" | 
					
					
						
						| 
							 | 
						 | 
					
					
						
						| 
							 | 
						tokenizer = AutoTokenizer.from_pretrained(TOKENIZER_NAME) | 
					
					
						
						| 
							 | 
						data: List[str] = [x["text"] for x in load_dataset("BeiR/scidocs", "corpus")["corpus"]] | 
					
					
						
						| 
							 | 
						batch = data[:BS] | 
					
					
						
						| 
							 | 
						 | 
					
					
						
						| 
							 | 
						client = InferenceServerClient("localhost:8000") | 
					
					
						
						| 
							 | 
						 | 
					
					
						
						| 
							 | 
						tokenized = tokenizer(batch, padding=True, truncation=True, return_tensors="np") | 
					
					
						
						| 
							 | 
						input_ids, attention_mask = tokenized.input_ids, tokenized.attention_mask | 
					
					
						
						| 
							 | 
						 | 
					
					
						
						| 
							 | 
						inputs = [ | 
					
					
						
						| 
							 | 
						    InferInput("input_ids", [len(batch), len(input_ids[0])], "INT64"), | 
					
					
						
						| 
							 | 
						    InferInput("attention_mask", [len(batch), len(attention_mask[0])], "INT64"), | 
					
					
						
						| 
							 | 
						] | 
					
					
						
						| 
							 | 
						inputs[0].set_data_from_numpy(input_ids) | 
					
					
						
						| 
							 | 
						inputs[1].set_data_from_numpy(attention_mask) | 
					
					
						
						| 
							 | 
						 | 
					
					
						
						| 
							 | 
						results = client.infer(TRITON_MODEL_NAME, inputs) | 
					
					
						
						| 
							 | 
						 | 
					
					
						
						| 
							 | 
						dense_vecs = results.as_numpy("dense_vecs") | 
					
					
						
						| 
							 | 
						sparse_vecs = results.as_numpy("sparse_vecs").squeeze(-1) | 
					
					
						
						| 
							 | 
						colbert_vecs = results.as_numpy("colbert_vecs").squeeze(-1) | 
					
					
						
						| 
							 | 
						 | 
					
					
						
						| 
							 | 
						output = { | 
					
					
						
						| 
							 | 
						    "dense_vecs": dense_vecs.tolist(), | 
					
					
						
						| 
							 | 
						    "sparse_vecs": sparse_vecs.tolist(), | 
					
					
						
						| 
							 | 
						    "colbert_vecs": colbert_vecs.tolist(), | 
					
					
						
						| 
							 | 
						} | 
					
					
						
						| 
							 | 
						print(output) | 
					
					
						
						| 
							 | 
						 | 
					
					
						
						| 
							 | 
						``` |