jinaai
/

jina-embeddings-v2-base-code

@@ -115,8 +115,8 @@ def mean_pooling(model_output, attention_mask):
     return torch.sum(token_embeddings * input_mask_expanded, 1) / torch.clamp(input_mask_expanded.sum(1), min=1e-9)
 sentences = [
-    'Save model to a pickle located at `path` with Python please',
-    'def save_act(self, path=None): if path is None: path = os.path.join(logger.get_dir(), "model.pkl") with tempfile.TemporaryDirectory() as td: save_variables(os.path.join(td, "model")) arc_name = os.path.join(td, "packed.zip") with zipfile.ZipFile(arc_name, "w") as zipf: for root, dirs, files in os.walk(td): for fname in files: file_path = os.path.join(root, fname) if file_path != arc_name: zipf.write(file_path, os.path.relpath(file_path, td)) with open(arc_name, "rb") as f: model_data = f.read() with open(path, "wb") as f: cloudpickle.dump((model_data, self._act_params), f)',
 ]
 tokenizer = AutoTokenizer.from_pretrained('jinaai/jina-embeddings-v2-base-code')
@@ -144,12 +144,12 @@ cos_sim = lambda a,b: (a @ b.T) / (norm(a)*norm(b))
 model = AutoModel.from_pretrained('jinaai/jina-embeddings-v2-base-code', trust_remote_code=True)
 embeddings = model.encode(
     [
-        'Save model to a pickle located at `path` with Python please',
-        'def save_act(self, path=None): if path is None: path = os.path.join(logger.get_dir(), "model.pkl") with tempfile.TemporaryDirectory() as td: save_variables(os.path.join(td, "model")) arc_name = os.path.join(td, "packed.zip") with zipfile.ZipFile(arc_name, "w") as zipf: for root, dirs, files in os.walk(td): for fname in files: file_path = os.path.join(root, fname) if file_path != arc_name: zipf.write(file_path, os.path.relpath(file_path, td)) with open(arc_name, "rb") as f: model_data = f.read() with open(path, "wb") as f: cloudpickle.dump((model_data, self._act_params), f)',
     ]
 )
 print(cos_sim(embeddings[0], embeddings[1]))
->>> 0.7230249
 ```
 If you only want to handle shorter sequence, such as 2k, pass the `max_length` parameter to the `encode` function:
@@ -161,6 +161,28 @@ embeddings = model.encode(
 )
 ```
 ## Plans
 1. Bilingual embedding models supporting more European & Asian languages, including Spanish, French, Italian and Japanese.

     return torch.sum(token_embeddings * input_mask_expanded, 1) / torch.clamp(input_mask_expanded.sum(1), min=1e-9)
 sentences = [
+    'How do I access the index while iterating over a sequence with a for loop?',
+    '# Use the built-in enumerator\nfor idx, x in enumerate(xs):\n    print(idx, x)',
 ]
 tokenizer = AutoTokenizer.from_pretrained('jinaai/jina-embeddings-v2-base-code')
 model = AutoModel.from_pretrained('jinaai/jina-embeddings-v2-base-code', trust_remote_code=True)
 embeddings = model.encode(
     [
+        'How do I access the index while iterating over a sequence with a for loop?',
+        '# Use the built-in enumerator\nfor idx, x in enumerate(xs):\n    print(idx, x)',
     ]
 )
 print(cos_sim(embeddings[0], embeddings[1]))
+>>> tensor([[0.7282]])
 ```
 If you only want to handle shorter sequence, such as 2k, pass the `max_length` parameter to the `encode` function:
 )
 ```
+Using the its latest release (v2.3.0) sentence-transformers also supports Jina embeddings (Please make sure that you are logged into huggingface as well):
+```python
+!pip install -U sentence-transformers
+from sentence_transformers import SentenceTransformer
+from sentence_transformers.util import cos_sim
+model = SentenceTransformer(
+    "jinaai/jina-embeddings-v2-base-code",
+    trust_remote_code=True
+)
+# control your input sequence length up to 8192
+model.max_seq_length = 1024
+embeddings = model.encode([
+    'How do I access the index while iterating over a sequence with a for loop?',
+    '# Use the built-in enumerator\nfor idx, x in enumerate(xs):\n    print(idx, x)',
+])
+print(cos_sim(embeddings[0], embeddings[1]))
+```
 ## Plans
 1. Bilingual embedding models supporting more European & Asian languages, including Spanish, French, Italian and Japanese.