jinaai
/

jina-embeddings-v2-base-code

@@ -111,10 +111,13 @@ def mean_pooling(model_output, attention_mask):
     input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
     return torch.sum(token_embeddings * input_mask_expanded, 1) / torch.clamp(input_mask_expanded.sum(1), min=1e-9)
-sentences = ['How is the weather today?', 'What is the current weather like today?']
-tokenizer = AutoTokenizer.from_pretrained('jinaai/jina-embeddings-v2-small-en')
-model = AutoModel.from_pretrained('jinaai/jina-embeddings-v2-small-en', trust_remote_code=True)
 encoded_input = tokenizer(sentences, padding=True, truncation=True, return_tensors='pt')
@@ -135,16 +138,20 @@ from transformers import AutoModel
 from numpy.linalg import norm
 cos_sim = lambda a,b: (a @ b.T) / (norm(a)*norm(b))
-model = AutoModel.from_pretrained('jinaai/jina-embeddings-v2-base-code', trust_remote_code=True) # trust_remote_code is needed to use the encode method
-embeddings = model.encode(['How is the weather today?', 'What is the current weather like today?'])
-print(cos_sim(embeddings[0], embeddings[1]))
 ```
 If you only want to handle shorter sequence, such as 2k, pass the `max_length` parameter to the `encode` function:
 ```python
 embeddings = model.encode(
-    ['Very long ... document'],
     max_length=2048
 )
 ```

     input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
     return torch.sum(token_embeddings * input_mask_expanded, 1) / torch.clamp(input_mask_expanded.sum(1), min=1e-9)
+sentences = [
+    'Save model to a pickle located at `path`',
+    'def save_act(self, path=None): if path is None: path = os.path.join(logger.get_dir(), "model.pkl") with tempfile.TemporaryDirectory() as td: save_variables(os.path.join(td, "model")) arc_name = os.path.join(td, "packed.zip") with zipfile.ZipFile(arc_name, "w") as zipf: for root, dirs, files in os.walk(td): for fname in files: file_path = os.path.join(root, fname) if file_path != arc_name: zipf.write(file_path, os.path.relpath(file_path, td)) with open(arc_name, "rb") as f: model_data = f.read() with open(path, "wb") as f: cloudpickle.dump((model_data, self._act_params), f)',
+]
+tokenizer = AutoTokenizer.from_pretrained('jinaai/jina-embeddings-v2-base-code')
+model = AutoModel.from_pretrained('jinaai/jina-embeddings-v2-base-code', trust_remote_code=True)
 encoded_input = tokenizer(sentences, padding=True, truncation=True, return_tensors='pt')
 from numpy.linalg import norm
 cos_sim = lambda a,b: (a @ b.T) / (norm(a)*norm(b))
+model = AutoModel.from_pretrained('jinaai/jina-embeddings-v2-base-code', trust_remote_code=True)
+embeddings = model.encode(
+    [
+        'Save model to a pickle located at `path`',
+        'def save_act(self, path=None): if path is None: path = os.path.join(logger.get_dir(), "model.pkl") with tempfile.TemporaryDirectory() as td: save_variables(os.path.join(td, "model")) arc_name = os.path.join(td, "packed.zip") with zipfile.ZipFile(arc_name, "w") as zipf: for root, dirs, files in os.walk(td): for fname in files: file_path = os.path.join(root, fname) if file_path != arc_name: zipf.write(file_path, os.path.relpath(file_path, td)) with open(arc_name, "rb") as f: model_data = f.read() with open(path, "wb") as f: cloudpickle.dump((model_data, self._act_params), f)',
+    ]
+)
 ```
 If you only want to handle shorter sequence, such as 2k, pass the `max_length` parameter to the `encode` function:
 ```python
 embeddings = model.encode(
+    ['Very long ... code'],
     max_length=2048
 )
 ```