Update README.md
Browse files
README.md
CHANGED
|
@@ -111,10 +111,13 @@ def mean_pooling(model_output, attention_mask):
|
|
| 111 |
input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
|
| 112 |
return torch.sum(token_embeddings * input_mask_expanded, 1) / torch.clamp(input_mask_expanded.sum(1), min=1e-9)
|
| 113 |
|
| 114 |
-
sentences = [
|
|
|
|
|
|
|
|
|
|
| 115 |
|
| 116 |
-
tokenizer = AutoTokenizer.from_pretrained('jinaai/jina-embeddings-v2-
|
| 117 |
-
model = AutoModel.from_pretrained('jinaai/jina-embeddings-v2-
|
| 118 |
|
| 119 |
encoded_input = tokenizer(sentences, padding=True, truncation=True, return_tensors='pt')
|
| 120 |
|
|
@@ -135,16 +138,20 @@ from transformers import AutoModel
|
|
| 135 |
from numpy.linalg import norm
|
| 136 |
|
| 137 |
cos_sim = lambda a,b: (a @ b.T) / (norm(a)*norm(b))
|
| 138 |
-
model = AutoModel.from_pretrained('jinaai/jina-embeddings-v2-base-code', trust_remote_code=True)
|
| 139 |
-
embeddings = model.encode(
|
| 140 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 141 |
```
|
| 142 |
|
| 143 |
If you only want to handle shorter sequence, such as 2k, pass the `max_length` parameter to the `encode` function:
|
| 144 |
|
| 145 |
```python
|
| 146 |
embeddings = model.encode(
|
| 147 |
-
['Very long ...
|
| 148 |
max_length=2048
|
| 149 |
)
|
| 150 |
```
|
|
|
|
| 111 |
input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
|
| 112 |
return torch.sum(token_embeddings * input_mask_expanded, 1) / torch.clamp(input_mask_expanded.sum(1), min=1e-9)
|
| 113 |
|
| 114 |
+
sentences = [
|
| 115 |
+
'Save model to a pickle located at `path`',
|
| 116 |
+
'def save_act(self, path=None): if path is None: path = os.path.join(logger.get_dir(), "model.pkl") with tempfile.TemporaryDirectory() as td: save_variables(os.path.join(td, "model")) arc_name = os.path.join(td, "packed.zip") with zipfile.ZipFile(arc_name, "w") as zipf: for root, dirs, files in os.walk(td): for fname in files: file_path = os.path.join(root, fname) if file_path != arc_name: zipf.write(file_path, os.path.relpath(file_path, td)) with open(arc_name, "rb") as f: model_data = f.read() with open(path, "wb") as f: cloudpickle.dump((model_data, self._act_params), f)',
|
| 117 |
+
]
|
| 118 |
|
| 119 |
+
tokenizer = AutoTokenizer.from_pretrained('jinaai/jina-embeddings-v2-base-code')
|
| 120 |
+
model = AutoModel.from_pretrained('jinaai/jina-embeddings-v2-base-code', trust_remote_code=True)
|
| 121 |
|
| 122 |
encoded_input = tokenizer(sentences, padding=True, truncation=True, return_tensors='pt')
|
| 123 |
|
|
|
|
| 138 |
from numpy.linalg import norm
|
| 139 |
|
| 140 |
cos_sim = lambda a,b: (a @ b.T) / (norm(a)*norm(b))
|
| 141 |
+
model = AutoModel.from_pretrained('jinaai/jina-embeddings-v2-base-code', trust_remote_code=True)
|
| 142 |
+
embeddings = model.encode(
|
| 143 |
+
[
|
| 144 |
+
'Save model to a pickle located at `path`',
|
| 145 |
+
'def save_act(self, path=None): if path is None: path = os.path.join(logger.get_dir(), "model.pkl") with tempfile.TemporaryDirectory() as td: save_variables(os.path.join(td, "model")) arc_name = os.path.join(td, "packed.zip") with zipfile.ZipFile(arc_name, "w") as zipf: for root, dirs, files in os.walk(td): for fname in files: file_path = os.path.join(root, fname) if file_path != arc_name: zipf.write(file_path, os.path.relpath(file_path, td)) with open(arc_name, "rb") as f: model_data = f.read() with open(path, "wb") as f: cloudpickle.dump((model_data, self._act_params), f)',
|
| 146 |
+
]
|
| 147 |
+
)
|
| 148 |
```
|
| 149 |
|
| 150 |
If you only want to handle shorter sequence, such as 2k, pass the `max_length` parameter to the `encode` function:
|
| 151 |
|
| 152 |
```python
|
| 153 |
embeddings = model.encode(
|
| 154 |
+
['Very long ... code'],
|
| 155 |
max_length=2048
|
| 156 |
)
|
| 157 |
```
|