rjurney commited on
Commit
b93719a
·
unverified ·
1 Parent(s): 2ca55d2

Remove all files to get rid of all files before copying archive over this and adding via LFS

Browse files
This view is limited to 50 files because it contains too many changes.   See raw diff
Files changed (50) hide show
  1. checkpoint-1000/1_Pooling/config.json +0 -10
  2. checkpoint-1000/README.md +0 -466
  3. checkpoint-1000/config.json +0 -25
  4. checkpoint-1000/config_sentence_transformers.json +0 -10
  5. checkpoint-1000/model.safetensors +0 -3
  6. checkpoint-1000/modules.json +0 -14
  7. checkpoint-1000/optimizer.pt +0 -3
  8. checkpoint-1000/rng_state.pth +0 -3
  9. checkpoint-1000/scaler.pt +0 -3
  10. checkpoint-1000/scheduler.pt +0 -3
  11. checkpoint-1000/sentence_bert_config.json +0 -4
  12. checkpoint-1000/special_tokens_map.json +0 -51
  13. checkpoint-1000/tokenizer.json +0 -3
  14. checkpoint-1000/tokenizer_config.json +0 -65
  15. checkpoint-1000/trainer_state.json +0 -217
  16. checkpoint-1000/training_args.bin +0 -3
  17. checkpoint-1000/unigram.json +0 -3
  18. checkpoint-1060/1_Pooling/config.json +0 -10
  19. checkpoint-1060/README.md +0 -466
  20. checkpoint-1060/config.json +0 -25
  21. checkpoint-1060/config_sentence_transformers.json +0 -10
  22. checkpoint-1060/model.safetensors +0 -3
  23. checkpoint-1060/modules.json +0 -14
  24. checkpoint-1060/optimizer.pt +0 -3
  25. checkpoint-1060/rng_state.pth +0 -3
  26. checkpoint-1060/scaler.pt +0 -3
  27. checkpoint-1060/scheduler.pt +0 -3
  28. checkpoint-1060/sentence_bert_config.json +0 -4
  29. checkpoint-1060/special_tokens_map.json +0 -51
  30. checkpoint-1060/tokenizer.json +0 -3
  31. checkpoint-1060/tokenizer_config.json +0 -65
  32. checkpoint-1060/trainer_state.json +0 -217
  33. checkpoint-1060/training_args.bin +0 -3
  34. checkpoint-1060/unigram.json +0 -3
  35. checkpoint-700/1_Pooling/config.json +0 -10
  36. checkpoint-700/README.md +0 -463
  37. checkpoint-700/config.json +0 -25
  38. checkpoint-700/config_sentence_transformers.json +0 -10
  39. checkpoint-700/model.safetensors +0 -3
  40. checkpoint-700/modules.json +0 -14
  41. checkpoint-700/optimizer.pt +0 -3
  42. checkpoint-700/rng_state.pth +0 -3
  43. checkpoint-700/scaler.pt +0 -3
  44. checkpoint-700/scheduler.pt +0 -3
  45. checkpoint-700/sentence_bert_config.json +0 -4
  46. checkpoint-700/special_tokens_map.json +0 -51
  47. checkpoint-700/tokenizer.json +0 -3
  48. checkpoint-700/tokenizer_config.json +0 -65
  49. checkpoint-700/trainer_state.json +0 -162
  50. checkpoint-700/training_args.bin +0 -3
checkpoint-1000/1_Pooling/config.json DELETED
@@ -1,10 +0,0 @@
1
- {
2
- "word_embedding_dimension": 384,
3
- "pooling_mode_cls_token": false,
4
- "pooling_mode_mean_tokens": true,
5
- "pooling_mode_max_tokens": false,
6
- "pooling_mode_mean_sqrt_len_tokens": false,
7
- "pooling_mode_weightedmean_tokens": false,
8
- "pooling_mode_lasttoken": false,
9
- "include_prompt": true
10
- }
 
 
 
 
 
 
 
 
 
 
 
checkpoint-1000/README.md DELETED
@@ -1,466 +0,0 @@
1
- ---
2
- language:
3
- - en
4
- license: apache-2.0
5
- tags:
6
- - sentence-transformers
7
- - sentence-similarity
8
- - feature-extraction
9
- - generated_from_trainer
10
- - dataset_size:2130620
11
- - loss:ContrastiveLoss
12
- base_model: sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2
13
- widget:
14
- - source_sentence: مانوئلا دی سنتا
15
- sentences:
16
- - Renko Kitagawa
17
- - هانس هيرمان وير
18
- - Ди Чента, Мануэла
19
- - source_sentence: يورى جافريلوف
20
- sentences:
21
- - Wiktor Pinczuk
22
- - Natalia Germanovna DIRKS
23
- - Світлана Євгенівна Савицька
24
- - source_sentence: Џуди Колинс
25
- sentences:
26
- - Collins
27
- - Aisha Muhammed Abdul Salam
28
- - Phonic Boy On Dope
29
- - source_sentence: ויליאם בלייר
30
- sentences:
31
- - The Hon. Mr Justice Blair
32
- - Queen Ingrid of Denmark
33
- - Herman van Rompuy
34
- - source_sentence: Saif al-Arab GADAFI
35
- sentences:
36
- - Максім Недасекаў
37
- - Mervyn Allister King
38
- - Paul d. scully-power
39
- pipeline_tag: sentence-similarity
40
- library_name: sentence-transformers
41
- metrics:
42
- - cosine_accuracy
43
- - cosine_accuracy_threshold
44
- - cosine_f1
45
- - cosine_f1_threshold
46
- - cosine_precision
47
- - cosine_recall
48
- - cosine_ap
49
- - cosine_mcc
50
- model-index:
51
- - name: sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2-address-matcher-original
52
- results:
53
- - task:
54
- type: binary-classification
55
- name: Binary Classification
56
- dataset:
57
- name: sentence transformers paraphrase multilingual MiniLM L12 v2
58
- type: sentence-transformers-paraphrase-multilingual-MiniLM-L12-v2
59
- metrics:
60
- - type: cosine_accuracy
61
- value: 0.9905380542935456
62
- name: Cosine Accuracy
63
- - type: cosine_accuracy_threshold
64
- value: 0.6790644526481628
65
- name: Cosine Accuracy Threshold
66
- - type: cosine_f1
67
- value: 0.9856131536880567
68
- name: Cosine F1
69
- - type: cosine_f1_threshold
70
- value: 0.6790644526481628
71
- name: Cosine F1 Threshold
72
- - type: cosine_precision
73
- value: 0.9816899806664392
74
- name: Cosine Precision
75
- - type: cosine_recall
76
- value: 0.9895678092399404
77
- name: Cosine Recall
78
- - type: cosine_ap
79
- value: 0.9977983578816215
80
- name: Cosine Ap
81
- - type: cosine_mcc
82
- value: 0.9785817179348335
83
- name: Cosine Mcc
84
- ---
85
-
86
- # sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2-address-matcher-original
87
-
88
- This is a [sentence-transformers](https://www.SBERT.net) model finetuned from [sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2](https://huggingface.co/sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2). It maps sentences & paragraphs to a 384-dimensional dense vector space and can be used for semantic textual similarity, semantic search, paraphrase mining, text classification, clustering, and more.
89
-
90
- ## Model Details
91
-
92
- ### Model Description
93
- - **Model Type:** Sentence Transformer
94
- - **Base model:** [sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2](https://huggingface.co/sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2) <!-- at revision 86741b4e3f5cb7765a600d3a3d55a0f6a6cb443d -->
95
- - **Maximum Sequence Length:** 128 tokens
96
- - **Output Dimensionality:** 384 dimensions
97
- - **Similarity Function:** Cosine Similarity
98
- <!-- - **Training Dataset:** Unknown -->
99
- - **Language:** en
100
- - **License:** apache-2.0
101
-
102
- ### Model Sources
103
-
104
- - **Documentation:** [Sentence Transformers Documentation](https://sbert.net)
105
- - **Repository:** [Sentence Transformers on GitHub](https://github.com/UKPLab/sentence-transformers)
106
- - **Hugging Face:** [Sentence Transformers on Hugging Face](https://huggingface.co/models?library=sentence-transformers)
107
-
108
- ### Full Model Architecture
109
-
110
- ```
111
- SentenceTransformer(
112
- (0): Transformer({'max_seq_length': 128, 'do_lower_case': False}) with Transformer model: BertModel
113
- (1): Pooling({'word_embedding_dimension': 384, 'pooling_mode_cls_token': False, 'pooling_mode_mean_tokens': True, 'pooling_mode_max_tokens': False, 'pooling_mode_mean_sqrt_len_tokens': False, 'pooling_mode_weightedmean_tokens': False, 'pooling_mode_lasttoken': False, 'include_prompt': True})
114
- )
115
- ```
116
-
117
- ## Usage
118
-
119
- ### Direct Usage (Sentence Transformers)
120
-
121
- First install the Sentence Transformers library:
122
-
123
- ```bash
124
- pip install -U sentence-transformers
125
- ```
126
-
127
- Then you can load this model and run inference.
128
- ```python
129
- from sentence_transformers import SentenceTransformer
130
-
131
- # Download from the 🤗 Hub
132
- model = SentenceTransformer("sentence_transformers_model_id")
133
- # Run inference
134
- sentences = [
135
- 'Saif al-Arab GADAFI',
136
- 'Максім Недасекаў',
137
- 'Mervyn Allister King',
138
- ]
139
- embeddings = model.encode(sentences)
140
- print(embeddings.shape)
141
- # [3, 384]
142
-
143
- # Get the similarity scores for the embeddings
144
- similarities = model.similarity(embeddings, embeddings)
145
- print(similarities.shape)
146
- # [3, 3]
147
- ```
148
-
149
- <!--
150
- ### Direct Usage (Transformers)
151
-
152
- <details><summary>Click to see the direct usage in Transformers</summary>
153
-
154
- </details>
155
- -->
156
-
157
- <!--
158
- ### Downstream Usage (Sentence Transformers)
159
-
160
- You can finetune this model on your own dataset.
161
-
162
- <details><summary>Click to expand</summary>
163
-
164
- </details>
165
- -->
166
-
167
- <!--
168
- ### Out-of-Scope Use
169
-
170
- *List how the model may foreseeably be misused and address what users ought not to do with the model.*
171
- -->
172
-
173
- ## Evaluation
174
-
175
- ### Metrics
176
-
177
- #### Binary Classification
178
-
179
- * Dataset: `sentence-transformers-paraphrase-multilingual-MiniLM-L12-v2`
180
- * Evaluated with [<code>BinaryClassificationEvaluator</code>](https://sbert.net/docs/package_reference/sentence_transformer/evaluation.html#sentence_transformers.evaluation.BinaryClassificationEvaluator)
181
-
182
- | Metric | Value |
183
- |:--------------------------|:-----------|
184
- | cosine_accuracy | 0.9905 |
185
- | cosine_accuracy_threshold | 0.6791 |
186
- | cosine_f1 | 0.9856 |
187
- | cosine_f1_threshold | 0.6791 |
188
- | cosine_precision | 0.9817 |
189
- | cosine_recall | 0.9896 |
190
- | **cosine_ap** | **0.9978** |
191
- | cosine_mcc | 0.9786 |
192
-
193
- <!--
194
- ## Bias, Risks and Limitations
195
-
196
- *What are the known or foreseeable issues stemming from this model? You could also flag here known failure cases or weaknesses of the model.*
197
- -->
198
-
199
- <!--
200
- ### Recommendations
201
-
202
- *What are recommendations with respect to the foreseeable issues? For example, filtering explicit content.*
203
- -->
204
-
205
- ## Training Details
206
-
207
- ### Training Dataset
208
-
209
- #### Unnamed Dataset
210
-
211
- * Size: 2,130,620 training samples
212
- * Columns: <code>sentence1</code>, <code>sentence2</code>, and <code>label</code>
213
- * Approximate statistics based on the first 1000 samples:
214
- | | sentence1 | sentence2 | label |
215
- |:--------|:---------------------------------------------------------------------------------|:---------------------------------------------------------------------------------|:---------------------------------------------------------------|
216
- | type | string | string | float |
217
- | details | <ul><li>min: 3 tokens</li><li>mean: 9.28 tokens</li><li>max: 57 tokens</li></ul> | <ul><li>min: 3 tokens</li><li>mean: 9.11 tokens</li><li>max: 65 tokens</li></ul> | <ul><li>min: 0.0</li><li>mean: 0.34</li><li>max: 1.0</li></ul> |
218
- * Samples:
219
- | sentence1 | sentence2 | label |
220
- |:----------------------------|:-------------------------------|:-----------------|
221
- | <code>ג'ק וייט</code> | <code>Jack White</code> | <code>1.0</code> |
222
- | <code>Абдуллоҳ Гул</code> | <code>Савицкая Светлана</code> | <code>0.0</code> |
223
- | <code>ショーン・ジャスティン・ペン</code> | <code>شان پن</code> | <code>1.0</code> |
224
- * Loss: [<code>ContrastiveLoss</code>](https://sbert.net/docs/package_reference/sentence_transformer/losses.html#contrastiveloss) with these parameters:
225
- ```json
226
- {
227
- "distance_metric": "SiameseDistanceMetric.COSINE_DISTANCE",
228
- "margin": 0.5,
229
- "size_average": true
230
- }
231
- ```
232
-
233
- ### Evaluation Dataset
234
-
235
- #### Unnamed Dataset
236
-
237
- * Size: 266,328 evaluation samples
238
- * Columns: <code>sentence1</code>, <code>sentence2</code>, and <code>label</code>
239
- * Approximate statistics based on the first 1000 samples:
240
- | | sentence1 | sentence2 | label |
241
- |:--------|:---------------------------------------------------------------------------------|:---------------------------------------------------------------------------------|:---------------------------------------------------------------|
242
- | type | string | string | float |
243
- | details | <ul><li>min: 3 tokens</li><li>mean: 9.27 tokens</li><li>max: 79 tokens</li></ul> | <ul><li>min: 3 tokens</li><li>mean: 8.99 tokens</li><li>max: 61 tokens</li></ul> | <ul><li>min: 0.0</li><li>mean: 0.32</li><li>max: 1.0</li></ul> |
244
- * Samples:
245
- | sentence1 | sentence2 | label |
246
- |:---------------------------------------------|:-----------------------------------------------|:-----------------|
247
- | <code>Анатолий Николаевич Герасимов</code> | <code>Anatoli Nikolajewitsch Gerassimow</code> | <code>1.0</code> |
248
- | <code>Igor Stanislavovitsj Prokopenko</code> | <code>Angelo Lauricella</code> | <code>0.0</code> |
249
- | <code>Кофе, Линда</code> | <code>Святлана Яўгенаўна Савіцкая</code> | <code>0.0</code> |
250
- * Loss: [<code>ContrastiveLoss</code>](https://sbert.net/docs/package_reference/sentence_transformer/losses.html#contrastiveloss) with these parameters:
251
- ```json
252
- {
253
- "distance_metric": "SiameseDistanceMetric.COSINE_DISTANCE",
254
- "margin": 0.5,
255
- "size_average": true
256
- }
257
- ```
258
-
259
- ### Training Hyperparameters
260
- #### Non-Default Hyperparameters
261
-
262
- - `eval_strategy`: steps
263
- - `per_device_train_batch_size`: 5000
264
- - `per_device_eval_batch_size`: 5000
265
- - `gradient_accumulation_steps`: 4
266
- - `weight_decay`: 0.02
267
- - `num_train_epochs`: 10
268
- - `warmup_ratio`: 0.1
269
- - `fp16`: True
270
- - `load_best_model_at_end`: True
271
- - `optim`: adafactor
272
- - `gradient_checkpointing`: True
273
-
274
- #### All Hyperparameters
275
- <details><summary>Click to expand</summary>
276
-
277
- - `overwrite_output_dir`: False
278
- - `do_predict`: False
279
- - `eval_strategy`: steps
280
- - `prediction_loss_only`: True
281
- - `per_device_train_batch_size`: 5000
282
- - `per_device_eval_batch_size`: 5000
283
- - `per_gpu_train_batch_size`: None
284
- - `per_gpu_eval_batch_size`: None
285
- - `gradient_accumulation_steps`: 4
286
- - `eval_accumulation_steps`: None
287
- - `torch_empty_cache_steps`: None
288
- - `learning_rate`: 5e-05
289
- - `weight_decay`: 0.02
290
- - `adam_beta1`: 0.9
291
- - `adam_beta2`: 0.999
292
- - `adam_epsilon`: 1e-08
293
- - `max_grad_norm`: 1.0
294
- - `num_train_epochs`: 10
295
- - `max_steps`: -1
296
- - `lr_scheduler_type`: linear
297
- - `lr_scheduler_kwargs`: {}
298
- - `warmup_ratio`: 0.1
299
- - `warmup_steps`: 0
300
- - `log_level`: passive
301
- - `log_level_replica`: warning
302
- - `log_on_each_node`: True
303
- - `logging_nan_inf_filter`: True
304
- - `save_safetensors`: True
305
- - `save_on_each_node`: False
306
- - `save_only_model`: False
307
- - `restore_callback_states_from_checkpoint`: False
308
- - `no_cuda`: False
309
- - `use_cpu`: False
310
- - `use_mps_device`: False
311
- - `seed`: 42
312
- - `data_seed`: None
313
- - `jit_mode_eval`: False
314
- - `use_ipex`: False
315
- - `bf16`: False
316
- - `fp16`: True
317
- - `fp16_opt_level`: O1
318
- - `half_precision_backend`: auto
319
- - `bf16_full_eval`: False
320
- - `fp16_full_eval`: False
321
- - `tf32`: None
322
- - `local_rank`: 0
323
- - `ddp_backend`: None
324
- - `tpu_num_cores`: None
325
- - `tpu_metrics_debug`: False
326
- - `debug`: []
327
- - `dataloader_drop_last`: False
328
- - `dataloader_num_workers`: 0
329
- - `dataloader_prefetch_factor`: None
330
- - `past_index`: -1
331
- - `disable_tqdm`: False
332
- - `remove_unused_columns`: True
333
- - `label_names`: None
334
- - `load_best_model_at_end`: True
335
- - `ignore_data_skip`: False
336
- - `fsdp`: []
337
- - `fsdp_min_num_params`: 0
338
- - `fsdp_config`: {'min_num_params': 0, 'xla': False, 'xla_fsdp_v2': False, 'xla_fsdp_grad_ckpt': False}
339
- - `tp_size`: 0
340
- - `fsdp_transformer_layer_cls_to_wrap`: None
341
- - `accelerator_config`: {'split_batches': False, 'dispatch_batches': None, 'even_batches': True, 'use_seedable_sampler': True, 'non_blocking': False, 'gradient_accumulation_kwargs': None}
342
- - `deepspeed`: None
343
- - `label_smoothing_factor`: 0.0
344
- - `optim`: adafactor
345
- - `optim_args`: None
346
- - `adafactor`: False
347
- - `group_by_length`: False
348
- - `length_column_name`: length
349
- - `ddp_find_unused_parameters`: None
350
- - `ddp_bucket_cap_mb`: None
351
- - `ddp_broadcast_buffers`: False
352
- - `dataloader_pin_memory`: True
353
- - `dataloader_persistent_workers`: False
354
- - `skip_memory_metrics`: True
355
- - `use_legacy_prediction_loop`: False
356
- - `push_to_hub`: False
357
- - `resume_from_checkpoint`: None
358
- - `hub_model_id`: None
359
- - `hub_strategy`: every_save
360
- - `hub_private_repo`: None
361
- - `hub_always_push`: False
362
- - `gradient_checkpointing`: True
363
- - `gradient_checkpointing_kwargs`: None
364
- - `include_inputs_for_metrics`: False
365
- - `include_for_metrics`: []
366
- - `eval_do_concat_batches`: True
367
- - `fp16_backend`: auto
368
- - `push_to_hub_model_id`: None
369
- - `push_to_hub_organization`: None
370
- - `mp_parameters`:
371
- - `auto_find_batch_size`: False
372
- - `full_determinism`: False
373
- - `torchdynamo`: None
374
- - `ray_scope`: last
375
- - `ddp_timeout`: 1800
376
- - `torch_compile`: False
377
- - `torch_compile_backend`: None
378
- - `torch_compile_mode`: None
379
- - `include_tokens_per_second`: False
380
- - `include_num_input_tokens_seen`: False
381
- - `neftune_noise_alpha`: None
382
- - `optim_target_modules`: None
383
- - `batch_eval_metrics`: False
384
- - `eval_on_start`: False
385
- - `use_liger_kernel`: False
386
- - `eval_use_gather_object`: False
387
- - `average_tokens_across_devices`: False
388
- - `prompts`: None
389
- - `batch_sampler`: batch_sampler
390
- - `multi_dataset_batch_sampler`: proportional
391
-
392
- </details>
393
-
394
- ### Training Logs
395
- | Epoch | Step | Training Loss | Validation Loss | sentence-transformers-paraphrase-multilingual-MiniLM-L12-v2_cosine_ap |
396
- |:------:|:----:|:-------------:|:---------------:|:---------------------------------------------------------------------:|
397
- | -1 | -1 | - | - | 0.7195 |
398
- | 0.9368 | 100 | - | 0.0083 | 0.9597 |
399
- | 1.8712 | 200 | - | 0.0043 | 0.9877 |
400
- | 2.8056 | 300 | - | 0.0028 | 0.9936 |
401
- | 3.7400 | 400 | - | 0.0021 | 0.9954 |
402
- | 4.6745 | 500 | 0.0224 | 0.0016 | 0.9964 |
403
- | 5.6089 | 600 | - | 0.0015 | 0.9970 |
404
- | 6.5433 | 700 | - | 0.0014 | 0.9974 |
405
- | 7.4778 | 800 | - | 0.0013 | 0.9975 |
406
- | 8.4122 | 900 | - | 0.0013 | 0.9977 |
407
- | 9.3466 | 1000 | 0.0052 | 0.0012 | 0.9978 |
408
-
409
-
410
- ### Framework Versions
411
- - Python: 3.12.9
412
- - Sentence Transformers: 3.4.1
413
- - Transformers: 4.51.3
414
- - PyTorch: 2.7.0+cu126
415
- - Accelerate: 1.6.0
416
- - Datasets: 3.6.0
417
- - Tokenizers: 0.21.1
418
-
419
- ## Citation
420
-
421
- ### BibTeX
422
-
423
- #### Sentence Transformers
424
- ```bibtex
425
- @inproceedings{reimers-2019-sentence-bert,
426
- title = "Sentence-BERT: Sentence Embeddings using Siamese BERT-Networks",
427
- author = "Reimers, Nils and Gurevych, Iryna",
428
- booktitle = "Proceedings of the 2019 Conference on Empirical Methods in Natural Language Processing",
429
- month = "11",
430
- year = "2019",
431
- publisher = "Association for Computational Linguistics",
432
- url = "https://arxiv.org/abs/1908.10084",
433
- }
434
- ```
435
-
436
- #### ContrastiveLoss
437
- ```bibtex
438
- @inproceedings{hadsell2006dimensionality,
439
- author={Hadsell, R. and Chopra, S. and LeCun, Y.},
440
- booktitle={2006 IEEE Computer Society Conference on Computer Vision and Pattern Recognition (CVPR'06)},
441
- title={Dimensionality Reduction by Learning an Invariant Mapping},
442
- year={2006},
443
- volume={2},
444
- number={},
445
- pages={1735-1742},
446
- doi={10.1109/CVPR.2006.100}
447
- }
448
- ```
449
-
450
- <!--
451
- ## Glossary
452
-
453
- *Clearly define terms in order to be accessible across audiences.*
454
- -->
455
-
456
- <!--
457
- ## Model Card Authors
458
-
459
- *Lists the people who create the model card, providing recognition and accountability for the detailed work that goes into its construction.*
460
- -->
461
-
462
- <!--
463
- ## Model Card Contact
464
-
465
- *Provides a way for people who have updates to the Model Card, suggestions, or questions, to contact the Model Card authors.*
466
- -->
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
checkpoint-1000/config.json DELETED
@@ -1,25 +0,0 @@
1
- {
2
- "architectures": [
3
- "BertModel"
4
- ],
5
- "attention_probs_dropout_prob": 0.1,
6
- "classifier_dropout": null,
7
- "gradient_checkpointing": false,
8
- "hidden_act": "gelu",
9
- "hidden_dropout_prob": 0.1,
10
- "hidden_size": 384,
11
- "initializer_range": 0.02,
12
- "intermediate_size": 1536,
13
- "layer_norm_eps": 1e-12,
14
- "max_position_embeddings": 512,
15
- "model_type": "bert",
16
- "num_attention_heads": 12,
17
- "num_hidden_layers": 12,
18
- "pad_token_id": 0,
19
- "position_embedding_type": "absolute",
20
- "torch_dtype": "float32",
21
- "transformers_version": "4.51.3",
22
- "type_vocab_size": 2,
23
- "use_cache": true,
24
- "vocab_size": 250037
25
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
checkpoint-1000/config_sentence_transformers.json DELETED
@@ -1,10 +0,0 @@
1
- {
2
- "__version__": {
3
- "sentence_transformers": "3.4.1",
4
- "transformers": "4.51.3",
5
- "pytorch": "2.7.0+cu126"
6
- },
7
- "prompts": {},
8
- "default_prompt_name": null,
9
- "similarity_fn_name": "cosine"
10
- }
 
 
 
 
 
 
 
 
 
 
 
checkpoint-1000/model.safetensors DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:4f4d32a2eafc839cb2ab10b136bf98c4d30bdad7f85e5f55ceafdf3a54a9e859
3
- size 470637416
 
 
 
 
checkpoint-1000/modules.json DELETED
@@ -1,14 +0,0 @@
1
- [
2
- {
3
- "idx": 0,
4
- "name": "0",
5
- "path": "",
6
- "type": "sentence_transformers.models.Transformer"
7
- },
8
- {
9
- "idx": 1,
10
- "name": "1",
11
- "path": "1_Pooling",
12
- "type": "sentence_transformers.models.Pooling"
13
- }
14
- ]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
checkpoint-1000/optimizer.pt DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:c78ab330430a2994c0f6654ccbded4a6558ca0f6cfe08b4fa75960dd3563c6fa
3
- size 1715019
 
 
 
 
checkpoint-1000/rng_state.pth DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:f3ba9bdd5b024d60bbe4f0e967c35c5c47d5c8b8d992558db327a2aae780abce
3
- size 14645
 
 
 
 
checkpoint-1000/scaler.pt DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:14ae2a2128444abab378aa06c09a61a84665f758fcc19fc46f5789b0bc1b5665
3
- size 1383
 
 
 
 
checkpoint-1000/scheduler.pt DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:c6cb8ef30177a351efb4472a87f05bae30f484d65c37957d4e17a50e58c3b3e9
3
- size 1465
 
 
 
 
checkpoint-1000/sentence_bert_config.json DELETED
@@ -1,4 +0,0 @@
1
- {
2
- "max_seq_length": 128,
3
- "do_lower_case": false
4
- }
 
 
 
 
 
checkpoint-1000/special_tokens_map.json DELETED
@@ -1,51 +0,0 @@
1
- {
2
- "bos_token": {
3
- "content": "<s>",
4
- "lstrip": false,
5
- "normalized": false,
6
- "rstrip": false,
7
- "single_word": false
8
- },
9
- "cls_token": {
10
- "content": "<s>",
11
- "lstrip": false,
12
- "normalized": false,
13
- "rstrip": false,
14
- "single_word": false
15
- },
16
- "eos_token": {
17
- "content": "</s>",
18
- "lstrip": false,
19
- "normalized": false,
20
- "rstrip": false,
21
- "single_word": false
22
- },
23
- "mask_token": {
24
- "content": "<mask>",
25
- "lstrip": true,
26
- "normalized": false,
27
- "rstrip": false,
28
- "single_word": false
29
- },
30
- "pad_token": {
31
- "content": "<pad>",
32
- "lstrip": false,
33
- "normalized": false,
34
- "rstrip": false,
35
- "single_word": false
36
- },
37
- "sep_token": {
38
- "content": "</s>",
39
- "lstrip": false,
40
- "normalized": false,
41
- "rstrip": false,
42
- "single_word": false
43
- },
44
- "unk_token": {
45
- "content": "<unk>",
46
- "lstrip": false,
47
- "normalized": false,
48
- "rstrip": false,
49
- "single_word": false
50
- }
51
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
checkpoint-1000/tokenizer.json DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:cad551d5600a84242d0973327029452a1e3672ba6313c2a3c3d69c4310e12719
3
- size 17082987
 
 
 
 
checkpoint-1000/tokenizer_config.json DELETED
@@ -1,65 +0,0 @@
1
- {
2
- "added_tokens_decoder": {
3
- "0": {
4
- "content": "<s>",
5
- "lstrip": false,
6
- "normalized": false,
7
- "rstrip": false,
8
- "single_word": false,
9
- "special": true
10
- },
11
- "1": {
12
- "content": "<pad>",
13
- "lstrip": false,
14
- "normalized": false,
15
- "rstrip": false,
16
- "single_word": false,
17
- "special": true
18
- },
19
- "2": {
20
- "content": "</s>",
21
- "lstrip": false,
22
- "normalized": false,
23
- "rstrip": false,
24
- "single_word": false,
25
- "special": true
26
- },
27
- "3": {
28
- "content": "<unk>",
29
- "lstrip": false,
30
- "normalized": false,
31
- "rstrip": false,
32
- "single_word": false,
33
- "special": true
34
- },
35
- "250001": {
36
- "content": "<mask>",
37
- "lstrip": true,
38
- "normalized": false,
39
- "rstrip": false,
40
- "single_word": false,
41
- "special": true
42
- }
43
- },
44
- "bos_token": "<s>",
45
- "clean_up_tokenization_spaces": false,
46
- "cls_token": "<s>",
47
- "do_lower_case": true,
48
- "eos_token": "</s>",
49
- "extra_special_tokens": {},
50
- "mask_token": "<mask>",
51
- "max_length": 128,
52
- "model_max_length": 128,
53
- "pad_to_multiple_of": null,
54
- "pad_token": "<pad>",
55
- "pad_token_type_id": 0,
56
- "padding_side": "right",
57
- "sep_token": "</s>",
58
- "stride": 0,
59
- "strip_accents": null,
60
- "tokenize_chinese_chars": true,
61
- "tokenizer_class": "BertTokenizer",
62
- "truncation_side": "right",
63
- "truncation_strategy": "longest_first",
64
- "unk_token": "<unk>"
65
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
checkpoint-1000/trainer_state.json DELETED
@@ -1,217 +0,0 @@
1
- {
2
- "best_global_step": 1000,
3
- "best_metric": 0.0012360884575173259,
4
- "best_model_checkpoint": "data/fine-tuned-sbert-sentence-transformers-paraphrase-multilingual-MiniLM-L12-v2-original-adafactor/checkpoint-1000",
5
- "epoch": 9.346604215456674,
6
- "eval_steps": 100,
7
- "global_step": 1000,
8
- "is_hyper_param_search": false,
9
- "is_local_process_zero": true,
10
- "is_world_process_zero": true,
11
- "log_history": [
12
- {
13
- "epoch": 0.936768149882904,
14
- "eval_loss": 0.008251233026385307,
15
- "eval_runtime": 117.4457,
16
- "eval_samples_per_second": 2267.669,
17
- "eval_sentence-transformers-paraphrase-multilingual-MiniLM-L12-v2_cosine_accuracy": 0.9330529793864755,
18
- "eval_sentence-transformers-paraphrase-multilingual-MiniLM-L12-v2_cosine_accuracy_threshold": 0.6639679670333862,
19
- "eval_sentence-transformers-paraphrase-multilingual-MiniLM-L12-v2_cosine_ap": 0.9596591982248662,
20
- "eval_sentence-transformers-paraphrase-multilingual-MiniLM-L12-v2_cosine_f1": 0.8990018609372358,
21
- "eval_sentence-transformers-paraphrase-multilingual-MiniLM-L12-v2_cosine_f1_threshold": 0.6536919474601746,
22
- "eval_sentence-transformers-paraphrase-multilingual-MiniLM-L12-v2_cosine_mcc": 0.8488676021429209,
23
- "eval_sentence-transformers-paraphrase-multilingual-MiniLM-L12-v2_cosine_precision": 0.8846836847946726,
24
- "eval_sentence-transformers-paraphrase-multilingual-MiniLM-L12-v2_cosine_recall": 0.913791126905881,
25
- "eval_steps_per_second": 0.46,
26
- "step": 100
27
- },
28
- {
29
- "epoch": 1.8711943793911008,
30
- "eval_loss": 0.004326523281633854,
31
- "eval_runtime": 118.308,
32
- "eval_samples_per_second": 2251.141,
33
- "eval_sentence-transformers-paraphrase-multilingual-MiniLM-L12-v2_cosine_accuracy": 0.9683099913640971,
34
- "eval_sentence-transformers-paraphrase-multilingual-MiniLM-L12-v2_cosine_accuracy_threshold": 0.6799858808517456,
35
- "eval_sentence-transformers-paraphrase-multilingual-MiniLM-L12-v2_cosine_ap": 0.987669070948898,
36
- "eval_sentence-transformers-paraphrase-multilingual-MiniLM-L12-v2_cosine_f1": 0.9520018198362147,
37
- "eval_sentence-transformers-paraphrase-multilingual-MiniLM-L12-v2_cosine_f1_threshold": 0.6799858808517456,
38
- "eval_sentence-transformers-paraphrase-multilingual-MiniLM-L12-v2_cosine_mcc": 0.9284143244509058,
39
- "eval_sentence-transformers-paraphrase-multilingual-MiniLM-L12-v2_cosine_precision": 0.9445886468795847,
40
- "eval_sentence-transformers-paraphrase-multilingual-MiniLM-L12-v2_cosine_recall": 0.9595322710076808,
41
- "eval_steps_per_second": 0.456,
42
- "step": 200
43
- },
44
- {
45
- "epoch": 2.8056206088992974,
46
- "eval_loss": 0.002782753435894847,
47
- "eval_runtime": 117.8399,
48
- "eval_samples_per_second": 2260.083,
49
- "eval_sentence-transformers-paraphrase-multilingual-MiniLM-L12-v2_cosine_accuracy": 0.9790110013892539,
50
- "eval_sentence-transformers-paraphrase-multilingual-MiniLM-L12-v2_cosine_accuracy_threshold": 0.7040826678276062,
51
- "eval_sentence-transformers-paraphrase-multilingual-MiniLM-L12-v2_cosine_ap": 0.9935758649482886,
52
- "eval_sentence-transformers-paraphrase-multilingual-MiniLM-L12-v2_cosine_f1": 0.9680662667809197,
53
- "eval_sentence-transformers-paraphrase-multilingual-MiniLM-L12-v2_cosine_f1_threshold": 0.7029732465744019,
54
- "eval_sentence-transformers-paraphrase-multilingual-MiniLM-L12-v2_cosine_mcc": 0.9524469797852624,
55
- "eval_sentence-transformers-paraphrase-multilingual-MiniLM-L12-v2_cosine_precision": 0.9648143930767479,
56
- "eval_sentence-transformers-paraphrase-multilingual-MiniLM-L12-v2_cosine_recall": 0.9713401352745615,
57
- "eval_steps_per_second": 0.458,
58
- "step": 300
59
- },
60
- {
61
- "epoch": 3.740046838407494,
62
- "eval_loss": 0.0020659712608903646,
63
- "eval_runtime": 116.8077,
64
- "eval_samples_per_second": 2280.056,
65
- "eval_sentence-transformers-paraphrase-multilingual-MiniLM-L12-v2_cosine_accuracy": 0.9837419742424811,
66
- "eval_sentence-transformers-paraphrase-multilingual-MiniLM-L12-v2_cosine_accuracy_threshold": 0.7114190459251404,
67
- "eval_sentence-transformers-paraphrase-multilingual-MiniLM-L12-v2_cosine_ap": 0.9954100421733855,
68
- "eval_sentence-transformers-paraphrase-multilingual-MiniLM-L12-v2_cosine_f1": 0.975348704810703,
69
- "eval_sentence-transformers-paraphrase-multilingual-MiniLM-L12-v2_cosine_f1_threshold": 0.6966520547866821,
70
- "eval_sentence-transformers-paraphrase-multilingual-MiniLM-L12-v2_cosine_mcc": 0.963270232791414,
71
- "eval_sentence-transformers-paraphrase-multilingual-MiniLM-L12-v2_cosine_precision": 0.9687853426826509,
72
- "eval_sentence-transformers-paraphrase-multilingual-MiniLM-L12-v2_cosine_recall": 0.9820016049524246,
73
- "eval_steps_per_second": 0.462,
74
- "step": 400
75
- },
76
- {
77
- "epoch": 4.674473067915691,
78
- "grad_norm": 0.07067500799894333,
79
- "learning_rate": 2.9402515723270442e-05,
80
- "loss": 0.0224,
81
- "step": 500
82
- },
83
- {
84
- "epoch": 4.674473067915691,
85
- "eval_loss": 0.0016409169184044003,
86
- "eval_runtime": 117.7739,
87
- "eval_samples_per_second": 2261.35,
88
- "eval_sentence-transformers-paraphrase-multilingual-MiniLM-L12-v2_cosine_accuracy": 0.986370292494274,
89
- "eval_sentence-transformers-paraphrase-multilingual-MiniLM-L12-v2_cosine_accuracy_threshold": 0.7391290664672852,
90
- "eval_sentence-transformers-paraphrase-multilingual-MiniLM-L12-v2_cosine_ap": 0.996439193909599,
91
- "eval_sentence-transformers-paraphrase-multilingual-MiniLM-L12-v2_cosine_f1": 0.9792820044518008,
92
- "eval_sentence-transformers-paraphrase-multilingual-MiniLM-L12-v2_cosine_f1_threshold": 0.7391290664672852,
93
- "eval_sentence-transformers-paraphrase-multilingual-MiniLM-L12-v2_cosine_mcc": 0.9691467317957321,
94
- "eval_sentence-transformers-paraphrase-multilingual-MiniLM-L12-v2_cosine_precision": 0.975107979086156,
95
- "eval_sentence-transformers-paraphrase-multilingual-MiniLM-L12-v2_cosine_recall": 0.9834919179181474,
96
- "eval_steps_per_second": 0.459,
97
- "step": 500
98
- },
99
- {
100
- "epoch": 5.608899297423887,
101
- "eval_loss": 0.0014551315689459443,
102
- "eval_runtime": 117.5801,
103
- "eval_samples_per_second": 2265.077,
104
- "eval_sentence-transformers-paraphrase-multilingual-MiniLM-L12-v2_cosine_accuracy": 0.9884729470957083,
105
- "eval_sentence-transformers-paraphrase-multilingual-MiniLM-L12-v2_cosine_accuracy_threshold": 0.7460525035858154,
106
- "eval_sentence-transformers-paraphrase-multilingual-MiniLM-L12-v2_cosine_ap": 0.9969945004512654,
107
- "eval_sentence-transformers-paraphrase-multilingual-MiniLM-L12-v2_cosine_f1": 0.9824360661365067,
108
- "eval_sentence-transformers-paraphrase-multilingual-MiniLM-L12-v2_cosine_f1_threshold": 0.7435637712478638,
109
- "eval_sentence-transformers-paraphrase-multilingual-MiniLM-L12-v2_cosine_mcc": 0.9738614226726382,
110
- "eval_sentence-transformers-paraphrase-multilingual-MiniLM-L12-v2_cosine_precision": 0.9805847418912745,
111
- "eval_sentence-transformers-paraphrase-multilingual-MiniLM-L12-v2_cosine_recall": 0.9842943941304597,
112
- "eval_steps_per_second": 0.459,
113
- "step": 600
114
- },
115
- {
116
- "epoch": 6.543325526932084,
117
- "eval_loss": 0.0013776659034192562,
118
- "eval_runtime": 117.6764,
119
- "eval_samples_per_second": 2263.223,
120
- "eval_sentence-transformers-paraphrase-multilingual-MiniLM-L12-v2_cosine_accuracy": 0.9893740847820374,
121
- "eval_sentence-transformers-paraphrase-multilingual-MiniLM-L12-v2_cosine_accuracy_threshold": 0.7209540009498596,
122
- "eval_sentence-transformers-paraphrase-multilingual-MiniLM-L12-v2_cosine_ap": 0.997357375070481,
123
- "eval_sentence-transformers-paraphrase-multilingual-MiniLM-L12-v2_cosine_f1": 0.9838035826704058,
124
- "eval_sentence-transformers-paraphrase-multilingual-MiniLM-L12-v2_cosine_f1_threshold": 0.7209540009498596,
125
- "eval_sentence-transformers-paraphrase-multilingual-MiniLM-L12-v2_cosine_mcc": 0.9758996171607873,
126
- "eval_sentence-transformers-paraphrase-multilingual-MiniLM-L12-v2_cosine_precision": 0.9822857142857143,
127
- "eval_sentence-transformers-paraphrase-multilingual-MiniLM-L12-v2_cosine_recall": 0.9853261492605755,
128
- "eval_steps_per_second": 0.459,
129
- "step": 700
130
- },
131
- {
132
- "epoch": 7.477751756440281,
133
- "eval_loss": 0.0013444514479488134,
134
- "eval_runtime": 117.3408,
135
- "eval_samples_per_second": 2269.696,
136
- "eval_sentence-transformers-paraphrase-multilingual-MiniLM-L12-v2_cosine_accuracy": 0.9898246536252018,
137
- "eval_sentence-transformers-paraphrase-multilingual-MiniLM-L12-v2_cosine_accuracy_threshold": 0.7261425852775574,
138
- "eval_sentence-transformers-paraphrase-multilingual-MiniLM-L12-v2_cosine_ap": 0.9975494130839752,
139
- "eval_sentence-transformers-paraphrase-multilingual-MiniLM-L12-v2_cosine_f1": 0.9844654628833477,
140
- "eval_sentence-transformers-paraphrase-multilingual-MiniLM-L12-v2_cosine_f1_threshold": 0.7227741479873657,
141
- "eval_sentence-transformers-paraphrase-multilingual-MiniLM-L12-v2_cosine_mcc": 0.9769000718683564,
142
- "eval_sentence-transformers-paraphrase-multilingual-MiniLM-L12-v2_cosine_precision": 0.9845218986470993,
143
- "eval_sentence-transformers-paraphrase-multilingual-MiniLM-L12-v2_cosine_recall": 0.9844090335893615,
144
- "eval_steps_per_second": 0.46,
145
- "step": 800
146
- },
147
- {
148
- "epoch": 8.412177985948478,
149
- "eval_loss": 0.0012511691311374307,
150
- "eval_runtime": 117.668,
151
- "eval_samples_per_second": 2263.385,
152
- "eval_sentence-transformers-paraphrase-multilingual-MiniLM-L12-v2_cosine_accuracy": 0.9902752224683663,
153
- "eval_sentence-transformers-paraphrase-multilingual-MiniLM-L12-v2_cosine_accuracy_threshold": 0.685534656047821,
154
- "eval_sentence-transformers-paraphrase-multilingual-MiniLM-L12-v2_cosine_ap": 0.9977460917001926,
155
- "eval_sentence-transformers-paraphrase-multilingual-MiniLM-L12-v2_cosine_f1": 0.9852413242919824,
156
- "eval_sentence-transformers-paraphrase-multilingual-MiniLM-L12-v2_cosine_f1_threshold": 0.6582455635070801,
157
- "eval_sentence-transformers-paraphrase-multilingual-MiniLM-L12-v2_cosine_mcc": 0.9780277137066985,
158
- "eval_sentence-transformers-paraphrase-multilingual-MiniLM-L12-v2_cosine_precision": 0.9794924087922049,
159
- "eval_sentence-transformers-paraphrase-multilingual-MiniLM-L12-v2_cosine_recall": 0.9910581222056631,
160
- "eval_steps_per_second": 0.459,
161
- "step": 900
162
- },
163
- {
164
- "epoch": 9.346604215456674,
165
- "grad_norm": 0.018028028309345245,
166
- "learning_rate": 3.1970649895178203e-06,
167
- "loss": 0.0052,
168
- "step": 1000
169
- },
170
- {
171
- "epoch": 9.346604215456674,
172
- "eval_loss": 0.0012360884575173259,
173
- "eval_runtime": 117.4598,
174
- "eval_samples_per_second": 2267.396,
175
- "eval_sentence-transformers-paraphrase-multilingual-MiniLM-L12-v2_cosine_accuracy": 0.9905380542935456,
176
- "eval_sentence-transformers-paraphrase-multilingual-MiniLM-L12-v2_cosine_accuracy_threshold": 0.6790644526481628,
177
- "eval_sentence-transformers-paraphrase-multilingual-MiniLM-L12-v2_cosine_ap": 0.9977983578816215,
178
- "eval_sentence-transformers-paraphrase-multilingual-MiniLM-L12-v2_cosine_f1": 0.9856131536880567,
179
- "eval_sentence-transformers-paraphrase-multilingual-MiniLM-L12-v2_cosine_f1_threshold": 0.6790644526481628,
180
- "eval_sentence-transformers-paraphrase-multilingual-MiniLM-L12-v2_cosine_mcc": 0.9785817179348335,
181
- "eval_sentence-transformers-paraphrase-multilingual-MiniLM-L12-v2_cosine_precision": 0.9816899806664392,
182
- "eval_sentence-transformers-paraphrase-multilingual-MiniLM-L12-v2_cosine_recall": 0.9895678092399404,
183
- "eval_steps_per_second": 0.46,
184
- "step": 1000
185
- }
186
- ],
187
- "logging_steps": 500,
188
- "max_steps": 1060,
189
- "num_input_tokens_seen": 0,
190
- "num_train_epochs": 10,
191
- "save_steps": 100,
192
- "stateful_callbacks": {
193
- "EarlyStoppingCallback": {
194
- "args": {
195
- "early_stopping_patience": 2,
196
- "early_stopping_threshold": 0.0
197
- },
198
- "attributes": {
199
- "early_stopping_patience_counter": 0
200
- }
201
- },
202
- "TrainerControl": {
203
- "args": {
204
- "should_epoch_stop": false,
205
- "should_evaluate": false,
206
- "should_log": false,
207
- "should_save": true,
208
- "should_training_stop": false
209
- },
210
- "attributes": {}
211
- }
212
- },
213
- "total_flos": 0.0,
214
- "train_batch_size": 5000,
215
- "trial_name": null,
216
- "trial_params": null
217
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
checkpoint-1000/training_args.bin DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:319aaa354e613c6db82c6bb78290f3da04198ef2c7a75b61b314fa305ed33c45
3
- size 6033
 
 
 
 
checkpoint-1000/unigram.json DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:da145b5e7700ae40f16691ec32a0b1fdc1ee3298db22a31ea55f57a966c4a65d
3
- size 14763260
 
 
 
 
checkpoint-1060/1_Pooling/config.json DELETED
@@ -1,10 +0,0 @@
1
- {
2
- "word_embedding_dimension": 384,
3
- "pooling_mode_cls_token": false,
4
- "pooling_mode_mean_tokens": true,
5
- "pooling_mode_max_tokens": false,
6
- "pooling_mode_mean_sqrt_len_tokens": false,
7
- "pooling_mode_weightedmean_tokens": false,
8
- "pooling_mode_lasttoken": false,
9
- "include_prompt": true
10
- }
 
 
 
 
 
 
 
 
 
 
 
checkpoint-1060/README.md DELETED
@@ -1,466 +0,0 @@
1
- ---
2
- language:
3
- - en
4
- license: apache-2.0
5
- tags:
6
- - sentence-transformers
7
- - sentence-similarity
8
- - feature-extraction
9
- - generated_from_trainer
10
- - dataset_size:2130620
11
- - loss:ContrastiveLoss
12
- base_model: sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2
13
- widget:
14
- - source_sentence: مانوئلا دی سنتا
15
- sentences:
16
- - Renko Kitagawa
17
- - هانس هيرمان وير
18
- - Ди Чента, Мануэла
19
- - source_sentence: يورى جافريلوف
20
- sentences:
21
- - Wiktor Pinczuk
22
- - Natalia Germanovna DIRKS
23
- - Світлана Євгенівна Савицька
24
- - source_sentence: Џуди Колинс
25
- sentences:
26
- - Collins
27
- - Aisha Muhammed Abdul Salam
28
- - Phonic Boy On Dope
29
- - source_sentence: ויליאם בלייר
30
- sentences:
31
- - The Hon. Mr Justice Blair
32
- - Queen Ingrid of Denmark
33
- - Herman van Rompuy
34
- - source_sentence: Saif al-Arab GADAFI
35
- sentences:
36
- - Максім Недасекаў
37
- - Mervyn Allister King
38
- - Paul d. scully-power
39
- pipeline_tag: sentence-similarity
40
- library_name: sentence-transformers
41
- metrics:
42
- - cosine_accuracy
43
- - cosine_accuracy_threshold
44
- - cosine_f1
45
- - cosine_f1_threshold
46
- - cosine_precision
47
- - cosine_recall
48
- - cosine_ap
49
- - cosine_mcc
50
- model-index:
51
- - name: sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2-address-matcher-original
52
- results:
53
- - task:
54
- type: binary-classification
55
- name: Binary Classification
56
- dataset:
57
- name: sentence transformers paraphrase multilingual MiniLM L12 v2
58
- type: sentence-transformers-paraphrase-multilingual-MiniLM-L12-v2
59
- metrics:
60
- - type: cosine_accuracy
61
- value: 0.9905380542935456
62
- name: Cosine Accuracy
63
- - type: cosine_accuracy_threshold
64
- value: 0.6790644526481628
65
- name: Cosine Accuracy Threshold
66
- - type: cosine_f1
67
- value: 0.9856131536880567
68
- name: Cosine F1
69
- - type: cosine_f1_threshold
70
- value: 0.6790644526481628
71
- name: Cosine F1 Threshold
72
- - type: cosine_precision
73
- value: 0.9816899806664392
74
- name: Cosine Precision
75
- - type: cosine_recall
76
- value: 0.9895678092399404
77
- name: Cosine Recall
78
- - type: cosine_ap
79
- value: 0.9977983578816215
80
- name: Cosine Ap
81
- - type: cosine_mcc
82
- value: 0.9785817179348335
83
- name: Cosine Mcc
84
- ---
85
-
86
- # sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2-address-matcher-original
87
-
88
- This is a [sentence-transformers](https://www.SBERT.net) model finetuned from [sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2](https://huggingface.co/sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2). It maps sentences & paragraphs to a 384-dimensional dense vector space and can be used for semantic textual similarity, semantic search, paraphrase mining, text classification, clustering, and more.
89
-
90
- ## Model Details
91
-
92
- ### Model Description
93
- - **Model Type:** Sentence Transformer
94
- - **Base model:** [sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2](https://huggingface.co/sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2) <!-- at revision 86741b4e3f5cb7765a600d3a3d55a0f6a6cb443d -->
95
- - **Maximum Sequence Length:** 128 tokens
96
- - **Output Dimensionality:** 384 dimensions
97
- - **Similarity Function:** Cosine Similarity
98
- <!-- - **Training Dataset:** Unknown -->
99
- - **Language:** en
100
- - **License:** apache-2.0
101
-
102
- ### Model Sources
103
-
104
- - **Documentation:** [Sentence Transformers Documentation](https://sbert.net)
105
- - **Repository:** [Sentence Transformers on GitHub](https://github.com/UKPLab/sentence-transformers)
106
- - **Hugging Face:** [Sentence Transformers on Hugging Face](https://huggingface.co/models?library=sentence-transformers)
107
-
108
- ### Full Model Architecture
109
-
110
- ```
111
- SentenceTransformer(
112
- (0): Transformer({'max_seq_length': 128, 'do_lower_case': False}) with Transformer model: BertModel
113
- (1): Pooling({'word_embedding_dimension': 384, 'pooling_mode_cls_token': False, 'pooling_mode_mean_tokens': True, 'pooling_mode_max_tokens': False, 'pooling_mode_mean_sqrt_len_tokens': False, 'pooling_mode_weightedmean_tokens': False, 'pooling_mode_lasttoken': False, 'include_prompt': True})
114
- )
115
- ```
116
-
117
- ## Usage
118
-
119
- ### Direct Usage (Sentence Transformers)
120
-
121
- First install the Sentence Transformers library:
122
-
123
- ```bash
124
- pip install -U sentence-transformers
125
- ```
126
-
127
- Then you can load this model and run inference.
128
- ```python
129
- from sentence_transformers import SentenceTransformer
130
-
131
- # Download from the 🤗 Hub
132
- model = SentenceTransformer("sentence_transformers_model_id")
133
- # Run inference
134
- sentences = [
135
- 'Saif al-Arab GADAFI',
136
- 'Максім Недасекаў',
137
- 'Mervyn Allister King',
138
- ]
139
- embeddings = model.encode(sentences)
140
- print(embeddings.shape)
141
- # [3, 384]
142
-
143
- # Get the similarity scores for the embeddings
144
- similarities = model.similarity(embeddings, embeddings)
145
- print(similarities.shape)
146
- # [3, 3]
147
- ```
148
-
149
- <!--
150
- ### Direct Usage (Transformers)
151
-
152
- <details><summary>Click to see the direct usage in Transformers</summary>
153
-
154
- </details>
155
- -->
156
-
157
- <!--
158
- ### Downstream Usage (Sentence Transformers)
159
-
160
- You can finetune this model on your own dataset.
161
-
162
- <details><summary>Click to expand</summary>
163
-
164
- </details>
165
- -->
166
-
167
- <!--
168
- ### Out-of-Scope Use
169
-
170
- *List how the model may foreseeably be misused and address what users ought not to do with the model.*
171
- -->
172
-
173
- ## Evaluation
174
-
175
- ### Metrics
176
-
177
- #### Binary Classification
178
-
179
- * Dataset: `sentence-transformers-paraphrase-multilingual-MiniLM-L12-v2`
180
- * Evaluated with [<code>BinaryClassificationEvaluator</code>](https://sbert.net/docs/package_reference/sentence_transformer/evaluation.html#sentence_transformers.evaluation.BinaryClassificationEvaluator)
181
-
182
- | Metric | Value |
183
- |:--------------------------|:-----------|
184
- | cosine_accuracy | 0.9905 |
185
- | cosine_accuracy_threshold | 0.6791 |
186
- | cosine_f1 | 0.9856 |
187
- | cosine_f1_threshold | 0.6791 |
188
- | cosine_precision | 0.9817 |
189
- | cosine_recall | 0.9896 |
190
- | **cosine_ap** | **0.9978** |
191
- | cosine_mcc | 0.9786 |
192
-
193
- <!--
194
- ## Bias, Risks and Limitations
195
-
196
- *What are the known or foreseeable issues stemming from this model? You could also flag here known failure cases or weaknesses of the model.*
197
- -->
198
-
199
- <!--
200
- ### Recommendations
201
-
202
- *What are recommendations with respect to the foreseeable issues? For example, filtering explicit content.*
203
- -->
204
-
205
- ## Training Details
206
-
207
- ### Training Dataset
208
-
209
- #### Unnamed Dataset
210
-
211
- * Size: 2,130,620 training samples
212
- * Columns: <code>sentence1</code>, <code>sentence2</code>, and <code>label</code>
213
- * Approximate statistics based on the first 1000 samples:
214
- | | sentence1 | sentence2 | label |
215
- |:--------|:---------------------------------------------------------------------------------|:---------------------------------------------------------------------------------|:---------------------------------------------------------------|
216
- | type | string | string | float |
217
- | details | <ul><li>min: 3 tokens</li><li>mean: 9.28 tokens</li><li>max: 57 tokens</li></ul> | <ul><li>min: 3 tokens</li><li>mean: 9.11 tokens</li><li>max: 65 tokens</li></ul> | <ul><li>min: 0.0</li><li>mean: 0.34</li><li>max: 1.0</li></ul> |
218
- * Samples:
219
- | sentence1 | sentence2 | label |
220
- |:----------------------------|:-------------------------------|:-----------------|
221
- | <code>ג'ק וייט</code> | <code>Jack White</code> | <code>1.0</code> |
222
- | <code>Абдуллоҳ Гул</code> | <code>Савицкая Светлана</code> | <code>0.0</code> |
223
- | <code>ショーン・ジャスティン・ペン</code> | <code>شان پن</code> | <code>1.0</code> |
224
- * Loss: [<code>ContrastiveLoss</code>](https://sbert.net/docs/package_reference/sentence_transformer/losses.html#contrastiveloss) with these parameters:
225
- ```json
226
- {
227
- "distance_metric": "SiameseDistanceMetric.COSINE_DISTANCE",
228
- "margin": 0.5,
229
- "size_average": true
230
- }
231
- ```
232
-
233
- ### Evaluation Dataset
234
-
235
- #### Unnamed Dataset
236
-
237
- * Size: 266,328 evaluation samples
238
- * Columns: <code>sentence1</code>, <code>sentence2</code>, and <code>label</code>
239
- * Approximate statistics based on the first 1000 samples:
240
- | | sentence1 | sentence2 | label |
241
- |:--------|:---------------------------------------------------------------------------------|:---------------------------------------------------------------------------------|:---------------------------------------------------------------|
242
- | type | string | string | float |
243
- | details | <ul><li>min: 3 tokens</li><li>mean: 9.27 tokens</li><li>max: 79 tokens</li></ul> | <ul><li>min: 3 tokens</li><li>mean: 8.99 tokens</li><li>max: 61 tokens</li></ul> | <ul><li>min: 0.0</li><li>mean: 0.32</li><li>max: 1.0</li></ul> |
244
- * Samples:
245
- | sentence1 | sentence2 | label |
246
- |:---------------------------------------------|:-----------------------------------------------|:-----------------|
247
- | <code>Анатолий Николаевич Герасимов</code> | <code>Anatoli Nikolajewitsch Gerassimow</code> | <code>1.0</code> |
248
- | <code>Igor Stanislavovitsj Prokopenko</code> | <code>Angelo Lauricella</code> | <code>0.0</code> |
249
- | <code>Кофе, Линда</code> | <code>Святлана Яўгенаўна Савіцкая</code> | <code>0.0</code> |
250
- * Loss: [<code>ContrastiveLoss</code>](https://sbert.net/docs/package_reference/sentence_transformer/losses.html#contrastiveloss) with these parameters:
251
- ```json
252
- {
253
- "distance_metric": "SiameseDistanceMetric.COSINE_DISTANCE",
254
- "margin": 0.5,
255
- "size_average": true
256
- }
257
- ```
258
-
259
- ### Training Hyperparameters
260
- #### Non-Default Hyperparameters
261
-
262
- - `eval_strategy`: steps
263
- - `per_device_train_batch_size`: 5000
264
- - `per_device_eval_batch_size`: 5000
265
- - `gradient_accumulation_steps`: 4
266
- - `weight_decay`: 0.02
267
- - `num_train_epochs`: 10
268
- - `warmup_ratio`: 0.1
269
- - `fp16`: True
270
- - `load_best_model_at_end`: True
271
- - `optim`: adafactor
272
- - `gradient_checkpointing`: True
273
-
274
- #### All Hyperparameters
275
- <details><summary>Click to expand</summary>
276
-
277
- - `overwrite_output_dir`: False
278
- - `do_predict`: False
279
- - `eval_strategy`: steps
280
- - `prediction_loss_only`: True
281
- - `per_device_train_batch_size`: 5000
282
- - `per_device_eval_batch_size`: 5000
283
- - `per_gpu_train_batch_size`: None
284
- - `per_gpu_eval_batch_size`: None
285
- - `gradient_accumulation_steps`: 4
286
- - `eval_accumulation_steps`: None
287
- - `torch_empty_cache_steps`: None
288
- - `learning_rate`: 5e-05
289
- - `weight_decay`: 0.02
290
- - `adam_beta1`: 0.9
291
- - `adam_beta2`: 0.999
292
- - `adam_epsilon`: 1e-08
293
- - `max_grad_norm`: 1.0
294
- - `num_train_epochs`: 10
295
- - `max_steps`: -1
296
- - `lr_scheduler_type`: linear
297
- - `lr_scheduler_kwargs`: {}
298
- - `warmup_ratio`: 0.1
299
- - `warmup_steps`: 0
300
- - `log_level`: passive
301
- - `log_level_replica`: warning
302
- - `log_on_each_node`: True
303
- - `logging_nan_inf_filter`: True
304
- - `save_safetensors`: True
305
- - `save_on_each_node`: False
306
- - `save_only_model`: False
307
- - `restore_callback_states_from_checkpoint`: False
308
- - `no_cuda`: False
309
- - `use_cpu`: False
310
- - `use_mps_device`: False
311
- - `seed`: 42
312
- - `data_seed`: None
313
- - `jit_mode_eval`: False
314
- - `use_ipex`: False
315
- - `bf16`: False
316
- - `fp16`: True
317
- - `fp16_opt_level`: O1
318
- - `half_precision_backend`: auto
319
- - `bf16_full_eval`: False
320
- - `fp16_full_eval`: False
321
- - `tf32`: None
322
- - `local_rank`: 0
323
- - `ddp_backend`: None
324
- - `tpu_num_cores`: None
325
- - `tpu_metrics_debug`: False
326
- - `debug`: []
327
- - `dataloader_drop_last`: False
328
- - `dataloader_num_workers`: 0
329
- - `dataloader_prefetch_factor`: None
330
- - `past_index`: -1
331
- - `disable_tqdm`: False
332
- - `remove_unused_columns`: True
333
- - `label_names`: None
334
- - `load_best_model_at_end`: True
335
- - `ignore_data_skip`: False
336
- - `fsdp`: []
337
- - `fsdp_min_num_params`: 0
338
- - `fsdp_config`: {'min_num_params': 0, 'xla': False, 'xla_fsdp_v2': False, 'xla_fsdp_grad_ckpt': False}
339
- - `tp_size`: 0
340
- - `fsdp_transformer_layer_cls_to_wrap`: None
341
- - `accelerator_config`: {'split_batches': False, 'dispatch_batches': None, 'even_batches': True, 'use_seedable_sampler': True, 'non_blocking': False, 'gradient_accumulation_kwargs': None}
342
- - `deepspeed`: None
343
- - `label_smoothing_factor`: 0.0
344
- - `optim`: adafactor
345
- - `optim_args`: None
346
- - `adafactor`: False
347
- - `group_by_length`: False
348
- - `length_column_name`: length
349
- - `ddp_find_unused_parameters`: None
350
- - `ddp_bucket_cap_mb`: None
351
- - `ddp_broadcast_buffers`: False
352
- - `dataloader_pin_memory`: True
353
- - `dataloader_persistent_workers`: False
354
- - `skip_memory_metrics`: True
355
- - `use_legacy_prediction_loop`: False
356
- - `push_to_hub`: False
357
- - `resume_from_checkpoint`: None
358
- - `hub_model_id`: None
359
- - `hub_strategy`: every_save
360
- - `hub_private_repo`: None
361
- - `hub_always_push`: False
362
- - `gradient_checkpointing`: True
363
- - `gradient_checkpointing_kwargs`: None
364
- - `include_inputs_for_metrics`: False
365
- - `include_for_metrics`: []
366
- - `eval_do_concat_batches`: True
367
- - `fp16_backend`: auto
368
- - `push_to_hub_model_id`: None
369
- - `push_to_hub_organization`: None
370
- - `mp_parameters`:
371
- - `auto_find_batch_size`: False
372
- - `full_determinism`: False
373
- - `torchdynamo`: None
374
- - `ray_scope`: last
375
- - `ddp_timeout`: 1800
376
- - `torch_compile`: False
377
- - `torch_compile_backend`: None
378
- - `torch_compile_mode`: None
379
- - `include_tokens_per_second`: False
380
- - `include_num_input_tokens_seen`: False
381
- - `neftune_noise_alpha`: None
382
- - `optim_target_modules`: None
383
- - `batch_eval_metrics`: False
384
- - `eval_on_start`: False
385
- - `use_liger_kernel`: False
386
- - `eval_use_gather_object`: False
387
- - `average_tokens_across_devices`: False
388
- - `prompts`: None
389
- - `batch_sampler`: batch_sampler
390
- - `multi_dataset_batch_sampler`: proportional
391
-
392
- </details>
393
-
394
- ### Training Logs
395
- | Epoch | Step | Training Loss | Validation Loss | sentence-transformers-paraphrase-multilingual-MiniLM-L12-v2_cosine_ap |
396
- |:------:|:----:|:-------------:|:---------------:|:---------------------------------------------------------------------:|
397
- | -1 | -1 | - | - | 0.7195 |
398
- | 0.9368 | 100 | - | 0.0083 | 0.9597 |
399
- | 1.8712 | 200 | - | 0.0043 | 0.9877 |
400
- | 2.8056 | 300 | - | 0.0028 | 0.9936 |
401
- | 3.7400 | 400 | - | 0.0021 | 0.9954 |
402
- | 4.6745 | 500 | 0.0224 | 0.0016 | 0.9964 |
403
- | 5.6089 | 600 | - | 0.0015 | 0.9970 |
404
- | 6.5433 | 700 | - | 0.0014 | 0.9974 |
405
- | 7.4778 | 800 | - | 0.0013 | 0.9975 |
406
- | 8.4122 | 900 | - | 0.0013 | 0.9977 |
407
- | 9.3466 | 1000 | 0.0052 | 0.0012 | 0.9978 |
408
-
409
-
410
- ### Framework Versions
411
- - Python: 3.12.9
412
- - Sentence Transformers: 3.4.1
413
- - Transformers: 4.51.3
414
- - PyTorch: 2.7.0+cu126
415
- - Accelerate: 1.6.0
416
- - Datasets: 3.6.0
417
- - Tokenizers: 0.21.1
418
-
419
- ## Citation
420
-
421
- ### BibTeX
422
-
423
- #### Sentence Transformers
424
- ```bibtex
425
- @inproceedings{reimers-2019-sentence-bert,
426
- title = "Sentence-BERT: Sentence Embeddings using Siamese BERT-Networks",
427
- author = "Reimers, Nils and Gurevych, Iryna",
428
- booktitle = "Proceedings of the 2019 Conference on Empirical Methods in Natural Language Processing",
429
- month = "11",
430
- year = "2019",
431
- publisher = "Association for Computational Linguistics",
432
- url = "https://arxiv.org/abs/1908.10084",
433
- }
434
- ```
435
-
436
- #### ContrastiveLoss
437
- ```bibtex
438
- @inproceedings{hadsell2006dimensionality,
439
- author={Hadsell, R. and Chopra, S. and LeCun, Y.},
440
- booktitle={2006 IEEE Computer Society Conference on Computer Vision and Pattern Recognition (CVPR'06)},
441
- title={Dimensionality Reduction by Learning an Invariant Mapping},
442
- year={2006},
443
- volume={2},
444
- number={},
445
- pages={1735-1742},
446
- doi={10.1109/CVPR.2006.100}
447
- }
448
- ```
449
-
450
- <!--
451
- ## Glossary
452
-
453
- *Clearly define terms in order to be accessible across audiences.*
454
- -->
455
-
456
- <!--
457
- ## Model Card Authors
458
-
459
- *Lists the people who create the model card, providing recognition and accountability for the detailed work that goes into its construction.*
460
- -->
461
-
462
- <!--
463
- ## Model Card Contact
464
-
465
- *Provides a way for people who have updates to the Model Card, suggestions, or questions, to contact the Model Card authors.*
466
- -->
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
checkpoint-1060/config.json DELETED
@@ -1,25 +0,0 @@
1
- {
2
- "architectures": [
3
- "BertModel"
4
- ],
5
- "attention_probs_dropout_prob": 0.1,
6
- "classifier_dropout": null,
7
- "gradient_checkpointing": false,
8
- "hidden_act": "gelu",
9
- "hidden_dropout_prob": 0.1,
10
- "hidden_size": 384,
11
- "initializer_range": 0.02,
12
- "intermediate_size": 1536,
13
- "layer_norm_eps": 1e-12,
14
- "max_position_embeddings": 512,
15
- "model_type": "bert",
16
- "num_attention_heads": 12,
17
- "num_hidden_layers": 12,
18
- "pad_token_id": 0,
19
- "position_embedding_type": "absolute",
20
- "torch_dtype": "float32",
21
- "transformers_version": "4.51.3",
22
- "type_vocab_size": 2,
23
- "use_cache": true,
24
- "vocab_size": 250037
25
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
checkpoint-1060/config_sentence_transformers.json DELETED
@@ -1,10 +0,0 @@
1
- {
2
- "__version__": {
3
- "sentence_transformers": "3.4.1",
4
- "transformers": "4.51.3",
5
- "pytorch": "2.7.0+cu126"
6
- },
7
- "prompts": {},
8
- "default_prompt_name": null,
9
- "similarity_fn_name": "cosine"
10
- }
 
 
 
 
 
 
 
 
 
 
 
checkpoint-1060/model.safetensors DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:6d56240a57c4c07d9788a1faa198fd34bca9ea0a1e5a26691b1d009dcae94358
3
- size 470637416
 
 
 
 
checkpoint-1060/modules.json DELETED
@@ -1,14 +0,0 @@
1
- [
2
- {
3
- "idx": 0,
4
- "name": "0",
5
- "path": "",
6
- "type": "sentence_transformers.models.Transformer"
7
- },
8
- {
9
- "idx": 1,
10
- "name": "1",
11
- "path": "1_Pooling",
12
- "type": "sentence_transformers.models.Pooling"
13
- }
14
- ]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
checkpoint-1060/optimizer.pt DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:9b64d5492cb2e0518735a1a0dc7b3b7826a2b4f5d195b44246fdc70db2a64017
3
- size 1715019
 
 
 
 
checkpoint-1060/rng_state.pth DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:8897bcd95f20279aabd5aac16966d704f565763d9f133ce3e3009c72d02b6438
3
- size 14645
 
 
 
 
checkpoint-1060/scaler.pt DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:d3bfa35dd520299a41189dc520e331d371a8b9b17d9abff7077c34c5e038a3b0
3
- size 1383
 
 
 
 
checkpoint-1060/scheduler.pt DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:d60e33f28b0c6c27be860020c700ca71a97176bb114f84fcae7c353e227c8a2e
3
- size 1465
 
 
 
 
checkpoint-1060/sentence_bert_config.json DELETED
@@ -1,4 +0,0 @@
1
- {
2
- "max_seq_length": 128,
3
- "do_lower_case": false
4
- }
 
 
 
 
 
checkpoint-1060/special_tokens_map.json DELETED
@@ -1,51 +0,0 @@
1
- {
2
- "bos_token": {
3
- "content": "<s>",
4
- "lstrip": false,
5
- "normalized": false,
6
- "rstrip": false,
7
- "single_word": false
8
- },
9
- "cls_token": {
10
- "content": "<s>",
11
- "lstrip": false,
12
- "normalized": false,
13
- "rstrip": false,
14
- "single_word": false
15
- },
16
- "eos_token": {
17
- "content": "</s>",
18
- "lstrip": false,
19
- "normalized": false,
20
- "rstrip": false,
21
- "single_word": false
22
- },
23
- "mask_token": {
24
- "content": "<mask>",
25
- "lstrip": true,
26
- "normalized": false,
27
- "rstrip": false,
28
- "single_word": false
29
- },
30
- "pad_token": {
31
- "content": "<pad>",
32
- "lstrip": false,
33
- "normalized": false,
34
- "rstrip": false,
35
- "single_word": false
36
- },
37
- "sep_token": {
38
- "content": "</s>",
39
- "lstrip": false,
40
- "normalized": false,
41
- "rstrip": false,
42
- "single_word": false
43
- },
44
- "unk_token": {
45
- "content": "<unk>",
46
- "lstrip": false,
47
- "normalized": false,
48
- "rstrip": false,
49
- "single_word": false
50
- }
51
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
checkpoint-1060/tokenizer.json DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:cad551d5600a84242d0973327029452a1e3672ba6313c2a3c3d69c4310e12719
3
- size 17082987
 
 
 
 
checkpoint-1060/tokenizer_config.json DELETED
@@ -1,65 +0,0 @@
1
- {
2
- "added_tokens_decoder": {
3
- "0": {
4
- "content": "<s>",
5
- "lstrip": false,
6
- "normalized": false,
7
- "rstrip": false,
8
- "single_word": false,
9
- "special": true
10
- },
11
- "1": {
12
- "content": "<pad>",
13
- "lstrip": false,
14
- "normalized": false,
15
- "rstrip": false,
16
- "single_word": false,
17
- "special": true
18
- },
19
- "2": {
20
- "content": "</s>",
21
- "lstrip": false,
22
- "normalized": false,
23
- "rstrip": false,
24
- "single_word": false,
25
- "special": true
26
- },
27
- "3": {
28
- "content": "<unk>",
29
- "lstrip": false,
30
- "normalized": false,
31
- "rstrip": false,
32
- "single_word": false,
33
- "special": true
34
- },
35
- "250001": {
36
- "content": "<mask>",
37
- "lstrip": true,
38
- "normalized": false,
39
- "rstrip": false,
40
- "single_word": false,
41
- "special": true
42
- }
43
- },
44
- "bos_token": "<s>",
45
- "clean_up_tokenization_spaces": false,
46
- "cls_token": "<s>",
47
- "do_lower_case": true,
48
- "eos_token": "</s>",
49
- "extra_special_tokens": {},
50
- "mask_token": "<mask>",
51
- "max_length": 128,
52
- "model_max_length": 128,
53
- "pad_to_multiple_of": null,
54
- "pad_token": "<pad>",
55
- "pad_token_type_id": 0,
56
- "padding_side": "right",
57
- "sep_token": "</s>",
58
- "stride": 0,
59
- "strip_accents": null,
60
- "tokenize_chinese_chars": true,
61
- "tokenizer_class": "BertTokenizer",
62
- "truncation_side": "right",
63
- "truncation_strategy": "longest_first",
64
- "unk_token": "<unk>"
65
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
checkpoint-1060/trainer_state.json DELETED
@@ -1,217 +0,0 @@
1
- {
2
- "best_global_step": 1000,
3
- "best_metric": 0.0012360884575173259,
4
- "best_model_checkpoint": "data/fine-tuned-sbert-sentence-transformers-paraphrase-multilingual-MiniLM-L12-v2-original-adafactor/checkpoint-1000",
5
- "epoch": 9.908665105386417,
6
- "eval_steps": 100,
7
- "global_step": 1060,
8
- "is_hyper_param_search": false,
9
- "is_local_process_zero": true,
10
- "is_world_process_zero": true,
11
- "log_history": [
12
- {
13
- "epoch": 0.936768149882904,
14
- "eval_loss": 0.008251233026385307,
15
- "eval_runtime": 117.4457,
16
- "eval_samples_per_second": 2267.669,
17
- "eval_sentence-transformers-paraphrase-multilingual-MiniLM-L12-v2_cosine_accuracy": 0.9330529793864755,
18
- "eval_sentence-transformers-paraphrase-multilingual-MiniLM-L12-v2_cosine_accuracy_threshold": 0.6639679670333862,
19
- "eval_sentence-transformers-paraphrase-multilingual-MiniLM-L12-v2_cosine_ap": 0.9596591982248662,
20
- "eval_sentence-transformers-paraphrase-multilingual-MiniLM-L12-v2_cosine_f1": 0.8990018609372358,
21
- "eval_sentence-transformers-paraphrase-multilingual-MiniLM-L12-v2_cosine_f1_threshold": 0.6536919474601746,
22
- "eval_sentence-transformers-paraphrase-multilingual-MiniLM-L12-v2_cosine_mcc": 0.8488676021429209,
23
- "eval_sentence-transformers-paraphrase-multilingual-MiniLM-L12-v2_cosine_precision": 0.8846836847946726,
24
- "eval_sentence-transformers-paraphrase-multilingual-MiniLM-L12-v2_cosine_recall": 0.913791126905881,
25
- "eval_steps_per_second": 0.46,
26
- "step": 100
27
- },
28
- {
29
- "epoch": 1.8711943793911008,
30
- "eval_loss": 0.004326523281633854,
31
- "eval_runtime": 118.308,
32
- "eval_samples_per_second": 2251.141,
33
- "eval_sentence-transformers-paraphrase-multilingual-MiniLM-L12-v2_cosine_accuracy": 0.9683099913640971,
34
- "eval_sentence-transformers-paraphrase-multilingual-MiniLM-L12-v2_cosine_accuracy_threshold": 0.6799858808517456,
35
- "eval_sentence-transformers-paraphrase-multilingual-MiniLM-L12-v2_cosine_ap": 0.987669070948898,
36
- "eval_sentence-transformers-paraphrase-multilingual-MiniLM-L12-v2_cosine_f1": 0.9520018198362147,
37
- "eval_sentence-transformers-paraphrase-multilingual-MiniLM-L12-v2_cosine_f1_threshold": 0.6799858808517456,
38
- "eval_sentence-transformers-paraphrase-multilingual-MiniLM-L12-v2_cosine_mcc": 0.9284143244509058,
39
- "eval_sentence-transformers-paraphrase-multilingual-MiniLM-L12-v2_cosine_precision": 0.9445886468795847,
40
- "eval_sentence-transformers-paraphrase-multilingual-MiniLM-L12-v2_cosine_recall": 0.9595322710076808,
41
- "eval_steps_per_second": 0.456,
42
- "step": 200
43
- },
44
- {
45
- "epoch": 2.8056206088992974,
46
- "eval_loss": 0.002782753435894847,
47
- "eval_runtime": 117.8399,
48
- "eval_samples_per_second": 2260.083,
49
- "eval_sentence-transformers-paraphrase-multilingual-MiniLM-L12-v2_cosine_accuracy": 0.9790110013892539,
50
- "eval_sentence-transformers-paraphrase-multilingual-MiniLM-L12-v2_cosine_accuracy_threshold": 0.7040826678276062,
51
- "eval_sentence-transformers-paraphrase-multilingual-MiniLM-L12-v2_cosine_ap": 0.9935758649482886,
52
- "eval_sentence-transformers-paraphrase-multilingual-MiniLM-L12-v2_cosine_f1": 0.9680662667809197,
53
- "eval_sentence-transformers-paraphrase-multilingual-MiniLM-L12-v2_cosine_f1_threshold": 0.7029732465744019,
54
- "eval_sentence-transformers-paraphrase-multilingual-MiniLM-L12-v2_cosine_mcc": 0.9524469797852624,
55
- "eval_sentence-transformers-paraphrase-multilingual-MiniLM-L12-v2_cosine_precision": 0.9648143930767479,
56
- "eval_sentence-transformers-paraphrase-multilingual-MiniLM-L12-v2_cosine_recall": 0.9713401352745615,
57
- "eval_steps_per_second": 0.458,
58
- "step": 300
59
- },
60
- {
61
- "epoch": 3.740046838407494,
62
- "eval_loss": 0.0020659712608903646,
63
- "eval_runtime": 116.8077,
64
- "eval_samples_per_second": 2280.056,
65
- "eval_sentence-transformers-paraphrase-multilingual-MiniLM-L12-v2_cosine_accuracy": 0.9837419742424811,
66
- "eval_sentence-transformers-paraphrase-multilingual-MiniLM-L12-v2_cosine_accuracy_threshold": 0.7114190459251404,
67
- "eval_sentence-transformers-paraphrase-multilingual-MiniLM-L12-v2_cosine_ap": 0.9954100421733855,
68
- "eval_sentence-transformers-paraphrase-multilingual-MiniLM-L12-v2_cosine_f1": 0.975348704810703,
69
- "eval_sentence-transformers-paraphrase-multilingual-MiniLM-L12-v2_cosine_f1_threshold": 0.6966520547866821,
70
- "eval_sentence-transformers-paraphrase-multilingual-MiniLM-L12-v2_cosine_mcc": 0.963270232791414,
71
- "eval_sentence-transformers-paraphrase-multilingual-MiniLM-L12-v2_cosine_precision": 0.9687853426826509,
72
- "eval_sentence-transformers-paraphrase-multilingual-MiniLM-L12-v2_cosine_recall": 0.9820016049524246,
73
- "eval_steps_per_second": 0.462,
74
- "step": 400
75
- },
76
- {
77
- "epoch": 4.674473067915691,
78
- "grad_norm": 0.07067500799894333,
79
- "learning_rate": 2.9402515723270442e-05,
80
- "loss": 0.0224,
81
- "step": 500
82
- },
83
- {
84
- "epoch": 4.674473067915691,
85
- "eval_loss": 0.0016409169184044003,
86
- "eval_runtime": 117.7739,
87
- "eval_samples_per_second": 2261.35,
88
- "eval_sentence-transformers-paraphrase-multilingual-MiniLM-L12-v2_cosine_accuracy": 0.986370292494274,
89
- "eval_sentence-transformers-paraphrase-multilingual-MiniLM-L12-v2_cosine_accuracy_threshold": 0.7391290664672852,
90
- "eval_sentence-transformers-paraphrase-multilingual-MiniLM-L12-v2_cosine_ap": 0.996439193909599,
91
- "eval_sentence-transformers-paraphrase-multilingual-MiniLM-L12-v2_cosine_f1": 0.9792820044518008,
92
- "eval_sentence-transformers-paraphrase-multilingual-MiniLM-L12-v2_cosine_f1_threshold": 0.7391290664672852,
93
- "eval_sentence-transformers-paraphrase-multilingual-MiniLM-L12-v2_cosine_mcc": 0.9691467317957321,
94
- "eval_sentence-transformers-paraphrase-multilingual-MiniLM-L12-v2_cosine_precision": 0.975107979086156,
95
- "eval_sentence-transformers-paraphrase-multilingual-MiniLM-L12-v2_cosine_recall": 0.9834919179181474,
96
- "eval_steps_per_second": 0.459,
97
- "step": 500
98
- },
99
- {
100
- "epoch": 5.608899297423887,
101
- "eval_loss": 0.0014551315689459443,
102
- "eval_runtime": 117.5801,
103
- "eval_samples_per_second": 2265.077,
104
- "eval_sentence-transformers-paraphrase-multilingual-MiniLM-L12-v2_cosine_accuracy": 0.9884729470957083,
105
- "eval_sentence-transformers-paraphrase-multilingual-MiniLM-L12-v2_cosine_accuracy_threshold": 0.7460525035858154,
106
- "eval_sentence-transformers-paraphrase-multilingual-MiniLM-L12-v2_cosine_ap": 0.9969945004512654,
107
- "eval_sentence-transformers-paraphrase-multilingual-MiniLM-L12-v2_cosine_f1": 0.9824360661365067,
108
- "eval_sentence-transformers-paraphrase-multilingual-MiniLM-L12-v2_cosine_f1_threshold": 0.7435637712478638,
109
- "eval_sentence-transformers-paraphrase-multilingual-MiniLM-L12-v2_cosine_mcc": 0.9738614226726382,
110
- "eval_sentence-transformers-paraphrase-multilingual-MiniLM-L12-v2_cosine_precision": 0.9805847418912745,
111
- "eval_sentence-transformers-paraphrase-multilingual-MiniLM-L12-v2_cosine_recall": 0.9842943941304597,
112
- "eval_steps_per_second": 0.459,
113
- "step": 600
114
- },
115
- {
116
- "epoch": 6.543325526932084,
117
- "eval_loss": 0.0013776659034192562,
118
- "eval_runtime": 117.6764,
119
- "eval_samples_per_second": 2263.223,
120
- "eval_sentence-transformers-paraphrase-multilingual-MiniLM-L12-v2_cosine_accuracy": 0.9893740847820374,
121
- "eval_sentence-transformers-paraphrase-multilingual-MiniLM-L12-v2_cosine_accuracy_threshold": 0.7209540009498596,
122
- "eval_sentence-transformers-paraphrase-multilingual-MiniLM-L12-v2_cosine_ap": 0.997357375070481,
123
- "eval_sentence-transformers-paraphrase-multilingual-MiniLM-L12-v2_cosine_f1": 0.9838035826704058,
124
- "eval_sentence-transformers-paraphrase-multilingual-MiniLM-L12-v2_cosine_f1_threshold": 0.7209540009498596,
125
- "eval_sentence-transformers-paraphrase-multilingual-MiniLM-L12-v2_cosine_mcc": 0.9758996171607873,
126
- "eval_sentence-transformers-paraphrase-multilingual-MiniLM-L12-v2_cosine_precision": 0.9822857142857143,
127
- "eval_sentence-transformers-paraphrase-multilingual-MiniLM-L12-v2_cosine_recall": 0.9853261492605755,
128
- "eval_steps_per_second": 0.459,
129
- "step": 700
130
- },
131
- {
132
- "epoch": 7.477751756440281,
133
- "eval_loss": 0.0013444514479488134,
134
- "eval_runtime": 117.3408,
135
- "eval_samples_per_second": 2269.696,
136
- "eval_sentence-transformers-paraphrase-multilingual-MiniLM-L12-v2_cosine_accuracy": 0.9898246536252018,
137
- "eval_sentence-transformers-paraphrase-multilingual-MiniLM-L12-v2_cosine_accuracy_threshold": 0.7261425852775574,
138
- "eval_sentence-transformers-paraphrase-multilingual-MiniLM-L12-v2_cosine_ap": 0.9975494130839752,
139
- "eval_sentence-transformers-paraphrase-multilingual-MiniLM-L12-v2_cosine_f1": 0.9844654628833477,
140
- "eval_sentence-transformers-paraphrase-multilingual-MiniLM-L12-v2_cosine_f1_threshold": 0.7227741479873657,
141
- "eval_sentence-transformers-paraphrase-multilingual-MiniLM-L12-v2_cosine_mcc": 0.9769000718683564,
142
- "eval_sentence-transformers-paraphrase-multilingual-MiniLM-L12-v2_cosine_precision": 0.9845218986470993,
143
- "eval_sentence-transformers-paraphrase-multilingual-MiniLM-L12-v2_cosine_recall": 0.9844090335893615,
144
- "eval_steps_per_second": 0.46,
145
- "step": 800
146
- },
147
- {
148
- "epoch": 8.412177985948478,
149
- "eval_loss": 0.0012511691311374307,
150
- "eval_runtime": 117.668,
151
- "eval_samples_per_second": 2263.385,
152
- "eval_sentence-transformers-paraphrase-multilingual-MiniLM-L12-v2_cosine_accuracy": 0.9902752224683663,
153
- "eval_sentence-transformers-paraphrase-multilingual-MiniLM-L12-v2_cosine_accuracy_threshold": 0.685534656047821,
154
- "eval_sentence-transformers-paraphrase-multilingual-MiniLM-L12-v2_cosine_ap": 0.9977460917001926,
155
- "eval_sentence-transformers-paraphrase-multilingual-MiniLM-L12-v2_cosine_f1": 0.9852413242919824,
156
- "eval_sentence-transformers-paraphrase-multilingual-MiniLM-L12-v2_cosine_f1_threshold": 0.6582455635070801,
157
- "eval_sentence-transformers-paraphrase-multilingual-MiniLM-L12-v2_cosine_mcc": 0.9780277137066985,
158
- "eval_sentence-transformers-paraphrase-multilingual-MiniLM-L12-v2_cosine_precision": 0.9794924087922049,
159
- "eval_sentence-transformers-paraphrase-multilingual-MiniLM-L12-v2_cosine_recall": 0.9910581222056631,
160
- "eval_steps_per_second": 0.459,
161
- "step": 900
162
- },
163
- {
164
- "epoch": 9.346604215456674,
165
- "grad_norm": 0.018028028309345245,
166
- "learning_rate": 3.1970649895178203e-06,
167
- "loss": 0.0052,
168
- "step": 1000
169
- },
170
- {
171
- "epoch": 9.346604215456674,
172
- "eval_loss": 0.0012360884575173259,
173
- "eval_runtime": 117.4598,
174
- "eval_samples_per_second": 2267.396,
175
- "eval_sentence-transformers-paraphrase-multilingual-MiniLM-L12-v2_cosine_accuracy": 0.9905380542935456,
176
- "eval_sentence-transformers-paraphrase-multilingual-MiniLM-L12-v2_cosine_accuracy_threshold": 0.6790644526481628,
177
- "eval_sentence-transformers-paraphrase-multilingual-MiniLM-L12-v2_cosine_ap": 0.9977983578816215,
178
- "eval_sentence-transformers-paraphrase-multilingual-MiniLM-L12-v2_cosine_f1": 0.9856131536880567,
179
- "eval_sentence-transformers-paraphrase-multilingual-MiniLM-L12-v2_cosine_f1_threshold": 0.6790644526481628,
180
- "eval_sentence-transformers-paraphrase-multilingual-MiniLM-L12-v2_cosine_mcc": 0.9785817179348335,
181
- "eval_sentence-transformers-paraphrase-multilingual-MiniLM-L12-v2_cosine_precision": 0.9816899806664392,
182
- "eval_sentence-transformers-paraphrase-multilingual-MiniLM-L12-v2_cosine_recall": 0.9895678092399404,
183
- "eval_steps_per_second": 0.46,
184
- "step": 1000
185
- }
186
- ],
187
- "logging_steps": 500,
188
- "max_steps": 1060,
189
- "num_input_tokens_seen": 0,
190
- "num_train_epochs": 10,
191
- "save_steps": 100,
192
- "stateful_callbacks": {
193
- "EarlyStoppingCallback": {
194
- "args": {
195
- "early_stopping_patience": 2,
196
- "early_stopping_threshold": 0.0
197
- },
198
- "attributes": {
199
- "early_stopping_patience_counter": 0
200
- }
201
- },
202
- "TrainerControl": {
203
- "args": {
204
- "should_epoch_stop": false,
205
- "should_evaluate": false,
206
- "should_log": false,
207
- "should_save": true,
208
- "should_training_stop": true
209
- },
210
- "attributes": {}
211
- }
212
- },
213
- "total_flos": 0.0,
214
- "train_batch_size": 5000,
215
- "trial_name": null,
216
- "trial_params": null
217
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
checkpoint-1060/training_args.bin DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:319aaa354e613c6db82c6bb78290f3da04198ef2c7a75b61b314fa305ed33c45
3
- size 6033
 
 
 
 
checkpoint-1060/unigram.json DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:da145b5e7700ae40f16691ec32a0b1fdc1ee3298db22a31ea55f57a966c4a65d
3
- size 14763260
 
 
 
 
checkpoint-700/1_Pooling/config.json DELETED
@@ -1,10 +0,0 @@
1
- {
2
- "word_embedding_dimension": 384,
3
- "pooling_mode_cls_token": false,
4
- "pooling_mode_mean_tokens": true,
5
- "pooling_mode_max_tokens": false,
6
- "pooling_mode_mean_sqrt_len_tokens": false,
7
- "pooling_mode_weightedmean_tokens": false,
8
- "pooling_mode_lasttoken": false,
9
- "include_prompt": true
10
- }
 
 
 
 
 
 
 
 
 
 
 
checkpoint-700/README.md DELETED
@@ -1,463 +0,0 @@
1
- ---
2
- language:
3
- - en
4
- license: apache-2.0
5
- tags:
6
- - sentence-transformers
7
- - sentence-similarity
8
- - feature-extraction
9
- - generated_from_trainer
10
- - dataset_size:2130620
11
- - loss:ContrastiveLoss
12
- base_model: sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2
13
- widget:
14
- - source_sentence: مانوئلا دی سنتا
15
- sentences:
16
- - Renko Kitagawa
17
- - هانس هيرمان وير
18
- - Ди Чента, Мануэла
19
- - source_sentence: يورى جافريلوف
20
- sentences:
21
- - Wiktor Pinczuk
22
- - Natalia Germanovna DIRKS
23
- - Світлана Євгенівна Савицька
24
- - source_sentence: Џуди Колинс
25
- sentences:
26
- - Collins
27
- - Aisha Muhammed Abdul Salam
28
- - Phonic Boy On Dope
29
- - source_sentence: ויליאם בלייר
30
- sentences:
31
- - The Hon. Mr Justice Blair
32
- - Queen Ingrid of Denmark
33
- - Herman van Rompuy
34
- - source_sentence: Saif al-Arab GADAFI
35
- sentences:
36
- - Максім Недасекаў
37
- - Mervyn Allister King
38
- - Paul d. scully-power
39
- pipeline_tag: sentence-similarity
40
- library_name: sentence-transformers
41
- metrics:
42
- - cosine_accuracy
43
- - cosine_accuracy_threshold
44
- - cosine_f1
45
- - cosine_f1_threshold
46
- - cosine_precision
47
- - cosine_recall
48
- - cosine_ap
49
- - cosine_mcc
50
- model-index:
51
- - name: sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2-address-matcher-original
52
- results:
53
- - task:
54
- type: binary-classification
55
- name: Binary Classification
56
- dataset:
57
- name: sentence transformers paraphrase multilingual MiniLM L12 v2
58
- type: sentence-transformers-paraphrase-multilingual-MiniLM-L12-v2
59
- metrics:
60
- - type: cosine_accuracy
61
- value: 0.9893740847820374
62
- name: Cosine Accuracy
63
- - type: cosine_accuracy_threshold
64
- value: 0.7209540009498596
65
- name: Cosine Accuracy Threshold
66
- - type: cosine_f1
67
- value: 0.9838035826704058
68
- name: Cosine F1
69
- - type: cosine_f1_threshold
70
- value: 0.7209540009498596
71
- name: Cosine F1 Threshold
72
- - type: cosine_precision
73
- value: 0.9822857142857143
74
- name: Cosine Precision
75
- - type: cosine_recall
76
- value: 0.9853261492605755
77
- name: Cosine Recall
78
- - type: cosine_ap
79
- value: 0.997357375070481
80
- name: Cosine Ap
81
- - type: cosine_mcc
82
- value: 0.9758996171607873
83
- name: Cosine Mcc
84
- ---
85
-
86
- # sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2-address-matcher-original
87
-
88
- This is a [sentence-transformers](https://www.SBERT.net) model finetuned from [sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2](https://huggingface.co/sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2). It maps sentences & paragraphs to a 384-dimensional dense vector space and can be used for semantic textual similarity, semantic search, paraphrase mining, text classification, clustering, and more.
89
-
90
- ## Model Details
91
-
92
- ### Model Description
93
- - **Model Type:** Sentence Transformer
94
- - **Base model:** [sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2](https://huggingface.co/sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2) <!-- at revision 86741b4e3f5cb7765a600d3a3d55a0f6a6cb443d -->
95
- - **Maximum Sequence Length:** 128 tokens
96
- - **Output Dimensionality:** 384 dimensions
97
- - **Similarity Function:** Cosine Similarity
98
- <!-- - **Training Dataset:** Unknown -->
99
- - **Language:** en
100
- - **License:** apache-2.0
101
-
102
- ### Model Sources
103
-
104
- - **Documentation:** [Sentence Transformers Documentation](https://sbert.net)
105
- - **Repository:** [Sentence Transformers on GitHub](https://github.com/UKPLab/sentence-transformers)
106
- - **Hugging Face:** [Sentence Transformers on Hugging Face](https://huggingface.co/models?library=sentence-transformers)
107
-
108
- ### Full Model Architecture
109
-
110
- ```
111
- SentenceTransformer(
112
- (0): Transformer({'max_seq_length': 128, 'do_lower_case': False}) with Transformer model: BertModel
113
- (1): Pooling({'word_embedding_dimension': 384, 'pooling_mode_cls_token': False, 'pooling_mode_mean_tokens': True, 'pooling_mode_max_tokens': False, 'pooling_mode_mean_sqrt_len_tokens': False, 'pooling_mode_weightedmean_tokens': False, 'pooling_mode_lasttoken': False, 'include_prompt': True})
114
- )
115
- ```
116
-
117
- ## Usage
118
-
119
- ### Direct Usage (Sentence Transformers)
120
-
121
- First install the Sentence Transformers library:
122
-
123
- ```bash
124
- pip install -U sentence-transformers
125
- ```
126
-
127
- Then you can load this model and run inference.
128
- ```python
129
- from sentence_transformers import SentenceTransformer
130
-
131
- # Download from the 🤗 Hub
132
- model = SentenceTransformer("sentence_transformers_model_id")
133
- # Run inference
134
- sentences = [
135
- 'Saif al-Arab GADAFI',
136
- 'Максім Недасекаў',
137
- 'Mervyn Allister King',
138
- ]
139
- embeddings = model.encode(sentences)
140
- print(embeddings.shape)
141
- # [3, 384]
142
-
143
- # Get the similarity scores for the embeddings
144
- similarities = model.similarity(embeddings, embeddings)
145
- print(similarities.shape)
146
- # [3, 3]
147
- ```
148
-
149
- <!--
150
- ### Direct Usage (Transformers)
151
-
152
- <details><summary>Click to see the direct usage in Transformers</summary>
153
-
154
- </details>
155
- -->
156
-
157
- <!--
158
- ### Downstream Usage (Sentence Transformers)
159
-
160
- You can finetune this model on your own dataset.
161
-
162
- <details><summary>Click to expand</summary>
163
-
164
- </details>
165
- -->
166
-
167
- <!--
168
- ### Out-of-Scope Use
169
-
170
- *List how the model may foreseeably be misused and address what users ought not to do with the model.*
171
- -->
172
-
173
- ## Evaluation
174
-
175
- ### Metrics
176
-
177
- #### Binary Classification
178
-
179
- * Dataset: `sentence-transformers-paraphrase-multilingual-MiniLM-L12-v2`
180
- * Evaluated with [<code>BinaryClassificationEvaluator</code>](https://sbert.net/docs/package_reference/sentence_transformer/evaluation.html#sentence_transformers.evaluation.BinaryClassificationEvaluator)
181
-
182
- | Metric | Value |
183
- |:--------------------------|:-----------|
184
- | cosine_accuracy | 0.9894 |
185
- | cosine_accuracy_threshold | 0.721 |
186
- | cosine_f1 | 0.9838 |
187
- | cosine_f1_threshold | 0.721 |
188
- | cosine_precision | 0.9823 |
189
- | cosine_recall | 0.9853 |
190
- | **cosine_ap** | **0.9974** |
191
- | cosine_mcc | 0.9759 |
192
-
193
- <!--
194
- ## Bias, Risks and Limitations
195
-
196
- *What are the known or foreseeable issues stemming from this model? You could also flag here known failure cases or weaknesses of the model.*
197
- -->
198
-
199
- <!--
200
- ### Recommendations
201
-
202
- *What are recommendations with respect to the foreseeable issues? For example, filtering explicit content.*
203
- -->
204
-
205
- ## Training Details
206
-
207
- ### Training Dataset
208
-
209
- #### Unnamed Dataset
210
-
211
- * Size: 2,130,620 training samples
212
- * Columns: <code>sentence1</code>, <code>sentence2</code>, and <code>label</code>
213
- * Approximate statistics based on the first 1000 samples:
214
- | | sentence1 | sentence2 | label |
215
- |:--------|:---------------------------------------------------------------------------------|:---------------------------------------------------------------------------------|:---------------------------------------------------------------|
216
- | type | string | string | float |
217
- | details | <ul><li>min: 3 tokens</li><li>mean: 9.28 tokens</li><li>max: 57 tokens</li></ul> | <ul><li>min: 3 tokens</li><li>mean: 9.11 tokens</li><li>max: 65 tokens</li></ul> | <ul><li>min: 0.0</li><li>mean: 0.34</li><li>max: 1.0</li></ul> |
218
- * Samples:
219
- | sentence1 | sentence2 | label |
220
- |:----------------------------|:-------------------------------|:-----------------|
221
- | <code>ג'ק וייט</code> | <code>Jack White</code> | <code>1.0</code> |
222
- | <code>Абдуллоҳ Гул</code> | <code>Савицкая Светлана</code> | <code>0.0</code> |
223
- | <code>ショーン・ジャスティン・ペン</code> | <code>شان پن</code> | <code>1.0</code> |
224
- * Loss: [<code>ContrastiveLoss</code>](https://sbert.net/docs/package_reference/sentence_transformer/losses.html#contrastiveloss) with these parameters:
225
- ```json
226
- {
227
- "distance_metric": "SiameseDistanceMetric.COSINE_DISTANCE",
228
- "margin": 0.5,
229
- "size_average": true
230
- }
231
- ```
232
-
233
- ### Evaluation Dataset
234
-
235
- #### Unnamed Dataset
236
-
237
- * Size: 266,328 evaluation samples
238
- * Columns: <code>sentence1</code>, <code>sentence2</code>, and <code>label</code>
239
- * Approximate statistics based on the first 1000 samples:
240
- | | sentence1 | sentence2 | label |
241
- |:--------|:---------------------------------------------------------------------------------|:---------------------------------------------------------------------------------|:---------------------------------------------------------------|
242
- | type | string | string | float |
243
- | details | <ul><li>min: 3 tokens</li><li>mean: 9.27 tokens</li><li>max: 79 tokens</li></ul> | <ul><li>min: 3 tokens</li><li>mean: 8.99 tokens</li><li>max: 61 tokens</li></ul> | <ul><li>min: 0.0</li><li>mean: 0.32</li><li>max: 1.0</li></ul> |
244
- * Samples:
245
- | sentence1 | sentence2 | label |
246
- |:---------------------------------------------|:-----------------------------------------------|:-----------------|
247
- | <code>Анатолий Николаевич Герасимов</code> | <code>Anatoli Nikolajewitsch Gerassimow</code> | <code>1.0</code> |
248
- | <code>Igor Stanislavovitsj Prokopenko</code> | <code>Angelo Lauricella</code> | <code>0.0</code> |
249
- | <code>Кофе, Линда</code> | <code>Святлана Яўгенаўна Савіцкая</code> | <code>0.0</code> |
250
- * Loss: [<code>ContrastiveLoss</code>](https://sbert.net/docs/package_reference/sentence_transformer/losses.html#contrastiveloss) with these parameters:
251
- ```json
252
- {
253
- "distance_metric": "SiameseDistanceMetric.COSINE_DISTANCE",
254
- "margin": 0.5,
255
- "size_average": true
256
- }
257
- ```
258
-
259
- ### Training Hyperparameters
260
- #### Non-Default Hyperparameters
261
-
262
- - `eval_strategy`: steps
263
- - `per_device_train_batch_size`: 5000
264
- - `per_device_eval_batch_size`: 5000
265
- - `gradient_accumulation_steps`: 4
266
- - `weight_decay`: 0.02
267
- - `num_train_epochs`: 10
268
- - `warmup_ratio`: 0.1
269
- - `fp16`: True
270
- - `load_best_model_at_end`: True
271
- - `optim`: adafactor
272
- - `gradient_checkpointing`: True
273
-
274
- #### All Hyperparameters
275
- <details><summary>Click to expand</summary>
276
-
277
- - `overwrite_output_dir`: False
278
- - `do_predict`: False
279
- - `eval_strategy`: steps
280
- - `prediction_loss_only`: True
281
- - `per_device_train_batch_size`: 5000
282
- - `per_device_eval_batch_size`: 5000
283
- - `per_gpu_train_batch_size`: None
284
- - `per_gpu_eval_batch_size`: None
285
- - `gradient_accumulation_steps`: 4
286
- - `eval_accumulation_steps`: None
287
- - `torch_empty_cache_steps`: None
288
- - `learning_rate`: 5e-05
289
- - `weight_decay`: 0.02
290
- - `adam_beta1`: 0.9
291
- - `adam_beta2`: 0.999
292
- - `adam_epsilon`: 1e-08
293
- - `max_grad_norm`: 1.0
294
- - `num_train_epochs`: 10
295
- - `max_steps`: -1
296
- - `lr_scheduler_type`: linear
297
- - `lr_scheduler_kwargs`: {}
298
- - `warmup_ratio`: 0.1
299
- - `warmup_steps`: 0
300
- - `log_level`: passive
301
- - `log_level_replica`: warning
302
- - `log_on_each_node`: True
303
- - `logging_nan_inf_filter`: True
304
- - `save_safetensors`: True
305
- - `save_on_each_node`: False
306
- - `save_only_model`: False
307
- - `restore_callback_states_from_checkpoint`: False
308
- - `no_cuda`: False
309
- - `use_cpu`: False
310
- - `use_mps_device`: False
311
- - `seed`: 42
312
- - `data_seed`: None
313
- - `jit_mode_eval`: False
314
- - `use_ipex`: False
315
- - `bf16`: False
316
- - `fp16`: True
317
- - `fp16_opt_level`: O1
318
- - `half_precision_backend`: auto
319
- - `bf16_full_eval`: False
320
- - `fp16_full_eval`: False
321
- - `tf32`: None
322
- - `local_rank`: 0
323
- - `ddp_backend`: None
324
- - `tpu_num_cores`: None
325
- - `tpu_metrics_debug`: False
326
- - `debug`: []
327
- - `dataloader_drop_last`: False
328
- - `dataloader_num_workers`: 0
329
- - `dataloader_prefetch_factor`: None
330
- - `past_index`: -1
331
- - `disable_tqdm`: False
332
- - `remove_unused_columns`: True
333
- - `label_names`: None
334
- - `load_best_model_at_end`: True
335
- - `ignore_data_skip`: False
336
- - `fsdp`: []
337
- - `fsdp_min_num_params`: 0
338
- - `fsdp_config`: {'min_num_params': 0, 'xla': False, 'xla_fsdp_v2': False, 'xla_fsdp_grad_ckpt': False}
339
- - `tp_size`: 0
340
- - `fsdp_transformer_layer_cls_to_wrap`: None
341
- - `accelerator_config`: {'split_batches': False, 'dispatch_batches': None, 'even_batches': True, 'use_seedable_sampler': True, 'non_blocking': False, 'gradient_accumulation_kwargs': None}
342
- - `deepspeed`: None
343
- - `label_smoothing_factor`: 0.0
344
- - `optim`: adafactor
345
- - `optim_args`: None
346
- - `adafactor`: False
347
- - `group_by_length`: False
348
- - `length_column_name`: length
349
- - `ddp_find_unused_parameters`: None
350
- - `ddp_bucket_cap_mb`: None
351
- - `ddp_broadcast_buffers`: False
352
- - `dataloader_pin_memory`: True
353
- - `dataloader_persistent_workers`: False
354
- - `skip_memory_metrics`: True
355
- - `use_legacy_prediction_loop`: False
356
- - `push_to_hub`: False
357
- - `resume_from_checkpoint`: None
358
- - `hub_model_id`: None
359
- - `hub_strategy`: every_save
360
- - `hub_private_repo`: None
361
- - `hub_always_push`: False
362
- - `gradient_checkpointing`: True
363
- - `gradient_checkpointing_kwargs`: None
364
- - `include_inputs_for_metrics`: False
365
- - `include_for_metrics`: []
366
- - `eval_do_concat_batches`: True
367
- - `fp16_backend`: auto
368
- - `push_to_hub_model_id`: None
369
- - `push_to_hub_organization`: None
370
- - `mp_parameters`:
371
- - `auto_find_batch_size`: False
372
- - `full_determinism`: False
373
- - `torchdynamo`: None
374
- - `ray_scope`: last
375
- - `ddp_timeout`: 1800
376
- - `torch_compile`: False
377
- - `torch_compile_backend`: None
378
- - `torch_compile_mode`: None
379
- - `include_tokens_per_second`: False
380
- - `include_num_input_tokens_seen`: False
381
- - `neftune_noise_alpha`: None
382
- - `optim_target_modules`: None
383
- - `batch_eval_metrics`: False
384
- - `eval_on_start`: False
385
- - `use_liger_kernel`: False
386
- - `eval_use_gather_object`: False
387
- - `average_tokens_across_devices`: False
388
- - `prompts`: None
389
- - `batch_sampler`: batch_sampler
390
- - `multi_dataset_batch_sampler`: proportional
391
-
392
- </details>
393
-
394
- ### Training Logs
395
- | Epoch | Step | Training Loss | Validation Loss | sentence-transformers-paraphrase-multilingual-MiniLM-L12-v2_cosine_ap |
396
- |:------:|:----:|:-------------:|:---------------:|:---------------------------------------------------------------------:|
397
- | -1 | -1 | - | - | 0.7195 |
398
- | 0.9368 | 100 | - | 0.0083 | 0.9597 |
399
- | 1.8712 | 200 | - | 0.0043 | 0.9877 |
400
- | 2.8056 | 300 | - | 0.0028 | 0.9936 |
401
- | 3.7400 | 400 | - | 0.0021 | 0.9954 |
402
- | 4.6745 | 500 | 0.0224 | 0.0016 | 0.9964 |
403
- | 5.6089 | 600 | - | 0.0015 | 0.9970 |
404
- | 6.5433 | 700 | - | 0.0014 | 0.9974 |
405
-
406
-
407
- ### Framework Versions
408
- - Python: 3.12.9
409
- - Sentence Transformers: 3.4.1
410
- - Transformers: 4.51.3
411
- - PyTorch: 2.7.0+cu126
412
- - Accelerate: 1.6.0
413
- - Datasets: 3.6.0
414
- - Tokenizers: 0.21.1
415
-
416
- ## Citation
417
-
418
- ### BibTeX
419
-
420
- #### Sentence Transformers
421
- ```bibtex
422
- @inproceedings{reimers-2019-sentence-bert,
423
- title = "Sentence-BERT: Sentence Embeddings using Siamese BERT-Networks",
424
- author = "Reimers, Nils and Gurevych, Iryna",
425
- booktitle = "Proceedings of the 2019 Conference on Empirical Methods in Natural Language Processing",
426
- month = "11",
427
- year = "2019",
428
- publisher = "Association for Computational Linguistics",
429
- url = "https://arxiv.org/abs/1908.10084",
430
- }
431
- ```
432
-
433
- #### ContrastiveLoss
434
- ```bibtex
435
- @inproceedings{hadsell2006dimensionality,
436
- author={Hadsell, R. and Chopra, S. and LeCun, Y.},
437
- booktitle={2006 IEEE Computer Society Conference on Computer Vision and Pattern Recognition (CVPR'06)},
438
- title={Dimensionality Reduction by Learning an Invariant Mapping},
439
- year={2006},
440
- volume={2},
441
- number={},
442
- pages={1735-1742},
443
- doi={10.1109/CVPR.2006.100}
444
- }
445
- ```
446
-
447
- <!--
448
- ## Glossary
449
-
450
- *Clearly define terms in order to be accessible across audiences.*
451
- -->
452
-
453
- <!--
454
- ## Model Card Authors
455
-
456
- *Lists the people who create the model card, providing recognition and accountability for the detailed work that goes into its construction.*
457
- -->
458
-
459
- <!--
460
- ## Model Card Contact
461
-
462
- *Provides a way for people who have updates to the Model Card, suggestions, or questions, to contact the Model Card authors.*
463
- -->
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
checkpoint-700/config.json DELETED
@@ -1,25 +0,0 @@
1
- {
2
- "architectures": [
3
- "BertModel"
4
- ],
5
- "attention_probs_dropout_prob": 0.1,
6
- "classifier_dropout": null,
7
- "gradient_checkpointing": false,
8
- "hidden_act": "gelu",
9
- "hidden_dropout_prob": 0.1,
10
- "hidden_size": 384,
11
- "initializer_range": 0.02,
12
- "intermediate_size": 1536,
13
- "layer_norm_eps": 1e-12,
14
- "max_position_embeddings": 512,
15
- "model_type": "bert",
16
- "num_attention_heads": 12,
17
- "num_hidden_layers": 12,
18
- "pad_token_id": 0,
19
- "position_embedding_type": "absolute",
20
- "torch_dtype": "float32",
21
- "transformers_version": "4.51.3",
22
- "type_vocab_size": 2,
23
- "use_cache": true,
24
- "vocab_size": 250037
25
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
checkpoint-700/config_sentence_transformers.json DELETED
@@ -1,10 +0,0 @@
1
- {
2
- "__version__": {
3
- "sentence_transformers": "3.4.1",
4
- "transformers": "4.51.3",
5
- "pytorch": "2.7.0+cu126"
6
- },
7
- "prompts": {},
8
- "default_prompt_name": null,
9
- "similarity_fn_name": "cosine"
10
- }
 
 
 
 
 
 
 
 
 
 
 
checkpoint-700/model.safetensors DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:000c7957828311658198adf8e84fe33c1a660836e1cb7b256504f04b8cc770aa
3
- size 470637416
 
 
 
 
checkpoint-700/modules.json DELETED
@@ -1,14 +0,0 @@
1
- [
2
- {
3
- "idx": 0,
4
- "name": "0",
5
- "path": "",
6
- "type": "sentence_transformers.models.Transformer"
7
- },
8
- {
9
- "idx": 1,
10
- "name": "1",
11
- "path": "1_Pooling",
12
- "type": "sentence_transformers.models.Pooling"
13
- }
14
- ]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
checkpoint-700/optimizer.pt DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:e4d52b430164992014c645f921eb89cb4f11af746bb4925e980955f54650d62b
3
- size 1715019
 
 
 
 
checkpoint-700/rng_state.pth DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:2d56fff20bbc2f130ed2e293f289ea71c316a57e902789c67ac719e6a30c1b4e
3
- size 14645
 
 
 
 
checkpoint-700/scaler.pt DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:850c3d909f8a0af6f9b431fac5a25833ab1658c39f899825e3b347b6af8a490b
3
- size 1383
 
 
 
 
checkpoint-700/scheduler.pt DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:d4c2cfbe9b0a118af0de30855464c364252cb3147a7ab4ad3d16c608263feebb
3
- size 1465
 
 
 
 
checkpoint-700/sentence_bert_config.json DELETED
@@ -1,4 +0,0 @@
1
- {
2
- "max_seq_length": 128,
3
- "do_lower_case": false
4
- }
 
 
 
 
 
checkpoint-700/special_tokens_map.json DELETED
@@ -1,51 +0,0 @@
1
- {
2
- "bos_token": {
3
- "content": "<s>",
4
- "lstrip": false,
5
- "normalized": false,
6
- "rstrip": false,
7
- "single_word": false
8
- },
9
- "cls_token": {
10
- "content": "<s>",
11
- "lstrip": false,
12
- "normalized": false,
13
- "rstrip": false,
14
- "single_word": false
15
- },
16
- "eos_token": {
17
- "content": "</s>",
18
- "lstrip": false,
19
- "normalized": false,
20
- "rstrip": false,
21
- "single_word": false
22
- },
23
- "mask_token": {
24
- "content": "<mask>",
25
- "lstrip": true,
26
- "normalized": false,
27
- "rstrip": false,
28
- "single_word": false
29
- },
30
- "pad_token": {
31
- "content": "<pad>",
32
- "lstrip": false,
33
- "normalized": false,
34
- "rstrip": false,
35
- "single_word": false
36
- },
37
- "sep_token": {
38
- "content": "</s>",
39
- "lstrip": false,
40
- "normalized": false,
41
- "rstrip": false,
42
- "single_word": false
43
- },
44
- "unk_token": {
45
- "content": "<unk>",
46
- "lstrip": false,
47
- "normalized": false,
48
- "rstrip": false,
49
- "single_word": false
50
- }
51
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
checkpoint-700/tokenizer.json DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:cad551d5600a84242d0973327029452a1e3672ba6313c2a3c3d69c4310e12719
3
- size 17082987
 
 
 
 
checkpoint-700/tokenizer_config.json DELETED
@@ -1,65 +0,0 @@
1
- {
2
- "added_tokens_decoder": {
3
- "0": {
4
- "content": "<s>",
5
- "lstrip": false,
6
- "normalized": false,
7
- "rstrip": false,
8
- "single_word": false,
9
- "special": true
10
- },
11
- "1": {
12
- "content": "<pad>",
13
- "lstrip": false,
14
- "normalized": false,
15
- "rstrip": false,
16
- "single_word": false,
17
- "special": true
18
- },
19
- "2": {
20
- "content": "</s>",
21
- "lstrip": false,
22
- "normalized": false,
23
- "rstrip": false,
24
- "single_word": false,
25
- "special": true
26
- },
27
- "3": {
28
- "content": "<unk>",
29
- "lstrip": false,
30
- "normalized": false,
31
- "rstrip": false,
32
- "single_word": false,
33
- "special": true
34
- },
35
- "250001": {
36
- "content": "<mask>",
37
- "lstrip": true,
38
- "normalized": false,
39
- "rstrip": false,
40
- "single_word": false,
41
- "special": true
42
- }
43
- },
44
- "bos_token": "<s>",
45
- "clean_up_tokenization_spaces": false,
46
- "cls_token": "<s>",
47
- "do_lower_case": true,
48
- "eos_token": "</s>",
49
- "extra_special_tokens": {},
50
- "mask_token": "<mask>",
51
- "max_length": 128,
52
- "model_max_length": 128,
53
- "pad_to_multiple_of": null,
54
- "pad_token": "<pad>",
55
- "pad_token_type_id": 0,
56
- "padding_side": "right",
57
- "sep_token": "</s>",
58
- "stride": 0,
59
- "strip_accents": null,
60
- "tokenize_chinese_chars": true,
61
- "tokenizer_class": "BertTokenizer",
62
- "truncation_side": "right",
63
- "truncation_strategy": "longest_first",
64
- "unk_token": "<unk>"
65
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
checkpoint-700/trainer_state.json DELETED
@@ -1,162 +0,0 @@
1
- {
2
- "best_global_step": 700,
3
- "best_metric": 0.0013776659034192562,
4
- "best_model_checkpoint": "data/fine-tuned-sbert-sentence-transformers-paraphrase-multilingual-MiniLM-L12-v2-original-adafactor/checkpoint-700",
5
- "epoch": 6.543325526932084,
6
- "eval_steps": 100,
7
- "global_step": 700,
8
- "is_hyper_param_search": false,
9
- "is_local_process_zero": true,
10
- "is_world_process_zero": true,
11
- "log_history": [
12
- {
13
- "epoch": 0.936768149882904,
14
- "eval_loss": 0.008251233026385307,
15
- "eval_runtime": 117.4457,
16
- "eval_samples_per_second": 2267.669,
17
- "eval_sentence-transformers-paraphrase-multilingual-MiniLM-L12-v2_cosine_accuracy": 0.9330529793864755,
18
- "eval_sentence-transformers-paraphrase-multilingual-MiniLM-L12-v2_cosine_accuracy_threshold": 0.6639679670333862,
19
- "eval_sentence-transformers-paraphrase-multilingual-MiniLM-L12-v2_cosine_ap": 0.9596591982248662,
20
- "eval_sentence-transformers-paraphrase-multilingual-MiniLM-L12-v2_cosine_f1": 0.8990018609372358,
21
- "eval_sentence-transformers-paraphrase-multilingual-MiniLM-L12-v2_cosine_f1_threshold": 0.6536919474601746,
22
- "eval_sentence-transformers-paraphrase-multilingual-MiniLM-L12-v2_cosine_mcc": 0.8488676021429209,
23
- "eval_sentence-transformers-paraphrase-multilingual-MiniLM-L12-v2_cosine_precision": 0.8846836847946726,
24
- "eval_sentence-transformers-paraphrase-multilingual-MiniLM-L12-v2_cosine_recall": 0.913791126905881,
25
- "eval_steps_per_second": 0.46,
26
- "step": 100
27
- },
28
- {
29
- "epoch": 1.8711943793911008,
30
- "eval_loss": 0.004326523281633854,
31
- "eval_runtime": 118.308,
32
- "eval_samples_per_second": 2251.141,
33
- "eval_sentence-transformers-paraphrase-multilingual-MiniLM-L12-v2_cosine_accuracy": 0.9683099913640971,
34
- "eval_sentence-transformers-paraphrase-multilingual-MiniLM-L12-v2_cosine_accuracy_threshold": 0.6799858808517456,
35
- "eval_sentence-transformers-paraphrase-multilingual-MiniLM-L12-v2_cosine_ap": 0.987669070948898,
36
- "eval_sentence-transformers-paraphrase-multilingual-MiniLM-L12-v2_cosine_f1": 0.9520018198362147,
37
- "eval_sentence-transformers-paraphrase-multilingual-MiniLM-L12-v2_cosine_f1_threshold": 0.6799858808517456,
38
- "eval_sentence-transformers-paraphrase-multilingual-MiniLM-L12-v2_cosine_mcc": 0.9284143244509058,
39
- "eval_sentence-transformers-paraphrase-multilingual-MiniLM-L12-v2_cosine_precision": 0.9445886468795847,
40
- "eval_sentence-transformers-paraphrase-multilingual-MiniLM-L12-v2_cosine_recall": 0.9595322710076808,
41
- "eval_steps_per_second": 0.456,
42
- "step": 200
43
- },
44
- {
45
- "epoch": 2.8056206088992974,
46
- "eval_loss": 0.002782753435894847,
47
- "eval_runtime": 117.8399,
48
- "eval_samples_per_second": 2260.083,
49
- "eval_sentence-transformers-paraphrase-multilingual-MiniLM-L12-v2_cosine_accuracy": 0.9790110013892539,
50
- "eval_sentence-transformers-paraphrase-multilingual-MiniLM-L12-v2_cosine_accuracy_threshold": 0.7040826678276062,
51
- "eval_sentence-transformers-paraphrase-multilingual-MiniLM-L12-v2_cosine_ap": 0.9935758649482886,
52
- "eval_sentence-transformers-paraphrase-multilingual-MiniLM-L12-v2_cosine_f1": 0.9680662667809197,
53
- "eval_sentence-transformers-paraphrase-multilingual-MiniLM-L12-v2_cosine_f1_threshold": 0.7029732465744019,
54
- "eval_sentence-transformers-paraphrase-multilingual-MiniLM-L12-v2_cosine_mcc": 0.9524469797852624,
55
- "eval_sentence-transformers-paraphrase-multilingual-MiniLM-L12-v2_cosine_precision": 0.9648143930767479,
56
- "eval_sentence-transformers-paraphrase-multilingual-MiniLM-L12-v2_cosine_recall": 0.9713401352745615,
57
- "eval_steps_per_second": 0.458,
58
- "step": 300
59
- },
60
- {
61
- "epoch": 3.740046838407494,
62
- "eval_loss": 0.0020659712608903646,
63
- "eval_runtime": 116.8077,
64
- "eval_samples_per_second": 2280.056,
65
- "eval_sentence-transformers-paraphrase-multilingual-MiniLM-L12-v2_cosine_accuracy": 0.9837419742424811,
66
- "eval_sentence-transformers-paraphrase-multilingual-MiniLM-L12-v2_cosine_accuracy_threshold": 0.7114190459251404,
67
- "eval_sentence-transformers-paraphrase-multilingual-MiniLM-L12-v2_cosine_ap": 0.9954100421733855,
68
- "eval_sentence-transformers-paraphrase-multilingual-MiniLM-L12-v2_cosine_f1": 0.975348704810703,
69
- "eval_sentence-transformers-paraphrase-multilingual-MiniLM-L12-v2_cosine_f1_threshold": 0.6966520547866821,
70
- "eval_sentence-transformers-paraphrase-multilingual-MiniLM-L12-v2_cosine_mcc": 0.963270232791414,
71
- "eval_sentence-transformers-paraphrase-multilingual-MiniLM-L12-v2_cosine_precision": 0.9687853426826509,
72
- "eval_sentence-transformers-paraphrase-multilingual-MiniLM-L12-v2_cosine_recall": 0.9820016049524246,
73
- "eval_steps_per_second": 0.462,
74
- "step": 400
75
- },
76
- {
77
- "epoch": 4.674473067915691,
78
- "grad_norm": 0.07067500799894333,
79
- "learning_rate": 2.9402515723270442e-05,
80
- "loss": 0.0224,
81
- "step": 500
82
- },
83
- {
84
- "epoch": 4.674473067915691,
85
- "eval_loss": 0.0016409169184044003,
86
- "eval_runtime": 117.7739,
87
- "eval_samples_per_second": 2261.35,
88
- "eval_sentence-transformers-paraphrase-multilingual-MiniLM-L12-v2_cosine_accuracy": 0.986370292494274,
89
- "eval_sentence-transformers-paraphrase-multilingual-MiniLM-L12-v2_cosine_accuracy_threshold": 0.7391290664672852,
90
- "eval_sentence-transformers-paraphrase-multilingual-MiniLM-L12-v2_cosine_ap": 0.996439193909599,
91
- "eval_sentence-transformers-paraphrase-multilingual-MiniLM-L12-v2_cosine_f1": 0.9792820044518008,
92
- "eval_sentence-transformers-paraphrase-multilingual-MiniLM-L12-v2_cosine_f1_threshold": 0.7391290664672852,
93
- "eval_sentence-transformers-paraphrase-multilingual-MiniLM-L12-v2_cosine_mcc": 0.9691467317957321,
94
- "eval_sentence-transformers-paraphrase-multilingual-MiniLM-L12-v2_cosine_precision": 0.975107979086156,
95
- "eval_sentence-transformers-paraphrase-multilingual-MiniLM-L12-v2_cosine_recall": 0.9834919179181474,
96
- "eval_steps_per_second": 0.459,
97
- "step": 500
98
- },
99
- {
100
- "epoch": 5.608899297423887,
101
- "eval_loss": 0.0014551315689459443,
102
- "eval_runtime": 117.5801,
103
- "eval_samples_per_second": 2265.077,
104
- "eval_sentence-transformers-paraphrase-multilingual-MiniLM-L12-v2_cosine_accuracy": 0.9884729470957083,
105
- "eval_sentence-transformers-paraphrase-multilingual-MiniLM-L12-v2_cosine_accuracy_threshold": 0.7460525035858154,
106
- "eval_sentence-transformers-paraphrase-multilingual-MiniLM-L12-v2_cosine_ap": 0.9969945004512654,
107
- "eval_sentence-transformers-paraphrase-multilingual-MiniLM-L12-v2_cosine_f1": 0.9824360661365067,
108
- "eval_sentence-transformers-paraphrase-multilingual-MiniLM-L12-v2_cosine_f1_threshold": 0.7435637712478638,
109
- "eval_sentence-transformers-paraphrase-multilingual-MiniLM-L12-v2_cosine_mcc": 0.9738614226726382,
110
- "eval_sentence-transformers-paraphrase-multilingual-MiniLM-L12-v2_cosine_precision": 0.9805847418912745,
111
- "eval_sentence-transformers-paraphrase-multilingual-MiniLM-L12-v2_cosine_recall": 0.9842943941304597,
112
- "eval_steps_per_second": 0.459,
113
- "step": 600
114
- },
115
- {
116
- "epoch": 6.543325526932084,
117
- "eval_loss": 0.0013776659034192562,
118
- "eval_runtime": 117.6764,
119
- "eval_samples_per_second": 2263.223,
120
- "eval_sentence-transformers-paraphrase-multilingual-MiniLM-L12-v2_cosine_accuracy": 0.9893740847820374,
121
- "eval_sentence-transformers-paraphrase-multilingual-MiniLM-L12-v2_cosine_accuracy_threshold": 0.7209540009498596,
122
- "eval_sentence-transformers-paraphrase-multilingual-MiniLM-L12-v2_cosine_ap": 0.997357375070481,
123
- "eval_sentence-transformers-paraphrase-multilingual-MiniLM-L12-v2_cosine_f1": 0.9838035826704058,
124
- "eval_sentence-transformers-paraphrase-multilingual-MiniLM-L12-v2_cosine_f1_threshold": 0.7209540009498596,
125
- "eval_sentence-transformers-paraphrase-multilingual-MiniLM-L12-v2_cosine_mcc": 0.9758996171607873,
126
- "eval_sentence-transformers-paraphrase-multilingual-MiniLM-L12-v2_cosine_precision": 0.9822857142857143,
127
- "eval_sentence-transformers-paraphrase-multilingual-MiniLM-L12-v2_cosine_recall": 0.9853261492605755,
128
- "eval_steps_per_second": 0.459,
129
- "step": 700
130
- }
131
- ],
132
- "logging_steps": 500,
133
- "max_steps": 1060,
134
- "num_input_tokens_seen": 0,
135
- "num_train_epochs": 10,
136
- "save_steps": 100,
137
- "stateful_callbacks": {
138
- "EarlyStoppingCallback": {
139
- "args": {
140
- "early_stopping_patience": 2,
141
- "early_stopping_threshold": 0.0
142
- },
143
- "attributes": {
144
- "early_stopping_patience_counter": 0
145
- }
146
- },
147
- "TrainerControl": {
148
- "args": {
149
- "should_epoch_stop": false,
150
- "should_evaluate": false,
151
- "should_log": false,
152
- "should_save": true,
153
- "should_training_stop": false
154
- },
155
- "attributes": {}
156
- }
157
- },
158
- "total_flos": 0.0,
159
- "train_batch_size": 5000,
160
- "trial_name": null,
161
- "trial_params": null
162
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
checkpoint-700/training_args.bin DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:319aaa354e613c6db82c6bb78290f3da04198ef2c7a75b61b314fa305ed33c45
3
- size 6033