fix parameter name in eval_mteb (#6)
Browse files- fix parameter name in eval_mteb (978fec22ab2d2f497d79c9554202a0d9ffd60618)
Co-authored-by: Bing <[email protected]>
- scripts/eval_mteb.py +6 -3
scripts/eval_mteb.py
CHANGED
|
@@ -119,6 +119,7 @@ CMTEB_TASK_LIST = ['TNews', 'IFlyTek', 'MultilingualSentiment', 'JDReview', 'Onl
|
|
| 119 |
'T2Retrieval', 'MMarcoRetrieval', 'DuRetrieval', 'CovidRetrieval', 'CmedqaRetrieval', 'EcomRetrieval', 'MedicalRetrieval', 'VideoRetrieval',
|
| 120 |
'ATEC', 'BQ', 'LCQMC', 'PAWSX', 'STSB', 'AFQMC', 'QBQTC', 'STS22']
|
| 121 |
|
|
|
|
| 122 |
MTEB_PL = [
|
| 123 |
"CBD","PolEmo2.0-IN","PolEmo2.0-OUT","AllegroReviews","PAC","MassiveIntentClassification","MassiveScenarioClassification",
|
| 124 |
"SICK-E-PL","PPC","CDSC-E","PSC","8TagsClustering","SICK-R-PL","CDSC-R","STS22",
|
|
@@ -405,6 +406,8 @@ class Wrapper:
|
|
| 405 |
self._target_device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
|
| 406 |
self.eod_id = self.tokenizer.convert_tokens_to_ids("<|endoftext|>")
|
| 407 |
self.instruction = instruction
|
|
|
|
|
|
|
| 408 |
|
| 409 |
if self.tokenizer.padding_side != 'right':
|
| 410 |
logger.warning(f"Change tokenizer.padding_side from {self.tokenizer.padding_side} to right")
|
|
@@ -544,9 +547,9 @@ class Wrapper:
|
|
| 544 |
|
| 545 |
def _tokenize(self, sentences: List[str], is_query: bool):
|
| 546 |
|
| 547 |
-
batch_dict = tokenizer(sentences, max_length=
|
| 548 |
-
batch_dict['input_ids'] = [input_ids + [tokenizer.eos_token_id] for input_ids in batch_dict['input_ids']]
|
| 549 |
-
batch_dict = tokenizer.pad(batch_dict, padding=True, return_attention_mask=True, return_tensors='pt')
|
| 550 |
batch_dict['is_causal'] = False
|
| 551 |
return batch_dict
|
| 552 |
|
|
|
|
| 119 |
'T2Retrieval', 'MMarcoRetrieval', 'DuRetrieval', 'CovidRetrieval', 'CmedqaRetrieval', 'EcomRetrieval', 'MedicalRetrieval', 'VideoRetrieval',
|
| 120 |
'ATEC', 'BQ', 'LCQMC', 'PAWSX', 'STSB', 'AFQMC', 'QBQTC', 'STS22']
|
| 121 |
|
| 122 |
+
|
| 123 |
MTEB_PL = [
|
| 124 |
"CBD","PolEmo2.0-IN","PolEmo2.0-OUT","AllegroReviews","PAC","MassiveIntentClassification","MassiveScenarioClassification",
|
| 125 |
"SICK-E-PL","PPC","CDSC-E","PSC","8TagsClustering","SICK-R-PL","CDSC-R","STS22",
|
|
|
|
| 406 |
self._target_device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
|
| 407 |
self.eod_id = self.tokenizer.convert_tokens_to_ids("<|endoftext|>")
|
| 408 |
self.instruction = instruction
|
| 409 |
+
self.default_query = default_query
|
| 410 |
+
self.force_default = force_default
|
| 411 |
|
| 412 |
if self.tokenizer.padding_side != 'right':
|
| 413 |
logger.warning(f"Change tokenizer.padding_side from {self.tokenizer.padding_side} to right")
|
|
|
|
| 547 |
|
| 548 |
def _tokenize(self, sentences: List[str], is_query: bool):
|
| 549 |
|
| 550 |
+
batch_dict = self.tokenizer(sentences, max_length=self.max_seq_len - 1, return_attention_mask=False, padding=False, truncation=True)
|
| 551 |
+
batch_dict['input_ids'] = [input_ids + [self.tokenizer.eos_token_id] for input_ids in batch_dict['input_ids']]
|
| 552 |
+
batch_dict = self.tokenizer.pad(batch_dict, padding=True, return_attention_mask=True, return_tensors='pt')
|
| 553 |
batch_dict['is_causal'] = False
|
| 554 |
return batch_dict
|
| 555 |
|