| !pip install -q transformers | |
| from transformers import RemBertForTokenClassification, RemBertTokenizerFast | |
| from transformers import XLMRobertaForTokenClassification, XLMRobertaTokenizerFast | |
| import torch | |
| main_path = "Misha24-10/MultiCoNER-2-recognition-model" | |
| model_1 = XLMRobertaForTokenClassification.from_pretrained(main_path, | |
| subfolder = "xlm_roberta_large_mountain") | |
| tokenizer_1 = XLMRobertaTokenizerFast.from_pretrained(main_path, | |
| subfolder = "xlm_roberta_large_mountain") | |
| model_2 = RemBertForTokenClassification.from_pretrained(main_path, | |
| subfolder = "google-rembert-ft_for_multi_ner_v3") | |
| tokenizer_2 = RemBertTokenizerFast.from_pretrained(main_path, | |
| subfolder = "google-rembert-ft_for_multi_ner_v3") | |
| model_3 = RemBertForTokenClassification.from_pretrained(main_path, | |
| subfolder = "google-rembert-ft_for_multi_ner_sky") | |
| tokenizer_3 = RemBertTokenizerFast.from_pretrained(main_path, | |
| subfolder = "google-rembert-ft_for_multi_ner_sky") | |
| import torch | |
| def compute_last_leyer_probs(model, tokenizer, sentence): | |
| device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') | |
| number_of_tokens = tokenizer.encode_plus(sentence, return_tensors='pt',)['input_ids'].shape[-1] | |
| list_of_words = sentence.split() | |
| inputs = tokenizer(list_of_words, is_split_into_words=True, padding='max_length', max_length = min(number_of_tokens,512), truncation=True, return_tensors="pt") | |
| input_ids = inputs['input_ids'].to(device) | |
| attention_mask = inputs['attention_mask'].to(device) | |
| label_ids = torch.Tensor(align_word_ids(inputs.word_ids())) | |
| with torch.no_grad(): | |
| outputs = model(input_ids=input_ids, attention_mask=attention_mask) | |
| logits = outputs.logits | |
| return (logits[:, (label_ids == 1), :]) | |
| weights = {'model_1': 1, 'model_2': 1, 'model_3': 1} | |
| def align_word_ids(word_ids, return_word_ids=False): | |
| previous_word_idx = None | |
| label_ids = [] | |
| index_list = [] | |
| for idx, word_idx in enumerate(word_ids): | |
| if word_idx is None: | |
| label_ids.append(-100) | |
| elif word_idx != previous_word_idx: | |
| try: | |
| label_ids.append(1) | |
| index_list.append(idx) | |
| except: | |
| label_ids.append(-100) | |
| else: | |
| try: | |
| label_ids.append(1 if label_all_tokens else -100) | |
| except: | |
| label_ids.append(-100) | |
| previous_word_idx = word_idx | |
| if return_word_ids: | |
| return label_ids, index_list | |
| else: | |
| return label_ids | |
| def weighted_voting(sentence): | |
| predictions = [] | |
| for idx, (model, tokenizer) in enumerate([(model_1, tokenizer_1), (model_2, tokenizer_2), (model_3, tokenizer_3)]): | |
| logits = compute_last_leyer_probs(model, tokenizer, sentence) | |
| predictions.append(logits * weights[f'model_{idx+1}']) | |
| final_logits = sum(predictions) | |
| final_predictions = torch.argmax(final_logits, dim=2) | |
| labels = [model_1.config.id2label[i] for i in final_predictions.tolist()[0]] | |
| return labels | |
| sent_ex = "Elon Musk 's brother sits on the boards of tesla".lower() | |
| weighted_voting(sent_ex) |