File size: 5,862 Bytes
a34b1b7 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 |
---
license: apache-2.0
---
**How to use the model**
To use the model with `transformer` package, see the example below:
```python
from transformers import AutoModelForCausalLM, AutoTokenizer
import torch
model_name = "Ihor/OpenBioLLM-Text2Graph-8B"
tokenizer = AutoTokenizer.from_pretrained(model_name)
tokenizer.chat_template = "{% set loop_messages = messages %}{% for message in loop_messages %}{% set content = '<|start_header_id|>' + message['role'] + '<|end_header_id|>\n\n'+ message['content'] | trim + '<|end_of_text|>' %}{% if loop.index0 == 0 %}{% set content = bos_token + content %}{% endif %}{{ content }}{% endfor %}{% if add_generation_prompt %}{{ '<|start_header_id|>assistant<|end_header_id|>\n\n' }}{% endif %}"
model = AutoModelForCausalLM.from_pretrained(
model_name,
device_map="auto",
torch_dtype=torch.bfloat16
)
MESSAGES = [
{
"role": "system",
"content": (
"You are an advanced assistant trained to process biomedical text for Named Entity Recognition (NER) and Relation Extraction (RE). "
"Your task is to analyze user-provided text, identify all unique and contextually relevant entities, and infer directed relationships "
"between these entities based on the context. Ensure that all relations exist only between annotated entities. "
"Entities and relationships should be human-readable and natural, reflecting real-world concepts and connections. "
"Output the annotated data in JSON format, structured as follows:\n\n"
"""{"entities": [{"id": 0, "text": "ner_string_0", "type": "ner_type_string_0"}, {"id": 1, "text": "ner_string_1", "type": "ner_type_string_1"}], "relations": [{"head": 0, "tail": 1, "type": "re_type_string_0"}]}"""
"\n\nEnsure that the output captures all significant entities and their directed relationships in a clear and concise manner."
),
},
{
"role": "user",
"content": (
'Here is a text input: "Subjects will receive a 100mL dose of IV saline every 6 hours for 24 hours. The first dose will be administered prior to anesthesia induction, approximately 30 minutes before skin incision. A total of 4 doses will be given." '
"Analyze this text, select and classify the entities, and extract their relationships as per your instructions."
),
},
]
# Build prompt text
chat_prompt = tokenizer.apply_chat_template(
MESSAGES, tokenize=False, add_generation_prompt=True
)
# Tokenize
inputs = tokenizer(chat_prompt, return_tensors="pt").to(model.device)
# Generate
outputs = model.generate(
**inputs,
max_new_tokens=3000,
do_sample=True,
eos_token_id=tokenizer.eos_token_id,
pad_token_id=tokenizer.eos_token_id,
return_dict_in_generate=True
)
# Decode ONLY the new tokens (skip the prompt tokens)
prompt_len = inputs["input_ids"].shape[-1]
generated_ids = outputs.sequences[0][prompt_len:]
response = tokenizer.decode(generated_ids, skip_special_tokens=True)
print(response)
```
To use the model with `vllm` package, please refer to the example below:
```python
# !pip install vllm
from vllm import LLM, SamplingParams
from transformers import AutoTokenizer
MODEL_ID = "Ihor/OpenBioLLM-Text2Graph-8B"
tokenizer = AutoTokenizer.from_pretrained(MODEL_ID, use_fast=True)
tokenizer.chat_template = "{% set loop_messages = messages %}{% for message in loop_messages %}{% set content = '<|start_header_id|>' + message['role'] + '<|end_header_id|>\n\n'+ message['content'] | trim + '<|end_of_text|>' %}{% if loop.index0 == 0 %}{% set content = bos_token + content %}{% endif %}{{ content }}{% endfor %}{% if add_generation_prompt %}{{ '<|start_header_id|>assistant<|end_header_id|>\n\n' }}{% endif %}"
llm = LLM(model=MODEL_ID)
sampling_params = SamplingParams(
max_tokens=3000,
n=1,
best_of=1,
presence_penalty=0.0,
frequency_penalty=0.0,
repetition_penalty=1.0,
temperature=0.0,
top_p=1.0,
top_k=-1,
min_p=0.0,
seed=42,
)
MESSAGES = [
{
"role": "system",
"content": (
"You are an advanced assistant trained to process biomedical text for Named Entity Recognition (NER) and Relation Extraction (RE). "
"Your task is to analyze user-provided text, identify all unique and contextually relevant entities, and infer directed relationships "
"between these entities based on the context. Ensure that all relations exist only between annotated entities. "
"Entities and relationships should be human-readable and natural, reflecting real-world concepts and connections. "
"Output the annotated data in JSON format, structured as follows:\n\n"
"""{"entities": [{"id": 0, "text": "ner_string_0", "type": "ner_type_string_0"}, {"id": 1, "text": "ner_string_1", "type": "ner_type_string_1"}], "relations": [{"head": 0, "tail": 1, "type": "re_type_string_0"}]}"""
"\n\nEnsure that the output captures all significant entities and their directed relationships in a clear and concise manner."
),
},
{
"role": "user",
"content": (
'Here is a text input: "Subjects will receive a 100mL dose of IV saline every 6 hours for 24 hours. The first dose will be administered prior to anesthesia induction, approximately 30 minutes before skin incision. A total of 4 doses will be given." '
"Analyze this text, select and classify the entities, and extract their relationships as per your instructions."
),
},
]
chat_prompt = tokenizer.apply_chat_template(
MESSAGES,
tokenize=False,
add_generation_prompt=True,
add_special_tokens=False,
)
outputs = llm.generate([chat_prompt], sampling_params)
response_text = outputs[0].outputs[0].text
print(response_text)
``` |