Update README.md

a34b1b7 verified 2 months ago

5.86 kB

	---
	license: apache-2.0
	---

	How to use the model

	To use the model with `transformer` package, see the example below:
	```python
	from transformers import AutoModelForCausalLM, AutoTokenizer
	import torch

	model_name = "Ihor/OpenBioLLM-Text2Graph-8B"

	tokenizer = AutoTokenizer.from_pretrained(model_name)
	tokenizer.chat_template = "{% set loop_messages = messages %}{% for message in loop_messages %}{% set content = '<\|start_header_id\|>' + message['role'] + '<\|end_header_id\|>\n\n'+ message['content'] \| trim + '<\|end_of_text\|>' %}{% if loop.index0 == 0 %}{% set content = bos_token + content %}{% endif %}{{ content }}{% endfor %}{% if add_generation_prompt %}{{ '<\|start_header_id\|>assistant<\|end_header_id\|>\n\n' }}{% endif %}"

	model = AutoModelForCausalLM.from_pretrained(
	model_name,
	device_map="auto",
	torch_dtype=torch.bfloat16
	)


	MESSAGES = [
	{
	"role": "system",
	"content": (
	"You are an advanced assistant trained to process biomedical text for Named Entity Recognition (NER) and Relation Extraction (RE). "
	"Your task is to analyze user-provided text, identify all unique and contextually relevant entities, and infer directed relationships "
	"between these entities based on the context. Ensure that all relations exist only between annotated entities. "
	"Entities and relationships should be human-readable and natural, reflecting real-world concepts and connections. "
	"Output the annotated data in JSON format, structured as follows:\n\n"
	"""{"entities": [{"id": 0, "text": "ner_string_0", "type": "ner_type_string_0"}, {"id": 1, "text": "ner_string_1", "type": "ner_type_string_1"}], "relations": [{"head": 0, "tail": 1, "type": "re_type_string_0"}]}"""
	"\n\nEnsure that the output captures all significant entities and their directed relationships in a clear and concise manner."
	),
	},
	{
	"role": "user",
	"content": (
	'Here is a text input: "Subjects will receive a 100mL dose of IV saline every 6 hours for 24 hours. The first dose will be administered prior to anesthesia induction, approximately 30 minutes before skin incision. A total of 4 doses will be given." '
	"Analyze this text, select and classify the entities, and extract their relationships as per your instructions."
	),
	},
	]

	# Build prompt text
	chat_prompt = tokenizer.apply_chat_template(
	MESSAGES, tokenize=False, add_generation_prompt=True
	)

	# Tokenize
	inputs = tokenizer(chat_prompt, return_tensors="pt").to(model.device)

	# Generate
	outputs = model.generate(
	**inputs,
	max_new_tokens=3000,
	do_sample=True,
	eos_token_id=tokenizer.eos_token_id,
	pad_token_id=tokenizer.eos_token_id,
	return_dict_in_generate=True
	)

	# Decode ONLY the new tokens (skip the prompt tokens)
	prompt_len = inputs["input_ids"].shape[-1]
	generated_ids = outputs.sequences[0][prompt_len:]
	response = tokenizer.decode(generated_ids, skip_special_tokens=True)
	print(response)
	```

	To use the model with `vllm` package, please refer to the example below:
	```python
	# !pip install vllm

	from vllm import LLM, SamplingParams
	from transformers import AutoTokenizer

	MODEL_ID = "Ihor/OpenBioLLM-Text2Graph-8B"

	tokenizer = AutoTokenizer.from_pretrained(MODEL_ID, use_fast=True)
	tokenizer.chat_template = "{% set loop_messages = messages %}{% for message in loop_messages %}{% set content = '<\|start_header_id\|>' + message['role'] + '<\|end_header_id\|>\n\n'+ message['content'] \| trim + '<\|end_of_text\|>' %}{% if loop.index0 == 0 %}{% set content = bos_token + content %}{% endif %}{{ content }}{% endfor %}{% if add_generation_prompt %}{{ '<\|start_header_id\|>assistant<\|end_header_id\|>\n\n' }}{% endif %}"

	llm = LLM(model=MODEL_ID)

	sampling_params = SamplingParams(
	max_tokens=3000,
	n=1,
	best_of=1,
	presence_penalty=0.0,
	frequency_penalty=0.0,
	repetition_penalty=1.0,
	temperature=0.0,
	top_p=1.0,
	top_k=-1,
	min_p=0.0,
	seed=42,
	)


	MESSAGES = [
	{
	"role": "system",
	"content": (
	"You are an advanced assistant trained to process biomedical text for Named Entity Recognition (NER) and Relation Extraction (RE). "
	"Your task is to analyze user-provided text, identify all unique and contextually relevant entities, and infer directed relationships "
	"between these entities based on the context. Ensure that all relations exist only between annotated entities. "
	"Entities and relationships should be human-readable and natural, reflecting real-world concepts and connections. "
	"Output the annotated data in JSON format, structured as follows:\n\n"
	"""{"entities": [{"id": 0, "text": "ner_string_0", "type": "ner_type_string_0"}, {"id": 1, "text": "ner_string_1", "type": "ner_type_string_1"}], "relations": [{"head": 0, "tail": 1, "type": "re_type_string_0"}]}"""
	"\n\nEnsure that the output captures all significant entities and their directed relationships in a clear and concise manner."
	),
	},
	{
	"role": "user",
	"content": (
	'Here is a text input: "Subjects will receive a 100mL dose of IV saline every 6 hours for 24 hours. The first dose will be administered prior to anesthesia induction, approximately 30 minutes before skin incision. A total of 4 doses will be given." '
	"Analyze this text, select and classify the entities, and extract their relationships as per your instructions."
	),
	},
	]

	chat_prompt = tokenizer.apply_chat_template(
	MESSAGES,
	tokenize=False,
	add_generation_prompt=True,
	add_special_tokens=False,
	)

	outputs = llm.generate([chat_prompt], sampling_params)
	response_text = outputs[0].outputs[0].text
	print(response_text)
	```