examples/basic_inference.py · TomBombadyl/Qwen2.5-Coder-7B-Instruct-Omni1.1 at main

Qwen2.5-Coder-7B-Instruct-Omni1.1 / examples /basic_inference.py

Upload folder using huggingface_hub

692d5c6 verified 11 days ago

6.24 kB

	#!/usr/bin/env python3
	"""
	Basic inference example for Isaac Sim Robotics Qwen model.

	This script demonstrates how to load and use the fine-tuned model
	for Isaac Sim robotics queries.
	"""

	import torch
	from transformers import AutoModelForCausalLM, AutoTokenizer
	import argparse
	import sys
	import os

	def load_model(model_path, device="auto", load_in_8bit=False):
	"""
	Load the Isaac Sim Robotics Qwen model.

	Args:
	model_path (str): Path to the model (local or HuggingFace hub)
	device (str): Device to load model on ("auto", "cpu", "cuda")
	load_in_8bit (bool): Whether to use 8-bit quantization

	Returns:
	tuple: (model, tokenizer)
	"""
	print(f"Loading model from: {model_path}")

	# Load tokenizer
	tokenizer = AutoTokenizer.from_pretrained(model_path)

	# Set pad token if not present
	if tokenizer.pad_token is None:
	tokenizer.pad_token = tokenizer.eos_token

	# Load model
	if load_in_8bit:
	try:
	model = AutoModelForCausalLM.from_pretrained(
	model_path,
	load_in_8bit=True,
	device_map=device,
	torch_dtype=torch.float16
	)
	except ImportError:
	print("8-bit quantization not available. Install bitsandbytes.")
	model = AutoModelForCausalLM.from_pretrained(
	model_path,
	device_map=device,
	torch_dtype=torch.float16
	)
	else:
	model = AutoModelForCausalLM.from_pretrained(
	model_path,
	device_map=device,
	torch_dtype=torch.float16
	)

	print("Model loaded successfully!")
	return model, tokenizer

	def generate_response(model, tokenizer, query, max_length=1024, temperature=0.7):
	"""
	Generate a response using the model.

	Args:
	model: The loaded model
	tokenizer: The loaded tokenizer
	query (str): The input query
	max_length (int): Maximum length of generated response
	temperature (float): Sampling temperature

	Returns:
	str: Generated response
	"""
	# Format query for Qwen2.5-Coder
	formatted_query = f"<\|im_start\|>user\n{query}<\|im_end\|>\n<\|im_start\|>assistant"

	# Tokenize input
	inputs = tokenizer(formatted_query, return_tensors="pt")

	# Move to same device as model
	device = next(model.parameters()).device
	inputs = {k: v.to(device) for k, v in inputs.items()}

	# Generate response
	with torch.no_grad():
	outputs = model.generate(
	**inputs,
	max_length=max_length,
	temperature=temperature,
	do_sample=True,
	pad_token_id=tokenizer.eos_token_id,
	eos_token_id=tokenizer.eos_token_id
	)

	# Decode response
	response = tokenizer.decode(outputs[0], skip_special_tokens=True)

	# Extract only the assistant response
	if "<\|im_start\|>assistant" in response:
	response = response.split("<\|im_start\|>assistant")[1].strip()

	return response

	def main():
	parser = argparse.ArgumentParser(description="Isaac Sim Robotics Qwen Inference")
	parser.add_argument(
	"--model_path",
	type=str,
	default="TomBombadyl/Qwen2.5-Coder-7B-Instruct-Omni1.1",
	help="Path to model (local or HuggingFace hub)"
	)
	parser.add_argument(
	"--device",
	type=str,
	default="auto",
	choices=["auto", "cpu", "cuda"],
	help="Device to use for inference"
	)
	parser.add_argument(
	"--load_8bit",
	action="store_true",
	help="Use 8-bit quantization to reduce memory usage"
	)
	parser.add_argument(
	"--max_length",
	type=int,
	default=1024,
	help="Maximum length of generated response"
	)
	parser.add_argument(
	"--temperature",
	type=float,
	default=0.7,
	help="Sampling temperature"
	)
	parser.add_argument(
	"--query",
	type=str,
	help="Query to ask (if not provided, will use interactive mode)"
	)

	args = parser.parse_args()

	try:
	# Load model
	model, tokenizer = load_model(
	args.model_path,
	device=args.device,
	load_in_8bit=args.load_8bit
	)

	if args.query:
	# Single query mode
	response = generate_response(
	model, tokenizer, args.query, args.max_length, args.temperature
	)
	print(f"\nQuery: {args.query}")
	print(f"Response:\n{response}")
	else:
	# Interactive mode
	print("\n=== Isaac Sim Robotics Qwen Interactive Mode ===")
	print("Type 'quit' to exit")
	print("Example queries:")
	print("- How do I create a differential drive robot in Isaac Sim?")
	print("- How to add a depth camera to my robot?")
	print("- What physics parameters should I use for a manipulator?")
	print()

	while True:
	try:
	query = input("Enter your Isaac Sim question: ").strip()
	if query.lower() in ['quit', 'exit', 'q']:
	break
	if not query:
	continue

	print("Generating response...")
	response = generate_response(
	model, tokenizer, query, args.max_length, args.temperature
	)
	print(f"\nResponse:\n{response}\n")

	except KeyboardInterrupt:
	print("\nExiting...")
	break
	except Exception as e:
	print(f"Error generating response: {e}")

	except Exception as e:
	print(f"Error loading model: {e}")
	sys.exit(1)

	if __name__ == "__main__":
	main()