from typing import Dict, List, Any, Optional import torch from transformers import AutoModelForCausalLM, AutoTokenizer import logging # Set up logging logging.basicConfig(level=logging.INFO) logger = logging.getLogger(__name__) class EndpointHandler: def __init__(self, path: str = ""): """ Initialize the handler for Qwen2.5-Coder-7B-Instruct-Omni1.1 Optimized for Isaac Sim robotics code generation """ logger.info(f"Loading model from {path}") # Load tokenizer with proper configuration self.tokenizer = AutoTokenizer.from_pretrained( path, trust_remote_code=True, use_fast=False # Use slow tokenizer to avoid tokenizer.json issues ) # Set pad token if not present if self.tokenizer.pad_token is None: self.tokenizer.pad_token = self.tokenizer.eos_token # Load model with optimizations for inference self.model = AutoModelForCausalLM.from_pretrained( path, torch_dtype=torch.float16, device_map="auto", trust_remote_code=True, low_cpu_mem_usage=True, attn_implementation="flash_attention_2" if torch.cuda.is_available() else "eager" ) # Set model to evaluation mode self.model.eval() logger.info("Model loaded successfully") def __call__(self, data: Dict[str, Any]) -> List[Dict[str, Any]]: """ Handle inference requests Expected input format: { "inputs": "Your prompt here", "parameters": { "max_new_tokens": 512, "temperature": 0.7, "top_p": 0.9, "do_sample": true, "repetition_penalty": 1.1 } } """ try: # Extract inputs and parameters inputs = data.get("inputs", "") parameters = data.get("parameters", {}) # Default generation parameters optimized for code generation max_new_tokens = parameters.get("max_new_tokens", 512) temperature = parameters.get("temperature", 0.7) top_p = parameters.get("top_p", 0.9) do_sample = parameters.get("do_sample", True) repetition_penalty = parameters.get("repetition_penalty", 1.1) # Format input with proper chat template for Qwen2.5 if not inputs.startswith("<|im_start|>"): formatted_input = f"<|im_start|>user\n{inputs}<|im_end|>\n<|im_start|>assistant" else: formatted_input = inputs # Tokenize input input_ids = self.tokenizer.encode( formatted_input, return_tensors="pt", truncation=True, max_length=2048 # Leave room for generation ).to(self.model.device) # Generate response with torch.no_grad(): output_ids = self.model.generate( input_ids, max_new_tokens=max_new_tokens, temperature=temperature, top_p=top_p, do_sample=do_sample, repetition_penalty=repetition_penalty, pad_token_id=self.tokenizer.pad_token_id, eos_token_id=self.tokenizer.eos_token_id, use_cache=True ) # Decode only the new tokens (response) response_ids = output_ids[0][input_ids.shape[1]:] response_text = self.tokenizer.decode( response_ids, skip_special_tokens=True, clean_up_tokenization_spaces=True ) # Clean up response response_text = response_text.strip() # Return in expected format return [{ "generated_text": response_text, "generated_tokens": len(response_ids), "finish_reason": "stop" if self.tokenizer.eos_token_id in response_ids else "length" }] except Exception as e: logger.error(f"Error during inference: {str(e)}") return [{ "error": f"Inference failed: {str(e)}", "generated_text": "" }]