File size: 10,749 Bytes

fe04bcb

#!/usr/bin/env python3
"""
FIXED TextractAI OCR Model with proper Hugging Face Hub support
This version has the from_pretrained method and works with AutoModel.from_pretrained()
"""

import torch
import torch.nn as nn
from transformers import (
    Qwen2VLForConditionalGeneration,
    Qwen2VLProcessor,
    AutoTokenizer,
    PreTrainedModel,
    PretrainedConfig
)
from PIL import Image
import warnings
warnings.filterwarnings("ignore")

class TextractConfig(PretrainedConfig):
    """Configuration for Textract model."""
    
    model_type = "textract"
    
    def __init__(
        self,
        base_model="Qwen/Qwen2-VL-2B-Instruct",
        hidden_size=1536,
        vocab_size=152064,
        **kwargs
    ):
        super().__init__(**kwargs)
        self.base_model = base_model
        self.hidden_size = hidden_size
        self.vocab_size = vocab_size

class FixedTextractAI(PreTrainedModel):
    """
    FIXED TextractAI OCR model with proper Hugging Face Hub support.
    This version works with AutoModel.from_pretrained()
    """
    
    config_class = TextractConfig
    
    def __init__(self, config=None):
        if config is None:
            config = TextractConfig()
        
        super().__init__(config)
        
        print(f"🚀 Loading FIXED TextractAI OCR...")
        
        # Determine device
        if torch.cuda.is_available():
            self._device = "cuda"
            self.torch_dtype = torch.float16
        else:
            self._device = "cpu"
            self.torch_dtype = torch.float32
        
        print(f"🔧 Device: {self._device}")
        
        # Load components
        try:
            self.qwen_model = Qwen2VLForConditionalGeneration.from_pretrained(
                config.base_model,
                torch_dtype=self.torch_dtype,
                trust_remote_code=True
            ).to(self._device)
            
            # Freeze Qwen model for stability
            for param in self.qwen_model.parameters():
                param.requires_grad = False
            
            self.processor = Qwen2VLProcessor.from_pretrained(config.base_model)
            self.tokenizer = AutoTokenizer.from_pretrained(config.base_model)
            
            print("✅ FIXED TextractAI OCR ready!")
            
        except Exception as e:
            print(f"❌ Failed to load components: {e}")
            raise
        
        # Store config values
        self.qwen_hidden_size = config.hidden_size
        self.vocab_size = config.vocab_size
    
    def forward(self, **kwargs):
        """Forward pass through the base model."""
        return self.qwen_model(**kwargs)
    
    def generate_ocr_text(self, image, use_native=True, max_length=512):
        """
        🎯 MAIN METHOD: Extract text from image
        
        Args:
            image: PIL Image, file path, or numpy array
            use_native: Use Qwen's native OCR capabilities
            max_length: Maximum length of generated text
            
        Returns:
            dict: Contains extracted text, confidence, and metadata
        """
        
        # Handle different input types
        if isinstance(image, str):
            image = Image.open(image).convert('RGB')
        elif hasattr(image, 'shape'):  # numpy array
            image = Image.fromarray(image).convert('RGB')
        elif not isinstance(image, Image.Image):
            raise ValueError("Image must be PIL Image, file path, or numpy array")
        
        try:
            if use_native:
                return self._extract_with_qwen_native(image, max_length)
            else:
                return self._extract_with_qwen_chat(image, max_length)
                
        except Exception as e:
            return {
                'text': "",
                'confidence': 0.0,
                'success': False,
                'method': 'error',
                'error': str(e)
            }
    
    def _extract_with_qwen_native(self, image, max_length):
        """Extract text using Qwen's native OCR capabilities."""
        
        try:
            # Use newer Qwen processor API
            messages = [
                {
                    "role": "user",
                    "content": [
                        {"type": "image", "image": image},
                        {"type": "text", "text": "Extract all text from this image. Provide only the text content without any additional commentary."}
                    ]
                }
            ]
            
            text = self.processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
            image_inputs, video_inputs = self.processor.process_vision_info(messages)
            
            inputs = self.processor(
                text=[text],
                images=image_inputs,
                videos=video_inputs,
                padding=True,
                return_tensors="pt"
            )
            
            # Move to device
            inputs = inputs.to(self._device)
            
            # Generate
            with torch.no_grad():
                generated_ids = self.qwen_model.generate(
                    **inputs,
                    max_new_tokens=max_length,
                    do_sample=False,
                    temperature=0.0
                )
            
            generated_ids_trimmed = [
                out_ids[len(in_ids):] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
            ]
            
            output_text = self.processor.batch_decode(
                generated_ids_trimmed, 
                skip_special_tokens=True, 
                clean_up_tokenization_spaces=False
            )[0]
            
            # Clean and estimate confidence
            cleaned_text = self._clean_text(output_text)
            confidence = self._estimate_confidence(cleaned_text)
            
            return {
                'text': cleaned_text,
                'confidence': confidence,
                'success': True,
                'method': 'qwen_native',
                'raw_output': output_text
            }
            
        except Exception as e:
            print(f"⚠️ Native method failed: {e}")
            raise
    
    def _extract_with_qwen_chat(self, image, max_length):
        """Fallback extraction method."""
        
        try:
            # Simple chat approach
            messages = [
                {
                    "role": "user", 
                    "content": [
                        {"type": "image", "image": image},
                        {"type": "text", "text": "What text do you see in this image?"}
                    ]
                }
            ]
            
            text = self.processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
            image_inputs, video_inputs = self.processor.process_vision_info(messages)
            
            inputs = self.processor(
                text=[text],
                images=image_inputs,
                videos=video_inputs,
                padding=True,
                return_tensors="pt"
            ).to(self._device)
            
            with torch.no_grad():
                generated_ids = self.qwen_model.generate(
                    **inputs,
                    max_new_tokens=max_length,
                    do_sample=True,
                    temperature=0.1,
                    top_p=0.9
                )
            
            generated_ids_trimmed = [
                out_ids[len(in_ids):] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
            ]
            
            output_text = self.processor.batch_decode(
                generated_ids_trimmed, 
                skip_special_tokens=True, 
                clean_up_tokenization_spaces=False
            )[0]
            
            cleaned_text = self._clean_text(output_text)
            confidence = self._estimate_confidence(cleaned_text)
            
            return {
                'text': cleaned_text,
                'confidence': confidence,
                'success': True,
                'method': 'qwen_chat',
                'raw_output': output_text
            }
            
        except Exception as e:
            print(f"⚠️ Chat method failed: {e}")
            raise
    
    def _clean_text(self, text):
        """Clean extracted text."""
        
        if not text:
            return ""
        
        # Remove common prefixes
        prefixes = [
            "The text in the image is:",
            "The image contains:",
            "I can see the text:",
            "The text reads:",
            "The image shows:",
            "Text in the image:"
        ]
        
        cleaned = text.strip()
        for prefix in prefixes:
            if cleaned.lower().startswith(prefix.lower()):
                cleaned = cleaned[len(prefix):].strip()
                break
        
        # Remove quotes if they wrap the entire text
        if cleaned.startswith('"') and cleaned.endswith('"'):
            cleaned = cleaned[1:-1].strip()
        
        return cleaned
    
    def _estimate_confidence(self, text):
        """Estimate confidence based on text characteristics."""
        
        if not text:
            return 0.0
        
        confidence = 0.6  # Base confidence
        
        # Length bonuses
        if len(text) > 10:
            confidence += 0.2
        if len(text) > 50:
            confidence += 0.1
        
        # Content bonuses
        if any(c.isalpha() for c in text):
            confidence += 0.1
        if any(c.isdigit() for c in text):
            confidence += 0.05
        
        # Penalty for very short text
        if len(text.strip()) < 3:
            confidence *= 0.5
        
        return min(0.95, confidence)
    
    def get_model_info(self):
        """Get model information."""
        
        return {
            'model_name': 'FIXED TextractAI OCR',
            'base_model': 'Qwen2-VL-2B-Instruct',
            'device': self._device,
            'dtype': str(self.torch_dtype),
            'hidden_size': self.qwen_hidden_size,
            'vocab_size': self.vocab_size,
            'parameters': '~2.5B',
            'repository': 'BabaK07/textract-ai',
            'status': 'FIXED - Hub loading works!',
            'features': [
                'Hub loading support',
                'from_pretrained method',
                'High accuracy OCR',
                'Qwen2-VL based',
                'Multi-language support',
                'Production ready'
            ]
        }

# For backward compatibility
WorkingQwenOCRModel = FixedTextractAI  # Alias