"""
Processor for WAVe model.

This module contains the processor that combines text tokenization and audio feature extraction.
"""

from transformers import ProcessorMixin, AutoTokenizer, AutoFeatureExtractor
from typing import List, Optional, Union
import numpy as np
import torch


class WAVeProcessor(ProcessorMixin):
    """
    Constructs a WAVe processor which wraps a text tokenizer and audio feature extractor into a single processor.

    [`WAVeProcessor`] offers all the functionalities of [`AutoTokenizer`] and [`AutoFeatureExtractor`].
    See the docstring of [`~WAVeProcessor.__call__`] and [`~WAVeProcessor.decode`] for more information.

    Args:
        tokenizer ([`PreTrainedTokenizer`]):
            An instance of [`PreTrainedTokenizer`]. The tokenizer is a required input.
        feature_extractor ([`FeatureExtractionMixin`]):
            An instance of [`FeatureExtractionMixin`]. The feature extractor is a required input.

    Example:
        ```python
        >>> from wave_hf import WAVeProcessor
        >>> from transformers import AutoTokenizer, AutoFeatureExtractor
        >>>
        >>> # Create processor from scratch
        >>> tokenizer = AutoTokenizer.from_pretrained("sentence-transformers/all-roberta-large-v1")
        >>> feature_extractor = AutoFeatureExtractor.from_pretrained("facebook/w2v-bert-2.0")
        >>> processor = WAVeProcessor(tokenizer=tokenizer, feature_extractor=feature_extractor)
        >>>
        >>> # Or load directly
        >>> processor = WAVeProcessor.from_pretrained("yuriyvnv/wave-portuguese")
        >>>
        >>> # Process single example
        >>> text = "Olá, como você está?"
        >>> audio = np.random.randn(16000)  # 1 second at 16kHz
        >>> inputs = processor(text=text, audio=audio, sampling_rate=16000, return_tensors="pt")
        >>>
        >>> # Process batch
        >>> texts = ["Olá", "Como vai?"]
        >>> audios = [np.random.randn(16000), np.random.randn(24000)]
        >>> inputs = processor(text=texts, audio=audios, sampling_rate=16000, padding=True, return_tensors="pt")
        ```
    """

    attributes = ["tokenizer", "feature_extractor"]
    tokenizer_class = "AutoTokenizer"
    feature_extractor_class = "AutoFeatureExtractor"

    def __init__(self, tokenizer, feature_extractor):
        super().__init__(tokenizer, feature_extractor)
        self.current_processor = self.tokenizer

    def __call__(
        self,
        text: Optional[Union[str, List[str]]] = None,
        audio: Optional[Union[np.ndarray, torch.Tensor, List[np.ndarray], List[torch.Tensor]]] = None,
        sampling_rate: Optional[int] = 16000,
        return_tensors: Optional[str] = None,
        padding: Union[bool, str] = False,
        max_length: Optional[int] = None,
        truncation: bool = False,
        **kwargs
    ) -> dict:
        """
        Main method to prepare text and audio for the model.

        Args:
            text (`str`, `List[str]`, *optional*):
                The sequence or batch of sequences to be encoded. Each sequence can be a string.
            audio (`np.ndarray`, `torch.Tensor`, `List[np.ndarray]`, `List[torch.Tensor]`, *optional*):
                The audio or batch of audios to be prepared. Each audio can be NumPy array or PyTorch
                tensor. In case of a NumPy array/PyTorch tensor, each audio should be of shape (C, T), where
                C is a number of channels, and T the sample length of the audio.
            sampling_rate (`int`, *optional*, defaults to 16000):
                Sampling rate of the audio waveform in Hz.
            return_tensors (`str` or [`~utils.TensorType`], *optional*):
                If set, will return tensors of a particular framework. Acceptable values are:
                - `'pt'`: Return PyTorch `torch.Tensor` objects.
                - `'np'`: Return NumPy `np.ndarray` objects.
            padding (`bool`, `str` or [`~utils.PaddingStrategy`], *optional*, defaults to `False`):
                Activates and controls padding. Accepts the following values:
                - `True` or `'longest'`: Pad to the longest sequence in the batch.
                - `'max_length'`: Pad to a maximum length specified with the argument `max_length`.
                - `False` or `'do_not_pad'` (default): No padding.
            max_length (`int`, *optional*):
                Maximum length of the returned list and optionally padding length.
            truncation (`bool`, *optional*, defaults to `False`):
                Activates truncation to cut input sequences longer than `max_length` to `max_length`.

        Returns:
            [`BatchEncoding`]: A [`BatchEncoding`] with the following fields:
            - **input_ids** -- List of token ids to be fed to the text encoder.
            - **attention_mask** -- List of indices specifying which tokens should be attended to by the text encoder.
            - **input_values** -- Audio input values to be fed to the audio encoder.
            - **audio_attention_mask** -- List of indices specifying which audio frames should be attended to (if padding is used).

        Example:
            ```python
            >>> from wave_hf import WAVeProcessor
            >>> import numpy as np
            >>>
            >>> processor = WAVeProcessor.from_pretrained("yuriyvnv/wave-portuguese")
            >>>
            >>> # Single example
            >>> text = "Olá mundo"
            >>> audio = np.random.randn(16000)
            >>> inputs = processor(text=text, audio=audio, sampling_rate=16000, return_tensors="pt")
            >>>
            >>> # Batch
            >>> texts = ["Texto um", "Texto dois"]
            >>> audios = [np.random.randn(16000), np.random.randn(32000)]
            >>> inputs = processor(text=texts, audio=audios, sampling_rate=16000, padding=True, return_tensors="pt")
            ```
        """
        if text is None and audio is None:
            raise ValueError("You must provide either text or audio inputs, or both.")

        # Initialize output dictionary
        encoded_inputs = {}

        # ===== PROCESS TEXT =====
        if text is not None:
            # Filter out audio-specific kwargs before passing to tokenizer
            text_kwargs = {k: v for k, v in kwargs.items() if k not in ['audio', 'sampling_rate']}
            text_inputs = self.tokenizer(
                text,
                return_tensors=return_tensors,
                padding=padding,
                max_length=max_length,
                truncation=truncation,
                **text_kwargs
            )
            encoded_inputs.update(text_inputs)

        # ===== PROCESS AUDIO =====
        if audio is not None:
            # Handle different audio input formats
            if isinstance(audio, (np.ndarray, torch.Tensor)):
                # Single audio - check dimensionality
                if len(audio.shape) == 1:
                    # Single channel, convert to batch of 1
                    audio = [audio]
                elif len(audio.shape) == 2:
                    # Could be batched (batch_size, n_samples) or multi-channel (n_channels, n_samples)
                    # Assume it's batched
                    audio = list(audio)

            # Process audio features
            audio_inputs = self.feature_extractor(
                audio,
                sampling_rate=sampling_rate,
                return_tensors=return_tensors,
                padding=padding,
                **kwargs
            )

            # Map audio feature extractor output keys to model input keys
            # Different audio models use different key names
            if "input_values" in audio_inputs:
                encoded_inputs["input_values"] = audio_inputs["input_values"]
            elif "input_features" in audio_inputs:
                # Some models use input_features instead of input_values
                encoded_inputs["input_values"] = audio_inputs["input_features"]

            # Add audio attention mask if present (created by padding)
            if "attention_mask" in audio_inputs:
                encoded_inputs["audio_attention_mask"] = audio_inputs["attention_mask"]

        return encoded_inputs

    def batch_decode(self, *args, **kwargs):
        """
        This method forwards all its arguments to the tokenizer's [`~PreTrainedTokenizer.batch_decode`]. Please
        refer to the docstring of this method for more information.
        """
        return self.tokenizer.batch_decode(*args, **kwargs)

    def decode(self, *args, **kwargs):
        """
        This method forwards all its arguments to the tokenizer's [`~PreTrainedTokenizer.decode`]. Please refer to
        the docstring of this method for more information.
        """
        return self.tokenizer.decode(*args, **kwargs)

    @classmethod
    def from_pretrained(cls, pretrained_model_name_or_path, **kwargs):
        """
        Instantiate a [`WAVeProcessor`] from a pretrained processor.

        Args:
            pretrained_model_name_or_path (`str` or `os.PathLike`):
                This can be either:
                - A string, the *model id* of a pretrained feature_extractor hosted inside a model repo on
                  huggingface.co. Valid model ids can be located at the root-level, like `bert-base-uncased`, or
                  namespaced under a user or organization name, like `dbmdz/bert-base-german-cased`.
                - A path to a *directory* containing a processor saved using the
                  [`~SequenceFeatureExtractor.save_pretrained`] method, e.g., `./my_model_directory/`.

        Returns:
            [`WAVeProcessor`]: A WAVeProcessor object.

        Example:
            ```python
            >>> from wave_hf import WAVeProcessor
            >>>
            >>> # Load from HuggingFace Hub
            >>> processor = WAVeProcessor.from_pretrained("yuriyvnv/wave-portuguese")
            >>>
            >>> # Or load from local directory
            >>> processor = WAVeProcessor.from_pretrained("./my_saved_model")
            ```
        """
        try:
            # Try to load from saved processor files
            return super().from_pretrained(pretrained_model_name_or_path, **kwargs)
        except Exception:
            # Fallback: load from config and create new processor
            from .configuration_wave import WAVeConfig

            config = WAVeConfig.from_pretrained(pretrained_model_name_or_path, **kwargs)

            tokenizer = AutoTokenizer.from_pretrained(
                config.text_model_name_or_path,
                **kwargs
            )
            feature_extractor = AutoFeatureExtractor.from_pretrained(
                config.audio_model_name_or_path,
                **kwargs
            )

            return cls(tokenizer=tokenizer, feature_extractor=feature_extractor)