WAVe-1B-Multimodal-NL / processing_wave.py
yuriyvnv's picture
Upload folder using huggingface_hub
3b6e646 verified
"""
Processor for WAVe model.
This module contains the processor that combines text tokenization and audio feature extraction.
"""
from transformers import ProcessorMixin, AutoTokenizer, AutoFeatureExtractor
from typing import List, Optional, Union
import numpy as np
import torch
class WAVeProcessor(ProcessorMixin):
"""
Constructs a WAVe processor which wraps a text tokenizer and audio feature extractor into a single processor.
[`WAVeProcessor`] offers all the functionalities of [`AutoTokenizer`] and [`AutoFeatureExtractor`].
See the docstring of [`~WAVeProcessor.__call__`] and [`~WAVeProcessor.decode`] for more information.
Args:
tokenizer ([`PreTrainedTokenizer`]):
An instance of [`PreTrainedTokenizer`]. The tokenizer is a required input.
feature_extractor ([`FeatureExtractionMixin`]):
An instance of [`FeatureExtractionMixin`]. The feature extractor is a required input.
Example:
```python
>>> from wave_hf import WAVeProcessor
>>> from transformers import AutoTokenizer, AutoFeatureExtractor
>>>
>>> # Create processor from scratch
>>> tokenizer = AutoTokenizer.from_pretrained("sentence-transformers/all-roberta-large-v1")
>>> feature_extractor = AutoFeatureExtractor.from_pretrained("facebook/w2v-bert-2.0")
>>> processor = WAVeProcessor(tokenizer=tokenizer, feature_extractor=feature_extractor)
>>>
>>> # Or load directly
>>> processor = WAVeProcessor.from_pretrained("yuriyvnv/wave-portuguese")
>>>
>>> # Process single example
>>> text = "Olá, como você está?"
>>> audio = np.random.randn(16000) # 1 second at 16kHz
>>> inputs = processor(text=text, audio=audio, sampling_rate=16000, return_tensors="pt")
>>>
>>> # Process batch
>>> texts = ["Olá", "Como vai?"]
>>> audios = [np.random.randn(16000), np.random.randn(24000)]
>>> inputs = processor(text=texts, audio=audios, sampling_rate=16000, padding=True, return_tensors="pt")
```
"""
attributes = ["tokenizer", "feature_extractor"]
tokenizer_class = "AutoTokenizer"
feature_extractor_class = "AutoFeatureExtractor"
def __init__(self, tokenizer, feature_extractor):
super().__init__(tokenizer, feature_extractor)
self.current_processor = self.tokenizer
def __call__(
self,
text: Optional[Union[str, List[str]]] = None,
audio: Optional[Union[np.ndarray, torch.Tensor, List[np.ndarray], List[torch.Tensor]]] = None,
sampling_rate: Optional[int] = 16000,
return_tensors: Optional[str] = None,
padding: Union[bool, str] = False,
max_length: Optional[int] = None,
truncation: bool = False,
**kwargs
) -> dict:
"""
Main method to prepare text and audio for the model.
Args:
text (`str`, `List[str]`, *optional*):
The sequence or batch of sequences to be encoded. Each sequence can be a string.
audio (`np.ndarray`, `torch.Tensor`, `List[np.ndarray]`, `List[torch.Tensor]`, *optional*):
The audio or batch of audios to be prepared. Each audio can be NumPy array or PyTorch
tensor. In case of a NumPy array/PyTorch tensor, each audio should be of shape (C, T), where
C is a number of channels, and T the sample length of the audio.
sampling_rate (`int`, *optional*, defaults to 16000):
Sampling rate of the audio waveform in Hz.
return_tensors (`str` or [`~utils.TensorType`], *optional*):
If set, will return tensors of a particular framework. Acceptable values are:
- `'pt'`: Return PyTorch `torch.Tensor` objects.
- `'np'`: Return NumPy `np.ndarray` objects.
padding (`bool`, `str` or [`~utils.PaddingStrategy`], *optional*, defaults to `False`):
Activates and controls padding. Accepts the following values:
- `True` or `'longest'`: Pad to the longest sequence in the batch.
- `'max_length'`: Pad to a maximum length specified with the argument `max_length`.
- `False` or `'do_not_pad'` (default): No padding.
max_length (`int`, *optional*):
Maximum length of the returned list and optionally padding length.
truncation (`bool`, *optional*, defaults to `False`):
Activates truncation to cut input sequences longer than `max_length` to `max_length`.
Returns:
[`BatchEncoding`]: A [`BatchEncoding`] with the following fields:
- **input_ids** -- List of token ids to be fed to the text encoder.
- **attention_mask** -- List of indices specifying which tokens should be attended to by the text encoder.
- **input_values** -- Audio input values to be fed to the audio encoder.
- **audio_attention_mask** -- List of indices specifying which audio frames should be attended to (if padding is used).
Example:
```python
>>> from wave_hf import WAVeProcessor
>>> import numpy as np
>>>
>>> processor = WAVeProcessor.from_pretrained("yuriyvnv/wave-portuguese")
>>>
>>> # Single example
>>> text = "Olá mundo"
>>> audio = np.random.randn(16000)
>>> inputs = processor(text=text, audio=audio, sampling_rate=16000, return_tensors="pt")
>>>
>>> # Batch
>>> texts = ["Texto um", "Texto dois"]
>>> audios = [np.random.randn(16000), np.random.randn(32000)]
>>> inputs = processor(text=texts, audio=audios, sampling_rate=16000, padding=True, return_tensors="pt")
```
"""
if text is None and audio is None:
raise ValueError("You must provide either text or audio inputs, or both.")
# Initialize output dictionary
encoded_inputs = {}
# ===== PROCESS TEXT =====
if text is not None:
# Filter out audio-specific kwargs before passing to tokenizer
text_kwargs = {k: v for k, v in kwargs.items() if k not in ['audio', 'sampling_rate']}
text_inputs = self.tokenizer(
text,
return_tensors=return_tensors,
padding=padding,
max_length=max_length,
truncation=truncation,
**text_kwargs
)
encoded_inputs.update(text_inputs)
# ===== PROCESS AUDIO =====
if audio is not None:
# Handle different audio input formats
if isinstance(audio, (np.ndarray, torch.Tensor)):
# Single audio - check dimensionality
if len(audio.shape) == 1:
# Single channel, convert to batch of 1
audio = [audio]
elif len(audio.shape) == 2:
# Could be batched (batch_size, n_samples) or multi-channel (n_channels, n_samples)
# Assume it's batched
audio = list(audio)
# Process audio features
audio_inputs = self.feature_extractor(
audio,
sampling_rate=sampling_rate,
return_tensors=return_tensors,
padding=padding,
**kwargs
)
# Map audio feature extractor output keys to model input keys
# Different audio models use different key names
if "input_values" in audio_inputs:
encoded_inputs["input_values"] = audio_inputs["input_values"]
elif "input_features" in audio_inputs:
# Some models use input_features instead of input_values
encoded_inputs["input_values"] = audio_inputs["input_features"]
# Add audio attention mask if present (created by padding)
if "attention_mask" in audio_inputs:
encoded_inputs["audio_attention_mask"] = audio_inputs["attention_mask"]
return encoded_inputs
def batch_decode(self, *args, **kwargs):
"""
This method forwards all its arguments to the tokenizer's [`~PreTrainedTokenizer.batch_decode`]. Please
refer to the docstring of this method for more information.
"""
return self.tokenizer.batch_decode(*args, **kwargs)
def decode(self, *args, **kwargs):
"""
This method forwards all its arguments to the tokenizer's [`~PreTrainedTokenizer.decode`]. Please refer to
the docstring of this method for more information.
"""
return self.tokenizer.decode(*args, **kwargs)
@classmethod
def from_pretrained(cls, pretrained_model_name_or_path, **kwargs):
"""
Instantiate a [`WAVeProcessor`] from a pretrained processor.
Args:
pretrained_model_name_or_path (`str` or `os.PathLike`):
This can be either:
- A string, the *model id* of a pretrained feature_extractor hosted inside a model repo on
huggingface.co. Valid model ids can be located at the root-level, like `bert-base-uncased`, or
namespaced under a user or organization name, like `dbmdz/bert-base-german-cased`.
- A path to a *directory* containing a processor saved using the
[`~SequenceFeatureExtractor.save_pretrained`] method, e.g., `./my_model_directory/`.
Returns:
[`WAVeProcessor`]: A WAVeProcessor object.
Example:
```python
>>> from wave_hf import WAVeProcessor
>>>
>>> # Load from HuggingFace Hub
>>> processor = WAVeProcessor.from_pretrained("yuriyvnv/wave-portuguese")
>>>
>>> # Or load from local directory
>>> processor = WAVeProcessor.from_pretrained("./my_saved_model")
```
"""
try:
# Try to load from saved processor files
return super().from_pretrained(pretrained_model_name_or_path, **kwargs)
except Exception:
# Fallback: load from config and create new processor
from .configuration_wave import WAVeConfig
config = WAVeConfig.from_pretrained(pretrained_model_name_or_path, **kwargs)
tokenizer = AutoTokenizer.from_pretrained(
config.text_model_name_or_path,
**kwargs
)
feature_extractor = AutoFeatureExtractor.from_pretrained(
config.audio_model_name_or_path,
**kwargs
)
return cls(tokenizer=tokenizer, feature_extractor=feature_extractor)
Free AI Image Generator No sign-up. Instant results. Open Now