WAVe-1B-Multimodal-NL / processing_wave.py

Upload folder using huggingface_hub

3b6e646 verified 3 months ago

10.9 kB

	"""
	Processor for WAVe model.

	This module contains the processor that combines text tokenization and audio feature extraction.
	"""

	from transformers import ProcessorMixin, AutoTokenizer, AutoFeatureExtractor
	from typing import List, Optional, Union
	import numpy as np
	import torch


	class WAVeProcessor(ProcessorMixin):
	"""
	Constructs a WAVe processor which wraps a text tokenizer and audio feature extractor into a single processor.

	[`WAVeProcessor`] offers all the functionalities of [`AutoTokenizer`] and [`AutoFeatureExtractor`].
	See the docstring of [`~WAVeProcessor.__call__`] and [`~WAVeProcessor.decode`] for more information.

	Args:
	tokenizer ([`PreTrainedTokenizer`]):
	An instance of [`PreTrainedTokenizer`]. The tokenizer is a required input.
	feature_extractor ([`FeatureExtractionMixin`]):
	An instance of [`FeatureExtractionMixin`]. The feature extractor is a required input.

	Example:
	```python
	>>> from wave_hf import WAVeProcessor
	>>> from transformers import AutoTokenizer, AutoFeatureExtractor
	>>>
	>>> # Create processor from scratch
	>>> tokenizer = AutoTokenizer.from_pretrained("sentence-transformers/all-roberta-large-v1")
	>>> feature_extractor = AutoFeatureExtractor.from_pretrained("facebook/w2v-bert-2.0")
	>>> processor = WAVeProcessor(tokenizer=tokenizer, feature_extractor=feature_extractor)
	>>>
	>>> # Or load directly
	>>> processor = WAVeProcessor.from_pretrained("yuriyvnv/wave-portuguese")
	>>>
	>>> # Process single example
	>>> text = "Olá, como você está?"
	>>> audio = np.random.randn(16000) # 1 second at 16kHz
	>>> inputs = processor(text=text, audio=audio, sampling_rate=16000, return_tensors="pt")
	>>>
	>>> # Process batch
	>>> texts = ["Olá", "Como vai?"]
	>>> audios = [np.random.randn(16000), np.random.randn(24000)]
	>>> inputs = processor(text=texts, audio=audios, sampling_rate=16000, padding=True, return_tensors="pt")
	```
	"""

	attributes = ["tokenizer", "feature_extractor"]
	tokenizer_class = "AutoTokenizer"
	feature_extractor_class = "AutoFeatureExtractor"

	def __init__(self, tokenizer, feature_extractor):
	super().__init__(tokenizer, feature_extractor)
	self.current_processor = self.tokenizer

	def __call__(
	self,
	text: Optional[Union[str, List[str]]] = None,
	audio: Optional[Union[np.ndarray, torch.Tensor, List[np.ndarray], List[torch.Tensor]]] = None,
	sampling_rate: Optional[int] = 16000,
	return_tensors: Optional[str] = None,
	padding: Union[bool, str] = False,
	max_length: Optional[int] = None,
	truncation: bool = False,
	**kwargs
	) -> dict:
	"""
	Main method to prepare text and audio for the model.

	Args:
	text (`str`, `List[str]`, optional):
	The sequence or batch of sequences to be encoded. Each sequence can be a string.
	audio (`np.ndarray`, `torch.Tensor`, `List[np.ndarray]`, `List[torch.Tensor]`, optional):
	The audio or batch of audios to be prepared. Each audio can be NumPy array or PyTorch
	tensor. In case of a NumPy array/PyTorch tensor, each audio should be of shape (C, T), where
	C is a number of channels, and T the sample length of the audio.
	sampling_rate (`int`, optional, defaults to 16000):
	Sampling rate of the audio waveform in Hz.
	return_tensors (`str` or [`~utils.TensorType`], optional):
	If set, will return tensors of a particular framework. Acceptable values are:
	- `'pt'`: Return PyTorch `torch.Tensor` objects.
	- `'np'`: Return NumPy `np.ndarray` objects.
	padding (`bool`, `str` or [`~utils.PaddingStrategy`], optional, defaults to `False`):
	Activates and controls padding. Accepts the following values:
	- `True` or `'longest'`: Pad to the longest sequence in the batch.
	- `'max_length'`: Pad to a maximum length specified with the argument `max_length`.
	- `False` or `'do_not_pad'` (default): No padding.
	max_length (`int`, optional):
	Maximum length of the returned list and optionally padding length.
	truncation (`bool`, optional, defaults to `False`):
	Activates truncation to cut input sequences longer than `max_length` to `max_length`.

	Returns:
	[`BatchEncoding`]: A [`BatchEncoding`] with the following fields:
	- input_ids -- List of token ids to be fed to the text encoder.
	- attention_mask -- List of indices specifying which tokens should be attended to by the text encoder.
	- input_values -- Audio input values to be fed to the audio encoder.
	- audio_attention_mask -- List of indices specifying which audio frames should be attended to (if padding is used).

	Example:
	```python
	>>> from wave_hf import WAVeProcessor
	>>> import numpy as np
	>>>
	>>> processor = WAVeProcessor.from_pretrained("yuriyvnv/wave-portuguese")
	>>>
	>>> # Single example
	>>> text = "Olá mundo"
	>>> audio = np.random.randn(16000)
	>>> inputs = processor(text=text, audio=audio, sampling_rate=16000, return_tensors="pt")
	>>>
	>>> # Batch
	>>> texts = ["Texto um", "Texto dois"]
	>>> audios = [np.random.randn(16000), np.random.randn(32000)]
	>>> inputs = processor(text=texts, audio=audios, sampling_rate=16000, padding=True, return_tensors="pt")
	```
	"""
	if text is None and audio is None:
	raise ValueError("You must provide either text or audio inputs, or both.")

	# Initialize output dictionary
	encoded_inputs = {}

	# ===== PROCESS TEXT =====
	if text is not None:
	# Filter out audio-specific kwargs before passing to tokenizer
	text_kwargs = {k: v for k, v in kwargs.items() if k not in ['audio', 'sampling_rate']}
	text_inputs = self.tokenizer(
	text,
	return_tensors=return_tensors,
	padding=padding,
	max_length=max_length,
	truncation=truncation,
	**text_kwargs
	)
	encoded_inputs.update(text_inputs)

	# ===== PROCESS AUDIO =====
	if audio is not None:
	# Handle different audio input formats
	if isinstance(audio, (np.ndarray, torch.Tensor)):
	# Single audio - check dimensionality
	if len(audio.shape) == 1:
	# Single channel, convert to batch of 1
	audio = [audio]
	elif len(audio.shape) == 2:
	# Could be batched (batch_size, n_samples) or multi-channel (n_channels, n_samples)
	# Assume it's batched
	audio = list(audio)

	# Process audio features
	audio_inputs = self.feature_extractor(
	audio,
	sampling_rate=sampling_rate,
	return_tensors=return_tensors,
	padding=padding,
	**kwargs
	)

	# Map audio feature extractor output keys to model input keys
	# Different audio models use different key names
	if "input_values" in audio_inputs:
	encoded_inputs["input_values"] = audio_inputs["input_values"]
	elif "input_features" in audio_inputs:
	# Some models use input_features instead of input_values
	encoded_inputs["input_values"] = audio_inputs["input_features"]

	# Add audio attention mask if present (created by padding)
	if "attention_mask" in audio_inputs:
	encoded_inputs["audio_attention_mask"] = audio_inputs["attention_mask"]

	return encoded_inputs

	def batch_decode(self, args, *kwargs):
	"""
	This method forwards all its arguments to the tokenizer's [`~PreTrainedTokenizer.batch_decode`]. Please
	refer to the docstring of this method for more information.
	"""
	return self.tokenizer.batch_decode(args, *kwargs)

	def decode(self, args, *kwargs):
	"""
	This method forwards all its arguments to the tokenizer's [`~PreTrainedTokenizer.decode`]. Please refer to
	the docstring of this method for more information.
	"""
	return self.tokenizer.decode(args, *kwargs)

	@classmethod
	def from_pretrained(cls, pretrained_model_name_or_path, **kwargs):
	"""
	Instantiate a [`WAVeProcessor`] from a pretrained processor.

	Args:
	pretrained_model_name_or_path (`str` or `os.PathLike`):
	This can be either:
	- A string, the model id of a pretrained feature_extractor hosted inside a model repo on
	huggingface.co. Valid model ids can be located at the root-level, like `bert-base-uncased`, or
	namespaced under a user or organization name, like `dbmdz/bert-base-german-cased`.
	- A path to a directory containing a processor saved using the
	[`~SequenceFeatureExtractor.save_pretrained`] method, e.g., `./my_model_directory/`.

	Returns:
	[`WAVeProcessor`]: A WAVeProcessor object.

	Example:
	```python
	>>> from wave_hf import WAVeProcessor
	>>>
	>>> # Load from HuggingFace Hub
	>>> processor = WAVeProcessor.from_pretrained("yuriyvnv/wave-portuguese")
	>>>
	>>> # Or load from local directory
	>>> processor = WAVeProcessor.from_pretrained("./my_saved_model")
	```
	"""
	try:
	# Try to load from saved processor files
	return super().from_pretrained(pretrained_model_name_or_path, **kwargs)
	except Exception:
	# Fallback: load from config and create new processor
	from .configuration_wave import WAVeConfig

	config = WAVeConfig.from_pretrained(pretrained_model_name_or_path, **kwargs)

	tokenizer = AutoTokenizer.from_pretrained(
	config.text_model_name_or_path,
	**kwargs
	)
	feature_extractor = AutoFeatureExtractor.from_pretrained(
	config.audio_model_name_or_path,
	**kwargs
	)

	return cls(tokenizer=tokenizer, feature_extractor=feature_extractor)