videoloc
/

seamless-basic

subtitle-editing-time-prediction

Model card Files Files and versions

seamless-basic / example_usage.py

giuseppe-tanzi's picture

Upload folder using huggingface_hub

d848202 verified 2 months ago

1.6 kB

	#!/usr/bin/env python3
	# Example usage for videoloc/seamless-basic

	from transformers import AutoModel, AutoConfig
	from huggingface_hub import hf_hub_download
	import torch
	import numpy as np
	import importlib.util

	def load_model_and_collator():
	# Load model - architecture is included in the repository
	model = AutoModel.from_pretrained("videoloc/seamless-basic")
	config = AutoConfig.from_pretrained("videoloc/seamless-basic")

	# Load data collator
	collator_file = hf_hub_download(repo_id="videoloc/seamless-basic", filename="data_collator.py")
	spec = importlib.util.spec_from_file_location("data_collator", collator_file)
	collator_module = importlib.util.module_from_spec(spec)
	spec.loader.exec_module(collator_module)

	data_collator = collator_module.DataCollatorSimpleSeamless(
	processor="facebook/hf-seamless-m4t-medium",
	max_audio_length_sec=8.0,
	max_text_length=256
	)

	return model, data_collator

	def example_inference():
	model, collator = load_model_and_collator()

	# Example data: audio segment + subtitle text to predict editing time
	data = [{
	'raw_audio': np.random.randn(16000 * 3), # 3 seconds at 16kHz
	'raw_text': "Hello, welcome to our presentation today.",
	}]

	batch = collator(data)
	model.eval()
	with torch.no_grad():
	outputs = model(**batch)
	tte_prediction = outputs.logits.item()

	print(f"Predicted Time To Edit (TTE): {tte_prediction:.2f} seconds")
	return tte_prediction

	if __name__ == "__main__":
	example_inference()