#!/usr/bin/env python3 # Example usage for videoloc/seamless-basic from transformers import AutoModel, AutoConfig from huggingface_hub import hf_hub_download import torch import numpy as np import importlib.util def load_model_and_collator(): # Load model - architecture is included in the repository model = AutoModel.from_pretrained("videoloc/seamless-basic") config = AutoConfig.from_pretrained("videoloc/seamless-basic") # Load data collator collator_file = hf_hub_download(repo_id="videoloc/seamless-basic", filename="data_collator.py") spec = importlib.util.spec_from_file_location("data_collator", collator_file) collator_module = importlib.util.module_from_spec(spec) spec.loader.exec_module(collator_module) data_collator = collator_module.DataCollatorSimpleSeamless( processor="facebook/hf-seamless-m4t-medium", max_audio_length_sec=8.0, max_text_length=256 ) return model, data_collator def example_inference(): model, collator = load_model_and_collator() # Example data: audio segment + subtitle text to predict editing time data = [{ 'raw_audio': np.random.randn(16000 * 3), # 3 seconds at 16kHz 'raw_text': "Hello, welcome to our presentation today.", }] batch = collator(data) model.eval() with torch.no_grad(): outputs = model(**batch) tte_prediction = outputs.logits.item() print(f"Predicted Time To Edit (TTE): {tte_prediction:.2f} seconds") return tte_prediction if __name__ == "__main__": example_inference()