#!/usr/bin/env python3 # Example usage for videoloc/seamless-basic from transformers import AutoModel, AutoConfig from huggingface_hub import hf_hub_download import torch import numpy as np import importlib.util def load_model_and_collator(): # Load model - custom architecture requires importing the model class model_files = hf_hub_download(repo_id="videoloc/seamless-basic", filename="modeling_seamless_basic.py") spec = importlib.util.spec_from_file_location("modeling_seamless_basic", model_files) modeling_module = importlib.util.module_from_spec(spec) spec.loader.exec_module(modeling_module) # Now load the model using the custom class config = modeling_module.SeamlessBasicConfig.from_pretrained("videoloc/seamless-basic") model = modeling_module.HFSeamlessBasic.from_pretrained("videoloc/seamless-basic") # Load data collator collator_file = hf_hub_download(repo_id="videoloc/seamless-basic", filename="data_collator.py") spec = importlib.util.spec_from_file_location("data_collator", collator_file) collator_module = importlib.util.module_from_spec(spec) spec.loader.exec_module(collator_module) data_collator = collator_module.DataCollatorSimpleSeamless( processor="facebook/hf-seamless-m4t-medium", max_audio_length_sec=8.0, max_text_length=256 ) return model, data_collator def example_inference(): model, collator = load_model_and_collator() # Example data: audio segment + subtitle text to predict editing time data = [{ 'raw_audio': np.random.randn(16000 * 3), # 3 seconds at 16kHz 'raw_text': "Hello, welcome to our presentation today.", }] batch = collator(data) model.eval() with torch.no_grad(): outputs = model(**batch) tte_prediction = outputs.logits.item() print(f"Predicted Time To Edit (TTE): {tte_prediction:.2f} seconds") return tte_prediction if __name__ == "__main__": example_inference()