seamless-basic / example_usage.py
giuseppe-tanzi's picture
Upload folder using huggingface_hub
d848202 verified
raw
history blame
1.6 kB
#!/usr/bin/env python3
# Example usage for videoloc/seamless-basic
from transformers import AutoModel, AutoConfig
from huggingface_hub import hf_hub_download
import torch
import numpy as np
import importlib.util
def load_model_and_collator():
# Load model - architecture is included in the repository
model = AutoModel.from_pretrained("videoloc/seamless-basic")
config = AutoConfig.from_pretrained("videoloc/seamless-basic")
# Load data collator
collator_file = hf_hub_download(repo_id="videoloc/seamless-basic", filename="data_collator.py")
spec = importlib.util.spec_from_file_location("data_collator", collator_file)
collator_module = importlib.util.module_from_spec(spec)
spec.loader.exec_module(collator_module)
data_collator = collator_module.DataCollatorSimpleSeamless(
processor="facebook/hf-seamless-m4t-medium",
max_audio_length_sec=8.0,
max_text_length=256
)
return model, data_collator
def example_inference():
model, collator = load_model_and_collator()
# Example data: audio segment + subtitle text to predict editing time
data = [{
'raw_audio': np.random.randn(16000 * 3), # 3 seconds at 16kHz
'raw_text': "Hello, welcome to our presentation today.",
}]
batch = collator(data)
model.eval()
with torch.no_grad():
outputs = model(**batch)
tte_prediction = outputs.logits.item()
print(f"Predicted Time To Edit (TTE): {tte_prediction:.2f} seconds")
return tte_prediction
if __name__ == "__main__":
example_inference()