File size: 1,604 Bytes
8525e7c d848202 8525e7c |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 |
#!/usr/bin/env python3
# Example usage for videoloc/seamless-basic
from transformers import AutoModel, AutoConfig
from huggingface_hub import hf_hub_download
import torch
import numpy as np
import importlib.util
def load_model_and_collator():
# Load model - architecture is included in the repository
model = AutoModel.from_pretrained("videoloc/seamless-basic")
config = AutoConfig.from_pretrained("videoloc/seamless-basic")
# Load data collator
collator_file = hf_hub_download(repo_id="videoloc/seamless-basic", filename="data_collator.py")
spec = importlib.util.spec_from_file_location("data_collator", collator_file)
collator_module = importlib.util.module_from_spec(spec)
spec.loader.exec_module(collator_module)
data_collator = collator_module.DataCollatorSimpleSeamless(
processor="facebook/hf-seamless-m4t-medium",
max_audio_length_sec=8.0,
max_text_length=256
)
return model, data_collator
def example_inference():
model, collator = load_model_and_collator()
# Example data: audio segment + subtitle text to predict editing time
data = [{
'raw_audio': np.random.randn(16000 * 3), # 3 seconds at 16kHz
'raw_text': "Hello, welcome to our presentation today.",
}]
batch = collator(data)
model.eval()
with torch.no_grad():
outputs = model(**batch)
tte_prediction = outputs.logits.item()
print(f"Predicted Time To Edit (TTE): {tte_prediction:.2f} seconds")
return tte_prediction
if __name__ == "__main__":
example_inference()
|