|
|
|
|
|
|
|
from transformers import AutoModel, AutoConfig |
|
from huggingface_hub import hf_hub_download |
|
import torch |
|
import numpy as np |
|
import importlib.util |
|
|
|
def load_model_and_collator(): |
|
|
|
model = AutoModel.from_pretrained("videoloc/seamless-basic") |
|
config = AutoConfig.from_pretrained("videoloc/seamless-basic") |
|
|
|
|
|
collator_file = hf_hub_download(repo_id="videoloc/seamless-basic", filename="data_collator.py") |
|
spec = importlib.util.spec_from_file_location("data_collator", collator_file) |
|
collator_module = importlib.util.module_from_spec(spec) |
|
spec.loader.exec_module(collator_module) |
|
|
|
data_collator = collator_module.DataCollatorSimpleSeamless( |
|
processor="facebook/hf-seamless-m4t-medium", |
|
max_audio_length_sec=8.0, |
|
max_text_length=256 |
|
) |
|
|
|
return model, data_collator |
|
|
|
def example_inference(): |
|
model, collator = load_model_and_collator() |
|
|
|
|
|
data = [{ |
|
'raw_audio': np.random.randn(16000 * 3), |
|
'raw_text': "Hello, welcome to our presentation today.", |
|
}] |
|
|
|
batch = collator(data) |
|
model.eval() |
|
with torch.no_grad(): |
|
outputs = model(**batch) |
|
tte_prediction = outputs.logits.item() |
|
|
|
print(f"Predicted Time To Edit (TTE): {tte_prediction:.2f} seconds") |
|
return tte_prediction |
|
|
|
if __name__ == "__main__": |
|
example_inference() |
|
|