#!/usr/bin/env python3
# Example usage for videoloc/seamless-basic

from transformers import AutoModel, AutoConfig
from huggingface_hub import hf_hub_download
import torch
import numpy as np
import importlib.util

def load_model_and_collator():
    # Load model - custom architecture requires importing the model class
    model_files = hf_hub_download(repo_id="videoloc/seamless-basic", filename="modeling_seamless_basic.py")
    spec = importlib.util.spec_from_file_location("modeling_seamless_basic", model_files)
    modeling_module = importlib.util.module_from_spec(spec)
    spec.loader.exec_module(modeling_module)

    # Now load the model using the custom class
    config = modeling_module.SeamlessBasicConfig.from_pretrained("videoloc/seamless-basic")
    model = modeling_module.HFSeamlessBasic.from_pretrained("videoloc/seamless-basic")
    
    # Load data collator
    collator_file = hf_hub_download(repo_id="videoloc/seamless-basic", filename="data_collator.py")
    spec = importlib.util.spec_from_file_location("data_collator", collator_file)
    collator_module = importlib.util.module_from_spec(spec)
    spec.loader.exec_module(collator_module)
    
    data_collator = collator_module.DataCollatorSimpleSeamless(
        processor="facebook/hf-seamless-m4t-medium",
        max_audio_length_sec=8.0,
        max_text_length=256
    )
    
    return model, data_collator

def example_inference():
    model, collator = load_model_and_collator()
    
    # Example data: audio segment + subtitle text to predict editing time
    data = [{
        'raw_audio': np.random.randn(16000 * 3),  # 3 seconds at 16kHz
        'raw_text': "Hello, welcome to our presentation today.",
    }]
    
    batch = collator(data)
    model.eval()
    with torch.no_grad():
        outputs = model(**batch)
        tte_prediction = outputs.logits.item()
    
    print(f"Predicted Time To Edit (TTE): {tte_prediction:.2f} seconds")
    return tte_prediction

if __name__ == "__main__":
    example_inference()