def generate_video(text, steps=1000): | |
model.eval() | |
text_enc = tokenizer(text, return_tensors="pt").input_ids.to(device) | |
x = torch.randn(1, 3, FRAMES, H, W).to(device) | |
for t in range(steps, 0, -1): | |
t_tensor = torch.tensor([[t/steps]]).to(device) | |
pred_noise = model(x, t_tensor, text_enc) | |
alpha_t = (1 - t_tensor).view(-1, 1, 1, 1, 1) | |
x = (x - (1 - alpha_t)/torch.sqrt(alpha_t) * pred_noise) / torch.sqrt(alpha_t) | |
return x.clamp(-1, 1) | |
video = generate_video("YOUR_PROMPT_HERE") |