DeepVerse: 4D Autoregressive Video Generation as a World Model
Paper • 2506.01103 • Published
import torch
from diffusers import DiffusionPipeline
from diffusers.utils import load_image, export_to_video
# switch to "mps" for apple devices
pipe = DiffusionPipeline.from_pretrained("SOTAMak1r/DeepVerse1.1", dtype=torch.bfloat16, device_map="cuda")
pipe.to("cuda")
prompt = "A man with short gray hair plays a red electric guitar."
image = load_image(
"https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/guitar-man.png"
)
output = pipe(image=image, prompt=prompt).frames[0]
export_to_video(output, "output.mp4")Please following the instructions from github repo
@article{chen2025deepverse,
title={DeepVerse: 4D Autoregressive Video Generation as a World Model},
author={Chen, Junyi and Zhu, Haoyi and He, Xianglong and Wang, Yifan and Zhou, Jianjun and Chang, Wenzheng and Zhou, Yang and Li, Zizun and Fu, Zhoujie and Pang, Jiangmiao and others},
journal={arXiv preprint arXiv:2506.01103},
year={2025}
}