TheDenk commited on
Commit
ba4c939
·
1 Parent(s): e15bc27
Files changed (3) hide show
  1. README.md +104 -0
  2. config.json +25 -0
  3. diffusion_pytorch_model.safetensors +3 -0
README.md CHANGED
@@ -1,3 +1,107 @@
1
  ---
2
  license: apache-2.0
 
 
 
 
 
 
 
 
 
3
  ---
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  ---
2
  license: apache-2.0
3
+ language:
4
+ - en
5
+ tags:
6
+ - video
7
+ - video-generation
8
+ - video-to-video
9
+ - controlnet
10
+ - diffusers
11
+ - wan2.2
12
  ---
13
+ # Controlnet for Wan2.2 (hed)
14
+
15
+ This repo contains the code for controlnet module for Wan2.2. See <a href="https://github.com/TheDenk/wan2.2-controlnet">Github code</a>.
16
+ Same approach as controlnet for [Wan2.1](https://github.com/TheDenk/wan2.1-dilated-controlnet).
17
+
18
+ ### Inference examples
19
+ #### Simple inference with cli
20
+ ```bash
21
+ python -m inference.cli_demo \
22
+ --video_path "resources/bubble.mp4" \
23
+ --prompt "Close-up shot with soft lighting, focusing sharply on the lower half of a young woman's face. Her lips are slightly parted as she blows an enormous bubblegum bubble. The bubble is semi-transparent, shimmering gently under the light, and surprisingly contains a miniature aquarium inside, where two orange-and-white goldfish slowly swim, their fins delicately fluttering as if in an aquatic universe. The background is a pure light blue color." \
24
+ --controlnet_type "depth" \
25
+ --base_model_path Wan-AI/Wan2.2-TI2V-5B-Diffusers \
26
+ --controlnet_model_path TheDenk/wan2.2-ti2v-5b-controlnet-depth-v1
27
+ ```
28
+ #### Minimal code example
29
+ ```python
30
+ import os
31
+ os.environ['CUDA_VISIBLE_DEVICES'] = "0"
32
+ os.environ["TOKENIZERS_PARALLELISM"] = "false"
33
+
34
+ import torch
35
+ from diffusers.utils import load_video, export_to_video
36
+ from diffusers import AutoencoderKLWan, UniPCMultistepScheduler
37
+ from controlnet_aux import HEDdetector
38
+
39
+ from wan_controlnet import WanControlnet
40
+ from wan_transformer import CustomWanTransformer3DModel
41
+ from wan_t2v_controlnet_pipeline import WanTextToVideoControlnetPipeline
42
+
43
+ base_model_path = "Wan-AI/Wan2.2-TI2V-5B-Diffusers"
44
+ controlnet_model_path = "TheDenk/wan2.2-ti2v-5b-controlnet-depth-v1"
45
+ vae = AutoencoderKLWan.from_pretrained(base_model_path, subfolder="vae", torch_dtype=torch.float32)
46
+ transformer = CustomWanTransformer3DModel.from_pretrained(base_model_path, subfolder="transformer", torch_dtype=torch.bfloat16)
47
+ controlnet = WanControlnet.from_pretrained(controlnet_model_path, torch_dtype=torch.bfloat16)
48
+ pipe = WanTextToVideoControlnetPipeline.from_pretrained(
49
+ pretrained_model_name_or_path=base_model_path,
50
+ controlnet=controlnet,
51
+ transformer=transformer,
52
+ vae=vae,
53
+ torch_dtype=torch.bfloat16
54
+ )
55
+ pipe.scheduler = UniPCMultistepScheduler.from_config(pipe.scheduler.config, flow_shift=5.0)
56
+ pipe.enable_model_cpu_offload()
57
+
58
+ controlnet_processor = HEDdetector.from_pretrained('lllyasviel/Annotators')
59
+ img_h = 704 # 704 480
60
+ img_w = 1280 # 1280 832
61
+ num_frames = 121 # 121 81 49
62
+
63
+ video_path = 'bubble.mp4'
64
+ video_frames = load_video(video_path)[:num_frames]
65
+ video_frames = [x.resize((img_w, img_h)) for x in video_frames]
66
+ controlnet_frames = [controlnet_processor(x) for x in video_frames]
67
+
68
+ prompt = "Close-up shot with soft lighting, focusing sharply on the lower half of a young woman's face. Her lips are slightly parted as she blows an enormous bubblegum bubble. The bubble is semi-transparent, shimmering gently under the light, and surprisingly contains a miniature aquarium inside, where two orange-and-white goldfish slowly swim, their fins delicately fluttering as if in an aquatic universe. The background is a pure light blue color."
69
+ negative_prompt = "bad quality, worst quality"
70
+
71
+ output = pipe(
72
+ prompt=prompt,
73
+ negative_prompt=negative_prompt,
74
+ height=img_h,
75
+ width=img_w,
76
+ num_frames=num_frames,
77
+ guidance_scale=5,
78
+ generator=torch.Generator(device="cuda").manual_seed(42),
79
+ output_type="pil",
80
+
81
+ controlnet_frames=controlnet_frames,
82
+ controlnet_guidance_start=0.0,
83
+ controlnet_guidance_end=0.8,
84
+ controlnet_weight=0.8,
85
+
86
+ teacache_treshold=0.6,
87
+ ).frames[0]
88
+
89
+ export_to_video(output, "output.mp4", fps=16)
90
+ ```
91
+ ## Acknowledgements
92
+ Original code and models [Wan2.2](https://github.com/Wan-Video/Wan2.2).
93
+
94
+
95
+ ## Citations
96
+ ```
97
+ @misc{TheDenk,
98
+ title={Wam2.2 Controlnet},
99
+ author={Karachev Denis},
100
+ url={https://github.com/TheDenk/wan2.2-controlnet},
101
+ publisher={Github},
102
+ year={2025}
103
+ }
104
+ ```
105
+
106
+ ## Contacts
107
+ <p>Issues should be raised directly in the repository. For professional support and recommendations please <a>[email protected]</a>.</p>
config.json ADDED
@@ -0,0 +1,25 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_class_name": "WanControlnet",
3
+ "_diffusers_version": "0.35.0.dev0",
4
+ "added_kv_proj_dim": null,
5
+ "attention_head_dim": 128,
6
+ "cross_attn_norm": true,
7
+ "downscale_coef": 16,
8
+ "eps": 1e-06,
9
+ "ffn_dim": 8960,
10
+ "freq_dim": 256,
11
+ "image_dim": null,
12
+ "in_channels": 3,
13
+ "num_attention_heads": 12,
14
+ "num_layers": 6,
15
+ "out_proj_dim": 3072,
16
+ "patch_size": [
17
+ 1,
18
+ 2,
19
+ 2
20
+ ],
21
+ "qk_norm": "rms_norm_across_heads",
22
+ "rope_max_seq_len": 1024,
23
+ "text_dim": 4096,
24
+ "vae_channels": 48
25
+ }
diffusion_pytorch_model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:adff10028ef3462f9df9afe57d12cebba85003b553df9c862776716c7af4c054
3
+ size 691979056