| | model_config: |
| | model_name: HunyuanVideo-Foley-XXL |
| | model_type: 1d |
| | model_precision: bf16 |
| | model_kwargs: |
| | depth_triple_blocks: 18 |
| | depth_single_blocks: 36 |
| | hidden_size: 1536 |
| | num_heads: 12 |
| | mlp_ratio: 4 |
| | mlp_act_type: "gelu_tanh" |
| | qkv_bias: True |
| | qk_norm: True |
| | qk_norm_type: "rms" |
| | attn_mode: "torch" |
| | embedder_type: "default" |
| | interleaved_audio_visual_rope: True |
| | enable_learnable_empty_visual_feat: True |
| | sync_modulation: False |
| | add_sync_feat_to_audio: True |
| | cross_attention: True |
| | use_attention_mask: False |
| | condition_projection: "linear" |
| | sync_feat_dim: 768 |
| | condition_dim: 768 |
| | clip_dim: 768 |
| | audio_vae_latent_dim: 128 |
| | audio_frame_rate: 50 |
| | patch_size: 1 |
| | rope_dim_list: null |
| | rope_theta: 10000 |
| | text_length: 77 |
| | clip_length: 64 |
| | sync_length: 192 |
| | use_mmaudio_singleblock: True |
| | depth_triple_ssl_encoder: null |
| | depth_single_ssl_encoder: 8 |
| | use_repa_with_audiossl: True |
| |
|
| | diffusion_config: |
| | denoise_type: "flow" |
| | flow_path_type: "linear" |
| | flow_predict_type: "velocity" |
| | flow_reverse: True |
| | flow_solver: "euler" |
| | sample_flow_shift: 1.0 |
| | sample_use_flux_shift: False |
| | flux_base_shift: 0.5 |
| | flux_max_shift: 1.15 |
| |
|