weijiangchuan commited on Feb 18

Commit

712663f

0 Parent(s):

initial commit

Browse files

Files changed (20) hide show

.gitattributes +48 -0
LICENSE +1 -0
README.md +99 -0
asset/examples/1.gif +3 -0
asset/examples/2.gif +3 -0
asset/examples/3.gif +3 -0
asset/examples/4.gif +3 -0
asset/examples/5.gif +3 -0
asset/examples/6.gif +3 -0
asset/examples/7.gif +3 -0
asset/examples/8.gif +3 -0
asset/examples/IITF.jpg +3 -0
asset/examples/framework.jpg +3 -0
configuration.json +1 -0
model_index.json +24 -0
scheduler/scheduler_config.json +18 -0
transformer/config.json +33 -0
transformer/diffusion_pytorch_model-00001-of-00002.safetensors +3 -0
transformer/diffusion_pytorch_model-00002-of-00002.safetensors +3 -0
transformer/diffusion_pytorch_model.safetensors.index.json +0 -0

.gitattributes ADDED Viewed

	@@ -0,0 +1,48 @@

+*.7z filter=lfs diff=lfs merge=lfs -text
+*.arrow filter=lfs diff=lfs merge=lfs -text
+*.bin filter=lfs diff=lfs merge=lfs -text
+*.bz2 filter=lfs diff=lfs merge=lfs -text
+*.ckpt filter=lfs diff=lfs merge=lfs -text
+*.ftz filter=lfs diff=lfs merge=lfs -text
+*.gz filter=lfs diff=lfs merge=lfs -text
+*.h5 filter=lfs diff=lfs merge=lfs -text
+*.joblib filter=lfs diff=lfs merge=lfs -text
+*.lfs.* filter=lfs diff=lfs merge=lfs -text
+*.mlmodel filter=lfs diff=lfs merge=lfs -text
+*.model filter=lfs diff=lfs merge=lfs -text
+*.msgpack filter=lfs diff=lfs merge=lfs -text
+*.npy filter=lfs diff=lfs merge=lfs -text
+*.npz filter=lfs diff=lfs merge=lfs -text
+*.onnx filter=lfs diff=lfs merge=lfs -text
+*.ot filter=lfs diff=lfs merge=lfs -text
+*.parquet filter=lfs diff=lfs merge=lfs -text
+*.pb filter=lfs diff=lfs merge=lfs -text
+*.pickle filter=lfs diff=lfs merge=lfs -text
+*.pkl filter=lfs diff=lfs merge=lfs -text
+*.pt filter=lfs diff=lfs merge=lfs -text
+*.pth filter=lfs diff=lfs merge=lfs -text
+*.rar filter=lfs diff=lfs merge=lfs -text
+*.safetensors filter=lfs diff=lfs merge=lfs -text
+saved_model/**/* filter=lfs diff=lfs merge=lfs -text
+*.tar.* filter=lfs diff=lfs merge=lfs -text
+*.tar filter=lfs diff=lfs merge=lfs -text
+*.tflite filter=lfs diff=lfs merge=lfs -text
+*.tgz filter=lfs diff=lfs merge=lfs -text
+*.wasm filter=lfs diff=lfs merge=lfs -text
+*.xz filter=lfs diff=lfs merge=lfs -text
+*.zip filter=lfs diff=lfs merge=lfs -text
+*.zst filter=lfs diff=lfs merge=lfs -text
+*tfevents* filter=lfs diff=lfs merge=lfs -text
+transformer/diffusion_pytorch_model-00001-of-00002.safetensors filter=lfs diff=lfs merge=lfs -text
+transformer/diffusion_pytorch_model-00002-of-00002.safetensors filter=lfs diff=lfs merge=lfs -text
+asset/examples/3.gif filter=lfs diff=lfs merge=lfs -text
+asset/examples/4.gif filter=lfs diff=lfs merge=lfs -text
+asset/examples/2.gif filter=lfs diff=lfs merge=lfs -text
+asset/examples/ filter=lfs diff=lfs merge=lfs -text
+asset/examples/5.gif filter=lfs diff=lfs merge=lfs -text
+asset/examples/6.gif filter=lfs diff=lfs merge=lfs -text
+asset/examples/7.gif filter=lfs diff=lfs merge=lfs -text
+asset/examples/8.gif filter=lfs diff=lfs merge=lfs -text
+asset/examples/framework.jpg filter=lfs diff=lfs merge=lfs -text
+asset/examples/IITF.jpg filter=lfs diff=lfs merge=lfs -text
+asset/examples/1.gif filter=lfs diff=lfs merge=lfs -text

LICENSE ADDED Viewed

	@@ -0,0 +1 @@


1	+ The model weights of EchoVideo are licensed under CC BY NC 4.0.

README.md ADDED Viewed

	@@ -0,0 +1,99 @@

+---
+license: other
+license_link: https://huggingface.co/bytedance-research/EchoVideo/blob/main/LICENSE
+language:
+  - en
+tags:
+  - EchoVideo
+  - video-generation
+  - id-preserving
+---
+# EchoVideo: Identity-Preserving Human Video Generation by Multimodal Feature Fusion
+This repo contains PyTorch model definitions, pre-trained weights and inference code for our video generation model, EchoVideo.
+> [**EchoVideo: Identity-Preserving Human Video Generation by Multimodal Feature Fusion**](https://arxiv.org/abs/2501.13452) <be>
+# News
+**[2025.02.27]** We release the inference code and model weights of EchoVideo.
+# Introduction
+EchoVideo is capable of generating a personalized video from a single photo and a text description. It excels in addressing issues related to "semantic conflict" and "copy-paste" problems. And demonstrates state-of-the-art performance.
+# Gallery
+## 1. Text-to-Video Generation
+| Face-ID Preserving | Full-Body Preserving|
+| ---- | ---- |
+| <img height="300" src="asset/examples/3.gif" > | <img height="300" src="asset/examples/4.gif" > |
+## 2. Comparisons
+| EchoVideo | ConsisID | IDAnimator |
+| ---- | ---- | ---- |
+| <img height="240" src="asset/examples/2.gif" > | <img height="240" src="asset/examples/5.gif" > | <img height="240" src="asset/examples/6.gif" > |
+| <img height="240" src="asset/examples/1.gif" > | <img height="240" src="asset/examples/7.gif" > | <img height="240" src="asset/examples/8.gif" > |
+# Usage
+**Python version is between 3.10 and 3.12, inclusive of both 3.10 and 3.12. Support both gpu and npu**
+## cloning the repository:
+```shell
+git clone https://github.com/bytedance/EchoVideo
+cd EchoVideo
+```
+## Installation
+```shell
+pip install -r requirements.txt
+```
+## Download Pretrained Weights
+The details of download pretrained models are shown [here](https://github.com/bytedance/EchoVideo/ckpts/README.md).
+## Run Demo
+```shell
+# multi-resolution video generation [(480, 640), (480, 848), (480, 480), (848, 480), (640, 480)]
+python infer.py
+```
+# Methods
+## **Overall Architecture**
+<p align="center">
+  <img src="asset/examples/framework.jpg"  height=350>
+</p>
+Overall architecture of EchoVideo. By employing a meticulously designed IITF module and mitigating the over-reliance on input images, our model effectively unifies the semantic information between the input facial image and the textual prompt. This integration enables the generation of consistent characters with multi-view facial coherence, ensuring that the synthesized outputs maintain both visual and semantic fidelity across diverse perspectives.
+## **Key Features**
+<p align="center">
+  <img src="asset/examples/IITF.jpg"  height=350>
+</p>
+Illustration of facial information injection methods. (a) IITF. Facial and textual information are fused to ensure consistent guidance throughout the generation process. we propose IITF to fuse text and facial information, establishing a semantic bridge between facial and textual information, coordinating the influence of different information on character features, thereby ensuring the consistency of generated characters. IITF consists of two core components: facial feature alignment and conditional feature alignment. (b) Dual branch. Facial and textual information are independently injected through Cross Attention mechanisms, providing separate guidance for the generation process.
+## Benchmark
+| Model | Identity Average↑ | Identity Variation↓ | Inception Distance↓ | Dynamic Degree↑ |
+| -- | -- | -- | -- |-----------------|
+| IDAnimator | 0.349 | **0.032** | **159.11** | 0.280           |
+| ConsisID | <u>0.414</u> | 0.094 | 200.40 | 0.871           |
+| pika | 0.329 | 0.091 | 268.35 | <u>0.954</u>    |
+| Ours | **0.516** | <u>0.075</u> | <u>176.53</u> | **0.955**       |
+# Acknowledgements
+* [CogVideo](https://huggingface.co/THUDM/CogVideoX-5b): The DiT module we adpated from, and the VAE module we used. [MODEL LICENSE](https://huggingface.co/THUDM/CogVideoX-5b/blob/main/LICENSE)
+* [SigLip](https://huggingface.co/google/siglip-base-patch16-224): Vision Encoder we used.
+# BibTeX
+If you find our work useful in your research, please consider citing the paper
+```bibtex
+@article{wei2025echovideo,
+  title={EchoVideo: Identity-Preserving Human Video Generation by Multimodal Feature Fusion},
+  author={Wei, Jiangchuan and Yan, Shiyue and Lin, Wenfeng and Liu, Boyuan and Chen, Renjie and Guo, Mingyu},
+  journal={arXiv preprint arXiv:2501.13452},
+  year={2025}
+}
+```

asset/examples/1.gif ADDED Viewed

Git LFS Details

SHA256: 1a78f5b5ffaa63138d92fceeb9e36c36c2712eef898fcf9b0b40ceaa8bb79e93
Pointer size: 132 Bytes
Size of remote file: 9.85 MB

asset/examples/2.gif ADDED Viewed

Git LFS Details

SHA256: 37af34fd8ed3a1d525bd353f4258793c33d2719ea3f355a4b8d1fec809ba4ee3
Pointer size: 132 Bytes
Size of remote file: 9.31 MB

asset/examples/3.gif ADDED Viewed

Git LFS Details

SHA256: ac56c2724a7166a1892cfc0003f395b955e0bfd482653ad11da82b8998f75531
Pointer size: 132 Bytes
Size of remote file: 7.85 MB

asset/examples/4.gif ADDED Viewed

Git LFS Details

SHA256: f05b04158c3c6c01d7b6e902097c94d59d9c0fc46c776a2a5e7a77b5c2fca380
Pointer size: 132 Bytes
Size of remote file: 8.36 MB

asset/examples/5.gif ADDED Viewed

Git LFS Details

SHA256: 0f67756626ee85135d759047ff86ce35c692cefeecb429c21c755786c58f7cb0
Pointer size: 132 Bytes
Size of remote file: 7.27 MB

asset/examples/6.gif ADDED Viewed

Git LFS Details

SHA256: aec6ee4c502550bb35035f5dcedfbeab2bd0e0820f86e0c7b93faff102eba0d0
Pointer size: 132 Bytes
Size of remote file: 2.47 MB

asset/examples/7.gif ADDED Viewed

Git LFS Details

SHA256: ec2f4b578edaf3418989a740802ff5d805e2dc8ca9280f3fd2a16c2f6428b81a
Pointer size: 132 Bytes
Size of remote file: 8.04 MB

asset/examples/8.gif ADDED Viewed

Git LFS Details

SHA256: 53a2639f5b47c2297cff3913fbf6d60c389d383c68d23c66a6210c924fd9bcc9
Pointer size: 132 Bytes
Size of remote file: 2.6 MB

asset/examples/IITF.jpg ADDED Viewed

Git LFS Details

SHA256: 3af76806fb4c251ae4a8226211e3f0a77430acd2f0fae6edacb546ad0748edcb
Pointer size: 131 Bytes
Size of remote file: 933 kB

asset/examples/framework.jpg ADDED Viewed

Git LFS Details

SHA256: d2c916de69a21146c2a55cd98d788296d6d6239d2b6e4f226717bb4c9e9ed9b0
Pointer size: 131 Bytes
Size of remote file: 663 kB

configuration.json ADDED Viewed

	@@ -0,0 +1 @@


1	+ {"framework":"Pytorch","task":"image-to-video"}

model_index.json ADDED Viewed

	@@ -0,0 +1,24 @@

+{
+  "_class_name": "EchoVideoPipeline",
+  "_diffusers_version": "0.31.0.dev0",
+  "scheduler": [
+    "diffusers",
+    "CogVideoXDPMScheduler"
+  ],
+  "text_encoder": [
+    "transformers",
+    "T5EncoderModel"
+  ],
+  "tokenizer": [
+    "transformers",
+    "T5Tokenizer"
+  ],
+  "transformer": [
+    "models.echovideo_transformer_3d",
+    "EchoVideoLDM"
+  ],
+  "vae": [
+    "diffusers",
+    "AutoencoderKLCogVideoX"
+  ]
+}

scheduler/scheduler_config.json ADDED Viewed

	@@ -0,0 +1,18 @@

+{
+  "_class_name": "CogVideoXDPMScheduler",
+  "_diffusers_version": "0.31.0.dev0",
+  "beta_end": 0.012,
+  "beta_schedule": "scaled_linear",
+  "beta_start": 0.00085,
+  "clip_sample": false,
+  "clip_sample_range": 1.0,
+  "num_train_timesteps": 1000,
+  "prediction_type": "v_prediction",
+  "rescale_betas_zero_snr": true,
+  "sample_max_value": 1.0,
+  "set_alpha_to_one": true,
+  "snr_shift_scale": 1.0,
+  "steps_offset": 0,
+  "timestep_spacing": "trailing",
+  "trained_betas": null
+}

transformer/config.json ADDED Viewed

	@@ -0,0 +1,33 @@

+{
+  "_class_name": "EchoVideoLDM",
+  "_diffusers_version": "0.31.0",
+  "_name_or_path": "EchoVideo/ckpts",
+  "activation_fn": "gelu-approximate",
+  "attention_bias": true,
+  "attention_head_dim": 64,
+  "dropout": 0.0,
+  "face_embed_dim": 768,
+  "face_features_embed_dim": 512,
+  "face_features_seq_length": 144,
+  "face_seq_length": 196,
+  "flip_sin_to_cos": true,
+  "freq_shift": 0,
+  "in_channels": 32,
+  "max_text_seq_length": 226,
+  "norm_elementwise_affine": true,
+  "norm_eps": 1e-05,
+  "num_attention_heads": 48,
+  "num_layers": 42,
+  "out_channels": 16,
+  "patch_size": 2,
+  "sample_frames": 49,
+  "sample_height": 60,
+  "sample_width": 90,
+  "spatial_interpolation_scale": 1.875,
+  "temporal_compression_ratio": 4,
+  "temporal_interpolation_scale": 1.0,
+  "text_embed_dim": 4096,
+  "time_embed_dim": 512,
+  "timestep_activation_fn": "silu",
+  "use_rotary_positional_embeddings": true
+}

transformer/diffusion_pytorch_model-00001-of-00002.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:a6d0c109ec3c1d8a11f096a79c1b9cc06442f510b6f2fd65ab097dbecc8c78bd
+size 9925735424

transformer/diffusion_pytorch_model-00002-of-00002.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:9d2bb9a5d79cb17533bb26d0d686d6d38471c039ae2c298026e1abb4b7001c4c
+size 1316905404

transformer/diffusion_pytorch_model.safetensors.index.json ADDED Viewed

The diff for this file is too large to render. See raw diff