Update README.md
Browse files
    	
        README.md
    CHANGED
    
    | 
         @@ -1,3 +1,117 @@ 
     | 
|
| 1 | 
         
             
            ---
         
     | 
| 2 | 
         
             
            license: mit
         
     | 
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 3 | 
         
             
            ---
         
     | 
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
| 
         | 
|
| 1 | 
         
             
            ---
         
     | 
| 2 | 
         
             
            license: mit
         
     | 
| 3 | 
         
            +
            language:
         
     | 
| 4 | 
         
            +
            - en
         
     | 
| 5 | 
         
            +
            base_model:
         
     | 
| 6 | 
         
            +
            - THUDM/CogVideoX-2b
         
     | 
| 7 | 
         
            +
            - Fudan-FUXI/LiFT-Critic-40b-lora
         
     | 
| 8 | 
         
            +
            pipeline_tag: text-to-video
         
     | 
| 9 | 
         
             
            ---
         
     | 
| 10 | 
         
            +
             
     | 
| 11 | 
         
            +
            # LiFT: Leveraging Human Feedback for Text-to-Video Model Alignment
         
     | 
| 12 | 
         
            +
             
     | 
| 13 | 
         
            +
            CogVideoX-1.5-LiFT is the fine-tuned version of CogVideoX-1.5 using our reward-weighted learning method.
         
     | 
| 14 | 
         
            +
             
     | 
| 15 | 
         
            +
            ## 🚀 Quick Start
         
     | 
| 16 | 
         
            +
             
     | 
| 17 | 
         
            +
            We provide `cli_demo.py` for users to quick start.
         
     | 
| 18 | 
         
            +
            ```
         
     | 
| 19 | 
         
            +
            import argparse
         
     | 
| 20 | 
         
            +
            from typing import Literal
         
     | 
| 21 | 
         
            +
             
     | 
| 22 | 
         
            +
            import torch
         
     | 
| 23 | 
         
            +
            from diffusers import (
         
     | 
| 24 | 
         
            +
                CogVideoXPipeline,
         
     | 
| 25 | 
         
            +
                CogVideoXDDIMScheduler,
         
     | 
| 26 | 
         
            +
                CogVideoXDPMScheduler,
         
     | 
| 27 | 
         
            +
            )
         
     | 
| 28 | 
         
            +
            from diffusers.utils import export_to_video, load_image, load_video
         
     | 
| 29 | 
         
            +
             
     | 
| 30 | 
         
            +
            def generate_video(
         
     | 
| 31 | 
         
            +
                prompt: str,
         
     | 
| 32 | 
         
            +
                model_path: str,
         
     | 
| 33 | 
         
            +
                lora_path: str = None,
         
     | 
| 34 | 
         
            +
                lora_rank: int = 128,
         
     | 
| 35 | 
         
            +
                output_path: str = "./output.mp4",
         
     | 
| 36 | 
         
            +
                image_or_video_path: str = "",
         
     | 
| 37 | 
         
            +
                num_inference_steps: int = 50,
         
     | 
| 38 | 
         
            +
                guidance_scale: float = 6.0,
         
     | 
| 39 | 
         
            +
                num_videos_per_prompt: int = 1,
         
     | 
| 40 | 
         
            +
                dtype: torch.dtype = torch.bfloat16,
         
     | 
| 41 | 
         
            +
                generate_type: str = Literal["t2v", "i2v", "v2v"], 
         
     | 
| 42 | 
         
            +
                seed: int = 42,
         
     | 
| 43 | 
         
            +
            ):
         
     | 
| 44 | 
         
            +
             
     | 
| 45 | 
         
            +
                pipe = CogVideoXPipeline.from_pretrained(model_path, torch_dtype=dtype)
         
     | 
| 46 | 
         
            +
             
     | 
| 47 | 
         
            +
                if lora_path:
         
     | 
| 48 | 
         
            +
                    pipe.load_lora_weights(lora_path, weight_name="pytorch_lora_weights.safetensors", adapter_name="test")
         
     | 
| 49 | 
         
            +
                    pipe.fuse_lora(lora_scale=1 / lora_rank, components=['transformer'])
         
     | 
| 50 | 
         
            +
             
     | 
| 51 | 
         
            +
                pipe.scheduler = CogVideoXDPMScheduler.from_config(pipe.scheduler.config, timestep_spacing="trailing")
         
     | 
| 52 | 
         
            +
             
     | 
| 53 | 
         
            +
                pipe.to("cuda")
         
     | 
| 54 | 
         
            +
             
     | 
| 55 | 
         
            +
                video_generate = pipe(
         
     | 
| 56 | 
         
            +
                    prompt=prompt,
         
     | 
| 57 | 
         
            +
                    num_videos_per_prompt=num_videos_per_prompt,
         
     | 
| 58 | 
         
            +
                    num_inference_steps=num_inference_steps,
         
     | 
| 59 | 
         
            +
                    num_frames=49,
         
     | 
| 60 | 
         
            +
                    use_dynamic_cfg=True,
         
     | 
| 61 | 
         
            +
                    guidance_scale=guidance_scale,
         
     | 
| 62 | 
         
            +
                    generator=torch.Generator().manual_seed(seed),
         
     | 
| 63 | 
         
            +
                ).frames[0]
         
     | 
| 64 | 
         
            +
             
     | 
| 65 | 
         
            +
                export_to_video(video_generate, output_path, fps=8)
         
     | 
| 66 | 
         
            +
             
     | 
| 67 | 
         
            +
             
     | 
| 68 | 
         
            +
            if __name__ == "__main__":
         
     | 
| 69 | 
         
            +
                parser = argparse.ArgumentParser(description="Generate a video from a text prompt using CogVideoX")
         
     | 
| 70 | 
         
            +
                parser.add_argument(
         
     | 
| 71 | 
         
            +
                    "--model_path", type=str, default='Fudan-FUXI/CogVideoX-2B-LiFT', help="The path of the pre-trained model to be used"
         
     | 
| 72 | 
         
            +
                )
         
     | 
| 73 | 
         
            +
                parser.add_argument(
         
     | 
| 74 | 
         
            +
                    "--prompt", type=str, default="A girl riding a bike.", help="The description of the video to be generated"
         
     | 
| 75 | 
         
            +
                )
         
     | 
| 76 | 
         
            +
                parser.add_argument(
         
     | 
| 77 | 
         
            +
                    "--output_path", type=str, default="./output.mp4", help="The path where the generated video will be saved"
         
     | 
| 78 | 
         
            +
                )
         
     | 
| 79 | 
         
            +
                parser.add_argument(
         
     | 
| 80 | 
         
            +
                    "--num_inference_steps", type=int, default=50, help="Number of steps for the inference process"
         
     | 
| 81 | 
         
            +
                )
         
     | 
| 82 | 
         
            +
                parser.add_argument(
         
     | 
| 83 | 
         
            +
                    "--dtype", type=str, default="float16", help="The data type for computation (e.g., 'float16' or 'bfloat16')"
         
     | 
| 84 | 
         
            +
                )
         
     | 
| 85 | 
         
            +
                parser.add_argument("--seed", type=int, default=42, help="The seed for reproducibility")
         
     | 
| 86 | 
         
            +
                
         
     | 
| 87 | 
         
            +
                args = parser.parse_args()
         
     | 
| 88 | 
         
            +
                dtype = torch.float16 if args.dtype == "float16" else torch.bfloat16
         
     | 
| 89 | 
         
            +
                generate_video(
         
     | 
| 90 | 
         
            +
                    prompt=args.prompt,
         
     | 
| 91 | 
         
            +
                    model_path=args.model_path,
         
     | 
| 92 | 
         
            +
                    output_path=args.output_path,
         
     | 
| 93 | 
         
            +
                    num_inference_steps=args.num_inference_steps,
         
     | 
| 94 | 
         
            +
                    dtype=dtype,
         
     | 
| 95 | 
         
            +
                    generate_type='t2v',
         
     | 
| 96 | 
         
            +
                    seed=args.seed,
         
     | 
| 97 | 
         
            +
                )
         
     | 
| 98 | 
         
            +
            ```
         
     | 
| 99 | 
         
            +
             
     | 
| 100 | 
         
            +
            Running the Script:
         
     | 
| 101 | 
         
            +
            ```
         
     | 
| 102 | 
         
            +
            $ python cli_demo.py --prompt "a girl riding a bike." --model_path Fudan-FUXI/CogVideoX-2B-LiFT
         
     | 
| 103 | 
         
            +
            ```
         
     | 
| 104 | 
         
            +
             
     | 
| 105 | 
         
            +
             
     | 
| 106 | 
         
            +
             
     | 
| 107 | 
         
            +
            # 🖊️ Citation
         
     | 
| 108 | 
         
            +
             
     | 
| 109 | 
         
            +
            If you find our work helpful, please cite our paper.
         
     | 
| 110 | 
         
            +
            ```bibtex
         
     | 
| 111 | 
         
            +
            @article{LiFT,
         
     | 
| 112 | 
         
            +
              title={LiFT: Leveraging Human Feedback for Text-to-Video Model Alignment.},
         
     | 
| 113 | 
         
            +
              author={Wang, Yibin and Tan, Zhiyu, and Wang, Junyan and Yang, Xiaomeng and Jin, Cheng and Li, Hao},
         
     | 
| 114 | 
         
            +
              journal={arXiv preprint arXiv:2412.04814},
         
     | 
| 115 | 
         
            +
              year={2024}
         
     | 
| 116 | 
         
            +
            }
         
     | 
| 117 | 
         
            +
            ```
         
     |