Add MotifVision model for text-to-image generation

- Implemented the MotifVision class, combining a Diffusion transformer with rectified flow loss and multiple text encoders.
- Integrated Variational Autoencoder (VAE) for image encoding and decoding.
- Added methods for tokenization, text encoding, and sampling images with flow matching.
- Included support for multiple text encoders: T5 and CLIP models.
- Implemented functionality for handling LoRA parameters during state dict loading.
- Added evaluation method to compute loss during inference.

Files changed (6) hide show

configs/configuration_mmdit.py +89 -0
configs/mmdit_xlarge_hq.json +26 -0
models/mixin/encoder_mixin.py +222 -0
models/mixin/flow_mixin.py +175 -0
models/modeling_dit.py +653 -0
models/modeling_motif_vision.py +612 -0

configs/configuration_mmdit.py ADDED Viewed

	@@ -0,0 +1,89 @@

+import json
+from dataclasses import dataclass
+ENCODED_TEXT_DIM = 4096
+POOLED_TEXT_DIM = 2048
+VAE_COMPRESSION_RATIO = 8
+@dataclass
+class MMDiTConfig:
+    # General
+    num_layers: int = 12
+    hidden_dim: int = 768  # common hidden dimension for the transformer arch
+    patch_size: int = 2
+    image_dim: int = 224
+    in_channel: int = 4
+    out_channel: int = 4
+    modulation_dim: int = ENCODED_TEXT_DIM  # input dimension for modulation layer (shifting and scaling)
+    height: int = 1024
+    width: int = 1024
+    vae_compression: int = VAE_COMPRESSION_RATIO  # reducing resolution with the VAE
+    vae_type: str = "SD3"  # SDXL or SD3
+    pos_emb_size: int = None
+    conv_header: bool = False
+    # Outside of the MMDiT block
+    time_embed_dim: int = 2048  # Initial projection (discrete_time embedding) output dim
+    pooled_text_dim: int = POOLED_TEXT_DIM
+    text_emb_dim: int = 768
+    # MMDiTBlock
+    t_emb_dim: int = 256
+    attn_embed_dim: int = 768  # hidden dimension during the attention
+    mlp_hidden_dim: int = 2048
+    attn_mode: str = None  # {'flash', 'sdpa', None}
+    use_final_layer_norm: bool = False
+    use_time_token_in_attn: bool = False
+    # GroupedQueryAttention
+    num_attention_heads: int = 12
+    num_key_value_heads: int = 6
+    use_scaled_dot_product_attention: bool = True
+    dropout: float = 0.0
+    # Modulation
+    use_modulation: bool = True
+    modulation_type: str = "film"  # Choose from 'film', 'adain', or 'spade'
+    # Register tokens
+    register_token_num: int = 4
+    additional_register_token_num: int = 12
+    # use dinov2 feature-align loss
+    dinov2_feature_align_loss: bool = False
+    feature_align_loss_weight: float = 0.5
+    num_feature_align_layers: int = 8  # number of transformer layers to calculate feature-align loss
+    # Personalization related
+    image_encoder_name: str = None  # if set, the persoanlized image encoder will be loaded
+    freeze_dit_backbone: bool = False
+    # Preference optimization
+    preference_train: bool = False
+    lora_rank: int = 64
+    lora_alpha: int = 8
+    skip_register_token_num: int = 0
+    @classmethod
+    def from_json_file(cls, json_file):
+        """
+        Instantiates a [`PretrainedConfig`] from the path to a JSON file of parameters.
+        Args:
+            json_file (`str` or `os.PathLike`):
+                Path to the JSON file containing the parameters.
+        Returns:
+            [`PretrainedConfig`]: The configuration object instantiated from that JSON file.
+        """
+        config_dict = cls._dict_from_json_file(json_file)
+        return cls(**config_dict)
+    @classmethod
+    def _dict_from_json_file(cls, json_file):
+        with open(json_file, "r", encoding="utf-8") as reader:
+            text = reader.read()
+        return json.loads(text)

configs/mmdit_xlarge_hq.json ADDED Viewed

	@@ -0,0 +1,26 @@

+{
+    "num_layers": 30,
+    "hidden_dim": 1920,
+    "patch_size": 2,
+    "in_channel": 4,
+    "out_channel": 4,
+    "time_embed_dim": 4096,
+    "attn_embed_dim": 4096,
+    "num_attention_heads": 30,
+    "num_key_value_heads": 30,
+    "use_scaled_dot_product_attention": true,
+    "dropout": 0.0,
+    "mlp_hidden_dim": 7680,
+    "use_modulation": true,
+    "modulation_type": "film",
+    "register_token_num": 4,
+    "additional_register_token_num": 0,
+    "skip_register_token_num": 0,
+    "height": 1024,
+    "width": 1024,
+    "attn_mode": "flash",
+    "use_final_layer_norm": false,
+    "pos_emb_size": 64,
+    "conv_header": false,
+    "use_time_token_in_attn": true
+}

models/mixin/encoder_mixin.py ADDED Viewed

	@@ -0,0 +1,222 @@

+import torch
+from diffusers.models import AutoencoderKL
+from transformers import CLIPTextModel, CLIPTokenizerFast, T5EncoderModel, T5Tokenizer
+class EncoderMixin:
+    """Mixin class for handling various encoders in the MotifDiT model.
+    This mixin provides functionality for:
+    1. Loading and initializing encoders (VAE, T5, CLIP-L, CLIP-G)
+    2. Text tokenization and encoding
+    3. Managing encoder parameters and state
+    """
+    TOKEN_MAX_LENGTH: int = 256
+    def prepare_embeddings(
+        self,
+        images: torch.Tensor,
+        raw_text: list[str],
+        vae: AutoencoderKL,
+        t5: T5EncoderModel,
+        clip_l: CLIPTextModel,
+        clip_g: CLIPTextModel,
+        t5_tokenizer: T5Tokenizer,
+        clip_l_tokenizer: CLIPTokenizerFast,
+        clip_g_tokenizer: CLIPTokenizerFast,
+        is_training,
+    ) -> tuple[torch.Tensor, list[torch.Tensor], torch.Tensor]:
+        """Prepare image latents and text embeddings for model input.
+        Args:
+            images (torch.Tensor): Input images tensor with shape [B, C=3, H, W].
+            raw_text (List[str]): List of raw text strings with length B.
+        """
+        with torch.no_grad():
+            latents: torch.Tensor = (
+                vae.encode(images).latent_dist.sample() - vae.config.shift_factor
+            ) * vae.config.scaling_factor  # Latents shape: [B, 16, H//8, W//8]
+        # Tokenize the input text and move tokens and masks to the same device as latents
+        tokenizers = [t5_tokenizer, clip_l_tokenizer, clip_g_tokenizer]
+        tokens, masks = self.tokenization(raw_text, tokenizers)
+        tokens = [token.to(latents.device) for token in tokens]
+        masks = [mask.to(latents.device) for mask in masks]
+        # Encode the text and drop unnecessary embeddings
+        text_embeddings, pooled_text_embeddings = self.text_encoding(
+            tokens,
+            masks,
+            t5,
+            clip_l,
+            clip_g,
+            t5_tokenizer.pad_token_id,
+            clip_l_tokenizer.eos_token_id,
+            clip_g_tokenizer.eos_token_id,
+            is_training,
+        )
+        text_embeddings = self.drop_text_emb(text_embeddings)
+        # Convert text embeddings to float
+        text_embeddings = [text_embedding.float() for text_embedding in text_embeddings]
+        # Convert pooled text embeddings to float
+        pooled_text_embeddings = pooled_text_embeddings.float()
+        return latents, text_embeddings, pooled_text_embeddings
+    def get_freezed_encoders_and_tokenizers(
+        self, vae_type: str
+    ) -> tuple[
+        AutoencoderKL, T5EncoderModel, CLIPTextModel, CLIPTextModel, T5Tokenizer, CLIPTokenizerFast, CLIPTokenizerFast
+    ]:
+        """Initialize the VAE and text encoders."""
+        if vae_type != "SD3":
+            raise ValueError(
+                f"VAE type must be `SD3` but self.config.vae_type is {vae_type}."
+                f" note that the VAE type SDXL is deprecated."
+            )
+        vae: AutoencoderKL = AutoencoderKL.from_pretrained(
+            "stabilityai/stable-diffusion-3-medium-diffusers", subfolder="vae"
+        )
+        # Text encoders
+        # 1. T5-XXL from Google
+        t5 = T5EncoderModel.from_pretrained("google/flan-t5-xxl").to(dtype=torch.bfloat16)
+        t5_tokenizer = T5Tokenizer.from_pretrained("google/flan-t5-xxl")
+        # 2. CLIP-L from OpenAI
+        clip_l = CLIPTextModel.from_pretrained("openai/clip-vit-large-patch14").to(dtype=torch.bfloat16)
+        clip_l_tokenizer = CLIPTokenizerFast.from_pretrained("openai/clip-vit-large-patch14")
+        # 3. CLIP-G from LAION
+        clip_g = CLIPTextModel.from_pretrained("laion/CLIP-ViT-bigG-14-laion2B-39B-b160k").to(dtype=torch.bfloat16)
+        clip_g_tokenizer = CLIPTokenizerFast.from_pretrained("laion/CLIP-ViT-bigG-14-laion2B-39B-b160k")
+        # Freeze all encoders
+        for encoder_module in [vae, clip_l, clip_g, t5]:
+            for param in encoder_module.parameters():
+                param.requires_grad = False
+        return vae, t5, clip_l, clip_g, t5_tokenizer, clip_l_tokenizer, clip_g_tokenizer
+    def tokenization(
+        self, raw_text: list[str], tokenizers: list[T5Tokenizer | CLIPTokenizerFast]
+    ) -> tuple[list[torch.Tensor], list[torch.Tensor]]:
+        """Tokenize the input text using multiple tokenizers.
+        Args:
+            raw_text (str): Input text string.
+        Returns:
+            Tuple[List[torch.Tensor], List[torch.Tensor]]: Lists of tokenized text tensors and attention masks.
+        """
+        tokens, masks = [], []
+        for tokenizer in tokenizers:
+            tok = tokenizer(
+                raw_text,
+                padding="max_length",
+                max_length=min(EncoderMixin.TOKEN_MAX_LENGTH, tokenizer.model_max_length),
+                return_tensors="pt",
+                truncation=True,
+            )
+            tokens.append(tok.input_ids)
+            masks.append(tok.attention_mask)
+        return tokens, masks
+    @torch.no_grad()
+    def text_encoding(
+        self,
+        tokens: list[torch.Tensor],
+        masks: list[torch.Tensor],
+        t5: T5EncoderModel,
+        clip_l: CLIPTextModel,
+        clip_g: CLIPTextModel,
+        t5_pad_token_id: int = 0,
+        clip_l_tokenizer_eos_token_id: int = 49407,
+        clip_g_tokenizer_eos_token_id: int = 49407,
+        is_training: bool = False,
+    ) -> tuple[list[torch.Tensor], torch.Tensor]:
+        """Encode the tokenized text using multiple text encoders.
+        Args:
+            tokens (List[torch.Tensor]): List of tokenized text tensors.
+            masks (List[torch.Tensor]): List of attention masks.
+        Returns:
+            Tuple[List[torch.Tensor], torch.Tensor]: Text embeddings and pooled text embeddings.
+        """
+        t5_tokens, clip_l_tokens, clip_g_tokens = tokens
+        t5_masks, _, _ = masks
+        # T5 encoding
+        t5_emb = t5(t5_tokens, attention_mask=t5_masks)[0]
+        t5_emb = t5_emb * (t5_tokens != t5_pad_token_id).unsqueeze(-1)
+        # CLIP encodings
+        clip_l_emb = clip_l(input_ids=clip_l_tokens, output_hidden_states=True)
+        clip_g_emb = clip_g(input_ids=clip_g_tokens, output_hidden_states=True)
+        # Get pooled outputs
+        clip_l_emb_pooled = clip_l_emb.pooler_output  # B x 768
+        clip_g_emb_pooled = clip_g_emb.pooler_output  # B x 1280
+        if is_training:
+            clip_l_emb_pooled = self.drop_text_emb(clip_l_emb_pooled)
+            clip_g_emb_pooled = self.drop_text_emb(clip_g_emb_pooled)
+        clip_l_emb = clip_l_emb.last_hidden_state  # B x L x 768
+        clip_g_emb = clip_g_emb.last_hidden_state  # B x L x 1280
+        def masking_wo_first_eos(token, eos):
+            """Create attention mask without first EOS token."""
+            idx = (token != eos).sum(dim=1)
+            mask = token != eos
+            arange = torch.arange(mask.size(0)).cuda()
+            if idx != len(mask[0]):
+                mask[arange, idx] = True
+            return mask.unsqueeze(-1)  # B x L x 1
+        # Apply masking
+        clip_l_emb = clip_l_emb * masking_wo_first_eos(clip_l_tokens, clip_l_tokenizer_eos_token_id)
+        clip_g_emb = clip_g_emb * masking_wo_first_eos(clip_g_tokens, clip_g_tokenizer_eos_token_id)
+        encodings = [t5_emb, clip_l_emb, clip_g_emb]
+        pooled_encodings = torch.cat([clip_l_emb_pooled, clip_g_emb_pooled], dim=-1)  # B x 2048
+        return encodings, pooled_encodings
+    @torch.no_grad()
+    def drop_text_emb(
+        self, text_embeddings: list[torch.Tensor] | torch.Tensor, drop_prob: float = 0.464
+    ) -> list[torch.Tensor] | torch.Tensor:
+        """Randomly drop text embeddings with a specified probability.
+        Args:
+            text_embeddings (Union[List[torch.Tensor], torch.Tensor]): Text embeddings to be dropped.
+            drop_prob (float, optional): Probability of dropping text embeddings. Defaults to 0.464.
+        Returns:
+            Union[List[torch.Tensor], torch.Tensor]: Text embeddings with dropped elements.
+        """
+        if isinstance(text_embeddings, list):
+            # For BxLxC features
+            for text_embedding in text_embeddings:
+                probs = torch.ones((text_embedding.shape[0])).cuda() * (1 - drop_prob)
+                masks = torch.bernoulli(probs).cuda()
+                while len(masks.shape) < len(text_embedding.shape):
+                    masks = masks.unsqueeze(-1)
+                text_embedding = text_embedding * masks
+        else:
+            # For a pooled BxC feature
+            probs = torch.ones((text_embeddings.shape[0])).cuda() * (1 - drop_prob)
+            masks = torch.bernoulli(probs).cuda()
+            while len(masks.shape) < len(text_embeddings.shape):
+                masks = masks.unsqueeze(-1)
+            text_embeddings = text_embeddings * masks
+        return text_embeddings

models/mixin/flow_mixin.py ADDED Viewed

	@@ -0,0 +1,175 @@

+"""
+Dongpin Oh: [email protected]
+"""
+import time
+import numpy as np
+import torch
+import torch.nn.functional as F
+from diffusers.utils.torch_utils import randn_tensor
+from loguru import logger
+class FlowMixin:
+    """Mixin class for flow-based models."""
+    _is_noise_initialized = False
+    MIN_STD: float = 0.0  # minimum size of std for the flow matching
+    CLAMP_CONTINUOUS_TIME: float = 0.0
+    def _timestep_shifting(self, timesteps: torch.Tensor, alpha: float = 3.0) -> torch.Tensor:
+        """
+        Adjust the timesteps for higher resolution images by adding more noise.
+        higher resolution have more pixels we need more noise to destory their signal.
+        NOTE that the timestep must be reversed unlike original SD3 style timestep shifting,
+        since the flow-timestep is reversed (t=0: original image; t=1: pure noise)
+        Args:
+            timesteps (torch.Tensor): The original timesteps.
+            alpha (float, optional): Scaling factor for timestep shifting. Defaults to 3.0.
+        Returns:
+            torch.Tensor: The reversed and shifted timesteps.
+        """
+        shifted_t = alpha * timesteps / (1 + (alpha - 1) * timesteps)
+        reversed_t = 1 - shifted_t
+        return reversed_t
+    def get_noisy_input(
+        self,
+        input: torch.Tensor,
+        normal_mean: float = 0.0,
+        normal_std: float = 1.0,
+        is_finetuning: bool = False,
+        t: torch.Tensor | None = None,
+        n_timesteps: int = 1000,
+    ) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
+        """
+        Generate noisy input, based on the optimal-transport flow.
+        Args:
+            input (torch.Tensor): Input tensor.
+            normal_mean (float, optional): Mean of the normal distribution for noise. Defaults to 0.0.
+            normal_std (float, optional): Standard deviation of the normal distribution for noise. Defaults to 1.0.
+            is_finetuning (bool, optional): Whether the model is in finetuning mode. Defaults to False.
+            t (torch.Tensor | None, optional): Predefined timesteps. If None, timesteps are sampled. Defaults to None.
+            n_timesteps (int, opyionsl): Number of discrete timesteps. Defaults to 1000.
+        Returns:
+            tuple[torch.Tensor, torch.Tensor, torch.Tensor]: Tuple containing noise, noisy input, and timestep.
+                - The timestep t is a continuous timestep ranged [0, 1] which cannot directly be used
+                  for the timestep embedding (needs to be discretized by self.discritize_timestep()).
+        """
+        b = input.shape[0]
+        if not FlowMixin._is_noise_initialized:
+            logger.warning("The torch random seed is changed when generating the initial random noise.")
+            current_time = int(time.time() * 1000)
+            torch.manual_seed(current_time)
+            FlowMixin._is_noise_initialized = True
+        noise = randn_tensor(input.shape).cuda()
+        # Sample timestep from a log-normal distribution with mean 0 and std 1
+        if t is None:
+            # NOTE: timestep is sampled from log-normal distribution to make the model
+            # focus on the intermediate timesteps, which are the most informative part of flow-ODE
+            t = torch.randn(b).cuda() * normal_std + normal_mean
+            t = torch.sigmoid(t)
+        else:
+            t = t
+        # Clamp t to be within the interval [0, 1] for numerical stability
+        t = t.clamp(0 + self.CLAMP_CONTINUOUS_TIME, 1 - self.CLAMP_CONTINUOUS_TIME)
+        if is_finetuning:
+            t = self._timestep_shifting(t)
+        # Reshape t to match the dimensions required
+        for _ in range(len(noise.shape) - 1):
+            t = t.unsqueeze(1)
+        # Generate the noisy input
+        noisy_input = (1 - t) * input + (self.MIN_STD + (1 - self.MIN_STD) * t) * noise
+        t_squeezed = t.squeeze()
+        if t_squeezed.dim() == 0:
+            t_squeezed = t_squeezed.unsqueeze(0)
+        return noise, noisy_input, t_squeezed
+    @torch.no_grad()
+    def _logit_norm(self, t: torch.Tensor, m: float = 0, s: float = 1) -> torch.Tensor:
+        """
+        Compute the loss-weight for the flow-matching loss.
+        It will be focusing (giving high weights) on the intermidate timestep, since
+        such timesteps are hard to be matched, according to https://arxiv.org/pdf/2403.03206.pdf
+        Args:
+            t (torch.Tensor): Timestep tensor. (0 to 1)
+            m (float, optional): Mean of the logit distribution. Defaults to 0.
+            s (float, optional): Standard deviation of the logit distribution. Defaults to 1.
+        Returns:
+            torch.Tensor: Weight tensor for the flow-matching loss.
+        """
+        coef = (1 / (s * ((2 * np.pi) ** 0.5))) * (1 / (t * (1 - t)))
+        def logit(x):
+            return torch.log(x) - torch.log(1 - x)
+        exp = torch.exp(-((logit(t) - m) ** 2) / (2 * s**2))
+        return coef * exp
+    def rectified_flow_loss(
+        self,
+        input: torch.Tensor,
+        noise: torch.Tensor,
+        t: torch.Tensor,
+        preds: torch.Tensor,
+        use_weighting: bool = False,
+        reduce: str = "mean",
+    ) -> torch.Tensor:
+        """
+        Compute the rectified flow loss, https://arxiv.org/pdf/2403.03206.pdf
+        Args:
+            input (torch.Tensor): Input tensor.
+            noise (torch.Tensor): Noise tensor.
+            t (torch.Tensor): Timestep tensor.
+            preds (torch.Tensor): Predicted tensor.
+            use_weighting (bool, optional): Whether to use weighting for the loss. Defaults to False.
+            reduce (str, optional): Reduction method for the loss. Options are 'mean' or 'none'. Defaults to 'mean'.
+        Returns:
+            torch.Tensor: Rectified flow loss.
+        """
+        # Matching dimension for broadcasting
+        t = t.reshape(t.shape[0], *[1 for _ in range(len(input.shape) - len(t.shape))])
+        target_flow = (1 - self.MIN_STD) * noise - input
+        loss = F.mse_loss(preds.float(), target_flow.float(), reduction="none")
+        if use_weighting:
+            weight = self._logit_norm(t).detach()
+            loss = loss * weight
+        if reduce == "mean":
+            loss = loss.mean()
+        elif reduce == "none":
+            loss = loss
+        else:
+            raise NotImplementedError
+        return loss
+    def discritize_timestep(self, t: torch.Tensor, n_timesteps: int = 1000) -> torch.Tensor:
+        """
+        Discretize the continuous timestep.
+        Args:
+            t (torch.Tensor): Continuous timestep.
+            n_timesteps (int, optional): Number of discrete timesteps. Defaults to 1000.
+        Returns:
+            torch.Tensor: Discretized timestep tensor.
+        """
+        return (t * n_timesteps).round().long()

models/modeling_dit.py ADDED Viewed

	@@ -0,0 +1,653 @@

+import math
+from typing import List
+import numpy as np
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from diffusers.models.embeddings import TimestepEmbedding, Timesteps
+from loguru import logger
+try:
+    motif_ops = torch.ops.motif
+    MotifRMSNorm = motif_ops.T5LayerNorm
+    ScaledDotProductAttention = None
+    MotifFlashAttention = motif_ops.flash_attention
+except ImportError: # if motif_ops is not available
+    MotifRMSNorm = None
+    ScaledDotProductAttention = None
+    MotifFlashAttention = None
+NUM_MODULATIONS = 6
+SD3_LATENT_CHANNEL = 16
+LOW_RES_POSEMB_BASE_SIZE = 16
+HIGH_RES_POSEMB_BASE_SIZE = 64
+class IdentityConv2d(nn.Module):
+    def __init__(self, channels, kernel_size=3, stride=1, padding=1, bias=True):
+        super().__init__()
+        self.conv = nn.Conv2d(
+            in_channels=channels,
+            out_channels=channels,
+            kernel_size=kernel_size,
+            stride=stride,
+            padding=padding,
+            bias=bias,
+        )
+        self._initialize_identity()
+    def _initialize_identity(self):
+        k = self.conv.kernel_size[0]
+        nn.init.zeros_(self.conv.weight)
+        center = k // 2
+        for i in range(self.conv.in_channels):
+            self.conv.weight.data[i, i, center, center] = 1.0
+        if self.conv.bias is not None:
+            nn.init.zeros_(self.conv.bias)
+    def forward(self, x):
+        return self.conv(x)
+class RMSNorm(nn.Module):
+    def __init__(self, hidden_size, eps=1e-6):
+        """
+        LlamaRMSNorm is equivalent to T5LayerNorm
+        """
+        super().__init__()
+        self.weight = nn.Parameter(torch.ones(hidden_size))
+        self.variance_epsilon = eps
+        self.mask = None
+    def forward(self, hidden_states):
+        input_dtype = hidden_states.dtype
+        hidden_states = hidden_states.to(torch.float)
+        if self.mask is not None:
+            hidden_states = self.mask.to(hidden_states.device).to(hidden_states.dtype) * hidden_states
+        variance = hidden_states.pow(2).sum(-1, keepdim=True)
+        if self.mask is not None:
+            variance /= torch.count_nonzero(self.mask)
+        else:
+            variance /= hidden_states.shape[-1]
+        hidden_states = hidden_states * torch.rsqrt(variance + self.variance_epsilon)
+        return self.weight * hidden_states.to(input_dtype)
+class MLP(nn.Module):
+    def __init__(self, input_size, hidden_size=None):
+        super().__init__()
+        if hidden_size is None:
+            self.input_size, self.hidden_size = input_size, input_size * 4
+        else:
+            self.input_size, self.hidden_size = input_size, hidden_size
+        self.gate_proj = nn.Linear(self.input_size, self.hidden_size)
+        self.down_proj = nn.Linear(self.hidden_size, self.input_size)
+        self.act_fn = nn.SiLU()
+    def forward(self, x):
+        down_proj = self.act_fn(self.gate_proj(x))
+        down_proj = self.down_proj(down_proj)
+        return down_proj
+class TextTimeEmbToGlobalParams(nn.Module):
+    def __init__(self, emb_dim, hidden_dim):
+        super().__init__()
+        self.projection = nn.Linear(emb_dim, hidden_dim * NUM_MODULATIONS)
+    def forward(self, emb):
+        emb = F.silu(emb)  # emb: B x D
+        params = self.projection(emb)  # emb: B x C
+        params = params.reshape(params.shape[0], NUM_MODULATIONS, params.shape[-1] // NUM_MODULATIONS)  # emb: B x 6 x C
+        return params.chunk(6, dim=1)  # [B x 1 x C] x 6
+class TextTimeEmbedding(nn.Module):
+    """
+    Input:
+        pooled_text_emb (B x C_l)
+        time_steps (B)
+    Output:
+        ()
+    """
+    def __init__(self, time_channel, text_channel, embed_dim, flip_sin_to_cos=True, downscale_freq_shift=0):
+        super().__init__()
+        self.time_proj = Timesteps(
+            time_channel, flip_sin_to_cos=flip_sin_to_cos, downscale_freq_shift=downscale_freq_shift
+        )
+        self.time_emb = TimestepEmbedding(time_channel, time_channel * 4, out_dim=embed_dim)  # Encode time emb with MLP
+        self.pooled_text_emb = TimestepEmbedding(
+            text_channel, text_channel * 4, out_dim=embed_dim
+        )  # Encode pooled text with MLP
+    def forward(self, pooled_text_emb, time_steps):
+        time_steps = self.time_proj(time_steps)
+        time_emb = self.time_emb(time_steps.to(dtype=torch.bfloat16))
+        pooled_text_emb = self.pooled_text_emb(pooled_text_emb)
+        return time_emb + pooled_text_emb
+class LatentPatchModule(nn.Module):
+    def __init__(self, patch_size, embedding_dim, latent_channels, vae_type):
+        super().__init__()
+        self.patch_size = patch_size
+        self.embedding_dim = embedding_dim
+        self.projection_SD3 = nn.Conv2d(SD3_LATENT_CHANNEL, embedding_dim, kernel_size=patch_size, stride=patch_size)
+        self.latent_channels = latent_channels
+    def forward(self, x):
+        assert (
+            x.shape[1] == SD3_LATENT_CHANNEL
+        ), f"VAE-Latent channel is not matched with '{SD3_LATENT_CHANNEL}'. current shape: {x.shape}"
+        patches = self.projection_SD3(
+            x.to(dtype=torch.bfloat16)
+        )  # Shape: (B, embedding_dim, num_patches_h, num_patches_w)
+        patches = patches.to(dtype=torch.bfloat16)
+        patches = patches.contiguous()
+        patches = patches.flatten(2)  # Shape: (B, embedding_dim, num_patches)
+        patches = patches.transpose(1, 2)  # Shape: (B, num_patches, embedding_dim)
+        patches = patches.contiguous()
+        return patches
+    def unpatchify(self, x):
+        """
+        x: (N, T, patch_size**2 * C)
+        imgs: (N, H, W, C)
+        """
+        n = x.shape[0]
+        c = self.latent_channels
+        p = self.patch_size
+        # check the valid patching
+        h = w = int(x.shape[1] ** 0.5)
+        assert h * w == x.shape[1]
+        x = x.contiguous()
+        # (N x T x [C * patch_size**2]) -> (N x H x W x P_1 x P_2 x C)
+        x = x.reshape(shape=(n, h, w, p, p, c))
+        # x = torch.einsum('nhwpqc->nchpwq', x)  # Note that einsum possibly be the problem.
+        # (N x H x W x P_1 x P_2 x C) -> (N x C x H x P_1 x W x P_2)
+        # (0 . 1 . 2 .  3 .   4 .  5) -> (0 . 5 . 1 .  3    2 .  4 )
+        x = x.permute(0, 5, 1, 3, 2, 4)
+        return x.reshape(shape=(n, c, h * p, h * p)).contiguous()
+class TextConditionModule(nn.Module):
+    def __init__(self, text_dim, latent_dim):
+        super().__init__()
+        self.projection = nn.Linear(text_dim, latent_dim)
+    def forward(self, t5_xxl, clip_a, clip_b):
+        clip_emb = torch.cat([clip_a, clip_b], dim=-1)
+        clip_emb = torch.nn.functional.pad(clip_emb, (0, t5_xxl.shape[-1] - clip_emb.shape[-1]))
+        text_emb = torch.cat([clip_emb, t5_xxl], dim=-2)
+        text_emb = self.projection(text_emb.to(torch.bfloat16))
+        return text_emb
+class MotifDiTBlock(nn.Module):
+    def __init__(self, emb_dim, t_emb_dim, attn_emb_dim, mlp_dim, attn_config, text_dim=4096):
+        super().__init__()
+        self.affine_params_c = TextTimeEmbToGlobalParams(t_emb_dim, emb_dim)
+        self.affine_params_x = TextTimeEmbToGlobalParams(t_emb_dim, emb_dim)
+        self.norm_1_c = nn.LayerNorm(emb_dim, elementwise_affine=False)
+        self.norm_1_x = nn.LayerNorm(emb_dim, elementwise_affine=False)
+        self.linear_1_c = nn.Linear(emb_dim, attn_emb_dim)
+        self.linear_1_x = nn.Linear(emb_dim, attn_emb_dim)
+        self.attn = JointAttn(attn_config)
+        self.norm_2_c = nn.LayerNorm(emb_dim, elementwise_affine=False)
+        self.norm_2_x = nn.LayerNorm(emb_dim, elementwise_affine=False)
+        self.mlp_3_c = MLP(emb_dim, mlp_dim)
+        self.mlp_3_x = MLP(emb_dim, mlp_dim)
+    def forward(self, x_emb, c_emb, t_emb, perturbed=False):
+        """
+        x_emb (N, TOKEN_LENGTH x 2, C)
+        c_emb (N, T + REGISTER_TOKENS, C)
+        t_emb (N, modulation_dim)
+        """
+        device = x_emb.device
+        # get global affine transformation parameters
+        alpha_x, beta_x, gamma_x, delta_x, epsilon_x, zeta_x = self.affine_params_x(t_emb)  # scale and shift for image
+        alpha_c, beta_c, gamma_c, delta_c, epsilon_c, zeta_c = self.affine_params_c(t_emb)  # scale and shift for text
+        # projection and affine transform before attention
+        x_emb_pre_attn = self.linear_1_x((1 + alpha_x) * self.norm_1_x(x_emb) + beta_x)
+        c_emb_pre_attn = self.linear_1_c((1 + alpha_c) * self.norm_1_c(c_emb) + beta_c)
+        # attn_output, attn_weight (None), past_key_value (None)
+        x_emb_post_attn, c_emb_post_attn = self.attn(
+            x_emb_pre_attn, c_emb_pre_attn, perturbed
+        )  # mixed feature for both text and image (N, [T_x + T_c], C)
+        # scale with gamma and residual with the original inputs
+        x_emb_post_attn = x_emb_post_attn.to(gamma_x.device)
+        x_emb_post_attn = (1 + gamma_x) * x_emb_post_attn + x_emb  # NOTE: nan loss for self.linear_2_x.bias
+        c_emb_post_attn = c_emb_post_attn.to(gamma_c.device)
+        c_emb_post_attn = (1 + gamma_c) * c_emb_post_attn + c_emb
+        # norm the features -> affine transform with modulation -> MLP
+        normalized_x_emb = self.norm_2_x(x_emb_post_attn).to(delta_x.device)
+        normalized_c_emb = self.norm_2_c(c_emb_post_attn).to(delta_c.device)
+        x_emb_final = self.mlp_3_x(delta_x * normalized_x_emb + epsilon_x)
+        c_emb_final = self.mlp_3_c(delta_c * normalized_c_emb + epsilon_c)
+        # final scaling with zeta and residual with the original inputs
+        x_emb_final = zeta_x.to(device) * x_emb_final.to(device) + x_emb.to(device)
+        c_emb_final = zeta_c.to(device) * c_emb_final.to(device) + c_emb.to(device)
+        return x_emb_final, c_emb_final
+class MotifDiT(nn.Module):
+    ENCODED_TEXT_DIM = 4096
+    def __init__(self, config):
+        super(MotifDiT, self).__init__()
+        self.patch_size = config.patch_size
+        self.h, self.w = config.height // config.vae_compression, config.width // config.vae_compression
+        self.latent_chennels = 16
+        # Embedding for (1) text; (2) input image; (3) time
+        self.text_cond = TextConditionModule(self.ENCODED_TEXT_DIM, config.hidden_dim)
+        self.patching = LatentPatchModule(config.patch_size, config.hidden_dim, self.latent_chennels, config.vae_type)
+        self.time_emb = TextTimeEmbedding(config.time_embed_dim, config.pooled_text_dim, config.modulation_dim)
+        # main multi-modal DiT blocks
+        self.mmdit_blocks = nn.ModuleList(
+            [
+                MotifDiTBlock(
+                    config.hidden_dim, config.modulation_dim, config.hidden_dim, config.mlp_hidden_dim, config
+                )
+                for layer_idx in range(config.num_layers)
+            ]
+        )
+        self.final_modulation = nn.Linear(config.modulation_dim, config.hidden_dim * 2)
+        self.final_linear_SD3 = nn.Linear(config.hidden_dim, SD3_LATENT_CHANNEL * config.patch_size**2)
+        self.skip_register_token_num = config.skip_register_token_num
+        if getattr(config, "pos_emb_size", None):
+            pos_emb_size = config.pos_emb_size
+        else:
+            pos_emb_size = HIGH_RES_POSEMB_BASE_SIZE if config.height > 512 else LOW_RES_POSEMB_BASE_SIZE
+        logger.info(f"Positional embedding of Motif-DiT is set to {pos_emb_size}")
+        self.pos_embed = torch.from_numpy(
+            get_2d_sincos_pos_embed(
+                config.hidden_dim, (self.h // self.patch_size, self.w // self.patch_size), base_size=pos_emb_size
+            )
+        ).to(device="cuda", dtype=torch.bfloat16)
+        # set register tokens (https://arxiv.org/abs/2309.16588)
+        if config.register_token_num > 0:
+            self.register_token_num = config.register_token_num
+            self.register_tokens = nn.Parameter(torch.randn(1, self.register_token_num, config.hidden_dim))
+            self.register_parameter("register_tokens", self.register_tokens)
+            # if needed, add additional register tokens for higher resolution training
+            self.additional_register_token_num = config.additional_register_token_num
+            if config.additional_register_token_num > 0:
+                self.register_tokens_highres = nn.Parameter(
+                    torch.randn(1, self.additional_register_token_num, config.hidden_dim)
+                )
+                self.register_parameter("register_tokens_highres", self.register_tokens_highres)
+        if config.use_final_layer_norm:
+            self.final_norm = nn.LayerNorm(config.hidden_dim)
+        if config.conv_header:
+            logger.info("use convolution header after de-patching")
+            self.depatching_conv_header = IdentityConv2d(SD3_LATENT_CHANNEL)
+        if config.use_time_token_in_attn:
+            self.t_token_proj = nn.Linear(config.modulation_dim, config.hidden_dim)
+    def forward(self, latent, t, text_embs: List[torch.Tensor], pooled_text_embs, guiding_feature=None):
+        """
+        latent (torch.Tensor)
+        t (torch.Tensor)
+        text_embs (List[torch.Tensor])
+        pooled_text_embs (torch.Tensor)
+        """
+        # 1. get inputs for the MMDiT blocks
+        emb_c = self.text_cond(*text_embs)  # (N, L, D), text conditions
+        emb_t = self.time_emb(pooled_text_embs, t).to(emb_c.device)  # (N, D), time and pooled text conditions
+        emb_x = (self.patching(latent) + self.pos_embed).to(
+            emb_c.device
+        )  # (N, T, D), where T = H*W / (patch_size ** 2), input latent patches
+        # additional "register" tokens, to convey the global information and prevent high-norm abnormal patch
+        # see https://openreview.net/forum?id=2dnO3LLiJ1
+        if hasattr(self, "register_tokens"):
+            if hasattr(self, "register_tokens_highres"):
+                emb_x = torch.cat(
+                    (
+                        self.register_tokens_highres.expand(emb_x.shape[0], -1, -1),
+                        self.register_tokens.expand(emb_x.shape[0], -1, -1),
+                        emb_x,
+                    ),
+                    dim=1,
+                )
+            else:
+                emb_x = torch.cat((self.register_tokens.expand(emb_x.shape[0], -1, -1), emb_x), dim=1)
+        # time embedding into text embedding
+        if hasattr(self, "use_time_token_in_attn"):
+            t_token = self.t_token_proj(emb_t).unsqueeze(1)
+            emb_c = torch.cat([emb_c, t_token], dim=1)  # (N, [T_c + 1], C)
+        # 2. MMDiT Blocks
+        for block_idx, block in enumerate(self.mmdit_blocks):
+            emb_x, emb_c = block(emb_x, emb_c, emb_t)
+            # accumulating the feature_similarity loss
+            # TODO: add modeling_dit related test
+            if hasattr(self, "num_feature_align_layers") and block_idx == self.num_feature_align_layers:
+                self.feature_alignment_loss = self.feature_align_mlp(emb_x, guiding_feature)  # exclude register tokens
+            # Remove the register tokens at the certain layer (the last layer as default).
+            if block_idx == len(self.mmdit_blocks) - (1 + self.skip_register_token_num):
+                if hasattr(self, "register_tokens_highres"):
+                    emb_x = emb_x[
+                        :, self.register_token_num + self.additional_register_token_num :
+                    ]  # remove the register tokens for the output layer
+                elif hasattr(self, "register_tokens"):
+                    emb_x = emb_x[:, self.register_token_num :]  # remove the register tokens for the output layer
+        # 3. final modulation (shift-and-scale)
+        scale, shift = self.final_modulation(emb_t).chunk(2, -1)  # (N, D) x 2
+        scale, shift = scale.unsqueeze(1), shift.unsqueeze(1)  # (N, 1, D) x 2
+        if hasattr(self, "final_norm"):
+            emb_x = self.final_norm(emb_x)
+        final_emb = (scale + 1) * emb_x + shift
+        # 4. final linear layer to reduce channel and un-patching
+        emb_x = self.final_linear_SD3(final_emb)  # (N, T, D) to (N, T, out_channels * patch_size**2)
+        emb_x = self.patching.unpatchify(emb_x)  # (N, out_channels, H, W)
+        if hasattr(self, "depatching_conv_header"):
+            emb_x = self.depatching_conv_header(emb_x)
+        return emb_x
+class JointAttn(nn.Module):
+    """
+    SD3 style joint-attention layer
+    """
+    def __init__(self, config):
+        super().__init__()
+        self.config = config
+        self.hidden_size = config.hidden_dim
+        self.num_heads = config.num_attention_heads
+        self.head_dim = self.hidden_size // self.num_heads
+        if (self.head_dim * self.num_heads) != self.hidden_size:
+            raise ValueError(
+                f"hidden_size must be divisible by num_heads (got `hidden_size`: {self.hidden_size}"
+                f" and `num_heads`: {self.num_heads})."
+            )
+        self.q_proj = nn.Linear(self.hidden_size, self.num_heads * self.head_dim, bias=False)
+        self.k_proj = nn.Linear(self.hidden_size, self.num_heads * self.head_dim, bias=False)
+        self.v_proj = nn.Linear(self.hidden_size, self.num_heads * self.head_dim, bias=False)
+        self.add_q_proj = nn.Linear(self.hidden_size, self.num_heads * self.head_dim, bias=False)
+        self.add_k_proj = nn.Linear(self.hidden_size, self.num_heads * self.head_dim, bias=False)
+        self.add_v_proj = nn.Linear(self.hidden_size, self.num_heads * self.head_dim, bias=False)
+        self.o_proj = nn.Linear(self.num_heads * self.head_dim, self.hidden_size, bias=False)
+        self.add_o_proj = nn.Linear(self.num_heads * self.head_dim, self.hidden_size, bias=False)
+        self.q_norm_x = MotifRMSNorm(self.head_dim) if MotifRMSNorm else RMSNorm(self.head_dim)
+        self.k_norm_x = MotifRMSNorm(self.head_dim) if MotifRMSNorm else RMSNorm(self.head_dim)
+        self.q_norm_c = MotifRMSNorm(self.head_dim) if MotifRMSNorm else RMSNorm(self.head_dim)
+        self.k_norm_c = MotifRMSNorm(self.head_dim) if MotifRMSNorm else RMSNorm(self.head_dim)
+        self.q_scale = nn.Parameter(torch.ones(self.num_heads))
+        # Attention mode : {'sdpa', 'flash', None}
+        self.attn_mode = config.attn_mode
+    def forward(
+        self,
+        hidden_states: torch.FloatTensor,
+        encoder_hidden_states: torch.FloatTensor,
+        *args,
+        **kwargs,
+    ) -> torch.FloatTensor:
+        residual = hidden_states
+        input_ndim = hidden_states.ndim
+        if input_ndim == 4:
+            batch_size, channel, height, width = hidden_states.shape
+            hidden_states = hidden_states.view(batch_size, channel, height * width).transpose(1, 2)
+        context_input_ndim = encoder_hidden_states.ndim
+        if context_input_ndim == 4:
+            batch_size, channel, height, width = encoder_hidden_states.shape
+            encoder_hidden_states = encoder_hidden_states.view(batch_size, channel, height * width).transpose(1, 2)
+        batch_size = encoder_hidden_states.shape[0]
+        # `sample` projections.
+        query = self.q_proj(hidden_states)
+        key = self.k_proj(hidden_states)
+        value = self.v_proj(hidden_states)
+        # `context` projections.
+        query_c = self.add_q_proj(encoder_hidden_states)
+        key_c = self.add_k_proj(encoder_hidden_states)
+        value_c = self.add_v_proj(encoder_hidden_states)
+        # head first
+        inner_dim = key.shape[-1]
+        head_dim = inner_dim // self.num_heads
+        def norm_qk(x, f_norm):
+            x = x.view(batch_size, -1, self.num_heads, head_dim)
+            b, l, h, d_h = x.shape
+            x = x.reshape(b * l, h, d_h)
+            x = f_norm(x)
+            return x.view(batch_size, -1, self.num_heads, head_dim).transpose(1, 2)  # [b, h, l, d_h]
+        query = norm_qk(query, self.q_norm_x)  # [b, h, l, d_h]
+        key = norm_qk(key, self.k_norm_x)  # [b, h, l, d_h]
+        value = value.view(batch_size, -1, self.num_heads, head_dim).transpose(1, 2)  # [b, h, l, d_h]
+        query_c = norm_qk(query_c, self.q_norm_c) * self.q_scale.reshape(1, self.num_heads, 1, 1)  # [b, h, l_c, d]
+        key_c = norm_qk(key_c, self.k_norm_c)  # [b, h, l_c, d]
+        value_c = value_c.view(batch_size, -1, self.num_heads, head_dim).transpose(1, 2)  # [b, h, l_c, d]
+        # attention
+        query = torch.cat([query, query_c], dim=2).contiguous()  # [b, h, l + l_c, d]
+        key = torch.cat([key, key_c], dim=2).contiguous()  # [b, h, l + l_c, d]
+        value = torch.cat([value, value_c], dim=2).contiguous()  # [b, h, l + l_c, d]
+        # deprecated.
+        hidden_states = self.joint_attention(batch_size, query, key, value, head_dim)
+        hidden_states = hidden_states.to(query.dtype)
+        # Split the attention outputs.
+        hidden_states, encoder_hidden_states = (
+            hidden_states[:, : residual.shape[1]],
+            hidden_states[:, residual.shape[1] :],
+        )
+        # linear proj
+        hidden_states = self.o_proj(hidden_states)
+        encoder_hidden_states = self.add_o_proj(encoder_hidden_states)
+        if input_ndim == 4:
+            hidden_states = hidden_states.transpose(-1, -2).reshape(batch_size, channel, height, width)
+        if context_input_ndim == 4:
+            encoder_hidden_states = encoder_hidden_states.transpose(-1, -2).reshape(batch_size, channel, height, width)
+        return hidden_states, encoder_hidden_states
+    def joint_attention(self, batch_size, query, key, value, head_dim):
+        if self.attn_mode == "sdpa" and ScaledDotProductAttention is not None:
+            # NOTE: SDPA does not support high-resolution (long-context).
+            q_len = query.size(-2)
+            masked_bias = torch.zeros((batch_size, self.num_heads, query.size(-2), key.size(-2)), device="cuda")
+            query = query.transpose(1, 2).reshape(batch_size, q_len, self.hidden_size).contiguous()
+            key = key.transpose(1, 2).reshape(batch_size, q_len, self.hidden_size).contiguous()
+            value = value.transpose(1, 2).reshape(batch_size, q_len, self.hidden_size).contiguous()
+            scale_factor = 1.0
+            scale_factor /= float(self.head_dim) ** 0.5
+            hidden_states = ScaledDotProductAttention(
+                query,
+                key,
+                value,
+                masked_bias,
+                dropout_rate=0.0,
+                training=self.training,
+                attn_weight_scale_factor=scale_factor,
+                num_kv_groups=1,
+            )
+        elif self.attn_mode == "flash" and MotifFlashAttention is not None:
+            query = query.permute(0, 2, 1, 3).contiguous()  # [b, l + l_c, h, d]
+            key = key.permute(0, 2, 1, 3).contiguous()  # [b, l + l_c, h, d]
+            value = value.permute(0, 2, 1, 3).contiguous()  # [b, l + l_c, h, d]
+            scale_factor = 1.0 / math.sqrt(self.head_dim)
+            # NOTE (1): masking of motif flash-attention uses (`1`: un-mask, `0`: mask) and has [Batch, Seq] shape
+            # NOTE (2): Q,K,V must be [Batch, Seq, Heads, Dim] and contiguous.
+            mask = torch.ones((batch_size, query.size(-3))).cuda()
+            hidden_states = MotifFlashAttention(
+                query,
+                key,
+                value,
+                padding_mask=mask,
+                softmax_scale=scale_factor,
+                causal=False,
+            )
+            hidden_states = hidden_states.reshape(batch_size, -1, self.num_heads * head_dim).contiguous()
+        else:
+            hidden_states = F.scaled_dot_product_attention(query, key, value, dropout_p=0.0)
+            hidden_states = hidden_states.transpose(1, 2).reshape(batch_size, -1, self.num_heads * head_dim)
+        return hidden_states
+    @staticmethod
+    def alt_scaled_dot_product_attention(query, key, value, attn_mask=None, dropout_p=0.0, scale=None) -> torch.Tensor:
+        """
+        Pure-pytorch version of the xformers.scaled_dot_product_attention
+        (or F.scaled_dot_product_attention from torch>2.0.0)
+        Args:
+            query (Tensor): query tensor
+            key (Tensor): key tensor
+            value (Tensor): value tensor
+            attn_mask (Tensor, optional): attention mask. Defaults to None.
+            dropout_p (float, optional): attention dropout probability. Defaults to 0.0.
+            scale (Tensor or float, optional): scaling for QK. Defaults to None.
+        Returns:
+            torch.Tensor: attention score (after softmax)
+        """
+        L, S = query.size(-2), key.size(-2)
+        scale_factor = 1 / math.sqrt(query.size(-1)) if scale is None else scale
+        attn_bias = torch.zeros(L, S, dtype=query.dtype, device=query.device)
+        if attn_mask is not None:
+            if attn_mask.dtype == torch.bool:
+                attn_bias.masked_fill_(attn_mask.logical_not(), float("-inf"))
+            else:
+                attn_bias += attn_mask
+        attn_weight = query @ key.transpose(-2, -1) * scale_factor  # B, L, S
+        attn_weight += attn_bias
+        attn_weight = torch.softmax(attn_weight, dim=-1)  # B, L, S
+        attn_weight = torch.dropout(attn_weight, dropout_p, train=True)
+        return attn_weight @ value  # B, L, S * S, D -> B, L, D
+# ===============================================
+# Sine/Cosine Positional Embedding Functions
+# ===============================================
+# https://github.com/facebookresearch/mae/blob/main/util/pos_embed.py
+def get_2d_sincos_pos_embed(embed_dim, grid_size, cls_token=False, extra_tokens=0, scale=1.0, base_size=None):
+    """
+    grid_size: int of the grid height and width
+    return:
+    pos_embed: [grid_size*grid_size, embed_dim] or [1+grid_size*grid_size, embed_dim] (w/ or w/o cls_token)
+    """
+    if not isinstance(grid_size, tuple):
+        grid_size = (grid_size, grid_size)
+    grid_h = np.arange(grid_size[0], dtype=np.float32) / scale
+    grid_w = np.arange(grid_size[1], dtype=np.float32) / scale
+    if base_size is not None:
+        grid_h *= base_size / grid_size[0]
+        grid_w *= base_size / grid_size[1]
+    grid = np.meshgrid(grid_w, grid_h)  # here w goes first
+    grid = np.stack(grid, axis=0)
+    grid = grid.reshape([2, 1, grid_size[1], grid_size[0]])
+    pos_embed = get_2d_sincos_pos_embed_from_grid(embed_dim, grid)
+    if cls_token and extra_tokens > 0:
+        pos_embed = np.concatenate([np.zeros([extra_tokens, embed_dim]), pos_embed], axis=0)
+    return pos_embed
+def get_2d_sincos_pos_embed_from_grid(embed_dim, grid):
+    assert embed_dim % 2 == 0
+    # use half of dimensions to encode grid_h
+    emb_h = get_1d_sincos_pos_embed_from_grid(embed_dim // 2, grid[0])  # (H*W, D/2)
+    emb_w = get_1d_sincos_pos_embed_from_grid(embed_dim // 2, grid[1])  # (H*W, D/2)
+    emb = np.concatenate([emb_h, emb_w], axis=1)  # (H*W, D)
+    return emb
+def get_1d_sincos_pos_embed(embed_dim, length, scale=1.0):
+    pos = np.arange(0, length)[..., None] / scale
+    return get_1d_sincos_pos_embed_from_grid(embed_dim, pos)
+def get_1d_sincos_pos_embed_from_grid(embed_dim, pos):
+    """
+    embed_dim: output dimension for each position
+    pos: a list of positions to be encoded: size (M,)
+    out: (M, D)
+    """
+    assert embed_dim % 2 == 0
+    omega = np.arange(embed_dim // 2, dtype=np.float64)
+    omega /= embed_dim / 2.0
+    omega = 1.0 / 10000**omega  # (D/2,)
+    pos = pos.reshape(-1)  # (M,)
+    out = np.einsum("m,d->md", pos, omega)  # (M, D/2), outer product
+    emb_sin = np.sin(out)  # (M, D/2)
+    emb_cos = np.cos(out)  # (M, D/2)
+    emb = np.concatenate([emb_sin, emb_cos], axis=1)  # (M, D)
+    return emb

models/modeling_motif_vision.py ADDED Viewed

	@@ -0,0 +1,612 @@

+from collections import defaultdict
+from typing import List, Optional, Tuple, Union
+import numpy as np
+import torch
+import torch.nn as nn
+import torchvision.transforms as T
+import tqdm
+from diffusers.models import AutoencoderKL
+from diffusers.utils.torch_utils import randn_tensor
+from loguru import logger
+from PIL import Image, ImageFilter
+from transformers import CLIPTextModel, CLIPTokenizerFast, T5EncoderModel, T5Tokenizer
+from models.mixin.flow_mixin import FlowMixin
+from models.modeling_dit import MotifDiT
+TOKEN_MAX_LENGTH: int = 256
+DROP_PROB: float = 0.1
+LATENT_CHANNELS: int = 4
+VAE_DOWNSCALE_FACTOR: int = 8
+SD3_LATENT_CHANNEL: int = 16
+def generate_intervals(steps, ratio, start=1.0):
+    intervals = torch.linspace(start, 0, steps=steps)
+    intervals = intervals.pow(ratio)
+    return intervals
+class MotifVision(nn.Module, FlowMixin):
+    """
+    MotifVision Text-to-Image model.
+    This model combines a Diffusion transformer with a rectified flow loss and multiple text encoders.
+    It uses a VAE (Variational Autoencoder) for image encoding and decoding.
+    Args:
+        config (MMDiTConfig): Configuration object for the MMDiT model.
+    Attributes:
+        dit (MotifDiT): MotifDiT model instance.
+        noise_scheduler (DDPMScheduler): Noise scheduler for the diffusion process.
+        normalize_img (Callable): Function to normalize images from [-1, 1] range.
+        unnormalize_img (Callable): Function to unnormalize images to [0, 1] range.
+        cond_drop_prob (float): Probability of dropping text embeddings during training.
+        snr_gamma (str): Strategy for weighting the loss based on Signal-to-Noise Ratio (SNR).
+        loss_weight_strategy (str): Strategy for weighting the loss.
+        vae (AutoencoderKL): Variational Autoencoder for image encoding and decoding.
+        t5 (T5EncoderModel): T5 encoder model for text encoding.
+        t5_tokenizer (T5Tokenizer): T5 tokenizer for text tokenization.
+        clip_l (CLIPModel): CLIP (Contrastive Language-Image Pre-training) model (large) for text encoding.
+        clip_l_tokenizer (CLIPTokenizerFast): CLIP tokenizer (large) for text tokenization.
+        clip_g (CLIPModel): CLIP model (giant) for text encoding.
+        clip_g_tokenizer (CLIPTokenizerFast): CLIP tokenizer (giant) for text tokenization.
+        tokenizers (List[Union[T5Tokenizer, CLIPTokenizerFast]]): List of tokenizers.
+        text_encoders (List[Union[T5EncoderModel, CLIPModel]]): List of text encoder models.
+    """
+    def __init__(self, config):
+        super().__init__()
+        self.config = config
+        self.dit = MotifDiT(config)
+        self.cond_drop_prob = 0.1
+        self.use_weighting = False
+        self._get_encoders()
+        self._freeze_encoders()
+    def forward(self, images: torch.Tensor, raw_text: str) -> torch.Tensor:
+        """
+        Forward pass of the MotifDiT model.
+        Args:
+            images (torch.Tensor): Input images tensor, [0-1] ranged.
+            raw_text (List[str]): Input text string.
+        Returns:
+            torch.Tensor: Rectified flow matching loss.
+        """
+        # 1. Encode images and texts
+        with torch.no_grad():
+            latents = self.vae.encode(images).latent_dist.sample() * self.vae.config.scaling_factor
+        tokens, masks = self.tokenization(raw_text)
+        tokens = [token.to(latents.device) for token in tokens]
+        masks = [mask.to(latents.device) for mask in masks]
+        text_embeddings, pooled_text_embeddings = self.text_encoding(tokens, masks)
+        text_embeddings = self._drop_text_emb(text_embeddings)
+        text_embeddings = [text_embedding.float() for text_embedding in text_embeddings]
+        pooled_text_embeddings = pooled_text_embeddings.float()
+        # 2. Get noisy input via the rectified flow
+        is_finetuning = self.config.height > 256
+        noise, noise_latents, t = self.get_noisy_input(latents, is_finetuning=is_finetuning)
+        timesteps = self.discritize_timestep(t, self.n_timesteps)
+        # 3. Forward pass through the dit
+        preds = self.dit(noise_latents, timesteps, text_embeddings, pooled_text_embeddings)
+        # 4. Rectified flow matching loss
+        loss = self.rectified_flow_loss(latents, noise, t, preds, use_weighting=self.use_weighting)
+        return [loss]
+    def _get_encoders(self) -> None:
+        """Initialize the VAE and text encoders."""
+        if self.config.vae_type == "SD3":
+            self.vae = AutoencoderKL.from_pretrained("stabilityai/stable-diffusion-3-medium-diffusers", subfolder="vae")
+        elif self.config.vae_type == "SDXL":
+            self.vae = AutoencoderKL.from_pretrained("stabilityai/sdxl-vae")
+        else:
+            raise ValueError(f"VAE type must be `SD3` or `SDXL`  but self.config.vae_type is {self.config.vae_type}")
+        # Text encoders
+        # 1. T5-XXL from Google
+        self.t5 = T5EncoderModel.from_pretrained("google/flan-t5-xxl").to(dtype=torch.bfloat16)
+        self.t5_tokenizer = T5Tokenizer.from_pretrained("google/flan-t5-xxl")
+        # 2. CLIP-L from OpenAI
+        self.clip_l = CLIPTextModel.from_pretrained("openai/clip-vit-large-patch14").to(dtype=torch.bfloat16)
+        self.clip_l_tokenizer = CLIPTokenizerFast.from_pretrained("openai/clip-vit-large-patch14")
+        # 3. CLIP-G from LAION
+        self.clip_g = CLIPTextModel.from_pretrained("laion/CLIP-ViT-bigG-14-laion2B-39B-b160k").to(dtype=torch.bfloat16)
+        self.clip_g_tokenizer = CLIPTokenizerFast.from_pretrained("laion/CLIP-ViT-bigG-14-laion2B-39B-b160k")
+        self.tokenizers = [self.t5_tokenizer, self.clip_l_tokenizer, self.clip_g_tokenizer]
+        self.text_encoders = [self.t5, self.clip_l, self.clip_g]
+    def state_dict(self, destination=None, prefix="", keep_vars=False):
+        state_dict = super(MotifVision, self).state_dict(destination, prefix, keep_vars)
+        exclude_keys = ["t5.", "clip_l.", "clip_g.", "vae."]
+        for key in list(state_dict.keys()):
+            if any(key.startswith(exclude_key) for exclude_key in exclude_keys):
+                state_dict.pop(key)
+        return state_dict
+    def load_state_dict(self, state_dict, strict=False):
+        """
+        Load state dict and merge LoRA parameters if present.
+        Args:
+            state_dict (dict): State dictionary containing model parameters
+            strict (bool): Whether to strictly enforce that the keys in state_dict match the keys in this module
+        Returns:
+            tuple: (missing_keys, unexpected_keys) lists of parameters that were missing or unexpected
+        """
+        # Check if state_dict contains LoRA parameters
+        has_lora = any("lora_" in key for key in state_dict.keys())
+        if has_lora:
+            # If model doesn't have LoRA enabled but state_dict has LoRA params, enable it
+            if not hasattr(self.dit, "peft_config"):
+                logger.info("Enabling LoRA for parameter merging...")
+                # Use default values if not already configured
+                lora_rank = getattr(self.config, "lora_rank", 64)
+                lora_alpha = getattr(self.config, "lora_alpha", 8)
+                self.enable_lora(lora_rank, lora_alpha)
+        if has_lora:
+            try:
+                # Load LoRA parameters
+                # state_dict = {
+                #     k.replace("base_layer.", ""): v
+                #     for k, v in state_dict.items()
+                #     if "lora_" not in k and "lora" not in k
+                # }
+                missing_keys, unexpected_keys = super().load_state_dict(state_dict, strict=False)
+                # Merge LoRA weights with base model
+                logger.info("Merging LoRA parameters with base model...")
+                for name, module in self.dit.named_modules():
+                    if hasattr(module, "merge_and_unload"):
+                        module.merge_and_unload()
+                logger.info("Successfully merged LoRA parameters")
+            except Exception as e:
+                logger.error(f"Error merging LoRA parameters: {str(e)}")
+                raise
+        else:
+            missing_keys, unexpected_keys = super().load_state_dict(state_dict, strict=False)
+        # Log summary of missing/unexpected parameters
+        missing_top_levels = set()
+        for key in missing_keys:
+            top_level_name = key.split(".")[0]
+            missing_top_levels.add(top_level_name)
+        if missing_top_levels:
+            logger.debug("Missing keys during loading at top level:")
+            for name in missing_top_levels:
+                logger.debug(name)
+        if unexpected_keys:
+            logger.debug("Unexpected keys found:")
+            for key in unexpected_keys:
+                logger.debug(key)
+        return missing_keys, unexpected_keys
+    def _freeze_encoders(self) -> None:
+        """
+        freeze all encoders
+        """
+        for encoder_module in [self.vae, self.clip_l, self.clip_g, self.t5]:
+            for param in encoder_module.parameters():
+                param.requires_grad = False
+    def tokenization(
+        self, raw_texts: List[str], repeat_if_short: bool = False
+    ) -> Tuple[List[torch.Tensor], List[torch.Tensor]]:
+        """
+        Tokenizes a BATCH of input texts using multiple tokenizers efficiently.
+        Optionally repeats each text to fill the max length if it's shorter,
+        BEFORE passing the pre-processed batch to the tokenizer.
+        Args:
+            raw_texts (List[str]): A list of input text strings (the batch).
+            repeat_if_short (bool): If True and a text is short, repeat that text
+                                    to fill the context length. Defaults to True.
+        Returns:
+            Tuple[List[torch.Tensor], List[torch.Tensor]]:
+                - A list containing one batch tensor of input IDs per tokenizer.
+                Each tensor shape: [batch_size, max_length]
+                - A list containing one batch tensor of attention masks per tokenizer.
+                Each tensor shape: [batch_size, max_length]
+        """
+        final_batch_tokens = []
+        final_batch_masks = []
+        # Process the batch with each tokenizer
+        for tokenizer in self.tokenizers:
+            effective_max_length = min(TOKEN_MAX_LENGTH, tokenizer.model_max_length)
+            # 1. Pre-process the batch: Create a new list of potentially repeated strings.
+            processed_texts_for_tokenizer = []
+            for text_item in raw_texts:
+                # Start with the original text for this item
+                processed_text = text_item
+                if repeat_if_short:
+                    # Apply repetition logic individually based on text_item's length
+                    num_initial_tokens = len(text_item.split())
+                    available_length = effective_max_length - 2  # Heuristic
+                    if num_initial_tokens > 0 and num_initial_tokens < available_length:
+                        num_additional_repeats = available_length // (num_initial_tokens + 1)
+                        if num_additional_repeats > 0:
+                            total_repeats = 1 + num_additional_repeats
+                            processed_text = " ".join([text_item] * total_repeats)
+                # Add the processed text (original or repeated) to the list for this tokenizer
+                processed_texts_for_tokenizer.append(processed_text)
+            # 2. Tokenize the entire batch of processed texts at once.
+            #    Pass the list `processed_texts_for_tokenizer` directly to the tokenizer.
+            #    The tokenizer's __call__ method should handle the batch efficiently.
+            batch_tok_output = tokenizer(  # Call the tokenizer ONCE with the full list
+                processed_texts_for_tokenizer,
+                padding="max_length",
+                max_length=effective_max_length,
+                return_tensors="pt",
+                truncation=True,
+            )
+            # 3. Store the resulting batch tensors directly.
+            #    The tokenizer should return tensors with shape [batch_size, max_length].
+            final_batch_tokens.append(batch_tok_output.input_ids)
+            final_batch_masks.append(batch_tok_output.attention_mask)
+        return final_batch_tokens, final_batch_masks
+    @torch.no_grad()
+    def text_encoding(
+        self, tokens: List[torch.Tensor], masks, noisy_pad=False, zero_masking=True
+    ) -> Tuple[List[torch.Tensor], torch.Tensor]:
+        """
+        Encode the tokenized text using multiple text encoders.
+        Args:
+            tokens (List[torch.Tensor]): List of tokenized text tensors.
+        Returns:
+            Tuple[List[torch.Tensor], torch.Tensor]: Tuple containing a list of text embeddings and pooled text embeddings.
+        """
+        t5_tokens, clip_l_tokens, clip_g_tokens = tokens
+        t5_masks, clip_l_masks, clip_g_masks = masks
+        t5_emb = self.t5(t5_tokens, attention_mask=t5_masks)[0]
+        if zero_masking:
+            t5_emb = t5_emb * (t5_tokens != self.t5_tokenizer.pad_token_id).unsqueeze(-1)
+        if noisy_pad:
+            t5_pad_noise = (
+                (t5_tokens == self.t5_tokenizer.pad_token_id).unsqueeze(-1) * torch.randn_like(t5_emb).cuda() * 0.008
+            )
+            t5_emb = t5_emb + t5_pad_noise
+        clip_l_emb = self.clip_l(input_ids=clip_l_tokens, output_hidden_states=True)
+        clip_g_emb = self.clip_g(input_ids=clip_g_tokens, output_hidden_states=True)
+        clip_l_emb_pooled = clip_l_emb.pooler_output  # B x 768
+        clip_g_emb_pooled = clip_g_emb.pooler_output  # B x 1280
+        clip_l_emb = clip_l_emb.last_hidden_state  # B x L x 768,
+        clip_g_emb = clip_g_emb.last_hidden_state  # B x L x 1280,
+        def masking_wo_first_eos(token, eos):
+            idx = (token != eos).sum(dim=1)
+            mask = token != eos
+            arange = torch.arange(mask.size(0)).cuda()
+            mask[arange, idx] = True
+            mask = mask.unsqueeze(-1)  # B x L x 1
+            return mask
+        if zero_masking:
+            clip_l_emb = clip_l_emb * masking_wo_first_eos(
+                clip_l_tokens, self.clip_l_tokenizer.eos_token_id
+            )  # B x L x 768,
+            clip_g_emb = clip_g_emb * masking_wo_first_eos(
+                clip_g_tokens, self.clip_g_tokenizer.eos_token_id
+            )  # B x L x 768,
+        if noisy_pad:
+            clip_l_pad_noise = (
+                ~masking_wo_first_eos(clip_l_tokens, self.clip_l_tokenizer.eos_token_id)
+                * torch.randn_like(clip_l_emb).cuda()
+                * 0.08
+            )
+            clip_g_pad_noise = (
+                ~masking_wo_first_eos(clip_g_tokens, self.clip_g_tokenizer.eos_token_id)
+                * torch.randn_like(clip_g_emb).cuda()
+                * 0.08
+            )
+            clip_l_emb = clip_l_emb + clip_l_pad_noise
+            clip_g_emb = clip_g_emb + clip_g_pad_noise
+        encodings = [t5_emb, clip_l_emb, clip_g_emb]
+        pooled_encodings = torch.cat([clip_l_emb_pooled, clip_g_emb_pooled], dim=-1)  # cat by channel, B x 2048
+        return encodings, pooled_encodings
+    @torch.no_grad()
+    def prompt_embedding(self, prompts: str, device, noisy_pad=False, zero_masking=True):
+        tokens, masks = self.tokenization(prompts)
+        tokens = [token.to(device) for token in tokens]
+        masks = [mask.to(device) for mask in masks]
+        text_embeddings, pooled_text_embeddings = self.text_encoding(
+            tokens, masks, noisy_pad=noisy_pad, zero_masking=zero_masking
+        )
+        text_embeddings = [text_embedding.bfloat16() for text_embedding in text_embeddings]
+        pooled_text_embeddings = pooled_text_embeddings.bfloat16()
+        return text_embeddings, pooled_text_embeddings
+    @torch.no_grad()
+    def sample(
+        self,
+        raw_text: List[str],
+        steps: int = 50,
+        guidance_scale: float = 7.5,
+        resolution: List[int] = (256, 256),
+        pre_latent=None,
+        pre_timestep=None,
+        step_scaling=1.0,
+        noisy_pad=False,
+        zero_masking=False,
+        negative_prompt: Optional[List[str]] = None,
+        device: str = "cuda",
+        rescale_cfg=-1.0,
+        clip_t=[0.0, 1.0],
+        use_linear_quadratic_schedule=False,
+        linear_quadratic_emulating_steps=250,
+        prompt_rewriter=None,
+        moderator=None,
+        get_intermediate_steps: bool = False,  # Defaulting to True based on user code
+    ) -> Union[List[Image.Image], Tuple[List[Image.Image], List[List[Image.Image]]]]:  # Updated return type hint
+        """
+        Sample images using flow matching. Optionally returns intermediate step images
+        calculated via observed average velocity method.
+        Args:
+            raw_text (List[str]): raw text prompts
+            steps (int, optional): number of function estimations for flow matching ODE. Defaults to 50.
+            guidance_scale (float, optional): classifier free guidance scale. Defaults to 7.5.
+            resolution (List[int], optional): input and output resolution of raw images. Defaults to (256, 256).
+            device (str, optional):  Defaults to 'cuda'.
+            pre_latent (Tensor, optional): the optional input to generate image with pre-defined latents.
+                for instance, it would be utilized for denoising or image-editing.
+            pre_timestep (float [0,1], optional): the pre-defined timestep. with `pre_latent`, image generation
+                can be done by starting with intermediate timestep.
+            step_scaling (float, default to 1.3): scaling factor for each ODE-solving.
+            use_linear_quadratic_schedule (bool, default to false): boolean option to linear-quaratic t schdule. If false, then linear t schdule.
+            linear_quadratic_emulating_steps (int, default to 250): N value in linear-quadratic t schedule from Meta moviegen paper
+                Reference: (https://ai.meta.com/static-resource/movie-gen-research-paper) Figure 10
+            get_intermediate_steps (bool, optional): Whether to calculate and return intermediate step images.
+                                                 Calculation is based on initial_noise - avg(velocity). Defaults to True.
+        Returns:
+            Union[List[PIL.Image], Tuple[List[PIL.Image], List[List[PIL.Image]]]]:
+                If get_intermediate_steps is False: Returns a list of final PIL images.
+                If get_intermediate_steps is True: Returns a tuple containing:
+                    - List[PIL.Image]: Final output PIL images.
+                    - List[List[PIL.Image]]: List of intermediate PIL images. Each inner list contains
+                                              the batch of images for one intermediate step.
+        """
+        if prompt_rewriter:
+            prompts = [prompt_rewriter.rewrite(prompt) for prompt in raw_text]
+        else:
+            prompts = raw_text
+        # Simplified check for rewriter status
+        if prompts == raw_text and prompt_rewriter is not None:
+            logger.debug("Prompt rewriter did not change the prompts.")
+        elif prompt_rewriter is None:
+            logger.debug("Prompt rewriter not provided.")
+        if moderator is None:
+            is_safe_prompt = [True for _ in prompts]
+        else:
+            is_safe_prompt = [moderator and moderator.is_safe_content(prompt, threshold=0.7) for prompt in prompts]
+            if not all(is_safe_prompt):
+                logger.warning("Noxious prompt detected. Output image(s) will be blurred.")
+        b = len(prompts)
+        h, w = resolution
+        # --- [Initial Latent Noise (e = x_1)] ---
+        latent_channels = 16
+        if pre_latent is None:
+            initial_noise = randn_tensor(  # Store initial noise separately
+                (b, latent_channels, h // VAE_DOWNSCALE_FACTOR, w // VAE_DOWNSCALE_FACTOR),
+                device=device,
+                dtype=torch.float32,  # Use float32 for calculations
+            )
+        else:
+            initial_noise = pre_latent.to(device=device, dtype=torch.float32)
+            if pre_timestep is not None and pre_timestep < 1.0:  # Check if it's truly intermediate
+                logger.warning(
+                    "Using pre_latent as initial_noise for average calculation, but pre_timestep suggests it's not pure noise. Results might be unexpected."
+                )
+        latents = initial_noise.clone()  # Working latents for the ODE solver
+        # --- [Text Embeddings & CFG Setup] ---
+        text_embeddings, pooled_text_embeddings = self.prompt_embedding(
+            prompts, latents.device, noisy_pad=noisy_pad, zero_masking=zero_masking
+        )
+        text_embeddings = [emb.to(device=latents.device, dtype=torch.bfloat16) for emb in text_embeddings]
+        pooled_text_embeddings = pooled_text_embeddings.to(device=latents.device, dtype=torch.bfloat16)
+        do_classifier_free_guidance = guidance_scale > 1.0
+        if do_classifier_free_guidance:
+            negative_text_embeddings = [
+                torch.zeros_like(text_embedding, device=text_embedding.device) for text_embedding in text_embeddings
+            ]
+            negative_pooled_text_embeddings = torch.zeros_like(
+                pooled_text_embeddings, device=pooled_text_embeddings.device
+            )
+            text_embeddings = [
+                torch.cat([text_embedding, negative_text_embedding], dim=0)
+                for text_embedding, negative_text_embedding in zip(text_embeddings, negative_text_embeddings)
+            ]
+            pooled_text_embeddings = torch.cat([pooled_text_embeddings, negative_pooled_text_embeddings], dim=0)
+            # if negative_prompt is None:
+            #     negative_prompt = [""] * b
+            #     logger.debug("No negative prompt provided, using empty strings for CFG.")
+            # negative_text_embeddings, negative_pooled_text_embeddings = self.prompt_embedding(negative_prompt, latents.device)
+            # negative_text_embeddings = [emb.to(device=latents.device, dtype=torch.bfloat16) for emb in negative_text_embeddings]
+            # negative_pooled_text_embeddings = negative_pooled_text_embeddings.to(device=latents.device, dtype=torch.bfloat16)
+            # text_embeddings = [torch.cat([pos_emb, neg_emb], dim=0) for pos_emb, neg_emb in zip(text_embeddings, negative_text_embeddings)]
+            # pooled_text_embeddings = torch.cat([pooled_text_embeddings, negative_pooled_text_embeddings], dim=0)
+        # --- [Timestep Schedule (Sigmas)] ---
+        # linear t schedule
+        sigmas = torch.linspace(1, 0, steps + 1) if not pre_timestep else torch.linspace(pre_timestep, 0, steps + 1)
+        if use_linear_quadratic_schedule:
+            # liner-quadratic t schedule
+            assert steps % 2 == 0
+            N = linear_quadratic_emulating_steps
+            sigmas = torch.concat(
+                [
+                    torch.linspace(1, 0, N + 1)[: steps // 2],
+                    torch.linspace(0, 1, steps // 2 + 1) ** 2 * (steps // 2 * 1 / N - 1) - (steps // 2 * 1 / N - 1),
+                ]
+            )
+        # --- [Initialization for Intermediate Step Calculation] ---
+        # intermediate_latents will store the latent states for intermediate steps
+        intermediate_latents = [] if get_intermediate_steps else None
+        predicted_velocities = []  # Store dx from each step
+        sigma_history = []
+        # --- [Sampling Loop] ---
+        for infer_step, t in tqdm.tqdm(enumerate(sigmas[:-1]), total=len(sigmas[:-1]), desc="Sampling"):
+            # Prepare input for DiT model
+            if do_classifier_free_guidance:
+                input_latents = torch.cat([latents] * 2, dim=0)
+            else:
+                input_latents = latents
+            # Prepare timestep input
+            timestep = (t * 1000).round().long().to(latents.device)
+            timestep = timestep.expand(input_latents.shape[0]).to(torch.bfloat16)  # Ensure timestep is bfloat16
+            # Predict velocity dx = v(x_t, t) ≈ e - x_0
+            dx = self.dit(input_latents.to(torch.bfloat16), timestep, text_embeddings, pooled_text_embeddings)
+            dt = sigmas[infer_step + 1] - sigmas[infer_step]  # dt is negative
+            sigma_history.append(dt)
+            # Apply Classifier-Free Guidance
+            if do_classifier_free_guidance:
+                cond_dx, uncond_dx = dx.chunk(2)
+                current_guidance_scale = guidance_scale if clip_t[0] <= t and t <= clip_t[1] else 1.0
+                dx = uncond_dx + current_guidance_scale * (cond_dx - uncond_dx)
+                if rescale_cfg > 0.0:
+                    std_pos = torch.std(cond_dx, dim=[1, 2, 3], keepdim=True, unbiased=False) + 1e-5
+                    std_cfg = torch.std(dx, dim=[1, 2, 3], keepdim=True, unbiased=False) + 1e-5
+                    factor = std_pos / std_cfg
+                    factor = rescale_cfg * factor + (1.0 - rescale_cfg)
+                    dx = dx * factor
+            # --- Store the predicted velocity for averaging ---
+            predicted_velocities.append(dx.clone())
+            # --- Update Latents using standard Euler step ---
+            latents = latents + dt * dx
+            # --- Calculate and Store Intermediate Latent State (if requested) ---
+            if get_intermediate_steps:
+                dxs = torch.stack(predicted_velocities)
+                sigma_sum = sum(sigma_history)
+                normalized_sigma_history = [s / (sigma_sum) for s in sigma_history]
+                dts = torch.tensor(normalized_sigma_history, device=dxs.device, dtype=dxs.dtype).view(-1, 1, 1, 1, 1)
+                avg_dx = torch.sum(dxs * dts, dim=0)
+                observed_state = initial_noise - avg_dx  # Calculate the desired intermediate state
+                intermediate_latents.append(observed_state.clone())  # Store its latent representation
+        # --- [Decode Final Latents to PIL Images] ---
+        self.vae = self.vae.to(device=latents.device, dtype=torch.float32)  # Ensure VAE is ready
+        final_latents_scaled = latents.to(torch.float32) / self.vae.config.scaling_factor
+        final_image_tensors = self.vae.decode(final_latents_scaled, return_dict=False)[0] + self.vae.config.shift_factor
+        final_image_tensors = ((final_image_tensors + 1.0) / 2.0).clamp(0.0, 1.0)
+        final_pil_images = []
+        for i, image_tensor in enumerate(final_image_tensors):
+            img = T.ToPILImage()(image_tensor.cpu())
+            if not is_safe_prompt[i]:
+                img = img.filter(ImageFilter.GaussianBlur(radius=30))
+            final_pil_images.append(img)
+        # --- [Decode Intermediate Latents to PIL Images (if requested)] ---
+        if get_intermediate_steps:
+            intermediate_pil_images = []
+            # Ensure VAE is still ready (it should be from final decoding)
+            for step_latents in tqdm.tqdm(intermediate_latents, desc="Decoding intermediates"):
+                step_latents_scaled = (
+                    step_latents.to(dtype=torch.float32, device="cuda") / self.vae.config.scaling_factor
+                )
+                step_image_tensors = (
+                    self.vae.decode(step_latents_scaled, return_dict=False)[0] + self.vae.config.shift_factor
+                )
+                step_image_tensors = ((step_image_tensors + 1.0) / 2.0).clamp(0.0, 1.0)
+                current_step_pil = []
+                for i, image_tensor in enumerate(step_image_tensors):
+                    img = T.ToPILImage()(image_tensor.cpu())
+                    # Apply moderation blur consistency
+                    if not is_safe_prompt[i]:
+                        img = img.filter(ImageFilter.GaussianBlur(radius=30))
+                    current_step_pil.append(img)
+                intermediate_pil_images.append(current_step_pil)  # Append list of images for this step
+            return final_pil_images, intermediate_pil_images  # Return both final and intermediate images
+        else:
+            return final_pil_images  # Return only final images
+    @torch.no_grad()
+    def eval_with_loss(self, images, raw_text):
+        latents = self.vae.encode(images).latent_dist.sample() * self.vae.config.scaling_factor
+        tokens, masks = self.tokenization(raw_text)
+        tokens = [token.to(latents.device) for token in tokens]
+        masks = [mask.to(latents.device) for mask in masks]
+        text_embeddings, pooled_text_embeddings = self.text_encoding(tokens, masks)
+        text_embeddings = [text_embedding for text_embedding in text_embeddings]
+        pooled_text_embeddings = pooled_text_embeddings.float()
+        # 2. Get noisy input via the rectified flow
+        is_finetuning = self.config.height > 256
+        noise, noise_latents, t = self.get_noisy_input(latents, is_finetuning=is_finetuning)
+        timesteps = self.discritize_timestep(t, self.n_timesteps)
+        # 3. Forward pass through the dit
+        preds = self.dit(noise_latents, timesteps, text_embeddings, pooled_text_embeddings)
+        # 4. Rectified flow matching loss
+        loss = self.rectified_flow_loss(noise_latents, noise, t, preds, reduce="none", use_weighting=False).mean(
+            dim=[1, 2, 3]
+        )
+        intervals = np.linspace(0, 1, 9)
+        t_interval = [(intervals[i], intervals[i + 1]) for i in range(len(intervals) - 1)]
+        loss_bins = defaultdict(list)
+        for i, interval in enumerate(t_interval, 0):
+            idx = (interval[0] < t) & (t < interval[1])
+            loss_bins[i].append(loss[idx])
+        return loss_bins