Motif-Image-6B-Preview / models /modeling_motifimage.py

refactor/motifimage (#2)

327b52c verified 15 days ago

29.6 kB

	from collections import defaultdict
	from typing import List, Optional, Tuple, Union

	import numpy as np
	import torch
	import torch.nn as nn
	import torchvision.transforms as T
	import tqdm
	from diffusers.models import AutoencoderKL
	from diffusers.utils.torch_utils import randn_tensor
	from loguru import logger
	from PIL import Image, ImageFilter
	from transformers import CLIPTextModel, CLIPTokenizerFast, T5EncoderModel, T5Tokenizer

	from models.mixin.flow_mixin import FlowMixin
	from models.modeling_dit import MotifDiT

	TOKEN_MAX_LENGTH: int = 256
	DROP_PROB: float = 0.1
	LATENT_CHANNELS: int = 4
	VAE_DOWNSCALE_FACTOR: int = 8
	SD3_LATENT_CHANNEL: int = 16


	def generate_intervals(steps, ratio, start=1.0):
	intervals = torch.linspace(start, 0, steps=steps)
	intervals = intervals.pow(ratio)
	return intervals


	class MotifImage(nn.Module, FlowMixin):
	"""
	MotifImage Text-to-Image model.

	This model combines a Diffusion transformer with a rectified flow loss and multiple text encoders.
	It uses a VAE (Variational Autoencoder) for image encoding and decoding.

	Args:
	config (MMDiTConfig): Configuration object for the MMDiT model.

	Attributes:
	dit (MotifDiT): MotifDiT model instance.
	noise_scheduler (DDPMScheduler): Noise scheduler for the diffusion process.
	normalize_img (Callable): Function to normalize images from [-1, 1] range.
	unnormalize_img (Callable): Function to unnormalize images to [0, 1] range.
	cond_drop_prob (float): Probability of dropping text embeddings during training.
	snr_gamma (str): Strategy for weighting the loss based on Signal-to-Noise Ratio (SNR).
	loss_weight_strategy (str): Strategy for weighting the loss.
	vae (AutoencoderKL): Variational Autoencoder for image encoding and decoding.
	t5 (T5EncoderModel): T5 encoder model for text encoding.
	t5_tokenizer (T5Tokenizer): T5 tokenizer for text tokenization.
	clip_l (CLIPModel): CLIP (Contrastive Language-Image Pre-training) model (large) for text encoding.
	clip_l_tokenizer (CLIPTokenizerFast): CLIP tokenizer (large) for text tokenization.
	clip_g (CLIPModel): CLIP model (giant) for text encoding.
	clip_g_tokenizer (CLIPTokenizerFast): CLIP tokenizer (giant) for text tokenization.
	tokenizers (List[Union[T5Tokenizer, CLIPTokenizerFast]]): List of tokenizers.
	text_encoders (List[Union[T5EncoderModel, CLIPModel]]): List of text encoder models.
	"""

	def __init__(self, config):
	super().__init__()
	self.config = config
	self.dit = MotifDiT(config)
	self.cond_drop_prob = 0.1
	self.use_weighting = False
	self._get_encoders()
	self._freeze_encoders()

	def forward(self, images: torch.Tensor, raw_text: str) -> torch.Tensor:
	"""
	Forward pass of the MotifDiT model.

	Args:
	images (torch.Tensor): Input images tensor, [0-1] ranged.
	raw_text (List[str]): Input text string.

	Returns:
	torch.Tensor: Rectified flow matching loss.
	"""
	# 1. Encode images and texts
	with torch.no_grad():
	latents = self.vae.encode(images).latent_dist.sample() * self.vae.config.scaling_factor
	tokens, masks = self.tokenization(raw_text)
	tokens = [token.to(latents.device) for token in tokens]
	masks = [mask.to(latents.device) for mask in masks]
	text_embeddings, pooled_text_embeddings = self.text_encoding(tokens, masks)
	text_embeddings = self._drop_text_emb(text_embeddings)
	text_embeddings = [text_embedding.float() for text_embedding in text_embeddings]
	pooled_text_embeddings = pooled_text_embeddings.float()

	# 2. Get noisy input via the rectified flow
	is_finetuning = self.config.height > 256
	noise, noise_latents, t = self.get_noisy_input(latents, is_finetuning=is_finetuning)

	timesteps = self.discritize_timestep(t, self.n_timesteps)

	# 3. Forward pass through the dit
	preds = self.dit(noise_latents, timesteps, text_embeddings, pooled_text_embeddings)

	# 4. Rectified flow matching loss
	loss = self.rectified_flow_loss(latents, noise, t, preds, use_weighting=self.use_weighting)

	return [loss]

	def _get_encoders(self) -> None:
	"""Initialize the VAE and text encoders."""
	if self.config.vae_type == "SD3":
	self.vae = AutoencoderKL.from_pretrained("stabilityai/stable-diffusion-3-medium-diffusers", subfolder="vae")
	elif self.config.vae_type == "SDXL":
	self.vae = AutoencoderKL.from_pretrained("stabilityai/sdxl-vae")
	else:
	raise ValueError(f"VAE type must be `SD3` or `SDXL` but self.config.vae_type is {self.config.vae_type}")

	# Text encoders
	# 1. T5-XXL from Google
	self.t5 = T5EncoderModel.from_pretrained("google/flan-t5-xxl").to(dtype=torch.bfloat16)
	self.t5_tokenizer = T5Tokenizer.from_pretrained("google/flan-t5-xxl")

	# 2. CLIP-L from OpenAI
	self.clip_l = CLIPTextModel.from_pretrained("openai/clip-vit-large-patch14").to(dtype=torch.bfloat16)
	self.clip_l_tokenizer = CLIPTokenizerFast.from_pretrained("openai/clip-vit-large-patch14")

	# 3. CLIP-G from LAION
	self.clip_g = CLIPTextModel.from_pretrained("laion/CLIP-ViT-bigG-14-laion2B-39B-b160k").to(dtype=torch.bfloat16)
	self.clip_g_tokenizer = CLIPTokenizerFast.from_pretrained("laion/CLIP-ViT-bigG-14-laion2B-39B-b160k")

	self.tokenizers = [self.t5_tokenizer, self.clip_l_tokenizer, self.clip_g_tokenizer]
	self.text_encoders = [self.t5, self.clip_l, self.clip_g]

	def state_dict(self, destination=None, prefix="", keep_vars=False):
	state_dict = super(MotifImage, self).state_dict(destination, prefix, keep_vars)
	exclude_keys = ["t5.", "clip_l.", "clip_g.", "vae."]
	for key in list(state_dict.keys()):
	if any(key.startswith(exclude_key) for exclude_key in exclude_keys):
	state_dict.pop(key)
	return state_dict

	def load_state_dict(self, state_dict, strict=False):
	"""
	Load state dict and merge LoRA parameters if present.

	Args:
	state_dict (dict): State dictionary containing model parameters
	strict (bool): Whether to strictly enforce that the keys in state_dict match the keys in this module

	Returns:
	tuple: (missing_keys, unexpected_keys) lists of parameters that were missing or unexpected
	"""
	# Check if state_dict contains LoRA parameters
	has_lora = any("lora_" in key for key in state_dict.keys())

	if has_lora:
	# If model doesn't have LoRA enabled but state_dict has LoRA params, enable it
	if not hasattr(self.dit, "peft_config"):
	logger.info("Enabling LoRA for parameter merging...")
	# Use default values if not already configured
	lora_rank = getattr(self.config, "lora_rank", 64)
	lora_alpha = getattr(self.config, "lora_alpha", 8)
	self.enable_lora(lora_rank, lora_alpha)

	if has_lora:
	try:
	# Load LoRA parameters
	# state_dict = {
	# k.replace("base_layer.", ""): v
	# for k, v in state_dict.items()
	# if "lora_" not in k and "lora" not in k
	# }
	missing_keys, unexpected_keys = super().load_state_dict(state_dict, strict=False)
	# Merge LoRA weights with base model
	logger.info("Merging LoRA parameters with base model...")
	for name, module in self.dit.named_modules():
	if hasattr(module, "merge_and_unload"):
	module.merge_and_unload()

	logger.info("Successfully merged LoRA parameters")

	except Exception as e:
	logger.error(f"Error merging LoRA parameters: {str(e)}")
	raise
	else:
	missing_keys, unexpected_keys = super().load_state_dict(state_dict, strict=False)

	# Log summary of missing/unexpected parameters
	missing_top_levels = set()
	for key in missing_keys:
	top_level_name = key.split(".")[0]
	missing_top_levels.add(top_level_name)
	if missing_top_levels:
	logger.debug("Missing keys during loading at top level:")
	for name in missing_top_levels:
	logger.debug(name)

	if unexpected_keys:
	logger.debug("Unexpected keys found:")
	for key in unexpected_keys:
	logger.debug(key)

	return missing_keys, unexpected_keys

	def _freeze_encoders(self) -> None:
	"""
	freeze all encoders
	"""
	for encoder_module in [self.vae, self.clip_l, self.clip_g, self.t5]:
	for param in encoder_module.parameters():
	param.requires_grad = False

	def tokenization(
	self, raw_texts: List[str], repeat_if_short: bool = False
	) -> Tuple[List[torch.Tensor], List[torch.Tensor]]:
	"""
	Tokenizes a BATCH of input texts using multiple tokenizers efficiently.
	Optionally repeats each text to fill the max length if it's shorter,
	BEFORE passing the pre-processed batch to the tokenizer.

	Args:
	raw_texts (List[str]): A list of input text strings (the batch).
	repeat_if_short (bool): If True and a text is short, repeat that text
	to fill the context length. Defaults to True.

	Returns:
	Tuple[List[torch.Tensor], List[torch.Tensor]]:
	- A list containing one batch tensor of input IDs per tokenizer.
	Each tensor shape: [batch_size, max_length]
	- A list containing one batch tensor of attention masks per tokenizer.
	Each tensor shape: [batch_size, max_length]
	"""
	final_batch_tokens = []
	final_batch_masks = []

	# Process the batch with each tokenizer
	for tokenizer in self.tokenizers:
	effective_max_length = min(TOKEN_MAX_LENGTH, tokenizer.model_max_length)

	# 1. Pre-process the batch: Create a new list of potentially repeated strings.
	processed_texts_for_tokenizer = []
	for text_item in raw_texts:
	# Start with the original text for this item
	processed_text = text_item

	if repeat_if_short:
	# Apply repetition logic individually based on text_item's length
	num_initial_tokens = len(text_item.split())
	available_length = effective_max_length - 2 # Heuristic

	if num_initial_tokens > 0 and num_initial_tokens < available_length:
	num_additional_repeats = available_length // (num_initial_tokens + 1)
	if num_additional_repeats > 0:
	total_repeats = 1 + num_additional_repeats
	processed_text = " ".join([text_item] * total_repeats)

	# Add the processed text (original or repeated) to the list for this tokenizer
	processed_texts_for_tokenizer.append(processed_text)

	# 2. Tokenize the entire batch of processed texts at once.
	# Pass the list `processed_texts_for_tokenizer` directly to the tokenizer.
	# The tokenizer's __call__ method should handle the batch efficiently.
	batch_tok_output = tokenizer( # Call the tokenizer ONCE with the full list
	processed_texts_for_tokenizer,
	padding="max_length",
	max_length=effective_max_length,
	return_tensors="pt",
	truncation=True,
	)

	# 3. Store the resulting batch tensors directly.
	# The tokenizer should return tensors with shape [batch_size, max_length].
	final_batch_tokens.append(batch_tok_output.input_ids)
	final_batch_masks.append(batch_tok_output.attention_mask)

	return final_batch_tokens, final_batch_masks

	@torch.no_grad()
	def text_encoding(
	self, tokens: List[torch.Tensor], masks, noisy_pad=False, zero_masking=True
	) -> Tuple[List[torch.Tensor], torch.Tensor]:
	"""
	Encode the tokenized text using multiple text encoders.

	Args:
	tokens (List[torch.Tensor]): List of tokenized text tensors.

	Returns:
	Tuple[List[torch.Tensor], torch.Tensor]: Tuple containing a list of text embeddings and pooled text embeddings.
	"""
	t5_tokens, clip_l_tokens, clip_g_tokens = tokens
	t5_masks, clip_l_masks, clip_g_masks = masks
	t5_emb = self.t5(t5_tokens, attention_mask=t5_masks)[0]
	if zero_masking:
	t5_emb = t5_emb * (t5_tokens != self.t5_tokenizer.pad_token_id).unsqueeze(-1)
	if noisy_pad:
	t5_pad_noise = (
	(t5_tokens == self.t5_tokenizer.pad_token_id).unsqueeze(-1) * torch.randn_like(t5_emb).cuda() * 0.008
	)
	t5_emb = t5_emb + t5_pad_noise

	clip_l_emb = self.clip_l(input_ids=clip_l_tokens, output_hidden_states=True)
	clip_g_emb = self.clip_g(input_ids=clip_g_tokens, output_hidden_states=True)
	clip_l_emb_pooled = clip_l_emb.pooler_output # B x 768
	clip_g_emb_pooled = clip_g_emb.pooler_output # B x 1280

	clip_l_emb = clip_l_emb.last_hidden_state # B x L x 768,
	clip_g_emb = clip_g_emb.last_hidden_state # B x L x 1280,

	def masking_wo_first_eos(token, eos):
	idx = (token != eos).sum(dim=1)
	mask = token != eos
	arange = torch.arange(mask.size(0)).cuda()
	mask[arange, idx] = True
	mask = mask.unsqueeze(-1) # B x L x 1
	return mask

	if zero_masking:
	clip_l_emb = clip_l_emb * masking_wo_first_eos(
	clip_l_tokens, self.clip_l_tokenizer.eos_token_id
	) # B x L x 768,
	clip_g_emb = clip_g_emb * masking_wo_first_eos(
	clip_g_tokens, self.clip_g_tokenizer.eos_token_id
	) # B x L x 768,

	if noisy_pad:
	clip_l_pad_noise = (
	~masking_wo_first_eos(clip_l_tokens, self.clip_l_tokenizer.eos_token_id)
	* torch.randn_like(clip_l_emb).cuda()
	* 0.08
	)
	clip_g_pad_noise = (
	~masking_wo_first_eos(clip_g_tokens, self.clip_g_tokenizer.eos_token_id)
	* torch.randn_like(clip_g_emb).cuda()
	* 0.08
	)
	clip_l_emb = clip_l_emb + clip_l_pad_noise
	clip_g_emb = clip_g_emb + clip_g_pad_noise

	encodings = [t5_emb, clip_l_emb, clip_g_emb]
	pooled_encodings = torch.cat([clip_l_emb_pooled, clip_g_emb_pooled], dim=-1) # cat by channel, B x 2048

	return encodings, pooled_encodings

	@torch.no_grad()
	def prompt_embedding(self, prompts: str, device, noisy_pad=False, zero_masking=True):
	tokens, masks = self.tokenization(prompts)
	tokens = [token.to(device) for token in tokens]
	masks = [mask.to(device) for mask in masks]
	text_embeddings, pooled_text_embeddings = self.text_encoding(
	tokens, masks, noisy_pad=noisy_pad, zero_masking=zero_masking
	)
	text_embeddings = [text_embedding.bfloat16() for text_embedding in text_embeddings]
	pooled_text_embeddings = pooled_text_embeddings.bfloat16()
	return text_embeddings, pooled_text_embeddings

	@torch.no_grad()
	def sample(
	self,
	raw_text: List[str],
	steps: int = 50,
	guidance_scale: float = 7.5,
	resolution: List[int] = (256, 256),
	pre_latent=None,
	pre_timestep=None,
	step_scaling=1.0,
	noisy_pad=False,
	zero_masking=False,
	negative_prompt: Optional[List[str]] = None,
	device: str = "cuda",
	rescale_cfg=-1.0,
	clip_t=[0.0, 1.0],
	use_linear_quadratic_schedule=False,
	linear_quadratic_emulating_steps=250,
	prompt_rewriter=None,
	moderator=None,
	get_intermediate_steps: bool = False, # Defaulting to True based on user code
	) -> Union[List[Image.Image], Tuple[List[Image.Image], List[List[Image.Image]]]]: # Updated return type hint
	"""
	Sample images using flow matching. Optionally returns intermediate step images
	calculated via observed average velocity method.

	Args:
	raw_text (List[str]): raw text prompts
	steps (int, optional): number of function estimations for flow matching ODE. Defaults to 50.
	guidance_scale (float, optional): classifier free guidance scale. Defaults to 7.5.
	resolution (List[int], optional): input and output resolution of raw images. Defaults to (256, 256).
	device (str, optional): Defaults to 'cuda'.
	pre_latent (Tensor, optional): the optional input to generate image with pre-defined latents.
	for instance, it would be utilized for denoising or image-editing.
	pre_timestep (float [0,1], optional): the pre-defined timestep. with `pre_latent`, image generation
	can be done by starting with intermediate timestep.
	step_scaling (float, default to 1.3): scaling factor for each ODE-solving.
	use_linear_quadratic_schedule (bool, default to false): boolean option to linear-quaratic t schdule. If false, then linear t schdule.
	linear_quadratic_emulating_steps (int, default to 250): N value in linear-quadratic t schedule from Meta moviegen paper
	Reference: (https://ai.meta.com/static-resource/movie-gen-research-paper) Figure 10
	get_intermediate_steps (bool, optional): Whether to calculate and return intermediate step images.
	Calculation is based on initial_noise - avg(velocity). Defaults to True.

	Returns:
	Union[List[PIL.Image], Tuple[List[PIL.Image], List[List[PIL.Image]]]]:
	If get_intermediate_steps is False: Returns a list of final PIL images.
	If get_intermediate_steps is True: Returns a tuple containing:
	- List[PIL.Image]: Final output PIL images.
	- List[List[PIL.Image]]: List of intermediate PIL images. Each inner list contains
	the batch of images for one intermediate step.
	"""
	if prompt_rewriter:
	prompts = [prompt_rewriter.rewrite(prompt) for prompt in raw_text]
	else:
	prompts = raw_text

	# Simplified check for rewriter status
	if prompts == raw_text and prompt_rewriter is not None:
	logger.debug("Prompt rewriter did not change the prompts.")
	elif prompt_rewriter is None:
	logger.debug("Prompt rewriter not provided.")

	if moderator is None:
	is_safe_prompt = [True for _ in prompts]
	else:
	is_safe_prompt = [moderator and moderator.is_safe_content(prompt, threshold=0.7) for prompt in prompts]
	if not all(is_safe_prompt):
	logger.warning("Noxious prompt detected. Output image(s) will be blurred.")

	b = len(prompts)
	h, w = resolution

	# --- [Initial Latent Noise (e = x_1)] ---
	latent_channels = 16
	if pre_latent is None:
	initial_noise = randn_tensor( # Store initial noise separately
	(b, latent_channels, h // VAE_DOWNSCALE_FACTOR, w // VAE_DOWNSCALE_FACTOR),
	device=device,
	dtype=torch.float32, # Use float32 for calculations
	)
	else:
	initial_noise = pre_latent.to(device=device, dtype=torch.float32)
	if pre_timestep is not None and pre_timestep < 1.0: # Check if it's truly intermediate
	logger.warning(
	"Using pre_latent as initial_noise for average calculation, but pre_timestep suggests it's not pure noise. Results might be unexpected."
	)

	latents = initial_noise.clone() # Working latents for the ODE solver

	# --- [Text Embeddings & CFG Setup] ---
	text_embeddings, pooled_text_embeddings = self.prompt_embedding(
	prompts, latents.device, noisy_pad=noisy_pad, zero_masking=zero_masking
	)
	text_embeddings = [emb.to(device=latents.device, dtype=torch.bfloat16) for emb in text_embeddings]
	pooled_text_embeddings = pooled_text_embeddings.to(device=latents.device, dtype=torch.bfloat16)

	do_classifier_free_guidance = guidance_scale > 1.0
	if do_classifier_free_guidance:
	negative_text_embeddings = [
	torch.zeros_like(text_embedding, device=text_embedding.device) for text_embedding in text_embeddings
	]
	negative_pooled_text_embeddings = torch.zeros_like(
	pooled_text_embeddings, device=pooled_text_embeddings.device
	)
	text_embeddings = [
	torch.cat([text_embedding, negative_text_embedding], dim=0)
	for text_embedding, negative_text_embedding in zip(text_embeddings, negative_text_embeddings)
	]
	pooled_text_embeddings = torch.cat([pooled_text_embeddings, negative_pooled_text_embeddings], dim=0)

	# if negative_prompt is None:
	# negative_prompt = [""] * b
	# logger.debug("No negative prompt provided, using empty strings for CFG.")
	# negative_text_embeddings, negative_pooled_text_embeddings = self.prompt_embedding(negative_prompt, latents.device)
	# negative_text_embeddings = [emb.to(device=latents.device, dtype=torch.bfloat16) for emb in negative_text_embeddings]
	# negative_pooled_text_embeddings = negative_pooled_text_embeddings.to(device=latents.device, dtype=torch.bfloat16)

	# text_embeddings = [torch.cat([pos_emb, neg_emb], dim=0) for pos_emb, neg_emb in zip(text_embeddings, negative_text_embeddings)]
	# pooled_text_embeddings = torch.cat([pooled_text_embeddings, negative_pooled_text_embeddings], dim=0)

	# --- [Timestep Schedule (Sigmas)] ---
	# linear t schedule
	sigmas = torch.linspace(1, 0, steps + 1) if not pre_timestep else torch.linspace(pre_timestep, 0, steps + 1)

	if use_linear_quadratic_schedule:
	# liner-quadratic t schedule
	assert steps % 2 == 0
	N = linear_quadratic_emulating_steps
	sigmas = torch.concat(
	[
	torch.linspace(1, 0, N + 1)[: steps // 2],
	torch.linspace(0, 1, steps // 2 + 1) ** 2 * (steps // 2 * 1 / N - 1) - (steps // 2 * 1 / N - 1),
	]
	)

	# --- [Initialization for Intermediate Step Calculation] ---
	# intermediate_latents will store the latent states for intermediate steps
	intermediate_latents = [] if get_intermediate_steps else None
	predicted_velocities = [] # Store dx from each step
	sigma_history = []
	# --- [Sampling Loop] ---
	for infer_step, t in tqdm.tqdm(enumerate(sigmas[:-1]), total=len(sigmas[:-1]), desc="Sampling"):
	# Prepare input for DiT model
	if do_classifier_free_guidance:
	input_latents = torch.cat([latents] * 2, dim=0)
	else:
	input_latents = latents

	# Prepare timestep input
	timestep = (t * 1000).round().long().to(latents.device)
	timestep = timestep.expand(input_latents.shape[0]).to(torch.bfloat16) # Ensure timestep is bfloat16

	# Predict velocity dx = v(x_t, t) ≈ e - x_0
	dx = self.dit(input_latents.to(torch.bfloat16), timestep, text_embeddings, pooled_text_embeddings)
	dt = sigmas[infer_step + 1] - sigmas[infer_step] # dt is negative
	sigma_history.append(dt)

	# Apply Classifier-Free Guidance
	if do_classifier_free_guidance:
	cond_dx, uncond_dx = dx.chunk(2)
	current_guidance_scale = guidance_scale if clip_t[0] <= t and t <= clip_t[1] else 1.0
	dx = uncond_dx + current_guidance_scale * (cond_dx - uncond_dx)

	if rescale_cfg > 0.0:
	std_pos = torch.std(cond_dx, dim=[1, 2, 3], keepdim=True, unbiased=False) + 1e-5
	std_cfg = torch.std(dx, dim=[1, 2, 3], keepdim=True, unbiased=False) + 1e-5
	factor = std_pos / std_cfg
	factor = rescale_cfg * factor + (1.0 - rescale_cfg)
	dx = dx * factor

	# --- Store the predicted velocity for averaging ---
	predicted_velocities.append(dx.clone())

	# --- Update Latents using standard Euler step ---
	latents = latents + dt * dx

	# --- Calculate and Store Intermediate Latent State (if requested) ---
	if get_intermediate_steps:
	dxs = torch.stack(predicted_velocities)

	sigma_sum = sum(sigma_history)
	normalized_sigma_history = [s / (sigma_sum) for s in sigma_history]
	dts = torch.tensor(normalized_sigma_history, device=dxs.device, dtype=dxs.dtype).view(-1, 1, 1, 1, 1)

	avg_dx = torch.sum(dxs * dts, dim=0)
	observed_state = initial_noise - avg_dx # Calculate the desired intermediate state
	intermediate_latents.append(observed_state.clone()) # Store its latent representation

	# --- [Decode Final Latents to PIL Images] ---
	self.vae = self.vae.to(device=latents.device, dtype=torch.float32) # Ensure VAE is ready
	final_latents_scaled = latents.to(torch.float32) / self.vae.config.scaling_factor
	final_image_tensors = self.vae.decode(final_latents_scaled, return_dict=False)[0] + self.vae.config.shift_factor
	final_image_tensors = ((final_image_tensors + 1.0) / 2.0).clamp(0.0, 1.0)

	final_pil_images = []
	for i, image_tensor in enumerate(final_image_tensors):
	img = T.ToPILImage()(image_tensor.cpu())
	if not is_safe_prompt[i]:
	img = img.filter(ImageFilter.GaussianBlur(radius=30))
	final_pil_images.append(img)

	# --- [Decode Intermediate Latents to PIL Images (if requested)] ---
	if get_intermediate_steps:
	intermediate_pil_images = []
	# Ensure VAE is still ready (it should be from final decoding)
	for step_latents in tqdm.tqdm(intermediate_latents, desc="Decoding intermediates"):
	step_latents_scaled = (
	step_latents.to(dtype=torch.float32, device="cuda") / self.vae.config.scaling_factor
	)
	step_image_tensors = (
	self.vae.decode(step_latents_scaled, return_dict=False)[0] + self.vae.config.shift_factor
	)
	step_image_tensors = ((step_image_tensors + 1.0) / 2.0).clamp(0.0, 1.0)

	current_step_pil = []
	for i, image_tensor in enumerate(step_image_tensors):
	img = T.ToPILImage()(image_tensor.cpu())
	# Apply moderation blur consistency
	if not is_safe_prompt[i]:
	img = img.filter(ImageFilter.GaussianBlur(radius=30))
	current_step_pil.append(img)
	intermediate_pil_images.append(current_step_pil) # Append list of images for this step

	return final_pil_images, intermediate_pil_images # Return both final and intermediate images
	else:
	return final_pil_images # Return only final images

	@torch.no_grad()
	def eval_with_loss(self, images, raw_text):
	latents = self.vae.encode(images).latent_dist.sample() * self.vae.config.scaling_factor

	tokens, masks = self.tokenization(raw_text)
	tokens = [token.to(latents.device) for token in tokens]
	masks = [mask.to(latents.device) for mask in masks]
	text_embeddings, pooled_text_embeddings = self.text_encoding(tokens, masks)
	text_embeddings = [text_embedding for text_embedding in text_embeddings]
	pooled_text_embeddings = pooled_text_embeddings.float()

	# 2. Get noisy input via the rectified flow
	is_finetuning = self.config.height > 256
	noise, noise_latents, t = self.get_noisy_input(latents, is_finetuning=is_finetuning)
	timesteps = self.discritize_timestep(t, self.n_timesteps)

	# 3. Forward pass through the dit
	preds = self.dit(noise_latents, timesteps, text_embeddings, pooled_text_embeddings)

	# 4. Rectified flow matching loss
	loss = self.rectified_flow_loss(noise_latents, noise, t, preds, reduce="none", use_weighting=False).mean(
	dim=[1, 2, 3]
	)

	intervals = np.linspace(0, 1, 9)
	t_interval = [(intervals[i], intervals[i + 1]) for i in range(len(intervals) - 1)]

	loss_bins = defaultdict(list)
	for i, interval in enumerate(t_interval, 0):
	idx = (interval[0] < t) & (t < interval[1])
	loss_bins[i].append(loss[idx])

	return loss_bins