Motif-Image-6B-Preview / models /mixin /flow_mixin.py

Add MotifVision model for text-to-image generation

6cd6a16 about 1 month ago

6.8 kB

	"""
	Dongpin Oh: [email protected]
	"""

	import time

	import numpy as np
	import torch
	import torch.nn.functional as F
	from diffusers.utils.torch_utils import randn_tensor
	from loguru import logger


	class FlowMixin:
	"""Mixin class for flow-based models."""

	_is_noise_initialized = False
	MIN_STD: float = 0.0 # minimum size of std for the flow matching
	CLAMP_CONTINUOUS_TIME: float = 0.0

	def _timestep_shifting(self, timesteps: torch.Tensor, alpha: float = 3.0) -> torch.Tensor:
	"""
	Adjust the timesteps for higher resolution images by adding more noise.
	higher resolution have more pixels we need more noise to destory their signal.

	NOTE that the timestep must be reversed unlike original SD3 style timestep shifting,
	since the flow-timestep is reversed (t=0: original image; t=1: pure noise)
	Args:
	timesteps (torch.Tensor): The original timesteps.
	alpha (float, optional): Scaling factor for timestep shifting. Defaults to 3.0.

	Returns:
	torch.Tensor: The reversed and shifted timesteps.
	"""
	shifted_t = alpha * timesteps / (1 + (alpha - 1) * timesteps)
	reversed_t = 1 - shifted_t
	return reversed_t

	def get_noisy_input(
	self,
	input: torch.Tensor,
	normal_mean: float = 0.0,
	normal_std: float = 1.0,
	is_finetuning: bool = False,
	t: torch.Tensor \| None = None,
	n_timesteps: int = 1000,
	) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
	"""
	Generate noisy input, based on the optimal-transport flow.

	Args:
	input (torch.Tensor): Input tensor.
	normal_mean (float, optional): Mean of the normal distribution for noise. Defaults to 0.0.
	normal_std (float, optional): Standard deviation of the normal distribution for noise. Defaults to 1.0.
	is_finetuning (bool, optional): Whether the model is in finetuning mode. Defaults to False.
	t (torch.Tensor \| None, optional): Predefined timesteps. If None, timesteps are sampled. Defaults to None.
	n_timesteps (int, opyionsl): Number of discrete timesteps. Defaults to 1000.

	Returns:
	tuple[torch.Tensor, torch.Tensor, torch.Tensor]: Tuple containing noise, noisy input, and timestep.
	- The timestep t is a continuous timestep ranged [0, 1] which cannot directly be used
	for the timestep embedding (needs to be discretized by self.discritize_timestep()).
	"""
	b = input.shape[0]

	if not FlowMixin._is_noise_initialized:
	logger.warning("The torch random seed is changed when generating the initial random noise.")
	current_time = int(time.time() * 1000)
	torch.manual_seed(current_time)
	FlowMixin._is_noise_initialized = True

	noise = randn_tensor(input.shape).cuda()

	# Sample timestep from a log-normal distribution with mean 0 and std 1
	if t is None:
	# NOTE: timestep is sampled from log-normal distribution to make the model
	# focus on the intermediate timesteps, which are the most informative part of flow-ODE
	t = torch.randn(b).cuda() * normal_std + normal_mean
	t = torch.sigmoid(t)
	else:
	t = t

	# Clamp t to be within the interval [0, 1] for numerical stability
	t = t.clamp(0 + self.CLAMP_CONTINUOUS_TIME, 1 - self.CLAMP_CONTINUOUS_TIME)

	if is_finetuning:
	t = self._timestep_shifting(t)

	# Reshape t to match the dimensions required
	for _ in range(len(noise.shape) - 1):
	t = t.unsqueeze(1)

	# Generate the noisy input
	noisy_input = (1 - t) * input + (self.MIN_STD + (1 - self.MIN_STD) * t) * noise

	t_squeezed = t.squeeze()
	if t_squeezed.dim() == 0:
	t_squeezed = t_squeezed.unsqueeze(0)
	return noise, noisy_input, t_squeezed

	@torch.no_grad()
	def _logit_norm(self, t: torch.Tensor, m: float = 0, s: float = 1) -> torch.Tensor:
	"""
	Compute the loss-weight for the flow-matching loss.
	It will be focusing (giving high weights) on the intermidate timestep, since
	such timesteps are hard to be matched, according to https://arxiv.org/pdf/2403.03206.pdf
	Args:
	t (torch.Tensor): Timestep tensor. (0 to 1)
	m (float, optional): Mean of the logit distribution. Defaults to 0.
	s (float, optional): Standard deviation of the logit distribution. Defaults to 1.

	Returns:
	torch.Tensor: Weight tensor for the flow-matching loss.
	"""
	coef = (1 / (s * ((2 * np.pi) ** 0.5))) * (1 / (t * (1 - t)))

	def logit(x):
	return torch.log(x) - torch.log(1 - x)

	exp = torch.exp(-((logit(t) - m) ** 2) / (2 * s**2))
	return coef * exp

	def rectified_flow_loss(
	self,
	input: torch.Tensor,
	noise: torch.Tensor,
	t: torch.Tensor,
	preds: torch.Tensor,
	use_weighting: bool = False,
	reduce: str = "mean",
	) -> torch.Tensor:
	"""
	Compute the rectified flow loss, https://arxiv.org/pdf/2403.03206.pdf

	Args:
	input (torch.Tensor): Input tensor.
	noise (torch.Tensor): Noise tensor.
	t (torch.Tensor): Timestep tensor.
	preds (torch.Tensor): Predicted tensor.
	use_weighting (bool, optional): Whether to use weighting for the loss. Defaults to False.
	reduce (str, optional): Reduction method for the loss. Options are 'mean' or 'none'. Defaults to 'mean'.

	Returns:
	torch.Tensor: Rectified flow loss.
	"""

	# Matching dimension for broadcasting
	t = t.reshape(t.shape[0], *[1 for _ in range(len(input.shape) - len(t.shape))])

	target_flow = (1 - self.MIN_STD) * noise - input
	loss = F.mse_loss(preds.float(), target_flow.float(), reduction="none")
	if use_weighting:
	weight = self._logit_norm(t).detach()
	loss = loss * weight
	if reduce == "mean":
	loss = loss.mean()
	elif reduce == "none":
	loss = loss
	else:
	raise NotImplementedError

	return loss

	def discritize_timestep(self, t: torch.Tensor, n_timesteps: int = 1000) -> torch.Tensor:
	"""
	Discretize the continuous timestep.

	Args:
	t (torch.Tensor): Continuous timestep.
	n_timesteps (int, optional): Number of discrete timesteps. Defaults to 1000.

	Returns:
	torch.Tensor: Discretized timestep tensor.
	"""
	return (t * n_timesteps).round().long()