clean up

Browse files

Files changed (4) hide show

README.md +8 -1
main.py +13 -3
mvdream/models.py +13 -25
mvdream/util.py +0 -196

README.md CHANGED Viewed

@@ -12,7 +12,14 @@ wget https://raw.githubusercontent.com/bytedance/MVDream/main/mvdream/configs/sd
 python convert_mvdream_to_diffusers.py --checkpoint_path ./sd-v2.1-base-4view.pt --dump_path ./weights --original_config_file ./sd-v2-base.yaml --half --to_safetensors --test
 ```
-### run pipeline
 ```python
 import torch
 import kiui

 python convert_mvdream_to_diffusers.py --checkpoint_path ./sd-v2.1-base-4view.pt --dump_path ./weights --original_config_file ./sd-v2-base.yaml --half --to_safetensors --test
 ```
+### usage
+example:
+```bash
+python main.py "a cute owl"
+```
+detailed usage:
 ```python
 import torch
 import kiui

main.py CHANGED Viewed

@@ -1,11 +1,21 @@
 import torch
 import kiui
 from mvdream.pipeline_mvdream import MVDreamStableDiffusionPipeline
 pipe = MVDreamStableDiffusionPipeline.from_pretrained('./weights', torch_dtype=torch.float16)
 pipe = pipe.to("cuda")
-prompt = "a photo of an astronaut riding a horse on mars"
-image = pipe(prompt)
-kiui.vis.plot_image(image)

 import torch
 import kiui
+import numpy as np
+import argparse
 from mvdream.pipeline_mvdream import MVDreamStableDiffusionPipeline
 pipe = MVDreamStableDiffusionPipeline.from_pretrained('./weights', torch_dtype=torch.float16)
 pipe = pipe.to("cuda")
+parser = argparse.ArgumentParser(description='MVDream')
+parser.add_argument('prompt', type=str, default="a cute owl 3d model")
+args = parser.parse_args()
+while True:
+    image = pipe(args.prompt)
+    grid = np.concatenate([
+        np.concatenate([image[0], image[2]], axis=0),
+        np.concatenate([image[1], image[3]], axis=0),
+    ], axis=1)
+    kiui.vis.plot_image(grid)

mvdream/models.py CHANGED Viewed

@@ -10,10 +10,8 @@ from abc import abstractmethod
 from .util import (
     checkpoint,
     conv_nd,
-    linear,
     avg_pool_nd,
     zero_module,
-    normalization,
     timestep_embedding,
 )
 from .attention import SpatialTransformer, SpatialTransformer3D
@@ -56,7 +54,7 @@ class MultiViewUNetWrapperModel(ModelMixin, ConfigMixin):
             adm_in_channels=None,
             camera_dim=None,):
         super().__init__()
-        self.unet: MultiViewUNetModel = MultiViewUNetModel(
             image_size=image_size,
             in_channels=in_channels,
             model_channels=model_channels,
@@ -218,7 +216,7 @@ class ResBlock(TimestepBlock):
         self.use_scale_shift_norm = use_scale_shift_norm
         self.in_layers = nn.Sequential(
-            normalization(channels),
             nn.SiLU(),
             conv_nd(dims, channels, self.out_channels, 3, padding=1),
         )
@@ -236,13 +234,13 @@ class ResBlock(TimestepBlock):
         self.emb_layers = nn.Sequential(
             nn.SiLU(),
-            linear(
                 emb_channels,
                 2 * self.out_channels if use_scale_shift_norm else self.out_channels,
             ),
         )
         self.out_layers = nn.Sequential(
-            normalization(self.out_channels),
             nn.SiLU(),
             nn.Dropout(p=dropout),
             zero_module(conv_nd(dims, self.out_channels, self.out_channels, 3, padding=1)),
@@ -310,7 +308,7 @@ class AttentionBlock(nn.Module):
             assert (channels % num_head_channels == 0), f"q,k,v channels {channels} is not divisible by num_head_channels {num_head_channels}"
             self.num_heads = channels // num_head_channels
         self.use_checkpoint = use_checkpoint
-        self.norm = normalization(channels)
         self.qkv = conv_nd(1, channels, channels * 3, 1)
         if use_new_attention_order:
             # split qkv before split heads
@@ -418,16 +416,6 @@ class QKVAttention(nn.Module):
         return count_flops_attn(model, _x, y)
-class Timestep(nn.Module):
-    def __init__(self, dim):
-        super().__init__()
-        self.dim = dim
-    def forward(self, t):
-        return timestep_embedding(t, self.dim)
 class MultiViewUNetModel(nn.Module):
     """
     The full multi-view UNet model with attention, timestep embedding and camera embedding.
@@ -545,17 +533,17 @@ class MultiViewUNetModel(nn.Module):
         time_embed_dim = model_channels * 4
         self.time_embed = nn.Sequential(
-            linear(model_channels, time_embed_dim),
             nn.SiLU(),
-            linear(time_embed_dim, time_embed_dim),
         )
         if camera_dim is not None:
             time_embed_dim = model_channels * 4
             self.camera_embed = nn.Sequential(
-                linear(camera_dim, time_embed_dim),
                 nn.SiLU(),
-                linear(time_embed_dim, time_embed_dim),
             )
         if self.num_classes is not None:
@@ -567,9 +555,9 @@ class MultiViewUNetModel(nn.Module):
             elif self.num_classes == "sequential":
                 assert adm_in_channels is not None
                 self.label_emb = nn.Sequential(nn.Sequential(
-                    linear(adm_in_channels, time_embed_dim),
                     nn.SiLU(),
-                    linear(time_embed_dim, time_embed_dim),
                 ))
             else:
                 raise ValueError()
@@ -722,13 +710,13 @@ class MultiViewUNetModel(nn.Module):
                 self._feature_size += ch
         self.out = nn.Sequential(
-            normalization(ch),
             nn.SiLU(),
             zero_module(conv_nd(dims, model_channels, out_channels, 3, padding=1)),
         )
         if self.predict_codebook_ids:
             self.id_predictor = nn.Sequential(
-                normalization(ch),
                 conv_nd(dims, model_channels, n_embed, 1),
                 #nn.LogSoftmax(dim=1)  # change to cross_entropy and produce non-normalized logits
             )

 from .util import (
     checkpoint,
     conv_nd,
     avg_pool_nd,
     zero_module,
     timestep_embedding,
 )
 from .attention import SpatialTransformer, SpatialTransformer3D
             adm_in_channels=None,
             camera_dim=None,):
         super().__init__()
+        self.unet = MultiViewUNetModel(
             image_size=image_size,
             in_channels=in_channels,
             model_channels=model_channels,
         self.use_scale_shift_norm = use_scale_shift_norm
         self.in_layers = nn.Sequential(
+            nn.GroupNorm(32, channels),
             nn.SiLU(),
             conv_nd(dims, channels, self.out_channels, 3, padding=1),
         )
         self.emb_layers = nn.Sequential(
             nn.SiLU(),
+            nn.Linear(
                 emb_channels,
                 2 * self.out_channels if use_scale_shift_norm else self.out_channels,
             ),
         )
         self.out_layers = nn.Sequential(
+            nn.GroupNorm(32, self.out_channels),
             nn.SiLU(),
             nn.Dropout(p=dropout),
             zero_module(conv_nd(dims, self.out_channels, self.out_channels, 3, padding=1)),
             assert (channels % num_head_channels == 0), f"q,k,v channels {channels} is not divisible by num_head_channels {num_head_channels}"
             self.num_heads = channels // num_head_channels
         self.use_checkpoint = use_checkpoint
+        self.norm = nn.GroupNorm(32, channels)
         self.qkv = conv_nd(1, channels, channels * 3, 1)
         if use_new_attention_order:
             # split qkv before split heads
         return count_flops_attn(model, _x, y)
 class MultiViewUNetModel(nn.Module):
     """
     The full multi-view UNet model with attention, timestep embedding and camera embedding.
         time_embed_dim = model_channels * 4
         self.time_embed = nn.Sequential(
+            nn.Linear(model_channels, time_embed_dim),
             nn.SiLU(),
+            nn.Linear(time_embed_dim, time_embed_dim),
         )
         if camera_dim is not None:
             time_embed_dim = model_channels * 4
             self.camera_embed = nn.Sequential(
+                nn.Linear(camera_dim, time_embed_dim),
                 nn.SiLU(),
+                nn.Linear(time_embed_dim, time_embed_dim),
             )
         if self.num_classes is not None:
             elif self.num_classes == "sequential":
                 assert adm_in_channels is not None
                 self.label_emb = nn.Sequential(nn.Sequential(
+                    nn.Linear(adm_in_channels, time_embed_dim),
                     nn.SiLU(),
+                    nn.Linear(time_embed_dim, time_embed_dim),
                 ))
             else:
                 raise ValueError()
                 self._feature_size += ch
         self.out = nn.Sequential(
+            nn.GroupNorm(32, ch),
             nn.SiLU(),
             zero_module(conv_nd(dims, model_channels, out_channels, 3, padding=1)),
         )
         if self.predict_codebook_ids:
             self.id_predictor = nn.Sequential(
+                nn.GroupNorm(32, ch),
                 conv_nd(dims, model_channels, n_embed, 1),
                 #nn.LogSoftmax(dim=1)  # change to cross_entropy and produce non-normalized logits
             )

mvdream/util.py CHANGED Viewed

@@ -10,136 +10,7 @@
 import math
 import torch
 import torch.nn as nn
-import numpy as np
-import importlib
 from einops import repeat
-from typing import Any
-def instantiate_from_config(config):
-    if not "target" in config:
-        if config == '__is_first_stage__':
-            return None
-        elif config == "__is_unconditional__":
-            return None
-        raise KeyError("Expected key `target` to instantiate.")
-    return get_obj_from_str(config["target"])(**config.get("params", dict()))
-def get_obj_from_str(string, reload=False):
-    module, cls = string.rsplit(".", 1)
-    if reload:
-        module_imp = importlib.import_module(module)
-        importlib.reload(module_imp)
-    return getattr(importlib.import_module(module, package=None), cls)
-def make_beta_schedule(schedule,
-                       n_timestep,
-                       linear_start=1e-4,
-                       linear_end=2e-2,
-                       cosine_s=8e-3):
-    if schedule == "linear":
-        betas = (torch.linspace(linear_start**0.5,
-                                linear_end**0.5,
-                                n_timestep,
-                                dtype=torch.float64)**2)
-    elif schedule == "cosine":
-        timesteps = (
-            torch.arange(n_timestep + 1, dtype=torch.float64) / n_timestep +
-            cosine_s)
-        alphas = timesteps / (1 + cosine_s) * np.pi / 2
-        alphas = torch.cos(alphas).pow(2)
-        alphas = alphas / alphas[0]
-        betas = 1 - alphas[1:] / alphas[:-1]
-        betas = np.clip(betas, a_min=0, a_max=0.999)
-    elif schedule == "sqrt_linear":
-        betas = torch.linspace(linear_start,
-                               linear_end,
-                               n_timestep,
-                               dtype=torch.float64)
-    elif schedule == "sqrt":
-        betas = torch.linspace(linear_start,
-                               linear_end,
-                               n_timestep,
-                               dtype=torch.float64)**0.5
-    else:
-        raise ValueError(f"schedule '{schedule}' unknown.")
-    return betas.numpy()  # type: ignore
-def make_ddim_timesteps(ddim_discr_method,
-                        num_ddim_timesteps,
-                        num_ddpm_timesteps,
-                        verbose=True):
-    if ddim_discr_method == 'uniform':
-        c = num_ddpm_timesteps // num_ddim_timesteps
-        ddim_timesteps = np.asarray(list(range(0, num_ddpm_timesteps, c)))
-    elif ddim_discr_method == 'quad':
-        ddim_timesteps = ((np.linspace(0, np.sqrt(num_ddpm_timesteps * .8),
-                                       num_ddim_timesteps))**2).astype(int)
-    else:
-        raise NotImplementedError(
-            f'There is no ddim discretization method called "{ddim_discr_method}"'
-        )
-    # assert ddim_timesteps.shape[0] == num_ddim_timesteps
-    # add one to get the final alpha values right (the ones from first scale to data during sampling)
-    steps_out = ddim_timesteps + 1
-    if verbose:
-        print(f'Selected timesteps for ddim sampler: {steps_out}')
-    return steps_out
-def make_ddim_sampling_parameters(alphacums,
-                                  ddim_timesteps,
-                                  eta,
-                                  verbose=True):
-    # select alphas for computing the variance schedule
-    alphas = alphacums[ddim_timesteps]
-    alphas_prev = np.asarray([alphacums[0]] +
-                             alphacums[ddim_timesteps[:-1]].tolist())
-    # according the the formula provided in https://arxiv.org/abs/2010.02502
-    sigmas = eta * np.sqrt(
-        (1 - alphas_prev) / (1 - alphas) * (1 - alphas / alphas_prev))
-    if verbose:
-        print(
-            f'Selected alphas for ddim sampler: a_t: {alphas}; a_(t-1): {alphas_prev}'
-        )
-        print(
-            f'For the chosen value of eta, which is {eta}, '
-            f'this results in the following sigma_t schedule for ddim sampler {sigmas}'
-        )
-    return sigmas, alphas, alphas_prev
-def betas_for_alpha_bar(num_diffusion_timesteps, alpha_bar, max_beta=0.999):
-    """
-    Create a beta schedule that discretizes the given alpha_t_bar function,
-    which defines the cumulative product of (1-beta) over time from t = [0,1].
-    :param num_diffusion_timesteps: the number of betas to produce.
-    :param alpha_bar: a lambda that takes an argument t from 0 to 1 and
-                      produces the cumulative product of (1-beta) up to that
-                      part of the diffusion process.
-    :param max_beta: the maximum beta to use; use values lower than 1 to
-                     prevent singularities.
-    """
-    betas = []
-    for i in range(num_diffusion_timesteps):
-        t1 = i / num_diffusion_timesteps
-        t2 = (i + 1) / num_diffusion_timesteps
-        betas.append(min(1 - alpha_bar(t2) / alpha_bar(t1), max_beta))
-    return np.array(betas)
-def extract_into_tensor(a, t, x_shape):
-    b, *_ = t.shape
-    out = a.gather(-1, t)
-    return out.reshape(b, *((1, ) * (len(x_shape) - 1)))
 def checkpoint(func, inputs, params, flag):
     """
@@ -227,45 +98,6 @@ def zero_module(module):
         p.detach().zero_()
     return module
-def scale_module(module, scale):
-    """
-    Scale the parameters of a module and return it.
-    """
-    for p in module.parameters():
-        p.detach().mul_(scale)
-    return module
-def mean_flat(tensor):
-    """
-    Take the mean over all non-batch dimensions.
-    """
-    return tensor.mean(dim=list(range(1, len(tensor.shape))))
-def normalization(channels):
-    """
-    Make a standard normalization layer.
-    :param channels: number of input channels.
-    :return: an nn.Module for normalization.
-    """
-    return GroupNorm32(32, channels)
-# PyTorch 1.7 has SiLU, but we support PyTorch 1.5.
-class SiLU(nn.Module):
-    def forward(self, x):
-        return x * torch.sigmoid(x)
-class GroupNorm32(nn.GroupNorm):
-    def forward(self, x):
-        return super().forward(x)
 def conv_nd(dims, *args, **kwargs):
     """
     Create a 1D, 2D, or 3D convolution module.
@@ -279,13 +111,6 @@ def conv_nd(dims, *args, **kwargs):
     raise ValueError(f"unsupported dimensions: {dims}")
-def linear(*args, **kwargs):
-    """
-    Create a linear module.
-    """
-    return nn.Linear(*args, **kwargs)
 def avg_pool_nd(dims, *args, **kwargs):
     """
     Create a 1D, 2D, or 3D average pooling module.
@@ -297,24 +122,3 @@ def avg_pool_nd(dims, *args, **kwargs):
     elif dims == 3:
         return nn.AvgPool3d(*args, **kwargs)
     raise ValueError(f"unsupported dimensions: {dims}")
-class HybridConditioner(nn.Module):
-    def __init__(self, c_concat_config, c_crossattn_config):
-        super().__init__()
-        self.concat_conditioner: Any = instantiate_from_config(c_concat_config)
-        self.crossattn_conditioner: Any = instantiate_from_config(
-            c_crossattn_config)
-    def forward(self, c_concat, c_crossattn):
-        c_concat = self.concat_conditioner(c_concat)
-        c_crossattn = self.crossattn_conditioner(c_crossattn)
-        return {'c_concat': [c_concat], 'c_crossattn': [c_crossattn]}
-def noise_like(shape, device, repeat=False):
-    repeat_noise = lambda: torch.randn((1, *shape[1:]), device=device).repeat(
-        shape[0], *((1, ) * (len(shape) - 1)))
-    noise = lambda: torch.randn(shape, device=device)
-    return repeat_noise() if repeat else noise()

 import math
 import torch
 import torch.nn as nn
 from einops import repeat
 def checkpoint(func, inputs, params, flag):
     """
         p.detach().zero_()
     return module
 def conv_nd(dims, *args, **kwargs):
     """
     Create a 1D, 2D, or 3D convolution module.
     raise ValueError(f"unsupported dimensions: {dims}")
 def avg_pool_nd(dims, *args, **kwargs):
     """
     Create a 1D, 2D, or 3D average pooling module.
     elif dims == 3:
         return nn.AvgPool3d(*args, **kwargs)
     raise ValueError(f"unsupported dimensions: {dims}")