kavathiya
/

multi-view-diff

Image-to-3D

Diffusers

Safetensors

MVDreamPipeline

Model card Files Files and versions

xet

Community

ashawkey commited on Jan 12, 2024

Commit

70f0c91

1 Parent(s): 2f425fc

further clean!

Browse files

Files changed (3) hide show

main.py +2 -2
mvdream/attention.py +13 -85
mvdream/models.py +12 -176

main.py CHANGED Viewed

@@ -5,8 +5,8 @@ import argparse
 from mvdream.pipeline_mvdream import MVDreamStableDiffusionPipeline
 pipe = MVDreamStableDiffusionPipeline.from_pretrained(
-    # "./weights", # local weights
-    "ashawkey/mvdream-sd2.1-diffusers",
     torch_dtype=torch.float16
 )
 pipe = pipe.to("cuda")

 from mvdream.pipeline_mvdream import MVDreamStableDiffusionPipeline
 pipe = MVDreamStableDiffusionPipeline.from_pretrained(
+    "./weights", # local weights
+    # "ashawkey/mvdream-sd2.1-diffusers",
     torch_dtype=torch.float16
 )
 pipe = pipe.to("cuda")

mvdream/attention.py CHANGED Viewed

@@ -2,14 +2,14 @@
 import math
 import torch
 import torch.nn.functional as F
 from inspect import isfunction
-from torch import nn, einsum
-from torch.amp.autocast_mode import autocast
 from einops import rearrange, repeat
 from typing import Optional, Any
-from .util import checkpoint
 try:
     import xformers  # type: ignore
@@ -25,28 +25,12 @@ import os
 _ATTN_PRECISION = os.environ.get("ATTN_PRECISION", "fp32")
-def uniq(arr):
-    return {el: True for el in arr}.keys()
 def default(val, d):
     if val is not None:
         return val
     return d() if isfunction(d) else d
-def max_neg_value(t):
-    return -torch.finfo(t.dtype).max
-def init_(tensor):
-    dim = tensor.shape[-1]
-    std = 1 / math.sqrt(dim)
-    tensor.uniform_(-std, std)
-    return tensor
-# feedforward
 class GEGLU(nn.Module):
     def __init__(self, dim_in, dim_out):
         super().__init__()
@@ -76,66 +60,6 @@ class FeedForward(nn.Module):
         return self.net(x)
-def zero_module(module):
-    """
-    Zero out the parameters of a module and return it.
-    """
-    for p in module.parameters():
-        p.detach().zero_()
-    return module
-def Normalize(in_channels):
-    return torch.nn.GroupNorm(
-        num_groups=32, num_channels=in_channels, eps=1e-6, affine=True
-    )
-class SpatialSelfAttention(nn.Module):
-    def __init__(self, in_channels):
-        super().__init__()
-        self.in_channels = in_channels
-        self.norm = Normalize(in_channels)
-        self.q = torch.nn.Conv2d(
-            in_channels, in_channels, kernel_size=1, stride=1, padding=0
-        )
-        self.k = torch.nn.Conv2d(
-            in_channels, in_channels, kernel_size=1, stride=1, padding=0
-        )
-        self.v = torch.nn.Conv2d(
-            in_channels, in_channels, kernel_size=1, stride=1, padding=0
-        )
-        self.proj_out = torch.nn.Conv2d(
-            in_channels, in_channels, kernel_size=1, stride=1, padding=0
-        )
-    def forward(self, x):
-        h_ = x
-        h_ = self.norm(h_)
-        q = self.q(h_)
-        k = self.k(h_)
-        v = self.v(h_)
-        # compute attention
-        b, c, h, w = q.shape
-        q = rearrange(q, "b c h w -> b (h w) c")
-        k = rearrange(k, "b c h w -> b c (h w)")
-        w_ = torch.einsum("bij,bjk->bik", q, k)
-        w_ = w_ * (int(c) ** (-0.5))
-        w_ = torch.nn.functional.softmax(w_, dim=2)
-        # attend to values
-        v = rearrange(v, "b c h w -> b c (h w)")
-        w_ = rearrange(w_, "b i j -> b j i")
-        h_ = torch.einsum("bij,bjk->bik", v, w_)
-        h_ = rearrange(h_, "b c (h w) -> b c h w", h=h)
-        h_ = self.proj_out(h_)
-        return x + h_
 class CrossAttention(nn.Module):
     def __init__(self, query_dim, context_dim=None, heads=8, dim_head=64, dropout=0.0):
         super().__init__()
@@ -167,9 +91,9 @@ class CrossAttention(nn.Module):
         if _ATTN_PRECISION == "fp32":
             with autocast(enabled=False, device_type="cuda"):
                 q, k = q.float(), k.float()
-                sim = einsum("b i d, b j d -> b i j", q, k) * self.scale
         else:
-            sim = einsum("b i d, b j d -> b i j", q, k) * self.scale
         del q, k
@@ -182,7 +106,7 @@ class CrossAttention(nn.Module):
         # attention, what we cannot get enough of
         sim = sim.softmax(dim=-1)
-        out = einsum("b i j, b j d -> b i d", sim, v)
         out = rearrange(out, "(b h) n d -> b n (h d)", h=h)
         return self.to_out(out)
@@ -326,7 +250,9 @@ class SpatialTransformer(nn.Module):
             context_dim = [context_dim]
         self.in_channels = in_channels
         inner_dim = n_heads * d_head
-        self.norm = Normalize(in_channels)
         if not use_linear:
             self.proj_in = nn.Conv2d(
                 in_channels, inner_dim, kernel_size=1, stride=1, padding=0
@@ -410,7 +336,7 @@ class SpatialTransformer3D(nn.Module):
         dropout=0.0,
         context_dim=None,
         disable_self_attn=False,
-        use_linear=False,
         use_checkpoint=True,
     ):
         super().__init__()
@@ -419,7 +345,9 @@ class SpatialTransformer3D(nn.Module):
             context_dim = [context_dim]
         self.in_channels = in_channels
         inner_dim = n_heads * d_head
-        self.norm = Normalize(in_channels)
         if not use_linear:
             self.proj_in = nn.Conv2d(
                 in_channels, inner_dim, kernel_size=1, stride=1, padding=0

 import math
 import torch
+import torch.nn as nn
 import torch.nn.functional as F
+from torch.amp.autocast_mode import autocast
 from inspect import isfunction
 from einops import rearrange, repeat
 from typing import Optional, Any
+from .util import checkpoint, zero_module
 try:
     import xformers  # type: ignore
 _ATTN_PRECISION = os.environ.get("ATTN_PRECISION", "fp32")
 def default(val, d):
     if val is not None:
         return val
     return d() if isfunction(d) else d
 class GEGLU(nn.Module):
     def __init__(self, dim_in, dim_out):
         super().__init__()
         return self.net(x)
 class CrossAttention(nn.Module):
     def __init__(self, query_dim, context_dim=None, heads=8, dim_head=64, dropout=0.0):
         super().__init__()
         if _ATTN_PRECISION == "fp32":
             with autocast(enabled=False, device_type="cuda"):
                 q, k = q.float(), k.float()
+                sim = torch.einsum("b i d, b j d -> b i j", q, k) * self.scale
         else:
+            sim = torch.einsum("b i d, b j d -> b i j", q, k) * self.scale
         del q, k
         # attention, what we cannot get enough of
         sim = sim.softmax(dim=-1)
+        out = torch.einsum("b i j, b j d -> b i d", sim, v)
         out = rearrange(out, "(b h) n d -> b n (h d)", h=h)
         return self.to_out(out)
             context_dim = [context_dim]
         self.in_channels = in_channels
         inner_dim = n_heads * d_head
+        self.norm = nn.GroupNorm(
+            num_groups=32, num_channels=in_channels, eps=1e-6, affine=True
+        )
         if not use_linear:
             self.proj_in = nn.Conv2d(
                 in_channels, inner_dim, kernel_size=1, stride=1, padding=0
         dropout=0.0,
         context_dim=None,
         disable_self_attn=False,
+        use_linear=True,
         use_checkpoint=True,
     ):
         super().__init__()
             context_dim = [context_dim]
         self.in_channels = in_channels
         inner_dim = n_heads * d_head
+        self.norm = nn.GroupNorm(
+            num_groups=32, num_channels=in_channels, eps=1e-6, affine=True
+        )
         if not use_linear:
             self.proj_in = nn.Conv2d(
                 in_channels, inner_dim, kernel_size=1, stride=1, padding=0

mvdream/models.py CHANGED Viewed

@@ -1,8 +1,7 @@
 # obtained and modified from https://github.com/bytedance/MVDream
 import math
-import numpy as np
-import torch as th
 import torch.nn as nn
 import torch.nn.functional as F
 from diffusers.configuration_utils import ConfigMixin
@@ -223,7 +222,7 @@ class ResBlock(TimestepBlock):
             emb_out = emb_out[..., None]
         if self.use_scale_shift_norm:
             out_norm, out_rest = self.out_layers[0], self.out_layers[1:]
-            scale, shift = th.chunk(emb_out, 2, dim=1)
             h = out_norm(h) * (1 + scale) + shift
             h = out_rest(h)
         else:
@@ -232,112 +231,6 @@ class ResBlock(TimestepBlock):
         return self.skip_connection(x) + h
-class AttentionBlock(nn.Module):
-    """
-    An attention block that allows spatial positions to attend to each other.
-    Originally ported from here, but adapted to the N-d case.
-    https://github.com/hojonathanho/diffusion/blob/1e0dceb3b3495bbe19116a5e1b3596cd0706c543/diffusion_tf/models/unet.py#L66.
-    """
-    def __init__(
-        self,
-        channels,
-        num_heads=1,
-        num_head_channels=-1,
-        use_checkpoint=False,
-        use_new_attention_order=False,
-    ):
-        super().__init__()
-        self.channels = channels
-        if num_head_channels == -1:
-            self.num_heads = num_heads
-        else:
-            assert (
-                channels % num_head_channels == 0
-            ), f"q,k,v channels {channels} is not divisible by num_head_channels {num_head_channels}"
-            self.num_heads = channels // num_head_channels
-        self.use_checkpoint = use_checkpoint
-        self.norm = nn.GroupNorm(32, channels)
-        self.qkv = conv_nd(1, channels, channels * 3, 1)
-        if use_new_attention_order:
-            # split qkv before split heads
-            self.attention = QKVAttention(self.num_heads)
-        else:
-            # split heads before split qkv
-            self.attention = QKVAttentionLegacy(self.num_heads)
-        self.proj_out = zero_module(conv_nd(1, channels, channels, 1))
-    def forward(self, x):
-        return checkpoint(self._forward, (x,), self.parameters(), True)
-    def _forward(self, x):
-        b, c, *spatial = x.shape
-        x = x.reshape(b, c, -1)
-        qkv = self.qkv(self.norm(x))
-        h = self.attention(qkv)
-        h = self.proj_out(h)
-        return (x + h).reshape(b, c, *spatial)
-class QKVAttentionLegacy(nn.Module):
-    """
-    A module which performs QKV attention. Matches legacy QKVAttention + input/ouput heads shaping
-    """
-    def __init__(self, n_heads):
-        super().__init__()
-        self.n_heads = n_heads
-    def forward(self, qkv):
-        """
-        Apply QKV attention.
-        :param qkv: an [N x (H * 3 * C) x T] tensor of Qs, Ks, and Vs.
-        :return: an [N x (H * C) x T] tensor after attention.
-        """
-        bs, width, length = qkv.shape
-        assert width % (3 * self.n_heads) == 0
-        ch = width // (3 * self.n_heads)
-        q, k, v = qkv.reshape(bs * self.n_heads, ch * 3, length).split(ch, dim=1)
-        scale = 1 / math.sqrt(math.sqrt(ch))
-        weight = th.einsum(
-            "bct,bcs->bts", q * scale, k * scale
-        )  # More stable with f16 than dividing afterwards
-        weight = th.softmax(weight.float(), dim=-1).type(weight.dtype)
-        a = th.einsum("bts,bcs->bct", weight, v)
-        return a.reshape(bs, -1, length)
-class QKVAttention(nn.Module):
-    """
-    A module which performs QKV attention and splits in a different order.
-    """
-    def __init__(self, n_heads):
-        super().__init__()
-        self.n_heads = n_heads
-    def forward(self, qkv):
-        """
-        Apply QKV attention.
-        :param qkv: an [N x (3 * H * C) x T] tensor of Qs, Ks, and Vs.
-        :return: an [N x (H * C) x T] tensor after attention.
-        """
-        bs, width, length = qkv.shape
-        assert width % (3 * self.n_heads) == 0
-        ch = width // (3 * self.n_heads)
-        q, k, v = qkv.chunk(3, dim=1)
-        scale = 1 / math.sqrt(math.sqrt(ch))
-        weight = th.einsum(
-            "bct,bcs->bts",
-            (q * scale).view(bs * self.n_heads, ch, length),
-            (k * scale).view(bs * self.n_heads, ch, length),
-        )  # More stable with f16 than dividing afterwards
-        weight = th.softmax(weight.float(), dim=-1).type(weight.dtype)
-        a = th.einsum("bts,bcs->bct", weight, v.reshape(bs * self.n_heads, ch, length))
-        return a.reshape(bs, -1, length)
 class MultiViewUNetModel(ModelMixin, ConfigMixin):
     """
     The full multi-view UNet model with attention, timestep embedding and camera embedding.
@@ -388,34 +281,18 @@ class MultiViewUNetModel(ModelMixin, ConfigMixin):
         num_heads_upsample=-1,
         use_scale_shift_norm=False,
         resblock_updown=False,
-        use_new_attention_order=False,
-        use_spatial_transformer=False,  # custom transformer support
         transformer_depth=1,  # custom transformer support
         context_dim=None,  # custom transformer support
         n_embed=None,  # custom support for prediction of discrete ids into codebook of first stage vq model
-        legacy=True,
         disable_self_attentions=None,
         num_attention_blocks=None,
         disable_middle_self_attn=False,
-        use_linear_in_transformer=False,
         adm_in_channels=None,
         camera_dim=None,
     ):
         super().__init__()
-        if use_spatial_transformer:
-            assert (
-                context_dim is not None
-            ), "Fool!! You forgot to include the dimension of your cross-attention conditioning..."
-        if context_dim is not None:
-            assert (
-                use_spatial_transformer
-            ), "Fool!! You forgot to use the spatial transformer for your cross-attention conditioning..."
-            from omegaconf.listconfig import ListConfig
-            if type(context_dim) == ListConfig:
-                context_dim = list(context_dim)
         if num_heads_upsample == -1:
             num_heads_upsample = num_heads
@@ -535,13 +412,7 @@ class MultiViewUNetModel(ModelMixin, ConfigMixin):
                     else:
                         num_heads = ch // num_head_channels
                         dim_head = num_head_channels
-                    if legacy:
-                        # num_heads = 1
-                        dim_head = (
-                            ch // num_heads
-                            if use_spatial_transformer
-                            else num_head_channels
-                        )
                     if disable_self_attentions is not None:
                         disabled_sa = disable_self_attentions[level]
                     else:
@@ -549,22 +420,13 @@ class MultiViewUNetModel(ModelMixin, ConfigMixin):
                     if num_attention_blocks is None or nr < num_attention_blocks[level]:
                         layers.append(
-                            AttentionBlock(
-                                ch,
-                                use_checkpoint=use_checkpoint,
-                                num_heads=num_heads,
-                                num_head_channels=dim_head,
-                                use_new_attention_order=use_new_attention_order,
-                            )
-                            if not use_spatial_transformer
-                            else SpatialTransformer3D(
                                 ch,
                                 num_heads,
                                 dim_head,
                                 depth=transformer_depth,
                                 context_dim=context_dim,
                                 disable_self_attn=disabled_sa,
-                                use_linear=use_linear_in_transformer,
                                 use_checkpoint=use_checkpoint,
                             )
                         )
@@ -601,9 +463,7 @@ class MultiViewUNetModel(ModelMixin, ConfigMixin):
         else:
             num_heads = ch // num_head_channels
             dim_head = num_head_channels
-        if legacy:
-            # num_heads = 1
-            dim_head = ch // num_heads if use_spatial_transformer else num_head_channels
         self.middle_block = TimestepEmbedSequential(
             ResBlock(
                 ch,
@@ -613,24 +473,15 @@ class MultiViewUNetModel(ModelMixin, ConfigMixin):
                 use_checkpoint=use_checkpoint,
                 use_scale_shift_norm=use_scale_shift_norm,
             ),
-            AttentionBlock(
-                ch,
-                use_checkpoint=use_checkpoint,
-                num_heads=num_heads,
-                num_head_channels=dim_head,
-                use_new_attention_order=use_new_attention_order,
-            )
-            if not use_spatial_transformer
-            else SpatialTransformer3D(
                 ch,
                 num_heads,
                 dim_head,
                 depth=transformer_depth,
                 context_dim=context_dim,
                 disable_self_attn=disable_middle_self_attn,
-                use_linear=use_linear_in_transformer,
                 use_checkpoint=use_checkpoint,
-            ),  # always uses a self-attn
             ResBlock(
                 ch,
                 time_embed_dim,
@@ -664,13 +515,7 @@ class MultiViewUNetModel(ModelMixin, ConfigMixin):
                     else:
                         num_heads = ch // num_head_channels
                         dim_head = num_head_channels
-                    if legacy:
-                        # num_heads = 1
-                        dim_head = (
-                            ch // num_heads
-                            if use_spatial_transformer
-                            else num_head_channels
-                        )
                     if disable_self_attentions is not None:
                         disabled_sa = disable_self_attentions[level]
                     else:
@@ -678,22 +523,13 @@ class MultiViewUNetModel(ModelMixin, ConfigMixin):
                     if num_attention_blocks is None or i < num_attention_blocks[level]:
                         layers.append(
-                            AttentionBlock(
-                                ch,
-                                use_checkpoint=use_checkpoint,
-                                num_heads=num_heads_upsample,
-                                num_head_channels=dim_head,
-                                use_new_attention_order=use_new_attention_order,
-                            )
-                            if not use_spatial_transformer
-                            else SpatialTransformer3D(
                                 ch,
                                 num_heads,
                                 dim_head,
                                 depth=transformer_depth,
                                 context_dim=context_dim,
                                 disable_self_attn=disabled_sa,
-                                use_linear=use_linear_in_transformer,
                                 use_checkpoint=use_checkpoint,
                             )
                         )
@@ -777,7 +613,7 @@ class MultiViewUNetModel(ModelMixin, ConfigMixin):
             hs.append(h)
         h = self.middle_block(h, emb, context, num_frames=num_frames)
         for module in self.output_blocks:
-            h = th.cat([h, hs.pop()], dim=1)
             h = module(h, emb, context, num_frames=num_frames)
         h = h.type(x.dtype)
         if self.predict_codebook_ids:

 # obtained and modified from https://github.com/bytedance/MVDream
 import math
+import torch
 import torch.nn as nn
 import torch.nn.functional as F
 from diffusers.configuration_utils import ConfigMixin
             emb_out = emb_out[..., None]
         if self.use_scale_shift_norm:
             out_norm, out_rest = self.out_layers[0], self.out_layers[1:]
+            scale, shift = torch.chunk(emb_out, 2, dim=1)
             h = out_norm(h) * (1 + scale) + shift
             h = out_rest(h)
         else:
         return self.skip_connection(x) + h
 class MultiViewUNetModel(ModelMixin, ConfigMixin):
     """
     The full multi-view UNet model with attention, timestep embedding and camera embedding.
         num_heads_upsample=-1,
         use_scale_shift_norm=False,
         resblock_updown=False,
         transformer_depth=1,  # custom transformer support
         context_dim=None,  # custom transformer support
         n_embed=None,  # custom support for prediction of discrete ids into codebook of first stage vq model
         disable_self_attentions=None,
         num_attention_blocks=None,
         disable_middle_self_attn=False,
         adm_in_channels=None,
         camera_dim=None,
     ):
         super().__init__()
+        assert context_dim is not None
         if num_heads_upsample == -1:
             num_heads_upsample = num_heads
                     else:
                         num_heads = ch // num_head_channels
                         dim_head = num_head_channels
                     if disable_self_attentions is not None:
                         disabled_sa = disable_self_attentions[level]
                     else:
                     if num_attention_blocks is None or nr < num_attention_blocks[level]:
                         layers.append(
+                            SpatialTransformer3D(
                                 ch,
                                 num_heads,
                                 dim_head,
                                 depth=transformer_depth,
                                 context_dim=context_dim,
                                 disable_self_attn=disabled_sa,
                                 use_checkpoint=use_checkpoint,
                             )
                         )
         else:
             num_heads = ch // num_head_channels
             dim_head = num_head_channels
         self.middle_block = TimestepEmbedSequential(
             ResBlock(
                 ch,
                 use_checkpoint=use_checkpoint,
                 use_scale_shift_norm=use_scale_shift_norm,
             ),
+            SpatialTransformer3D(
                 ch,
                 num_heads,
                 dim_head,
                 depth=transformer_depth,
                 context_dim=context_dim,
                 disable_self_attn=disable_middle_self_attn,
                 use_checkpoint=use_checkpoint,
+            ),
             ResBlock(
                 ch,
                 time_embed_dim,
                     else:
                         num_heads = ch // num_head_channels
                         dim_head = num_head_channels
                     if disable_self_attentions is not None:
                         disabled_sa = disable_self_attentions[level]
                     else:
                     if num_attention_blocks is None or i < num_attention_blocks[level]:
                         layers.append(
+                            SpatialTransformer3D(
                                 ch,
                                 num_heads,
                                 dim_head,
                                 depth=transformer_depth,
                                 context_dim=context_dim,
                                 disable_self_attn=disabled_sa,
                                 use_checkpoint=use_checkpoint,
                             )
                         )
             hs.append(h)
         h = self.middle_block(h, emb, context, num_frames=num_frames)
         for module in self.output_blocks:
+            h = torch.cat([h, hs.pop()], dim=1)
             h = module(h, emb, context, num_frames=num_frames)
         h = h.type(x.dtype)
         if self.predict_codebook_ids: