fix mvdream

Browse files

Files changed (6) hide show

.gitignore +1 -0
README.md +3 -0
convert_mvdream_to_diffusers.py +6 -0
mvdream/attention.py +1 -4
mvdream/models.py +9 -23
mvdream/util.py +0 -9

.gitignore CHANGED Viewed

@@ -4,4 +4,5 @@
 *.pyc
 weights
 sd-v2*

 *.pyc
 weights
+models
 sd-v2*

README.md CHANGED Viewed

@@ -4,6 +4,9 @@ modified from https://github.com/KokeCacao/mvdream-hf.
 ### convert weights
 ```bash
 # download original ckpt
 wget https://huggingface.co/MVDream/MVDream/resolve/main/sd-v2.1-base-4view.pt
 wget https://raw.githubusercontent.com/bytedance/MVDream/main/mvdream/configs/sd-v2-base.yaml

 ### convert weights
 ```bash
+# dependency
+pip install -U omegaconf diffusers safetensors huggingface_hub transformers accelerate
 # download original ckpt
 wget https://huggingface.co/MVDream/MVDream/resolve/main/sd-v2.1-base-4view.pt
 wget https://raw.githubusercontent.com/bytedance/MVDream/main/mvdream/configs/sd-v2-base.yaml

convert_mvdream_to_diffusers.py CHANGED Viewed

@@ -405,6 +405,12 @@ def convert_from_original_mvdream_ckpt(checkpoint_path, original_config_file, de
     # )
     # print(f"Unet Config: {original_config.model.params.unet_config.params}")
     unet_config = create_unet_config(original_config)
     unet = MultiViewUNetModel(**unet_config)
     unet.register_to_config(**unet_config)
     # print(f"Unet State Dict: {unet.state_dict().keys()}")

     # )
     # print(f"Unet Config: {original_config.model.params.unet_config.params}")
     unet_config = create_unet_config(original_config)
+    # remove unused configs
+    del unet_config['legacy']
+    del unet_config['use_linear_in_transformer']
+    del unet_config['use_spatial_transformer']
     unet = MultiViewUNetModel(**unet_config)
     unet.register_to_config(**unet_config)
     # print(f"Unet State Dict: {unet.state_dict().keys()}")

mvdream/attention.py CHANGED Viewed

@@ -1,6 +1,3 @@
-# obtained and modified from https://github.com/bytedance/MVDream
-import math
 import torch
 import torch.nn as nn
 import torch.nn.functional as F
@@ -14,9 +11,9 @@ from .util import checkpoint, zero_module
 try:
     import xformers  # type: ignore
     import xformers.ops  # type: ignore
     XFORMERS_IS_AVAILBLE = True
 except:
     XFORMERS_IS_AVAILBLE = False
 # CrossAttn precision handling

 import torch
 import torch.nn as nn
 import torch.nn.functional as F
 try:
     import xformers  # type: ignore
     import xformers.ops  # type: ignore
     XFORMERS_IS_AVAILBLE = True
 except:
+    print(f'[WARN] xformers is unavailable!')
     XFORMERS_IS_AVAILBLE = False
 # CrossAttn precision handling

mvdream/models.py CHANGED Viewed

@@ -1,6 +1,3 @@
-# obtained and modified from https://github.com/bytedance/MVDream
-import math
 import torch
 import torch.nn as nn
 import torch.nn.functional as F
@@ -9,7 +6,6 @@ from diffusers.models.modeling_utils import ModelMixin
 from typing import Any, List, Optional
 from torch import Tensor
-from abc import abstractmethod
 from .util import (
     checkpoint,
     conv_nd,
@@ -19,19 +15,8 @@ from .util import (
 )
 from .attention import SpatialTransformer, SpatialTransformer3D
-class TimestepBlock(nn.Module):
-    """
-    Any module where forward() takes timestep embeddings as a second argument.
-    """
-    @abstractmethod
-    def forward(self, x, emb):
-        """
-        Apply the module to `x` given `emb` timestep embeddings.
-        """
-class TimestepEmbedSequential(nn.Sequential, TimestepBlock):
     """
     A sequential module that passes timestep embeddings to the children that
     support it as an extra input.
@@ -39,7 +24,7 @@ class TimestepEmbedSequential(nn.Sequential, TimestepBlock):
     def forward(self, x, emb, context=None, num_frames=1):
         for layer in self:
-            if isinstance(layer, TimestepBlock):
                 x = layer(x, emb)
             elif isinstance(layer, SpatialTransformer3D):
                 x = layer(x, context, num_frames=num_frames)
@@ -117,7 +102,7 @@ class Downsample(nn.Module):
         return self.op(x)
-class ResBlock(TimestepBlock):
     """
     A residual block that can optionally change the number of channels.
     :param channels: the number of input channels.
@@ -289,6 +274,7 @@ class MultiViewUNetModel(ModelMixin, ConfigMixin):
         disable_middle_self_attn=False,
         adm_in_channels=None,
         camera_dim=None,
     ):
         super().__init__()
         assert context_dim is not None
@@ -383,7 +369,7 @@ class MultiViewUNetModel(ModelMixin, ConfigMixin):
         self.input_blocks = nn.ModuleList(
             [
-                TimestepEmbedSequential(
                     conv_nd(dims, in_channels, model_channels, 3, padding=1)
                 )
             ]
@@ -430,13 +416,13 @@ class MultiViewUNetModel(ModelMixin, ConfigMixin):
                                 use_checkpoint=use_checkpoint,
                             )
                         )
-                self.input_blocks.append(TimestepEmbedSequential(*layers))
                 self._feature_size += ch
                 input_block_chans.append(ch)
             if level != len(channel_mult) - 1:
                 out_ch = ch
                 self.input_blocks.append(
-                    TimestepEmbedSequential(
                         ResBlock(
                             ch,
                             time_embed_dim,
@@ -464,7 +450,7 @@ class MultiViewUNetModel(ModelMixin, ConfigMixin):
             num_heads = ch // num_head_channels
             dim_head = num_head_channels
-        self.middle_block = TimestepEmbedSequential(
             ResBlock(
                 ch,
                 time_embed_dim,
@@ -550,7 +536,7 @@ class MultiViewUNetModel(ModelMixin, ConfigMixin):
                         else Upsample(ch, conv_resample, dims=dims, out_channels=out_ch)
                     )
                     ds //= 2
-                self.output_blocks.append(TimestepEmbedSequential(*layers))
                 self._feature_size += ch
         self.out = nn.Sequential(

 import torch
 import torch.nn as nn
 import torch.nn.functional as F
 from typing import Any, List, Optional
 from torch import Tensor
 from .util import (
     checkpoint,
     conv_nd,
 )
 from .attention import SpatialTransformer, SpatialTransformer3D
+class CondSequential(nn.Sequential):
     """
     A sequential module that passes timestep embeddings to the children that
     support it as an extra input.
     def forward(self, x, emb, context=None, num_frames=1):
         for layer in self:
+            if isinstance(layer, ResBlock):
                 x = layer(x, emb)
             elif isinstance(layer, SpatialTransformer3D):
                 x = layer(x, context, num_frames=num_frames)
         return self.op(x)
+class ResBlock(nn.Module):
     """
     A residual block that can optionally change the number of channels.
     :param channels: the number of input channels.
         disable_middle_self_attn=False,
         adm_in_channels=None,
         camera_dim=None,
+        **kwargs,
     ):
         super().__init__()
         assert context_dim is not None
         self.input_blocks = nn.ModuleList(
             [
+                CondSequential(
                     conv_nd(dims, in_channels, model_channels, 3, padding=1)
                 )
             ]
                                 use_checkpoint=use_checkpoint,
                             )
                         )
+                self.input_blocks.append(CondSequential(*layers))
                 self._feature_size += ch
                 input_block_chans.append(ch)
             if level != len(channel_mult) - 1:
                 out_ch = ch
                 self.input_blocks.append(
+                    CondSequential(
                         ResBlock(
                             ch,
                             time_embed_dim,
             num_heads = ch // num_head_channels
             dim_head = num_head_channels
+        self.middle_block = CondSequential(
             ResBlock(
                 ch,
                 time_embed_dim,
                         else Upsample(ch, conv_resample, dims=dims, out_channels=out_ch)
                     )
                     ds //= 2
+                self.output_blocks.append(CondSequential(*layers))
                 self._feature_size += ch
         self.out = nn.Sequential(

mvdream/util.py CHANGED Viewed

@@ -1,12 +1,3 @@
-# adopted from
-# https://github.com/openai/improved-diffusion/blob/main/improved_diffusion/gaussian_diffusion.py
-# and
-# https://github.com/lucidrains/denoising-diffusion-pytorch/blob/7706bdfc6f527f58d33f84b7b522e61e6e3164b3/denoising_diffusion_pytorch/denoising_diffusion_pytorch.py
-# and
-# https://github.com/openai/guided-diffusion/blob/0ba878e517b276c45d1195eb29f6f5f72659a05b/guided_diffusion/nn.py
-#
-# thanks!
 import math
 import torch
 import torch.nn as nn

 import math
 import torch
 import torch.nn as nn