int4

Files changed (7) hide show

README.md +12 -1
config.py +1 -0
layers.py +66 -2
model.safetensors +2 -2
moondream.py +15 -9
text.py +43 -87
weights.py +177 -220

README.md CHANGED Viewed

@@ -9,6 +9,10 @@ Moondream is a small vision language model designed to run efficiently everywher
 This repository contains the latest (**2025-04-14**) release of Moondream, as well as [historical releases](https://huggingface.co/vikhyatk/moondream2/blob/main/versions.txt). The model is updated frequently, so we recommend specifying a revision as shown below if you're using it in a production application.
 ### Usage
@@ -16,9 +20,11 @@ This repository contains the latest (**2025-04-14**) release of Moondream, as we
 from transformers import AutoModelForCausalLM, AutoTokenizer
 from PIL import Image
 model = AutoModelForCausalLM.from_pretrained(
     "vikhyatk/moondream2",
-    revision="2025-04-14",
     trust_remote_code=True,
     # Uncomment to run on GPU.
     # device_map={"": "cuda"}
@@ -50,6 +56,11 @@ print(f"Found {len(points)} person(s)")
 ```
 ### Changelog
 **2025-04-15** ([full release notes](https://moondream.ai/blog/moondream-2025-04-14-release))

 This repository contains the latest (**2025-04-14**) release of Moondream, as well as [historical releases](https://huggingface.co/vikhyatk/moondream2/blob/main/versions.txt). The model is updated frequently, so we recommend specifying a revision as shown below if you're using it in a production application.
+To use **quantized int4**, make sure to install the requirements:
+```
+pip install -r https://depot.moondream.ai/transformers/requirements.txt
+```
 ### Usage
 from transformers import AutoModelForCausalLM, AutoTokenizer
 from PIL import Image
+# To run in float16, set revision_id = 2025-04-14
 model = AutoModelForCausalLM.from_pretrained(
     "vikhyatk/moondream2",
+    revision="int4_2025-04-14",
+    revision="2025-04-14
     trust_remote_code=True,
     # Uncomment to run on GPU.
     # device_map={"": "cuda"}
 ```
 ### Changelog
+**int4-2025-04-15** ([full release notes](https://moondream.ai/blog/moondream-2025-04-14-release))
+1. Moondream uses a whole lot less memory (4.12 down to 2.47GB)
+2. Small device get a big speed up (44.54 to 67.84 tok/sec on a RTX 4050 Mobile)
+3. Improved spatial understanding (RealWorldQA up from 58.3 to 60.13)
 **2025-04-15** ([full release notes](https://moondream.ai/blog/moondream-2025-04-14-release))

config.py CHANGED Viewed

@@ -12,6 +12,7 @@ class TextConfig:
     n_heads: int = 32
     n_kv_heads: int = 32
     prefix_attn: int = 730
 @dataclass(frozen=True)

     n_heads: int = 32
     n_kv_heads: int = 32
     prefix_attn: int = 730
+    group_size: int = 128
 @dataclass(frozen=True)

layers.py CHANGED Viewed

@@ -1,7 +1,10 @@
 from dataclasses import dataclass
 from typing import Literal
-import torch
 from torch.nn import functional as F
@@ -15,6 +18,66 @@ class LinearWeights:
     bias: torch.Tensor
 def linear(x: torch.Tensor, w: LinearWeights) -> torch.Tensor:
     return F.linear(x, w.weight, w.bias)
@@ -37,6 +100,7 @@ class MLPWeights:
 def mlp(x: torch.Tensor, w: MLPWeights) -> torch.Tensor:
     x = w.fc1(x)
     x = gelu_approx(x)
     x = w.fc2(x)

+import bitblas
+import torch
+import torch.nn as nn
 from dataclasses import dataclass
 from typing import Literal
+from bitblas.cache import OperatorCache
 from torch.nn import functional as F
     bias: torch.Tensor
+class Linear(nn.Module):
+    """
+    Linear layer with support for bitblas quantization.
+    If dtype is torch.int8, it uses bitblas for quantization.
+    Otherwise, it uses a standard nn.Linear layer.
+    """
+    def __init__(
+        self,
+        in_features: int,
+        out_features: int,
+        bias: bool = True,
+        dtype: torch.dtype = None,
+        operator_cache: OperatorCache = None,
+        cache_dir: str = None,
+        group_size: int = 128,
+    ):
+        super().__init__()
+        if dtype == torch.int8:
+            self.linear = bitblas.Linear(
+                in_features=in_features,
+                out_features=out_features,
+                bias=bias,
+                with_zeros=True,
+                zeros_mode="original",
+                with_scaling=True,
+                A_dtype="float16",
+                W_dtype="uint4",
+                accum_dtype="float16",
+                out_dtype="float16",
+                fast_decoding=True,
+                enable_tuning=True,
+                operator_cache=operator_cache,
+                database_path=cache_dir,
+                group_size=group_size,
+            )
+        else:
+            self.linear = nn.Linear(
+                in_features=in_features,
+                out_features=out_features,
+                bias=bias,
+                dtype=torch.float16,
+            )
+    def forward(self, x):
+        return self.linear(x)
+    @property
+    def weight(self) -> torch.Tensor:
+        try:
+            return self.linear.weight
+        except AttributeError:
+            return self.linear.qweight
+    @property
+    def bias(self) -> torch.Tensor:
+        return self.linear.bias
 def linear(x: torch.Tensor, w: LinearWeights) -> torch.Tensor:
     return F.linear(x, w.weight, w.bias)
 def mlp(x: torch.Tensor, w: MLPWeights) -> torch.Tensor:
     x = w.fc1(x)
     x = gelu_approx(x)
     x = w.fc2(x)

model.safetensors CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:96dce588e4a319fde7af3c70fbf27e726f4850e22522d0fdc4b165d5e6003ad5
-size 3854538376

 version https://git-lfs.github.com/spec/v1
+oid sha256:5f86cdffeecef5dfab629bf93dbbf3b4ec480d5eaa4ab11b18714b92d76c4303
+size 2080366176

moondream.py CHANGED Viewed

@@ -66,12 +66,16 @@ class MoondreamModel(nn.Module):
     def __init__(self, config: MoondreamConfig, dtype=torch.float16, setup_caches=True):
         super().__init__()
         self.config = config
         self.tokenizer = Tokenizer.from_pretrained(
             "vikhyatk/moondream2", revision="2025-01-09"
         )
         self.vision = build_vision_model(config.vision, dtype)
-        self.text = build_text_model(config.text, dtype)
         # Region Model
         self.region = nn.ModuleDict(
@@ -125,11 +129,11 @@ class MoondreamModel(nn.Module):
         attn_mask[..., :prefix_attn_len, :prefix_attn_len] = 1
         self.register_buffer("attn_mask", attn_mask, persistent=False)
-        # Initialize KV caches.
-        if setup_caches:
-            self._setup_caches()
     def _setup_caches(self):
         c = self.config.text
         for b in self.text.blocks:
             b.kv_cache = KVCache(
@@ -163,11 +167,11 @@ class MoondreamModel(nn.Module):
     def compile(self):
         # TODO: vision_projection is not being compiled
-        self._vis_enc = torch.compile(self._vis_enc, fullgraph=True)
-        self._prefill = torch.compile(self._prefill, fullgraph=True)
-        self._decode_one_tok = torch.compile(
-            self._decode_one_tok, fullgraph=True, mode="reduce-overhead"
         )
     def _run_vision_encoder(self, image: Image.Image) -> torch.Tensor:
         all_crops, tiling = prepare_crops(image, self.config.vision, device=self.device)
@@ -200,6 +204,7 @@ class MoondreamModel(nn.Module):
         # Run through text model in addition to the vision encoder, to minimize
         # re-computation if multiple queries are performed on this image.
         with torch.inference_mode():
             img_emb = self._run_vision_encoder(image)
             bos_emb = text_encoder(
@@ -235,6 +240,7 @@ class MoondreamModel(nn.Module):
     def _prefill_prompt(
         self, prompt_tokens: torch.Tensor, pos: int, temperature: float, top_p: float
     ):
         with torch.inference_mode():
             prompt_emb = text_encoder(prompt_tokens, self.text)
             torch._dynamo.mark_dynamic(prompt_emb, 1)

     def __init__(self, config: MoondreamConfig, dtype=torch.float16, setup_caches=True):
         super().__init__()
         self.config = config
+        self.dtype = dtype
+        self.setup_caches_flag = setup_caches
         self.tokenizer = Tokenizer.from_pretrained(
             "vikhyatk/moondream2", revision="2025-01-09"
         )
         self.vision = build_vision_model(config.vision, dtype)
+        self.text = None
         # Region Model
         self.region = nn.ModuleDict(
         attn_mask[..., :prefix_attn_len, :prefix_attn_len] = 1
         self.register_buffer("attn_mask", attn_mask, persistent=False)
     def _setup_caches(self):
+        """Setup KV caches for the text model"""
+        if self.text is None:
+            return  # Can't set up caches without text model
         c = self.config.text
         for b in self.text.blocks:
             b.kv_cache = KVCache(
     def compile(self):
         # TODO: vision_projection is not being compiled
+        self._vis_enc = torch.compile(
+            self._vis_enc, fullgraph=False, mode="reduce-overhead"
         )
+        self._prefill = torch.compile(self._prefill)
+        self._decode_one_tok = torch.compile(self._decode_one_tok)
     def _run_vision_encoder(self, image: Image.Image) -> torch.Tensor:
         all_crops, tiling = prepare_crops(image, self.config.vision, device=self.device)
         # Run through text model in addition to the vision encoder, to minimize
         # re-computation if multiple queries are performed on this image.
         with torch.inference_mode():
             img_emb = self._run_vision_encoder(image)
             bos_emb = text_encoder(
     def _prefill_prompt(
         self, prompt_tokens: torch.Tensor, pos: int, temperature: float, top_p: float
     ):
         with torch.inference_mode():
             prompt_emb = text_encoder(prompt_tokens, self.text)
             torch._dynamo.mark_dynamic(prompt_emb, 1)

text.py CHANGED Viewed

@@ -2,8 +2,9 @@ import torch
 import torch.nn as nn
 from torch.nn import functional as F
-from .layers import layer_norm, mlp
 from .rope import apply_rotary_emb, precompute_freqs_cis
 from .config import TextConfig
@@ -26,6 +27,7 @@ def attn(
     head_dim = d_model // n_heads
     qkv_out = w.qkv(x)  # shape: (bsz, q_len, (n_heads + 2*n_kv_heads)*head_dim)
     q_dim = n_heads * head_dim
     kv_dim = n_kv_heads * head_dim
@@ -55,71 +57,6 @@ def attn(
     return out
-def _attn(
-    x: torch.Tensor,
-    w: torch.Tensor,
-    freqs_cis: torch.Tensor,
-    attn_mask: torch.Tensor,
-    n_heads: int,
-    n_kv_heads: int,
-):
-    bsz, q_len, d_model = x.shape
-    head_dim = d_model // n_heads
-    pos = 0
-    qkv_out = w.qkv(x)  # shape: (bsz, q_len, (n_heads + 2*n_kv_heads)*head_dim)
-    q_dim = n_heads * head_dim
-    kv_dim = n_kv_heads * head_dim
-    q = qkv_out[..., :q_dim].view(bsz, q_len, n_heads, head_dim).transpose(1, 2)
-    k = (
-        qkv_out[..., q_dim : q_dim + kv_dim]
-        .view(bsz, q_len, n_kv_heads, head_dim)
-        .transpose(1, 2)
-    )
-    v = (
-        qkv_out[..., q_dim + kv_dim :]
-        .view(bsz, q_len, n_kv_heads, head_dim)
-        .transpose(1, 2)
-    )
-    position_ids = torch.arange(pos, pos + q_len, dtype=torch.long)
-    q = apply_rotary_emb(q, freqs_cis, position_ids, n_heads)
-    k = apply_rotary_emb(k, freqs_cis, position_ids, n_kv_heads)
-    out = F.scaled_dot_product_attention(
-        q, k, v, attn_mask=attn_mask, enable_gqa=n_heads != n_kv_heads
-    )
-    out = out.transpose(1, 2).reshape(bsz, q_len, d_model)
-    out = w.proj(out)
-    return out
-def _produce_hidden(inputs_embeds: torch.Tensor, w: nn.Module, config: TextConfig):
-    hidden_BTC = inputs_embeds
-    bsz, q_len, d_model = inputs_embeds.shape
-    attn_mask = torch.zeros(q_len, q_len)
-    attn_mask[:730, :730] = 1
-    for i in range(730, q_len):
-        attn_mask[i, : i + 1] = 1
-    attn_mask = attn_mask.to(dtype=torch.bool)
-    for i, block in enumerate(w.blocks):
-        l_in = layer_norm(hidden_BTC, block.ln)
-        l_attn = _attn(
-            x=l_in,
-            w=block.attn,
-            freqs_cis=w.freqs_cis,
-            attn_mask=attn_mask,
-            n_heads=config.n_heads,
-            n_kv_heads=config.n_kv_heads,
-        )
-        l_mlp = mlp(l_in, block.mlp)
-        hidden_BTC = hidden_BTC + l_attn + l_mlp
-    return hidden_BTC
 def text_decoder(
     x: torch.Tensor,
     w: nn.Module,
@@ -139,6 +76,7 @@ def text_decoder(
             n_kv_heads=config.n_kv_heads,
             position_ids=position_ids,
         )
         l_mlp = mlp(l_in, block.mlp)
         x = x + l_attn + l_mlp
@@ -152,38 +90,54 @@ def lm_head(hidden_BTC: torch.Tensor, w: nn.Module):
     return logits
-def _lm_head(hidden_BTC: torch.Tensor, w: nn.Module):
-    hidden_BTC = layer_norm(hidden_BTC, w.post_ln)
-    logits = w.lm_head(hidden_BTC)
-    return logits
-def build_text_model(config: TextConfig, dtype: torch.dtype) -> nn.Module:
     qkv_dim = int(config.dim * (1 + 2 * config.n_kv_heads / config.n_heads))
     text = nn.ModuleDict(
         {
             "blocks": nn.ModuleList(
                 [
                     nn.ModuleDict(
                         {
-                            "ln": nn.LayerNorm(config.dim, dtype=dtype),
                             "attn": nn.ModuleDict(
                                 {
-                                    "qkv": nn.Linear(config.dim, qkv_dim, dtype=dtype),
-                                    "proj": nn.Linear(
-                                        config.dim, config.dim, dtype=dtype
-                                    ),
                                 }
                             ),
                             "mlp": nn.ModuleDict(
                                 {
-                                    "fc1": nn.Linear(
-                                        config.dim, config.ff_dim, dtype=dtype
-                                    ),
-                                    "fc2": nn.Linear(
-                                        config.ff_dim, config.dim, dtype=dtype
-                                    ),
                                 }
                             ),
                         }
@@ -191,11 +145,13 @@ def build_text_model(config: TextConfig, dtype: torch.dtype) -> nn.Module:
                     for _ in range(config.n_layers)
                 ]
             ),
-            "post_ln": nn.LayerNorm(config.dim, dtype=dtype),
-            "lm_head": nn.Linear(config.dim, config.vocab_size, dtype=dtype),
         }
     )
-    text.wte = nn.Parameter(torch.empty(config.vocab_size, config.dim, dtype=dtype))
     text.register_buffer(
         "freqs_cis",
         precompute_freqs_cis(config.dim // (2 * config.n_heads), config.max_context),

 import torch.nn as nn
 from torch.nn import functional as F
+from bitblas.cache import OperatorCache
+from .layers import layer_norm, mlp, Linear
 from .rope import apply_rotary_emb, precompute_freqs_cis
 from .config import TextConfig
     head_dim = d_model // n_heads
     qkv_out = w.qkv(x)  # shape: (bsz, q_len, (n_heads + 2*n_kv_heads)*head_dim)
     q_dim = n_heads * head_dim
     kv_dim = n_kv_heads * head_dim
     return out
 def text_decoder(
     x: torch.Tensor,
     w: nn.Module,
             n_kv_heads=config.n_kv_heads,
             position_ids=position_ids,
         )
         l_mlp = mlp(l_in, block.mlp)
         x = x + l_attn + l_mlp
     return logits
+def build_text_model(
+    config: TextConfig,
+    linear_dtype: torch.dtype = torch.float16,
+    layernorm_dtype: torch.dtype = torch.float16,
+) -> nn.Module:
+    # note : layernorm dtype is used for layernorm, lm_head and wte not just layernorm
+    print(
+        "Initializing quantized backend. This only has to run once, but may take a few minutes."
+    )
     qkv_dim = int(config.dim * (1 + 2 * config.n_kv_heads / config.n_heads))
+    operator_cache = None
+    cache_dir = None
+    group_size = None
+    if linear_dtype == torch.int8:
+        operator_cache = OperatorCache()
+        cache_dir = "./cache"
+        group_size = config.group_size
+    def create_linear(in_features, out_features, dtype=linear_dtype):
+        # factory function for creating Linear layers so we dont have to pass everything again and again
+        return Linear(
+            in_features=in_features,
+            out_features=out_features,
+            dtype=dtype,
+            operator_cache=operator_cache,
+            cache_dir=cache_dir,
+            group_size=group_size,
+        )
     text = nn.ModuleDict(
         {
             "blocks": nn.ModuleList(
                 [
                     nn.ModuleDict(
                         {
+                            "ln": nn.LayerNorm(config.dim, dtype=layernorm_dtype),
                             "attn": nn.ModuleDict(
                                 {
+                                    "qkv": create_linear(config.dim, qkv_dim),
+                                    "proj": create_linear(config.dim, config.dim),
                                 }
                             ),
                             "mlp": nn.ModuleDict(
                                 {
+                                    "fc1": create_linear(config.dim, config.ff_dim),
+                                    "fc2": create_linear(config.ff_dim, config.dim),
                                 }
                             ),
                         }
                     for _ in range(config.n_layers)
                 ]
             ),
+            "post_ln": nn.LayerNorm(config.dim, dtype=layernorm_dtype),
+            "lm_head": nn.Linear(config.dim, config.vocab_size, dtype=layernorm_dtype),
         }
     )
+    text.wte = nn.Parameter(
+        torch.empty(config.vocab_size, config.dim, dtype=layernorm_dtype)
+    )
     text.register_buffer(
         "freqs_cis",
         precompute_freqs_cis(config.dim // (2 * config.n_heads), config.max_context),

weights.py CHANGED Viewed

@@ -1,61 +1,25 @@
 import safetensors
 import torch
 import torch.nn as nn
 from contextlib import contextmanager
-from dataclasses import dataclass
 from typing import Callable, List
-from .layers import AttentionWeights, LayerNormWeights, LinearWeights, MLPWeights
-@dataclass
-class VisionBlock:
-    ln1: LayerNormWeights
-    attn: AttentionWeights
-    ln2: LayerNormWeights
-    mlp: MLPWeights
-@dataclass
-class VisionModel:
-    patch_emb: LinearWeights
-    pos_emb: torch.Tensor
-    blocks: List[VisionBlock]
-    post_ln: LayerNormWeights
-    proj_mlp: MLPWeights
-@dataclass
-class TextBlock:
-    ln: LayerNormWeights
-    attn: AttentionWeights
-    mlp: MLPWeights
-@dataclass
-class TextModel:
-    wte: torch.Tensor
-    blocks: List[TextBlock]
-    post_ln: LayerNormWeights
-    lm_head: LinearWeights
-@dataclass
-class RegionModel:
-    coord_features: torch.Tensor
-    coord_encoder: LinearWeights
-    coord_decoder: MLPWeights
-    size_features: torch.Tensor
-    size_encoder: LinearWeights
-    size_decoder: MLPWeights
-@dataclass
-class MoondreamModel:
-    vision: VisionModel
-    text: TextModel
-    region: RegionModel
 @contextmanager
@@ -79,199 +43,192 @@ def safetensors_open(safetensors_file: str):
         yield get_tensor
-def _load_weights(get_tensor: Callable[[str], torch.Tensor], model: nn.Module) -> None:
     """Internal function to load weights using a tensor getter function."""
     model = model.to(dtype=torch.float16)
-    # Vision Model
-    model.vision["patch_emb"].weight.data.copy_(
-        get_tensor("vision_encoder.encoder.model.visual.patch_embed.linear.weight")
-    )
-    model.vision["patch_emb"].bias.data.copy_(
-        get_tensor("vision_encoder.encoder.model.visual.patch_embed.linear.bias")
-    )
-    model.vision.pos_emb.data.copy_(
-        get_tensor("vision_encoder.encoder.model.visual.pos_embed")
-    )
     for i in range(len(model.vision["blocks"])):
         prefix = f"vision_encoder.encoder.model.visual.blocks.{i}"
-        # Layer norms
-        model.vision["blocks"][i]["ln1"].weight.data.copy_(
-            get_tensor(f"{prefix}.norm1.weight")
-        )
-        model.vision["blocks"][i]["ln1"].bias.data.copy_(
-            get_tensor(f"{prefix}.norm1.bias")
-        )
-        model.vision["blocks"][i]["ln2"].weight.data.copy_(
-            get_tensor(f"{prefix}.norm2.weight")
-        )
-        model.vision["blocks"][i]["ln2"].bias.data.copy_(
-            get_tensor(f"{prefix}.norm2.bias")
-        )
-        # Attention
-        model.vision["blocks"][i]["attn"]["qkv"].weight.data.copy_(
-            get_tensor(f"{prefix}.attn.qkv.weight")
-        )
-        model.vision["blocks"][i]["attn"]["qkv"].bias.data.copy_(
-            get_tensor(f"{prefix}.attn.qkv.bias")
-        )
-        model.vision["blocks"][i]["attn"]["proj"].weight.data.copy_(
-            get_tensor(f"{prefix}.attn.proj.weight")
-        )
-        model.vision["blocks"][i]["attn"]["proj"].bias.data.copy_(
-            get_tensor(f"{prefix}.attn.proj.bias")
-        )
-        # MLP
-        model.vision["blocks"][i]["mlp"]["fc1"].weight.data.copy_(
-            get_tensor(f"{prefix}.mlp.fc1.weight")
-        )
-        model.vision["blocks"][i]["mlp"]["fc1"].bias.data.copy_(
-            get_tensor(f"{prefix}.mlp.fc1.bias")
-        )
-        model.vision["blocks"][i]["mlp"]["fc2"].weight.data.copy_(
-            get_tensor(f"{prefix}.mlp.fc2.weight")
-        )
-        model.vision["blocks"][i]["mlp"]["fc2"].bias.data.copy_(
-            get_tensor(f"{prefix}.mlp.fc2.bias")
-        )
-    model.vision["post_ln"].weight.data.copy_(
-        get_tensor("vision_encoder.encoder.model.visual.norm.weight")
-    )
-    model.vision["post_ln"].bias.data.copy_(
-        get_tensor("vision_encoder.encoder.model.visual.norm.bias")
-    )
-    model.vision["proj_mlp"]["fc1"].weight.data.copy_(
-        get_tensor("vision_encoder.projection.mlp.fc1.weight")
-    )
-    model.vision["proj_mlp"]["fc1"].bias.data.copy_(
-        get_tensor("vision_encoder.projection.mlp.fc1.bias")
-    )
-    model.vision["proj_mlp"]["fc2"].weight.data.copy_(
-        get_tensor("vision_encoder.projection.mlp.fc2.weight")
-    )
-    model.vision["proj_mlp"]["fc2"].bias.data.copy_(
-        get_tensor("vision_encoder.projection.mlp.fc2.bias")
     )
-    # Text Model
-    model.text.wte.data.copy_(get_tensor("text_model.transformer.embd.wte.weight"))
-    for i in range(len(model.text["blocks"])):
-        prefix = f"text_model.transformer.h.{i}"
-        # Layer norm
-        model.text["blocks"][i]["ln"].weight.data.copy_(
-            get_tensor(f"{prefix}.ln.weight")
         )
-        model.text["blocks"][i]["ln"].bias.data.copy_(get_tensor(f"{prefix}.ln.bias"))
-        # Attention
-        model.text["blocks"][i]["attn"]["qkv"].weight.data.copy_(
-            get_tensor(f"{prefix}.mixer.Wqkv.weight")
-        )
-        model.text["blocks"][i]["attn"]["qkv"].bias.data.copy_(
-            get_tensor(f"{prefix}.mixer.Wqkv.bias")
-        )
-        model.text["blocks"][i]["attn"]["proj"].weight.data.copy_(
-            get_tensor(f"{prefix}.mixer.out_proj.weight")
-        )
-        model.text["blocks"][i]["attn"]["proj"].bias.data.copy_(
-            get_tensor(f"{prefix}.mixer.out_proj.bias")
-        )
-        # MLP
-        model.text["blocks"][i]["mlp"]["fc1"].weight.data.copy_(
-            get_tensor(f"{prefix}.mlp.fc1.weight")
-        )
-        model.text["blocks"][i]["mlp"]["fc1"].bias.data.copy_(
-            get_tensor(f"{prefix}.mlp.fc1.bias")
-        )
-        model.text["blocks"][i]["mlp"]["fc2"].weight.data.copy_(
-            get_tensor(f"{prefix}.mlp.fc2.weight")
-        )
-        model.text["blocks"][i]["mlp"]["fc2"].bias.data.copy_(
-            get_tensor(f"{prefix}.mlp.fc2.bias")
-        )
-    model.text["post_ln"].weight.data.copy_(get_tensor("text_model.lm_head.ln.weight"))
-    model.text["post_ln"].bias.data.copy_(get_tensor("text_model.lm_head.ln.bias"))
-    model.text["lm_head"].weight.data.copy_(
-        get_tensor("text_model.lm_head.linear.weight")
-    )
-    model.text["lm_head"].bias.data.copy_(get_tensor("text_model.lm_head.linear.bias"))
-    # Region Model
-    model.region.coord_features.data.copy_(
-        get_tensor("region_model.coordinate_features.weight").T
-    )
-    model.region["coord_encoder"].weight.data.copy_(
-        get_tensor("region_model.coordinate_encoder.weight")
-    )
-    model.region["coord_encoder"].bias.data.copy_(
-        get_tensor("region_model.coordinate_encoder.bias")
-    )
-    model.region["coord_decoder"]["fc1"].weight.data.copy_(
-        get_tensor("region_model.coordinate_decoder.fc1.weight")
-    )
-    model.region["coord_decoder"]["fc1"].bias.data.copy_(
-        get_tensor("region_model.coordinate_decoder.fc1.bias")
-    )
-    model.region["coord_decoder"]["fc2"].weight.data.copy_(
-        get_tensor("region_model.coordinate_decoder.fc2.weight")
-    )
-    model.region["coord_decoder"]["fc2"].bias.data.copy_(
-        get_tensor("region_model.coordinate_decoder.fc2.bias")
     )
-    model.region.size_features.data.copy_(
-        get_tensor("region_model.size_features.weight").T
-    )
-    model.region["size_encoder"].weight.data.copy_(
-        get_tensor("region_model.size_encoder.weight")
-    )
-    model.region["size_encoder"].bias.data.copy_(
-        get_tensor("region_model.size_encoder.bias")
-    )
-    model.region["size_decoder"]["fc1"].weight.data.copy_(
-        get_tensor("region_model.size_decoder.fc1.weight")
-    )
-    model.region["size_decoder"]["fc1"].bias.data.copy_(
-        get_tensor("region_model.size_decoder.fc1.bias")
-    )
-    model.region["size_decoder"]["fc2"].weight.data.copy_(
-        get_tensor("region_model.size_decoder.fc2.weight")
     )
-    model.region["size_decoder"]["fc2"].bias.data.copy_(
-        get_tensor("region_model.size_decoder.fc2.bias")
-    )
-def load_weights_from_safetensors(weights_file: str, model: nn.Module) -> None:
-    """Load weights from a safetensors file into a MoondreamModel instance."""
-    with safetensors_open(weights_file) as get_tensor:
-        # Wrap the get_tensor function to handle key normalization
-        name_map = {k.replace("._orig_mod", ""): k for k in get_tensor.keys()}
-        _load_weights(lambda x: get_tensor(name_map[x]).to(dtype=torch.float16), model)
-def load_weights_from_pt(weights_file: str, model: nn.Module) -> None:
-    """Load weights from a PyTorch file into a MoondreamModel instance."""
-    device = str(torch.empty(0).device)
-    tensors = torch.load(weights_file, map_location=device, weights_only=True)
-    tensors = {
-        k.replace("._orig_mod", ""): v.to(dtype=torch.float16)
-        for k, v in tensors.items()
-    }
-    _load_weights(lambda x: tensors[x], model)
 def load_weights_into_model(weights_file: str, model: nn.Module) -> None:

 import safetensors
 import torch
 import torch.nn as nn
+import re
 from contextlib import contextmanager
 from typing import Callable, List
+from .text import build_text_model
+from .config import TextConfig
+# Our custom linear has an module named linear, so we add linear to the name
+def add_linear_to_key(k: str) -> str:
+    k = k.replace("model.", "")
+    if k.startswith("text.") and ".linear." not in k:
+        k = re.sub(
+            r"(attn\.(?:qkv|proj)|mlp\.fc[12])\.(weight|bias)$",
+            r"\1.linear.\2",
+            k,
+        )
+    return k
 @contextmanager
         yield get_tensor
+def _load_weights(
+    get_tensor: Callable[[str], torch.Tensor],
+    model: nn.Module,
+    is_quantized: bool = False,
+) -> None:
     """Internal function to load weights using a tensor getter function."""
     model = model.to(dtype=torch.float16)
+    vision = model.vision
+    region = model.region
+    weight_map = {
+        "vision_encoder.encoder.model.visual.patch_embed.linear.weight": vision[
+            "patch_emb"
+        ].weight,
+        "vision_encoder.encoder.model.visual.patch_embed.linear.bias": vision[
+            "patch_emb"
+        ].bias,
+        "vision_encoder.encoder.model.visual.pos_embed": vision.pos_emb,
+        "vision_encoder.encoder.model.visual.norm.weight": vision["post_ln"].weight,
+        "vision_encoder.encoder.model.visual.norm.bias": vision["post_ln"].bias,
+        "vision_encoder.projection.mlp.fc1.weight": vision["proj_mlp"]["fc1"].weight,
+        "vision_encoder.projection.mlp.fc1.bias": vision["proj_mlp"]["fc1"].bias,
+        "vision_encoder.projection.mlp.fc2.weight": vision["proj_mlp"]["fc2"].weight,
+        "vision_encoder.projection.mlp.fc2.bias": vision["proj_mlp"]["fc2"].bias,
+        "text_model.transformer.embd.wte.weight": model.text.wte,
+        "text_model.lm_head.ln.weight": model.text["post_ln"].weight,
+        "text_model.lm_head.ln.bias": model.text["post_ln"].bias,
+        "text_model.lm_head.linear.weight": model.text["lm_head"].weight,
+        "text_model.lm_head.linear.bias": model.text["lm_head"].bias,
+        "region_model.coordinate_encoder.weight": region["coord_encoder"].weight,
+        "region_model.coordinate_encoder.bias": region["coord_encoder"].bias,
+        "region_model.coordinate_decoder.fc1.weight": region["coord_decoder"][
+            "fc1"
+        ].weight,
+        "region_model.coordinate_decoder.fc1.bias": region["coord_decoder"]["fc1"].bias,
+        "region_model.coordinate_decoder.fc2.weight": region["coord_decoder"][
+            "fc2"
+        ].weight,
+        "region_model.coordinate_decoder.fc2.bias": region["coord_decoder"]["fc2"].bias,
+        "region_model.size_encoder.weight": region["size_encoder"].weight,
+        "region_model.size_encoder.bias": region["size_encoder"].bias,
+        "region_model.size_decoder.fc1.weight": region["size_decoder"]["fc1"].weight,
+        "region_model.size_decoder.fc1.bias": region["size_decoder"]["fc1"].bias,
+        "region_model.size_decoder.fc2.weight": region["size_decoder"]["fc2"].weight,
+        "region_model.size_decoder.fc2.bias": region["size_decoder"]["fc2"].bias,
+    }
     for i in range(len(model.vision["blocks"])):
         prefix = f"vision_encoder.encoder.model.visual.blocks.{i}"
+        blk = model.vision["blocks"][i]
+        weight_map.update(
+            {
+                f"{prefix}.norm1.weight": blk["ln1"].weight,
+                f"{prefix}.norm1.bias": blk["ln1"].bias,
+                f"{prefix}.norm2.weight": blk["ln2"].weight,
+                f"{prefix}.norm2.bias": blk["ln2"].bias,
+                f"{prefix}.attn.qkv.weight": blk["attn"]["qkv"].weight,
+                f"{prefix}.attn.qkv.bias": blk["attn"]["qkv"].bias,
+                f"{prefix}.attn.proj.weight": blk["attn"]["proj"].weight,
+                f"{prefix}.attn.proj.bias": blk["attn"]["proj"].bias,
+                f"{prefix}.mlp.fc1.weight": blk["mlp"]["fc1"].weight,
+                f"{prefix}.mlp.fc1.bias": blk["mlp"]["fc1"].bias,
+                f"{prefix}.mlp.fc2.weight": blk["mlp"]["fc2"].weight,
+                f"{prefix}.mlp.fc2.bias": blk["mlp"]["fc2"].bias,
+            }
+        )
+    if not is_quantized:
+        for i in range(len(model.text["blocks"])):
+            prefix = f"text_model.transformer.h.{i}"
+            blk = model.text["blocks"][i]
+            weight_map.update(
+                {
+                    f"{prefix}.ln.weight": blk["ln"].weight,
+                    f"{prefix}.ln.bias": blk["ln"].bias,
+                    f"{prefix}.mixer.Wqkv.weight": blk["attn"]["qkv"].weight,
+                    f"{prefix}.mixer.Wqkv.bias": blk["attn"]["qkv"].bias,
+                    f"{prefix}.mixer.out_proj.weight": blk["attn"]["proj"].weight,
+                    f"{prefix}.mixer.out_proj.bias": blk["attn"]["proj"].bias,
+                    f"{prefix}.mlp.fc1.weight": blk["mlp"]["fc1"].weight,
+                    f"{prefix}.mlp.fc1.bias": blk["mlp"]["fc1"].bias,
+                    f"{prefix}.mlp.fc2.weight": blk["mlp"]["fc2"].weight,
+                    f"{prefix}.mlp.fc2.bias": blk["mlp"]["fc2"].bias,
+                }
+            )
+    else:  # add special quantized path. this is specific to how bitblas expects weights to be loaded (.qweight)
+        for i in range(len(model.text["blocks"])):
+            prefix = f"text_model.transformer.h.{i}"
+            blk = model.text["blocks"][i]
+            weight_map.update(
+                {
+                    f"{prefix}.ln.qweight": blk["ln"].weight,
+                    f"{prefix}.ln.bias": blk["ln"].bias,
+                    f"{prefix}.mixer.Wqkv.qweight": blk["attn"]["qkv"].weight,
+                    f"{prefix}.mixer.Wqkv.bias": blk["attn"]["qkv"].bias,
+                    f"{prefix}.mixer.out_proj.qweight": blk["attn"]["proj"].weight,
+                    f"{prefix}.mixer.out_proj.bias": blk["attn"]["proj"].bias,
+                    f"{prefix}.mlp.fc1.qweight": blk["mlp"]["fc1"].weight,
+                    f"{prefix}.mlp.fc1.bias": blk["mlp"]["fc1"].bias,
+                    f"{prefix}.mlp.fc2.qweight": blk["mlp"]["fc2"].weight,
+                    f"{prefix}.mlp.fc2.bias": blk["mlp"]["fc2"].bias,
+                }
+            )
+    for key, tensor in weight_map.items():
+        tensor.data.copy_(get_tensor(key))
+    region.coord_features.data.copy_(
+        get_tensor("region_model.coordinate_features.weight").T
     )
+    region.size_features.data.copy_(get_tensor("region_model.size_features.weight").T)
+def load_weights_from_safetensors(weights_file: str, model: nn.Module) -> None:
+    """Load weights from a safetensors file into a MoondreamModel instance."""
+    with safetensors_open(weights_file) as get_tensor:
+        all_keys = get_tensor.keys()
+        is_quantized = any(
+            ".qweight" in key or "_quantized" in key or "quant." in key
+            for key in all_keys
         )
+        if "text_model.transformer.h.0.ln.weight" in all_keys:
+            layernorm_dtype = get_tensor("text_model.transformer.h.0.ln.weight").dtype
+        else:
+            layernorm_dtype = torch.float16
+        linear_dtype = torch.int8 if is_quantized else torch.float16
+        model.text = build_text_model(
+            TextConfig, linear_dtype=linear_dtype, layernorm_dtype=layernorm_dtype
+        )
+        if model.setup_caches_flag:
+            model._setup_caches()
+        if (
+            "vision.blocks.0.attn.proj.bias" in all_keys
+            or "model.vision.blocks.0.attn.proj.bias" in all_keys
+        ):
+            with safetensors_open(weights_file) as get_tensor:
+                tensors = {add_linear_to_key(k): get_tensor(k) for k in all_keys}
+                model.load_state_dict(tensors, strict=False)
+        else:
+            # Wrap the get_tensor function to handle key normalization
+            name_map = {k.replace("._orig_mod", ""): k for k in all_keys}
+            _load_weights(
+                lambda x: get_tensor(name_map[x]).to(dtype=torch.float16),
+                model,
+                is_quantized,
+            )
+def load_weights_from_pt(weights_file: str, model: nn.Module) -> None:
+    """Load weights from a PyTorch file into a MoondreamModel instance."""
+    tensors = torch.load(weights_file, map_location="cpu", weights_only=True)
+    all_keys = tensors.keys()
+    is_quantized = any(
+        ".qweight" in key or "_quantized" in key or "quant." in key for key in all_keys
     )
+    if "text.blocks.0.ln.weight" in all_keys:
+        layernorm_dtype = tensors["text.blocks.0.ln.weight"].dtype
+    else:
+        layernorm_dtype = torch.float16
+    linear_dtype = torch.int8 if is_quantized else torch.float16
+    model.text = build_text_model(
+        TextConfig, linear_dtype=linear_dtype, layernorm_dtype=layernorm_dtype
     )
+    if model.setup_caches_flag:
+        model._setup_caches()
+    if (
+        "vision.blocks.0.attn.proj.bias" in all_keys
+        or "model.vision.blocks.0.attn.proj.bias" in all_keys
+    ):
+        tensors = {add_linear_to_key(k): v for k, v in tensors.items()}
+        model.load_state_dict(tensors, strict=False)
+    else:
+        tensors = {
+            k.replace("._orig_mod", ""): v.to(dtype=torch.float16)
+            for k, v in tensors.items()
+        }
+        _load_weights(lambda x: tensors[x], model, is_quantized)
 def load_weights_into_model(weights_file: str, model: nn.Module) -> None: