Intel
/

Qwen3-Coder-480B-A35B-Instruct-int4-AutoRound

@@ -8,7 +8,8 @@ datasets:
 ## Model Details
-This model is a mixed int4 model with group_size 128 and symmetric quantization of [Qwen/Qwen3-Coder-480B-A35B-Instruct](https://huggingface.co/Qwen/Qwen3-Coder-480B-A35B-Instruct) generated by [intel/auto-round](https://github.com/intel/auto-round) via **auot-round-light**
 Please follow the license of the original model.
@@ -65,212 +66,6 @@ for i, prompt in enumerate(prompts):
     print(f"Generated: {decoded_outputs[i]}")
     print("-" * 50)
-"""
-Prompt: Write a quick sort algorithm.
-Generated: Here's a Quick Sort implementation in Python:
-```python
-def quicksort(arr):
-    """
-    Quick Sort algorithm implementation
-    Args:
-        arr: List of comparable elements
-    Returns:
-        Sorted list
-    """
-    # Base case: arrays with 0 or 1 element are already sorted
-    if len(arr) <= 1:
-        return arr
-    # Choose pivot (using middle element)
-    pivot = arr[len(arr) // 2]
-    # Partition array into three parts
-    left = [x for x in arr if x < pivot]      # Elements less than pivot
-    middle = [x for x in arr if x == pivot]   # Elements equal to pivot
-    right = [x for x in arr if x > pivot]     # Elements greater than pivot
-    # Recursively sort left and right partitions, then combine
-    return quicksort(left) + middle + quicksort(right)
-# Alternative in-place version (more memory efficient)
-def quicksort_inplace(arr, low=0, high=None):
-    """
-    In-place Quick Sort implementation
-    Args:
-        arr: List to be sorted in-place
-        low: Starting index
-        high: Ending index
-    """
-    if high is None:
-        high = len(arr) - 1
-    if low < high:
-        # Partition the array and get pivot index
-        pivot_index = partition(arr, low, high)
-        # Recursively sort elements before and after partition
-        quicksort_inplace(arr, low, pivot_index - 1)
-        quicksort_inplace(arr, pivot_index + 1, high)
-def partition(arr, low, high):
-    """
-    Partition function for in-place quicksort
-    """
-    # Choose rightmost element as pivot
-    pivot = arr[high]
-    # Index of smaller element (indicates right position of pivot)
-    i = low - 1
-    for j in range(low, high):
-        # If current element is smaller than or equal to pivot
-        if arr[j] <= pivot:
-            i += 1
-            arr[i], arr[j] = arr[j], arr[i]  # Swap elements
-    # Place pivot in correct position
-    arr[i + 1], arr[high] = arr[high], arr[i + 1]
-    return i + 1
-# Example usage
-if __name__ == "__main__":
-    # Test the simple version
-    test_array
---------------------------------------------------
-Prompt: Write a flappy bird.
-Generated: # Flappy Bird in PyGame
-Here's a complete implementation of Flappy Bird using PyGame:
-```python
-import pygame
-import sys
-import random
-# Initialize pygame
-pygame.init()
-# Game constants
-WIDTH, HEIGHT = 400, 600
-FPS = 60
-GRAVITY = 0.25
-FLAP_STRENGTH = -5
-PIPE_SPEED = 3
-PIPE_GAP = 150
-PIPE_FREQUENCY = 1800  # milliseconds
-GROUND_HEIGHT = 100
-BIRD_SIZE = 30
-# Colors
-SKY_BLUE = (113, 197, 207)
-GREEN = (111, 196, 69)
-DARK_GREEN = (76, 145, 65)
-BROWN = (160, 120, 40)
-YELLOW = (255, 221, 45)
-RED = (231, 76, 60)
-WHITE = (255, 255, 255)
-BLACK = (0, 0, 0)
-# Set up the display
-screen = pygame.display.set_mode((WIDTH, HEIGHT))
-pygame.display.set_caption("Flappy Bird")
-clock = pygame.time.Clock()
-# Font setup
-font = pygame.font.SysFont(None, 36)
-small_font = pygame.font.SysFont(None, 24)
-class Bird:
-    def __init__(self):
-        self.x = WIDTH // 3
-        self.y = HEIGHT // 2
-        self.velocity = 0
-        self.alive = True
-        self.rotation = 0
-    def flap(self):
-        self.velocity = FLAP_STRENGTH
-    def update(self):
-        # Apply gravity
-        self.velocity += GRAVITY
-        self.y += self.velocity
-        # Rotate bird based on velocity
-        self.rotation = max(-30, min(self.velocity * 3, 90))
-        # Check if bird hits the ground or ceiling
-        if self.y >= HEIGHT - GROUND_HEIGHT - BIRD_SIZE//2:
-            self.y = HEIGHT - GROUND_HEIGHT - BIRD_SIZE//2
-            self.alive = False
-        if self.y <= 0:
-            self.y = 0
-            self.velocity = 0
-    def draw(self
---------------------------------------------------
-Prompt: Write a llm quantization algorithm.
-Generated: Here's a comprehensive implementation of LLM quantization algorithms, including post-training quantization and QLoRA-style quantization:
-```python
-import torch
-import torch.nn as nn
-from typing import Dict, Tuple, Optional
-import math
-class Quantizer:
-    """Base class for quantization operations"""
-    @staticmethod
-    def symmetric_quantize(tensor: torch.Tensor, bits: int = 8) -> Tuple[torch.Tensor, float]:
-        """
-        Symmetric quantization for weights
-        Returns quantized tensor and scale factor
-        """
-        max_val = tensor.abs().max()
-        scale = max_val / (2 ** (bits - 1) - 1)
-        # Quantize to integer values
-        quantized = torch.round(tensor / scale).clamp(-2**(bits-1), 2**(bits-1)-1)
-        return quantized.to(torch.int8), scale
-    @staticmethod
-    def asymmetric_quantize(tensor: torch.Tensor, bits: int = 8) -> Tuple[torch.Tensor, float, float]:
-        """
-        Asymmetric quantization for activations
-        Returns quantized tensor, scale, and zero point
-        """
-        min_val, max_val = tensor.min(), tensor.max()
-       scale = (max_val - min_val) / (2**bits - 1)
-        zero_point = torch.round(-min_val / scale).clamp(0, 2**bits-1)
-        # Quantize with zero point
-        quantized = torch.round(tensor / scale + zero_point).clamp(0, 2**bits-1)
-        return quantized.to(torch.uint8), scale, zero_point
-    @staticmethod
-    def dequantize(quantized: torch.Tensor, scale: float, zero_point: Optional[float] = None) -> torch.Tensor:
-        """Dequantize tensor back to floating point"""
-        if zero_point is not None:
-            return (quantized.float() - zero_point) * scale
-        else:
-            return quantized.float() * scale
-class NF4Quantizer:
-    """4-bit NormalFloat quantization (NF4)"""
-    def __init__(self):
-        # Pre-defined NF4 values normalized to [-1, 1]
-        self.norm_floats = torch.tensor([
-            -1.0, -0.6962, -0.5251, -0.3949, -0.2844,
---------------------------------------------------
-"""
 ~~~
 ### Generate the model
@@ -307,8 +102,14 @@ for n, m in block.named_modules():
         device_map.update({n: device})
 autoround = AutoRound(
-    model=model, tokenizer=tokenizer, device_map=device_map, nsamples=512,dataset="github-code-clean")
 autoround.quantize_and_save(format="auto_round", output_dir="./Qwen3-Coder-480B-A35B-Instruct-int4")
 ```

 ## Model Details
+This model is a mixed int4 model with group_size 128 and symmetric quantization of [Qwen/Qwen3-Coder-480B-A35B-Instruct](https://huggingface.co/Qwen/Qwen3-Coder-480B-A35B-Instruct) generated by [intel/auto-round](https://github.com/intel/auto-round) algorithm
+`mlp.gate` layers fallback to 16 bits to ensure runing successfully on vLLM.
 Please follow the license of the original model.
     print(f"Generated: {decoded_outputs[i]}")
     print("-" * 50)
 ~~~
 ### Generate the model
         device_map.update({n: device})
+layer_config = {}
+for n, m in model.named_modules():
+    if "mlp.gate" in n: ## vllm only support 16 bit for this layer
+        layer_config[n] = {"bits": 16}
 autoround = AutoRound(
+    model=model, tokenizer=tokenizer, device_map=device_map, nsamples=512,dataset="github-code-clean", layer_config=layer_config)
 autoround.quantize_and_save(format="auto_round", output_dir="./Qwen3-Coder-480B-A35B-Instruct-int4")
 ```