gpt-oss-mxfp4 / README.md

Upload folder using huggingface_hub

27b6ad8 verified 20 days ago

5.73 kB

	---
	library_name: transformers
	pipeline_tag: text-generation
	inference: true
	widget:
	- text: Hello!
	example_title: Hello world
	group: Python
	base_model:
	- openai/gpt-oss-120b
	---

	This tiny model is for debugging. It is randomly initialized with the config adapted from [openai/gpt-oss-120b](https://huggingface.co/openai/gpt-oss-120b).

	Note: This model used quantized MXFP4 FFN. `pip install -U triton git+https://github.com/triton-lang/triton.git@main#subdirectory=python/triton_kernels`

	### Example usage:

	- vLLM

	```bash
	vllm serve tiny-random/gpt-oss-mxfp4
	```

	- Transformers

	```python
	import torch
	from transformers import pipeline

	model_id = "tiny-random/gpt-oss-mxfp4"

	pipe = pipeline(
	"text-generation",
	model=model_id,
	torch_dtype='auto',
	device_map="cuda",
	)

	messages = [
	{"role": "user", "content": "Explain quantum mechanics clearly and concisely."},
	]

	outputs = pipe(
	messages,
	max_new_tokens=16,
	)
	print(outputs[0]["generated_text"][-1])
	```

	### Codes to create this repo:

	```python
	import json

	import safetensors
	import torch
	from huggingface_hub import hf_hub_download
	from transformers import (
	AutoConfig,
	AutoModelForCausalLM,
	AutoProcessor,
	AutoTokenizer,
	GenerationConfig,
	GptOssForCausalLM,
	pipeline,
	set_seed,
	)

	source_model_id = "openai/gpt-oss-120b"
	save_folder = "/tmp/tiny-random/gpt-oss-mxfp4"

	processor = AutoProcessor.from_pretrained(source_model_id)
	processor.save_pretrained(save_folder)

	with open(hf_hub_download(source_model_id, filename='config.json', repo_type='model'), 'r') as f:
	config_json = json.load(f)
	config_json.update({
	"head_dim": 32,
	"hidden_size": 32, # required by Mxfp4GptOssExperts codes
	"intermediate_size": 64,
	"layer_types": ["sliding_attention", "full_attention"],
	"num_attention_heads": 2,
	"num_hidden_layers": 2,
	"num_key_value_heads": 1,
	"num_local_experts": 32,
	"tie_word_embeddings": True,
	})
	quantization_config = config_json['quantization_config']
	del config_json['quantization_config']
	with open(f"{save_folder}/config.json", "w", encoding='utf-8') as f:
	json.dump(config_json, f, indent=2)

	config = AutoConfig.from_pretrained(save_folder)
	print(config)
	torch.set_default_dtype(torch.bfloat16)
	model = AutoModelForCausalLM.from_config(config, torch_dtype=torch.bfloat16)
	torch.set_default_dtype(torch.float32)
	model.generation_config = GenerationConfig.from_pretrained(
	source_model_id, trust_remote_code=True,
	)
	set_seed(42)
	with torch.no_grad():
	for name, p in sorted(model.named_parameters()):
	torch.nn.init.normal_(p, 0, 0.1)
	print(name, p.shape)
	model.save_pretrained(save_folder)

	# mxfp4
	state_dict = model.cpu().state_dict()
	del state_dict['lm_head.weight']
	for i in range(len(model.model.layers)):
	del state_dict[f'model.layers.{i}.mlp.experts.down_proj']
	del state_dict[f'model.layers.{i}.mlp.experts.gate_up_proj']
	state_dict[f'model.layers.{i}.mlp.experts.down_proj_blocks'] = torch.randint(0, 255, size=(
	config.num_local_experts, config.hidden_size, config.intermediate_size // 32, 16), dtype=torch.uint8
	)
	state_dict[f'model.layers.{i}.mlp.experts.down_proj_scales'] = torch.randint(0, 4, size=(
	config.num_local_experts, config.hidden_size, config.intermediate_size // 32), dtype=torch.uint8
	)
	state_dict[f'model.layers.{i}.mlp.experts.gate_up_proj_blocks'] = torch.randint(0, 255, size=(
	config.num_local_experts, 2 * config.intermediate_size, config.hidden_size // 32, 16), dtype=torch.uint8
	)
	state_dict[f'model.layers.{i}.mlp.experts.gate_up_proj_scales'] = torch.randint(0, 4, size=(
	config.num_local_experts, 2 * config.intermediate_size, config.hidden_size // 32), dtype=torch.uint8
	)
	safetensors.torch.save_file(state_dict, f"{save_folder}/model.safetensors")

	# from unittest.mock import Mock
	# from transformers.quantizers.auto import AutoHfQuantizer
	# from transformers.quantizers.quantizer_mxfp4 import Mxfp4HfQuantizer
	# _get_device_capability = torch.cuda.get_device_capability
	# torch.cuda.get_device_capability = Mock(return_value=(9, 0))
	# set_seed(42)
	# bf16_state_dict = model.cpu().state_dict()
	# model = AutoModelForCausalLM.from_pretrained(save_folder, torch_dtype=torch.bfloat16, quantization_config=quantization_config)
	# for i in range(len(model.model.layers)):
	# model.model.layers[i].mlp.experts.down_proj_bottom_pad = 0
	# model.model.layers[i].mlp.experts.down_proj_right_pad = 0
	# hf_quantizer: Mxfp4HfQuantizer = AutoHfQuantizer.from_config(quantization_config)
	# hf_quantizer.pre_quantized = False
	# ffn_keys = ['model.layers.0.mlp.experts.down_proj', 'model.layers.0.mlp.experts.gate_up_proj',
	# 'model.layers.1.mlp.experts.down_proj', 'model.layers.1.mlp.experts.gate_up_proj']
	# for key in ffn_keys:
	# hf_quantizer.create_quantized_param(model, bf16_state_dict[key], key, "cuda", bf16_state_dict)
	# print('down_proj', model.model.layers[0].mlp.experts.down_proj)
	# print('down_proj_blocks', model.model.layers[0].mlp.experts.down_proj_blocks)
	# state_dict = model.state_dict()
	# del state_dict['lm_head.weight']
	# for key in ffn_keys:
	# del state_dict[key]
	# for k, v in state_dict.items():
	# if str(v.device) == 'meta':
	# print(k, v.device, v.shape)

	# safetensors.torch.save_file(state_dict, f"{save_folder}/model.safetensors")
	with open(f"{save_folder}/config.json", "r", encoding='utf-8') as f:
	config = json.load(f)
	config['quantization_config'] = quantization_config
	with open(f"{save_folder}/config.json", "w", encoding='utf-8') as f:
	json.dump(config, f, indent=2)
	# torch.cuda.get_device_capability = _get_device_capability
	```