Upload folder using huggingface_hub

Browse files

Files changed (9) hide show

README.md +264 -0
chat_template.jinja +20 -0
config.json +62 -0
generation_config.json +9 -0
model.safetensors +3 -0
processor_config.json +6 -0
special_tokens_map.json +23 -0
tokenizer.json +0 -0
tokenizer_config.json +0 -0

README.md ADDED Viewed

	@@ -0,0 +1,264 @@

+---
+library_name: transformers
+pipeline_tag: text-generation
+inference: true
+widget:
+  - text: Hello!
+    example_title: Hello world
+    group: Python
+base_model:
+- stepfun-ai/step3
+---
+This tiny model is for debugging. It is randomly initialized with the config adapted from [stepfun-ai/step3](https://huggingface.co/stepfun-ai/step3).
+Note: if you want the model version that follows transformers' naming, see the model without "-vllm" suffix.
+### Example usage:
+- vLLM
+```bash
+vllm serve yujiepan/step3-tiny-random-vllm --trust-remote-code
+```
+- Transformers
+```python
+# Note: it's more convenient to use the model without "-vllm" suffix, which follows transformers' naming. Here "key_mapping" is a workaround.
+import torch
+from transformers import AutoModelForCausalLM, AutoProcessor
+model_id = "yujiepan/step3-tiny-random-vllm"
+key_mapping = {
+    "^vision_model": "model.vision_model",
+    r"^model(?!\.(language_model|vision_model))": "model.language_model",
+    "vit_downsampler": "model.vit_downsampler",
+    "vit_downsampler2": "model.vit_downsampler2",
+    "vit_large_projector": "model.vit_large_projector",
+}
+processor = AutoProcessor.from_pretrained(model_id, trust_remote_code=True)
+model = AutoModelForCausalLM.from_pretrained(
+    model_id,
+    device_map="cuda", torch_dtype=torch.bfloat16,
+    trust_remote_code=True, key_mapping=key_mapping,
+)
+messages = [
+    {
+        "role": "user",
+        "content": [
+            {"type": "image", "image": "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/bee.jpg"},
+            {"type": "text", "text": "What's in this picture?"}
+        ]
+    },
+]
+inputs = processor.apply_chat_template(
+    messages, add_generation_prompt=True, tokenize=True, return_dict=True, return_tensors="pt"
+).to(model.device)
+generate_ids = model.generate(**inputs, max_new_tokens=32, do_sample=False)
+decoded = processor.decode(generate_ids[0, inputs["input_ids"].shape[-1]:], skip_special_tokens=False)
+print(decoded)
+```
+### Codes to create this repo:
+```python
+import json
+from pathlib import Path
+import accelerate
+import torch
+from huggingface_hub import file_exists, hf_hub_download
+from transformers import (
+    AutoConfig,
+    AutoModelForCausalLM,
+    AutoProcessor,
+    AutoTokenizer,
+    GenerationConfig,
+    set_seed,
+)
+source_model_id = "stepfun-ai/step3"
+save_folder = "/tmp/yujiepan/step3-tiny-random-vllm"
+processor = AutoProcessor.from_pretrained(source_model_id, trust_remote_code=True)
+processor.save_pretrained(save_folder)
+def rewrite_automap(filepath: str, source_model_id: str, overrides: dict = None):
+    import json
+    with open(filepath, 'r', encoding='utf-8') as f:
+        config = json.load(f)
+    for k, v in config['auto_map'].items():
+        v = v.split('--')[-1]
+        config['auto_map'][k] = f'{source_model_id}--{v}'
+    if overrides is not None:
+        config.update(overrides)
+    with open(filepath, 'w', encoding='utf - 8') as f:
+        json.dump(config, f, indent=2)
+rewrite_automap(f'{save_folder}/processor_config.json', source_model_id)
+rewrite_automap(f'{save_folder}/tokenizer_config.json', source_model_id)
+with open(hf_hub_download(source_model_id, filename='config.json', repo_type='model'), 'r', encoding='utf-8') as f:
+    config_json = json.load(f)
+for k, v in config_json['auto_map'].items():
+    config_json['auto_map'][k] = f'{source_model_id}--{v}'
+config_json['architectures'] = ["Step3VLForConditionalGeneration"]
+config_json['text_config'].update({
+    "hidden_size": 32,
+    "intermediate_size": 64,
+    "num_hidden_layers": 2,
+    "num_attention_heads": 2,
+    "num_attention_groups": 1,
+    "head_dim": 256,
+    "share_q_dim": 512,
+    "moe_layers_enum": "1",
+    "moe_num_experts": 8,
+    "moe_top_k": 3,
+    "moe_intermediate_size": 64,
+    "share_expert_dim": 64,
+    "tie_word_embeddings": True,
+})
+config_json['vision_config'].update({
+    "hidden_size": 64,
+    "output_hidden_size": 64,
+    "intermediate_size": 128,
+    "num_hidden_layers": 2,
+    "num_attention_heads": 2
+})
+with open(f"{save_folder}/config.json", "w", encoding='utf-8') as f:
+    json.dump(config_json, f, indent=2)
+config = AutoConfig.from_pretrained(
+    save_folder,
+    trust_remote_code=True,
+)
+print(config)
+automap = config_json['auto_map']
+torch.set_default_dtype(torch.bfloat16)
+model = AutoModelForCausalLM.from_config(config, trust_remote_code=True)
+torch.set_default_dtype(torch.float32)
+if file_exists(filename="generation_config.json", repo_id=source_model_id, repo_type='model'):
+    model.generation_config = GenerationConfig.from_pretrained(
+        source_model_id, trust_remote_code=True,
+    )
+set_seed(42)
+model = model.cpu()  # cpu is more stable for random initialization across machines
+with torch.no_grad():
+    for name, p in sorted(model.named_parameters()):
+        torch.nn.init.normal_(p, 0, 0.2)
+        print(name, p.shape)
+model.save_pretrained(save_folder)
+import safetensors
+new_tensors = {}
+with safetensors.safe_open(f'{save_folder}/model.safetensors', framework='pt', device='cpu') as f:
+    for k in list(f.keys()):
+        v = f.get_tensor(k)
+        if k.startswith('model.language_model.'):
+            k = k.replace('model.language_model.', 'model.')
+            new_tensors[k] = v
+        elif k.startswith('model.vi'):
+            k = k.replace('model.vi', 'vi')
+            new_tensors[k] = v
+        else:
+            new_tensors[k] = v
+safetensors.torch.save_file(new_tensors, f"{save_folder}/model.safetensors")
+rewrite_automap(
+    f'{save_folder}/config.json', source_model_id,
+    overrides=dict(architectures=['Step3VLForConditionalGeneration']),
+)
+for python_file in Path(save_folder).glob('*.py'):
+    if python_file.name.startswith('modeling_') or python_file.name.startswith('configuration_') or python_file.name.endswith('.py'):
+        python_file.unlink()
+```
+### Printing the model:
+```text
+Step3vForConditionalGeneration(
+  (model): Step3vModel(
+    (vision_model): StepCLIPVisionTransformer(
+      (embeddings): StepCLIPVisionEmbeddings(
+        (patch_embedding): Conv2d(3, 64, kernel_size=(14, 14), stride=(14, 14))
+        (position_embedding): Embedding(2705, 64)
+      )
+      (transformer): StepCLIPEncoder(
+        (layers): ModuleList(
+          (0-1): 2 x StepCLIPEncoderLayer(
+            (layer_norm1): LayerNorm((64,), eps=1e-06, elementwise_affine=True)
+            (layer_norm2): LayerNorm((64,), eps=1e-06, elementwise_affine=True)
+            (self_attn): StepCLIPAttention(
+              (qkv_proj): Linear(in_features=64, out_features=192, bias=True)
+              (out_proj): Linear(in_features=64, out_features=64, bias=True)
+            )
+            (mlp): StepCLIPMLP(
+              (fc1): Linear(in_features=64, out_features=128, bias=True)
+              (act): QuickGELUActivation()
+              (fc2): Linear(in_features=128, out_features=64, bias=True)
+            )
+          )
+        )
+      )
+    )
+    (language_model): Step3Model(
+      (embed_tokens): Embedding(128815, 32)
+      (layers): ModuleList(
+        (0): Step3vDecoderLayer(
+          (self_attn): Step3vAttention(
+            (q_proj): Linear(in_features=32, out_features=512, bias=False)
+            (k_proj): Linear(in_features=32, out_features=256, bias=False)
+            (v_proj): Linear(in_features=32, out_features=256, bias=False)
+            (o_proj): Linear(in_features=512, out_features=32, bias=False)
+            (inter_norm): Step3vRMSNorm((512,), eps=1e-05)
+            (wq): Linear(in_features=512, out_features=512, bias=False)
+          )
+          (mlp): Step3vMLP(
+            (gate_proj): Linear(in_features=32, out_features=64, bias=False)
+            (up_proj): Linear(in_features=32, out_features=64, bias=False)
+            (down_proj): Linear(in_features=64, out_features=32, bias=False)
+            (act_fn): SiLU()
+          )
+          (input_layernorm): Step3vRMSNorm((32,), eps=1e-05)
+          (post_attention_layernorm): Step3vRMSNorm((32,), eps=1e-05)
+        )
+        (1): Step3vDecoderLayer(
+          (self_attn): Step3vAttention(
+            (q_proj): Linear(in_features=32, out_features=512, bias=False)
+            (k_proj): Linear(in_features=32, out_features=256, bias=False)
+            (v_proj): Linear(in_features=32, out_features=256, bias=False)
+            (o_proj): Linear(in_features=512, out_features=32, bias=False)
+            (inter_norm): Step3vRMSNorm((512,), eps=1e-05)
+            (wq): Linear(in_features=512, out_features=512, bias=False)
+          )
+          (moe): Step3vMoEMLP(
+            (gate): Linear(in_features=32, out_features=8, bias=False)
+            (up_proj): MoELinear()
+            (gate_proj): MoELinear()
+            (down_proj): MoELinear()
+            (act_fn): SiLU()
+          )
+          (share_expert): Step3vMLP(
+            (gate_proj): Linear(in_features=32, out_features=64, bias=False)
+            (up_proj): Linear(in_features=32, out_features=64, bias=False)
+            (down_proj): Linear(in_features=64, out_features=32, bias=False)
+            (act_fn): SiLU()
+          )
+          (input_layernorm): Step3vRMSNorm((32,), eps=1e-05)
+          (post_attention_layernorm): Step3vRMSNorm((32,), eps=1e-05)
+        )
+      )
+      (norm): Step3vRMSNorm((32,), eps=1e-05)
+      (rotary_emb): Step3vRotaryEmbedding()
+    )
+    (vit_downsampler): Conv2d(64, 64, kernel_size=(2, 2), stride=(2, 2))
+    (vit_downsampler2): Conv2d(64, 128, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1))
+    (vit_large_projector): Linear(in_features=128, out_features=32, bias=False)
+  )
+  (lm_head): Linear(in_features=32, out_features=128815, bias=False)
+)
+```

chat_template.jinja ADDED Viewed

	@@ -0,0 +1,20 @@

+{% macro render_content(content) %} {% if content is string %}{{- content }}{% elif content is mapping %}{{- content['value'] if 'value' in content else content['text'] }}{% elif content is iterable %}{% for item in content %}{% if item.type == 'text' %}{{- item['value'] if 'value' in item else item['text'] }}{% elif item.type == 'image' %}<im_patch>{% endif %}{% endfor %}{% endif %} {% endmacro %}{% if not add_generation_prompt is defined %}{% set add_generation_prompt = false %}{% endif %}{{ bos_token }}{% for message in messages %}{% if message.role == 'system' %}{{ render_content(message['content']) }}{% endif %}{% endfor %}{% if tools is defined and tools %}{% set ns = namespace(data='') %}{% for tool in tools %}{% set ns.data = ns.data + (tool | tojson(ensure_ascii=False)) + '
+' %}{% endfor %}{% set tool_schemas_var = ns.data %}# Tools
+You may call one or more tools to assist with the user query. You are provided with tool schemas within <tools></tools> XML tags: <tools>{{ tool_schemas_var }}</tools> When making tool calls, use XML format to invoke tools and pass parameters: <｜tool_calls_begin｜>
+<｜tool_call_begin｜>
+function<｜tool_sep｜><steptml:invoke name="tool_name0"><steptml:parameter name="parameter_name0">[parameter value]</steptml:parameter>...</steptml:invoke><｜tool_call_end｜>
+<｜tool_call_begin｜>
+function<｜tool_sep｜><steptml:invoke name="tool_name1"><steptml:parameter name="parameter_name1">[parameter value]</steptml:parameter>...</steptml:invoke><｜tool_call_end｜>
+<｜tool_calls_end｜>
+Note: * You can invoke one or more tools in parallel. * Each tool call must be complete and self-contained within a single <steptml:toolcall></steptml:toolcall> block. {% endif %}{% for message in messages %}{% if message.role == 'tool_description' %}{{ render_content(message['content']) }}{% elif message.role == 'user' %}{{- '<|BOT|>' + message.role + '\n' + render_content(message['content']) }}{{- '<|EOT|>' }}{% elif message.role == 'tool_response' %}<｜tool_outputs_begin｜>
+{% for tool_output in message['content'] %}<｜tool_output_begin｜>
+{{ render_content(tool_output) }}<｜tool_output_end｜>{% endfor %}
+<｜tool_outputs_end｜>
+{% else %}{{- '<|BOT|>' + message.role + '
+' }}{% if message['content'] is defined %}{{- render_content(message['content']) }}{% endif %}{% if message.tool_calls is defined %}<｜tool_calls_begin｜>
+{% for tool in message.tool_calls %}<｜tool_call_begin>｜>
+{{ tool['type'] }}<｜tool_sep｜>{{- '<steptml:invoke name="' + tool['function']['name'] + '">' }}{% for name, param in tool['function']['arguments'].items() %} {{- '<steptml:parameter name="' + name + '">' + param | string + '</steptml:parameter>' }}{% endfor %}</steptml:invoke><｜tool_call_end｜>
+{% endfor %}<｜tool_calls_end｜>
+{% endif %}<|EOT|>{% endif %}{% endfor %}{% if add_generation_prompt %}{{- '<|BOT|>assistant
+<think>
+' }}{% endif %}

config.json ADDED Viewed

	@@ -0,0 +1,62 @@

+{
+  "architectures": [
+    "Step3VLForConditionalGeneration"
+  ],
+  "auto_map": {
+    "AutoConfig": "stepfun-ai/step3--configuration_step3.Step3VLConfig",
+    "AutoModelForCausalLM": "stepfun-ai/step3--modeling_step3.Step3vForConditionalGeneration"
+  },
+  "bos_token_id": 0,
+  "eos_token_id": 128805,
+  "hidden_size": 32,
+  "im_end_token": "<im_end>",
+  "im_patch_token": "<im_patch>",
+  "im_start_token": "<im_start>",
+  "image_token_id": 128001,
+  "image_token_len": 169,
+  "model_type": "step3_vl",
+  "patch_token_len": 81,
+  "projector_bias": false,
+  "text_config": {
+    "architectures": [
+      "Step3TextForCausalLM"
+    ],
+    "head_dim": 256,
+    "hidden_size": 32,
+    "intermediate_size": 64,
+    "max_position_embedding": 65536,
+    "max_seq_len": 65536,
+    "model_type": "step3_text",
+    "moe_intermediate_size": 64,
+    "moe_layers_enum": "1",
+    "moe_num_experts": 8,
+    "moe_top_k": 3,
+    "norm_expert_weight": false,
+    "num_attention_groups": 1,
+    "num_attention_heads": 2,
+    "num_hidden_layers": 2,
+    "rms_norm_eps": 1e-05,
+    "rope_scaling": null,
+    "rope_theta": 500000,
+    "share_expert_dim": 64,
+    "share_q_dim": 512,
+    "torch_dtype": "bfloat16",
+    "vocab_size": 128815
+  },
+  "torch_dtype": "bfloat16",
+  "transformers_version": "4.54.1",
+  "understand_projector_stride": 2,
+  "vision_config": {
+    "hidden_act": "quick_gelu",
+    "hidden_size": 64,
+    "image_size": 728,
+    "intermediate_size": 128,
+    "layer_norm_eps": 1e-05,
+    "model_type": "step3_vision_encoder",
+    "num_attention_heads": 2,
+    "num_channels": 3,
+    "num_hidden_layers": 2,
+    "output_hidden_size": 64,
+    "patch_size": 14
+  }
+}

generation_config.json ADDED Viewed

	@@ -0,0 +1,9 @@

+{
+  "bos_token_id": 0,
+  "do_sample": true,
+  "eos_token_id": 128805,
+  "temperature": 0.7,
+  "top_p": 0.95,
+  "transformers_version": "4.54.1",
+  "trust_remote_code": true
+}

model.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:6e46470c9bafefcc89fb8189e085a3496f3500d9f7b873cef07f64b3e76c0fab
+size 18610672

processor_config.json ADDED Viewed

	@@ -0,0 +1,6 @@

+{
+  "auto_map": {
+    "AutoProcessor": "stepfun-ai/step3--processing_step3.Step3VLProcessor"
+  },
+  "processor_class": "Step3VLProcessor"
+}

special_tokens_map.json ADDED Viewed

	@@ -0,0 +1,23 @@

+{
+  "bos_token": {
+    "content": "<｜begin▁of▁sentence｜>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "eos_token": {
+    "content": "<|EOT|>",
+    "lstrip": false,
+    "normalized": true,
+    "rstrip": false,
+    "single_word": false
+  },
+  "pad_token": {
+    "content": "<｜end▁of▁sentence｜>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  }
+}

tokenizer.json ADDED Viewed

The diff for this file is too large to render. See raw diff

tokenizer_config.json ADDED Viewed

The diff for this file is too large to render. See raw diff