Make `config.json` compatible with standard sliding window config

This will add `layer_types` to the loaded config class so that libraries such as vLLM can load hybrid attention models in the standard Hugging Face format.

Since we do not edit `configuration_phi4flash.py` this change is backwards compatible.

Once this change has been merged along with https://github.com/vllm-project/vllm/pull/21927 we can update `configuration_phi4flash.py` so that the the modelling code works in the standard way too.

Files changed (1) hide show

config.json +6 -0

config.json CHANGED Viewed

@@ -26,6 +26,12 @@
     "num_key_value_heads": 20,
     "resid_pdrop": 0.0,
     "sliding_window": 512,
     "torch_dtype": "bfloat16",
     "tie_word_embeddings": true,
     "transformers_version": "4.46.1",

     "num_key_value_heads": 20,
     "resid_pdrop": 0.0,
     "sliding_window": 512,
+    "layer_types": [
+      "full_attention", "sliding_attention", "full_attention", "sliding_attention", "full_attention", "sliding_attention", "full_attention", "sliding_attention",
+      "full_attention", "sliding_attention", "full_attention", "sliding_attention", "full_attention", "sliding_attention", "full_attention", "sliding_attention",
+      "full_attention", "full_attention", "full_attention", "full_attention", "full_attention", "full_attention", "full_attention", "full_attention",
+      "full_attention", "full_attention", "full_attention", "full_attention", "full_attention", "full_attention", "full_attention", "full_attention"
+    ],
     "torch_dtype": "bfloat16",
     "tie_word_embeddings": true,
     "transformers_version": "4.46.1",