aws-neuron/Qwen3-8BSharded

This is a copy of Qwen3-8B compiled with the 2.25 SDK for the Neuron workshop. https://github.com/aws-neuron/neuron-workshops

This checkpoint was generated with the code:

bs=1
seqlength=1024

import os
from vllm import LLM, SamplingParams
os.environ['VLLM_NEURON_FRAMEWORK'] = "neuronx-distributed-inference"

path = f"/home/ubuntu/qwen3/qwen3-8B-BS{bs}-SEQ{seqlength}"
#save the sharded weights and compiler artifacts in the same folder
os.environ['NEURON_COMPILED_ARTIFACTS'] = path
os.environ['BASE_COMPILE_WORK_DIR'] =path

llm = LLM(
    model="/home/ubuntu/models/Qwen3-8B",
    max_num_seqs=bs,
    max_model_len=seqlength,
    device="neuron",
    tensor_parallel_size=2,
    override_neuron_config={"save_sharded_checkpoint": True})
prompts = [
    "Hello, my name is",
    "The president of the United States is",
    "The capital of France is",
    "The future of AI is",
]
# note that top_k must be set to lower than the global_top_k defined in
# the neuronx_distributed_inference.models.config.OnDeviceSamplingConfig
sampling_params = SamplingParams(top_k=10, temperature=0.8, top_p=0.95)
outputs = llm.generate(prompts, sampling_params)
for output in outputs:
    prompt = output.prompt
    generated_text = output.outputs[0].text
    print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")