This is a copy of Qwen3-8B compiled with the 2.25 SDK for the Neuron workshop. https://github.com/aws-neuron/neuron-workshops
This checkpoint was generated with the code:
bs=1
seqlength=1024
import os
from vllm import LLM, SamplingParams
os.environ['VLLM_NEURON_FRAMEWORK'] = "neuronx-distributed-inference"
path = f"/home/ubuntu/qwen3/qwen3-8B-BS{bs}-SEQ{seqlength}"
#save the sharded weights and compiler artifacts in the same folder
os.environ['NEURON_COMPILED_ARTIFACTS'] = path
os.environ['BASE_COMPILE_WORK_DIR'] =path
llm = LLM(
model="/home/ubuntu/models/Qwen3-8B",
max_num_seqs=bs,
max_model_len=seqlength,
device="neuron",
tensor_parallel_size=2,
override_neuron_config={"save_sharded_checkpoint": True})
prompts = [
"Hello, my name is",
"The president of the United States is",
"The capital of France is",
"The future of AI is",
]
# note that top_k must be set to lower than the global_top_k defined in
# the neuronx_distributed_inference.models.config.OnDeviceSamplingConfig
sampling_params = SamplingParams(top_k=10, temperature=0.8, top_p=0.95)
outputs = llm.generate(prompts, sampling_params)
for output in outputs:
prompt = output.prompt
generated_text = output.outputs[0].text
print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
Inference Providers
NEW
This model isn't deployed by any Inference Provider.
🙋
Ask for provider support