pytorch
/

Qwen3-8B-INT4

Text Generation

text-generation-inference

Model card Files Files and versions

jerryzh168 commited on May 17

Commit

14c6ae2

·

verified ·

1 Parent(s): fec1414

Update README.md

Files changed (1) hide show

README.md +35 -27

README.md CHANGED Viewed

@@ -63,42 +63,50 @@ pip install accelerate
 Example:
 ```Py
 import torch
-from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline
-torch.random.manual_seed(0)
-model_path = "pytorch/Qwen3-8B-int4wo-hqq"
 model = AutoModelForCausalLM.from_pretrained(
-    model_path,
-    device_map="auto",
     torch_dtype="auto",
-    trust_remote_code=True,
 )
-tokenizer = AutoTokenizer.from_pretrained(model_path)
 messages = [
-    {"role": "system", "content": "You are a helpful AI assistant."},
-    {"role": "user", "content": "Can you provide ways to eat combinations of bananas and dragonfruits?"},
-    {"role": "assistant", "content": "Sure! Here are some ways to eat bananas and dragonfruits together: 1. Banana and dragonfruit smoothie: Blend bananas and dragonfruits together with some milk and honey. 2. Banana and dragonfruit salad: Mix sliced bananas and dragonfruits together with some lemon juice and honey."},
-    {"role": "user", "content": "What about solving an 2x + 3 = 7 equation?"},
 ]
-pipe = pipeline(
-    "text-generation",
-    model=model,
-    tokenizer=tokenizer,
 )
-generation_args = {
-    "max_new_tokens": 500,
-    "return_full_text": False,
-    "temperature": 0.0,
-    "do_sample": False,
-}
-output = pipe(messages, **generation_args)
-print(output[0]['generated_text'])
 ```
 # Quantization Recipe

 Example:
 ```Py
 import torch
+from transformers import AutoModelForCausalLM, AutoTokenizer
+model_name = "pytorch/Qwen3-8B-int4wo-hqq"
+# load the tokenizer and the model
+tokenizer = AutoTokenizer.from_pretrained(model_name)
 model = AutoModelForCausalLM.from_pretrained(
+    model_name,
     torch_dtype="auto",
+    device_map="auto"
 )
+# prepare the model input
+prompt = "Give me a short introduction to large language model."
 messages = [
+    {"role": "user", "content": prompt}
 ]
+text = tokenizer.apply_chat_template(
+    messages,
+    tokenize=False,
+    add_generation_prompt=True,
+    enable_thinking=True # Switches between thinking and non-thinking modes. Default is True.
+)
+model_inputs = tokenizer([text], return_tensors="pt").to(model.device)
+# conduct text completion
+generated_ids = model.generate(
+    **model_inputs,
+    max_new_tokens=32768
 )
+output_ids = generated_ids[0][len(model_inputs.input_ids[0]):].tolist()
+# parsing thinking content
+try:
+    # rindex finding 151668 (</think>)
+    index = len(output_ids) - output_ids[::-1].index(151668)
+except ValueError:
+    index = 0
+thinking_content = tokenizer.decode(output_ids[:index], skip_special_tokens=True).strip("\n")
+content = tokenizer.decode(output_ids[index:], skip_special_tokens=True).strip("\n")
+print("thinking content:", thinking_content)
+print("content:", content)
 ```
 # Quantization Recipe