jerryzh168 commited on
Commit
14c6ae2
·
verified ·
1 Parent(s): fec1414

Update README.md

Browse files
Files changed (1) hide show
  1. README.md +35 -27
README.md CHANGED
@@ -63,42 +63,50 @@ pip install accelerate
63
  Example:
64
  ```Py
65
  import torch
66
- from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline
67
-
68
- torch.random.manual_seed(0)
69
 
70
- model_path = "pytorch/Qwen3-8B-int4wo-hqq"
71
 
 
 
72
  model = AutoModelForCausalLM.from_pretrained(
73
- model_path,
74
- device_map="auto",
75
  torch_dtype="auto",
76
- trust_remote_code=True,
77
  )
78
- tokenizer = AutoTokenizer.from_pretrained(model_path)
79
 
 
 
80
  messages = [
81
- {"role": "system", "content": "You are a helpful AI assistant."},
82
- {"role": "user", "content": "Can you provide ways to eat combinations of bananas and dragonfruits?"},
83
- {"role": "assistant", "content": "Sure! Here are some ways to eat bananas and dragonfruits together: 1. Banana and dragonfruit smoothie: Blend bananas and dragonfruits together with some milk and honey. 2. Banana and dragonfruit salad: Mix sliced bananas and dragonfruits together with some lemon juice and honey."},
84
- {"role": "user", "content": "What about solving an 2x + 3 = 7 equation?"},
85
  ]
86
-
87
- pipe = pipeline(
88
- "text-generation",
89
- model=model,
90
- tokenizer=tokenizer,
 
 
 
 
 
 
 
91
  )
92
-
93
- generation_args = {
94
- "max_new_tokens": 500,
95
- "return_full_text": False,
96
- "temperature": 0.0,
97
- "do_sample": False,
98
- }
99
-
100
- output = pipe(messages, **generation_args)
101
- print(output[0]['generated_text'])
 
 
 
 
102
  ```
103
 
104
  # Quantization Recipe
 
63
  Example:
64
  ```Py
65
  import torch
66
+ from transformers import AutoModelForCausalLM, AutoTokenizer
 
 
67
 
68
+ model_name = "pytorch/Qwen3-8B-int4wo-hqq"
69
 
70
+ # load the tokenizer and the model
71
+ tokenizer = AutoTokenizer.from_pretrained(model_name)
72
  model = AutoModelForCausalLM.from_pretrained(
73
+ model_name,
 
74
  torch_dtype="auto",
75
+ device_map="auto"
76
  )
 
77
 
78
+ # prepare the model input
79
+ prompt = "Give me a short introduction to large language model."
80
  messages = [
81
+ {"role": "user", "content": prompt}
 
 
 
82
  ]
83
+ text = tokenizer.apply_chat_template(
84
+ messages,
85
+ tokenize=False,
86
+ add_generation_prompt=True,
87
+ enable_thinking=True # Switches between thinking and non-thinking modes. Default is True.
88
+ )
89
+ model_inputs = tokenizer([text], return_tensors="pt").to(model.device)
90
+
91
+ # conduct text completion
92
+ generated_ids = model.generate(
93
+ **model_inputs,
94
+ max_new_tokens=32768
95
  )
96
+ output_ids = generated_ids[0][len(model_inputs.input_ids[0]):].tolist()
97
+
98
+ # parsing thinking content
99
+ try:
100
+ # rindex finding 151668 (</think>)
101
+ index = len(output_ids) - output_ids[::-1].index(151668)
102
+ except ValueError:
103
+ index = 0
104
+
105
+ thinking_content = tokenizer.decode(output_ids[:index], skip_special_tokens=True).strip("\n")
106
+ content = tokenizer.decode(output_ids[index:], skip_special_tokens=True).strip("\n")
107
+
108
+ print("thinking content:", thinking_content)
109
+ print("content:", content)
110
  ```
111
 
112
  # Quantization Recipe