gijs
/

audsemthinker

@@ -24,64 +24,75 @@ This model is built upon the `Qwen2.5-Omni-7B` multimodal foundation model and i
 To use `AudSemThinker` for audio understanding and captioning tasks, you can load it using the `transformers` library. Ensure you have `torch`, `torchaudio`, and `soundfile` installed.
 ```python
-from transformers import AutoProcessor, AutoModelForCausalLM
-import torch
-import torchaudio
 import soundfile as sf
-# Load processor and model
-processor = Qwen2_5OmniProcessor.from_pretrained("gijs/audsemthinker", trust_remote_code=True)
-model = Qwen2_5OmniThinkerForConditionalGeneration.from_pretrained(
-    "gijs/audsemthinker",
-    torch_dtype=torch.bfloat16,
     device_map="auto",
-    trust_remote_code=True,
-    low_cpu_mem_usage=True,
 )
-# Example audio file (replace with your audio path)
-audio_file = "path/to/your/audio.wav"
 audio_input, sampling_rate = torchaudio.load(audio_file)
 if sampling_rate != processor.feature_extractor.sampling_rate:
-    audio_input = torchaudio.transforms.Resample(orig_freq=sampling_rate, new_freq=processor.feature_extractor.sampling_rate)(audio_input)
-audio_input = audio_input.squeeze().numpy() # Ensure mono and numpy array
-# User prompt for the task
-user_prompt_text = "You are given an audio clip. Your task is to describe the audio in detail. First, think about the audio clip and put your thoughts in <think> and </think> tags. Then reason about the semantic elements involved in the audio clip and put your reasoning in <semantic_elements> and </semantic_elements> tags. Then describe the audio clip, put your answer in <answer> and </answer> tags."
-# Construct messages in conversation format, similar to training
-messages = [
-    {"role": "system", "content": [{"type": "text", "text": "You are Qwen, a virtual human developed by the Qwen Team, Alibaba Group, capable of perceiving auditory and visual inputs, as well as generating text and speech."}]},
     {
         "role": "user",
         "content": [
             {"type": "audio", "audio": audio_input},
-            {"type": "text", "text": user_prompt_text}
-        ]
-    }
 ]
-# Apply chat template
-# For inference, add_generation_prompt should be True.
-text_from_chat_template = processor.apply_chat_template(
-    messages,
-    tokenize=False,
-    add_generation_prompt=True
-)
-# Prepare inputs for the model
 inputs = processor(
-    text=text_from_chat_template,
-    audio=[audio_input], # Pass audio as a list of numpy arrays
-    return_tensors="pt"
-).to(model.device)
-# Generate response
 output_ids = model.generate(**inputs, max_new_tokens=512)
-response = processor.batch_decode(output_ids, skip_special_tokens=True)[0]
-print(response)
 # Expected output format:
 # <think>...detailed reasoning about the audio scene...</think>
 # <semantic_elements>...list of identified semantic descriptors (e.g., Who, What, How, When, Where)...</semantic_elements>

 To use `AudSemThinker` for audio understanding and captioning tasks, you can load it using the `transformers` library. Ensure you have `torch`, `torchaudio`, and `soundfile` installed.
 ```python
 import soundfile as sf
+from transformers import Qwen2_5OmniForConditionalGeneration, Qwen2_5OmniProcessor
+from qwen_omni_utils import process_mm_info
+import torchaudio
+# default: Load the model on the available device(s)
+model = Qwen2_5OmniForConditionalGeneration.from_pretrained(
+    "gijs/audsemthinker",
+    torch_dtype="auto",
     device_map="auto",
+    trust_remote_code=True
 )
+# We recommend enabling flash_attention_2 for better acceleration and memory saving.
+# model = Qwen2_5OmniForConditionalGeneration.from_pretrained(
+#     "gijs/audsemthinker",
+#     torch_dtype="auto",
+#     device_map="auto",
+#     attn_implementation="flash_attention_2",
+#     trust_remote_code=True
+# )
+processor = Qwen2_5OmniProcessor.from_pretrained("gijs/audsemthinker", trust_remote_code=True)
+# Load and preprocess audio
+audio_file = "path/to/your/audio.wav"
 audio_input, sampling_rate = torchaudio.load(audio_file)
 if sampling_rate != processor.feature_extractor.sampling_rate:
+    audio_input = torchaudio.transforms.Resample(
+        orig_freq=sampling_rate,
+        new_freq=processor.feature_extractor.sampling_rate
+    )(audio_input)
+audio_input = audio_input.squeeze().numpy()
+# Conversation format
+conversation = [
+    {
+        "role": "system",
+        "content": [
+            {"type": "text", "text": "You are Qwen, a virtual human developed by the Qwen Team, Alibaba Group, capable of perceiving auditory and visual inputs, as well as generating text and speech."}
+        ],
+    },
     {
         "role": "user",
         "content": [
             {"type": "audio", "audio": audio_input},
+            {"type": "text", "text": "You are given an audio clip. Your task is to describe the audio in detail. First, think about the audio clip and put your thoughts in <think> and </think> tags. Then reason about the semantic elements involved in the audio clip and put your reasoning in <semantic_elements> and </semantic_elements> tags. Then describe the audio clip, put your answer in <answer> and </answer> tags."}
+        ],
+    },
 ]
+# Preparation for inference
+text = processor.apply_chat_template(conversation, add_generation_prompt=True, tokenize=False)
+audios, images, videos = process_mm_info(conversation)
 inputs = processor(
+    text=text,
+    audio=audios,
+    images=images,
+    videos=videos,
+    return_tensors="pt",
+    padding=True
+)
+inputs = inputs.to(model.device).to(model.dtype)
+# Inference: Generation of the output
 output_ids = model.generate(**inputs, max_new_tokens=512)
+response = processor.batch_decode(output_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)
+print(response[0])
 # Expected output format:
 # <think>...detailed reasoning about the audio scene...</think>
 # <semantic_elements>...list of identified semantic descriptors (e.g., Who, What, How, When, Where)...</semantic_elements>