gijs commited on
Commit
b397a36
·
verified ·
1 Parent(s): f6f0bc9

Update README.md

Browse files
Files changed (1) hide show
  1. README.md +51 -40
README.md CHANGED
@@ -24,64 +24,75 @@ This model is built upon the `Qwen2.5-Omni-7B` multimodal foundation model and i
24
  To use `AudSemThinker` for audio understanding and captioning tasks, you can load it using the `transformers` library. Ensure you have `torch`, `torchaudio`, and `soundfile` installed.
25
 
26
  ```python
27
- from transformers import AutoProcessor, AutoModelForCausalLM
28
- import torch
29
- import torchaudio
30
  import soundfile as sf
 
 
 
31
 
32
- # Load processor and model
33
- processor = Qwen2_5OmniProcessor.from_pretrained("gijs/audsemthinker", trust_remote_code=True)
34
- model = Qwen2_5OmniThinkerForConditionalGeneration.from_pretrained(
35
- "gijs/audsemthinker",
36
- torch_dtype=torch.bfloat16,
37
  device_map="auto",
38
- trust_remote_code=True,
39
- low_cpu_mem_usage=True,
40
  )
41
 
42
- # Example audio file (replace with your audio path)
43
- audio_file = "path/to/your/audio.wav"
 
 
 
 
 
 
44
 
 
 
 
 
45
  audio_input, sampling_rate = torchaudio.load(audio_file)
46
  if sampling_rate != processor.feature_extractor.sampling_rate:
47
- audio_input = torchaudio.transforms.Resample(orig_freq=sampling_rate, new_freq=processor.feature_extractor.sampling_rate)(audio_input)
48
- audio_input = audio_input.squeeze().numpy() # Ensure mono and numpy array
49
-
50
- # User prompt for the task
51
- user_prompt_text = "You are given an audio clip. Your task is to describe the audio in detail. First, think about the audio clip and put your thoughts in <think> and </think> tags. Then reason about the semantic elements involved in the audio clip and put your reasoning in <semantic_elements> and </semantic_elements> tags. Then describe the audio clip, put your answer in <answer> and </answer> tags."
52
-
53
- # Construct messages in conversation format, similar to training
54
- messages = [
55
- {"role": "system", "content": [{"type": "text", "text": "You are Qwen, a virtual human developed by the Qwen Team, Alibaba Group, capable of perceiving auditory and visual inputs, as well as generating text and speech."}]},
 
 
 
 
 
56
  {
57
  "role": "user",
58
  "content": [
59
  {"type": "audio", "audio": audio_input},
60
- {"type": "text", "text": user_prompt_text}
61
- ]
62
- }
63
  ]
64
 
65
- # Apply chat template
66
- # For inference, add_generation_prompt should be True.
67
- text_from_chat_template = processor.apply_chat_template(
68
- messages,
69
- tokenize=False,
70
- add_generation_prompt=True
71
- )
72
-
73
- # Prepare inputs for the model
74
  inputs = processor(
75
- text=text_from_chat_template,
76
- audio=[audio_input], # Pass audio as a list of numpy arrays
77
- return_tensors="pt"
78
- ).to(model.device)
 
 
 
 
79
 
80
- # Generate response
81
  output_ids = model.generate(**inputs, max_new_tokens=512)
82
- response = processor.batch_decode(output_ids, skip_special_tokens=True)[0]
 
83
 
84
- print(response)
85
  # Expected output format:
86
  # <think>...detailed reasoning about the audio scene...</think>
87
  # <semantic_elements>...list of identified semantic descriptors (e.g., Who, What, How, When, Where)...</semantic_elements>
 
24
  To use `AudSemThinker` for audio understanding and captioning tasks, you can load it using the `transformers` library. Ensure you have `torch`, `torchaudio`, and `soundfile` installed.
25
 
26
  ```python
 
 
 
27
  import soundfile as sf
28
+ from transformers import Qwen2_5OmniForConditionalGeneration, Qwen2_5OmniProcessor
29
+ from qwen_omni_utils import process_mm_info
30
+ import torchaudio
31
 
32
+ # default: Load the model on the available device(s)
33
+ model = Qwen2_5OmniForConditionalGeneration.from_pretrained(
34
+ "gijs/audsemthinker",
35
+ torch_dtype="auto",
 
36
  device_map="auto",
37
+ trust_remote_code=True
 
38
  )
39
 
40
+ # We recommend enabling flash_attention_2 for better acceleration and memory saving.
41
+ # model = Qwen2_5OmniForConditionalGeneration.from_pretrained(
42
+ # "gijs/audsemthinker",
43
+ # torch_dtype="auto",
44
+ # device_map="auto",
45
+ # attn_implementation="flash_attention_2",
46
+ # trust_remote_code=True
47
+ # )
48
 
49
+ processor = Qwen2_5OmniProcessor.from_pretrained("gijs/audsemthinker", trust_remote_code=True)
50
+
51
+ # Load and preprocess audio
52
+ audio_file = "path/to/your/audio.wav"
53
  audio_input, sampling_rate = torchaudio.load(audio_file)
54
  if sampling_rate != processor.feature_extractor.sampling_rate:
55
+ audio_input = torchaudio.transforms.Resample(
56
+ orig_freq=sampling_rate,
57
+ new_freq=processor.feature_extractor.sampling_rate
58
+ )(audio_input)
59
+ audio_input = audio_input.squeeze().numpy()
60
+
61
+ # Conversation format
62
+ conversation = [
63
+ {
64
+ "role": "system",
65
+ "content": [
66
+ {"type": "text", "text": "You are Qwen, a virtual human developed by the Qwen Team, Alibaba Group, capable of perceiving auditory and visual inputs, as well as generating text and speech."}
67
+ ],
68
+ },
69
  {
70
  "role": "user",
71
  "content": [
72
  {"type": "audio", "audio": audio_input},
73
+ {"type": "text", "text": "You are given an audio clip. Your task is to describe the audio in detail. First, think about the audio clip and put your thoughts in <think> and </think> tags. Then reason about the semantic elements involved in the audio clip and put your reasoning in <semantic_elements> and </semantic_elements> tags. Then describe the audio clip, put your answer in <answer> and </answer> tags."}
74
+ ],
75
+ },
76
  ]
77
 
78
+ # Preparation for inference
79
+ text = processor.apply_chat_template(conversation, add_generation_prompt=True, tokenize=False)
80
+ audios, images, videos = process_mm_info(conversation)
 
 
 
 
 
 
81
  inputs = processor(
82
+ text=text,
83
+ audio=audios,
84
+ images=images,
85
+ videos=videos,
86
+ return_tensors="pt",
87
+ padding=True
88
+ )
89
+ inputs = inputs.to(model.device).to(model.dtype)
90
 
91
+ # Inference: Generation of the output
92
  output_ids = model.generate(**inputs, max_new_tokens=512)
93
+ response = processor.batch_decode(output_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)
94
+ print(response[0])
95
 
 
96
  # Expected output format:
97
  # <think>...detailed reasoning about the audio scene...</think>
98
  # <semantic_elements>...list of identified semantic descriptors (e.g., Who, What, How, When, Where)...</semantic_elements>