tanlonghua commited on
Commit
a504d09
·
1 Parent(s): 544845b

update zero-shot TTS

Browse files
.gitattributes CHANGED
@@ -41,5 +41,10 @@ data/openai_whisper-20240930-py3-none-any.whl filter=lfs diff=lfs merge=lfs -tex
41
  data/matcha_tts-0.0.5.1-cp310-cp310-linux_x86_64.whl filter=lfs diff=lfs merge=lfs -text
42
  *.mp4 filter=lfs diff=lfs merge=lfs -text
43
  *.png filter=lfs diff=lfs merge=lfs -text
 
 
 
 
 
44
 
45
 
 
41
  data/matcha_tts-0.0.5.1-cp310-cp310-linux_x86_64.whl filter=lfs diff=lfs merge=lfs -text
42
  *.mp4 filter=lfs diff=lfs merge=lfs -text
43
  *.png filter=lfs diff=lfs merge=lfs -text
44
+ *.onnx filter=lfs diff=lfs merge=lfs -text
45
+ data/spks/prompt.wav filter=lfs diff=lfs merge=lfs -text
46
+
47
+
48
+
49
 
50
 
README.md CHANGED
@@ -1,11 +1,3 @@
1
- ---
2
- base_model:
3
- - inclusionAI/Ling-lite
4
- license: mit
5
- pipeline_tag: any-to-any
6
- library_name: transformers
7
- ---
8
-
9
  # Ming-Lite-Omni
10
 
11
  <p align="center">
@@ -216,6 +208,7 @@ pip install nvidia-cublas-cu12==12.4.5.8 # for H20
216
  Note: We test following examples on hardware of NVIDIA H800-80GB with CUDA 12.2. Loading inclusionAI/Ming-Lite-Omni in bfloat16 takes about 40890MB memory.
217
 
218
 
 
219
  ```python
220
  import os
221
  import torch
@@ -275,31 +268,19 @@ messages = [
275
  To enable thinking before response, adding the following system prompt before your question:
276
 
277
  ```python
278
- cot_prompt = "SYSTEM: You are a helpful assistant. When the user asks a question, your response must include two parts: first, the reasoning process enclosed in <thinking>...</thinking> tags, then the final answer enclosed in <answer>...</answer> tags. The critical answer or key result should be placed within \\boxed{}.
279
- "
280
  # And your input message should be like this:
281
  messages = [
282
  {
283
  "role": "HUMAN",
284
  "content": [
285
  {"type": "image", "image": os.path.join(assets_path, "reasoning.png")},
286
- {"type": "text", "text": cot_prompt + "In the rectangle $A B C D$ pictured, $M_{1}$ is the midpoint of $D C, M_{2}$ the midpoint of $A M_{1}, M_{3}$ the midpoint of $B M_{2}$ and $M_{4}$ the midpoint of $C M_{3}$. Determine the ratio of the area of the quadrilateral $M_{1} M_{2} M_{3} M_{4}$ to the area of the rectangle $A B C D$.
287
- Choices:
288
- (A) $\\frac{7}{16}$
289
- (B) $\\frac{3}{16}$
290
- (C) $\\frac{7}{32}$
291
- (D) $\\frac{9}{32}$
292
- (E) $\\frac{1}{5}$"},
293
  ],
294
  },
295
  ]
296
  # Output:
297
- # \<think\>
298
- Okay, so I have this problem about a rectangle ABCD ... (thinking process omitted) ... So, the correct answer is C.
299
- \</think\>
300
- \<answer\>\\boxed{C}\</answer\>
301
-
302
-
303
  ```
304
 
305
  ```python
@@ -467,6 +448,34 @@ audio_tokens = model.talker.omni_audio_generation(
467
  waveform = audio_detokenizer.token2wav(audio_tokens, save_path='out.wav', **spk_input)
468
 
469
  ```
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
470
  For detailed usage for ASR, SpeechQA, and TTS tasks, please refer to `test_audio_tasks.py`
471
 
472
  ### Image Generation & Edit
@@ -559,4 +568,6 @@ If you find our work helpful, feel free to give us a cite.
559
  archivePrefix = {arXiv},
560
  url = {https://arxiv.org/abs/2506.09344}
561
  }
562
- ```
 
 
 
 
 
 
 
 
 
 
 
1
  # Ming-Lite-Omni
2
 
3
  <p align="center">
 
208
  Note: We test following examples on hardware of NVIDIA H800-80GB with CUDA 12.2. Loading inclusionAI/Ming-Lite-Omni in bfloat16 takes about 40890MB memory.
209
 
210
 
211
+
212
  ```python
213
  import os
214
  import torch
 
268
  To enable thinking before response, adding the following system prompt before your question:
269
 
270
  ```python
271
+ cot_prompt = "SYSTEM: You are a helpful assistant. When the user asks a question, your response must include two parts: first, the reasoning process enclosed in <thinking>...</thinking> tags, then the final answer enclosed in <answer>...</answer> tags. The critical answer or key result should be placed within \\boxed{}.\n"
 
272
  # And your input message should be like this:
273
  messages = [
274
  {
275
  "role": "HUMAN",
276
  "content": [
277
  {"type": "image", "image": os.path.join(assets_path, "reasoning.png")},
278
+ {"type": "text", "text": cot_prompt + "In the rectangle $A B C D$ pictured, $M_{1}$ is the midpoint of $D C, M_{2}$ the midpoint of $A M_{1}, M_{3}$ the midpoint of $B M_{2}$ and $M_{4}$ the midpoint of $C M_{3}$. Determine the ratio of the area of the quadrilateral $M_{1} M_{2} M_{3} M_{4}$ to the area of the rectangle $A B C D$.\nChoices:\n(A) $\frac{7}{16}$\n(B) $\frac{3}{16}$\n(C) $\frac{7}{32}$\n(D) $\frac{9}{32}$\n(E) $\frac{1}{5}$"},
 
 
 
 
 
 
279
  ],
280
  },
281
  ]
282
  # Output:
283
+ # \<think\>\nOkay, so I have this problem about a rectangle ABCD ... (thinking process omitted) ... So, the correct answer is C.\n\</think\>\n\<answer\>\\boxed{C}\</answer\>\n\n
 
 
 
 
 
284
  ```
285
 
286
  ```python
 
448
  waveform = audio_detokenizer.token2wav(audio_tokens, save_path='out.wav', **spk_input)
449
 
450
  ```
451
+
452
+ ```python
453
+ # zero-shot TTS
454
+ from modeling_bailing_talker import AudioDetokenizer
455
+ from audio_detokenizer.cli.frontend import TTSFrontEnd
456
+ from hyperpyyaml import load_hyperpyyaml
457
+
458
+ model_name_or_path = model.config._name_or_path
459
+ audio_detokenizer = AudioDetokenizer(
460
+ f'{model_name_or_path}/talker/audio_detokenizer.yaml',
461
+ flow_model_path=f'{model_name_or_path}/talker/flow.pt',
462
+ hifigan_model_path=f'{model_name_or_path}/talker/hift.pt'
463
+ )
464
+
465
+ with open(f'{model_name_or_path}/talker/audio_detokenizer.yaml', 'r') as f:
466
+ configs = load_hyperpyyaml(f)
467
+ audio_frontend = TTSFrontEnd(
468
+ configs["feat_extractor"],
469
+ f'{model_name_or_path}/talker/campplus.onnx',
470
+ f'{model_name_or_path}/talker/speech_tokenizer_v1.onnx',
471
+ )
472
+
473
+ tts_text = "这是一条测试语句。"
474
+ spk_input = audio_frontend.frontend_zero_shot(prompt_text="感谢你的认可。", prompt_wav_path="data/spks/prompt.wav")
475
+ audio_tokens = model.talker.omni_audio_generation(tts_text, **spk_input)
476
+ waveform = audio_detokenizer.token2wav(audio_tokens, save_path='out.wav', **spk_input)
477
+ ```
478
+
479
  For detailed usage for ASR, SpeechQA, and TTS tasks, please refer to `test_audio_tasks.py`
480
 
481
  ### Image Generation & Edit
 
568
  archivePrefix = {arXiv},
569
  url = {https://arxiv.org/abs/2506.09344}
570
  }
571
+ ```
572
+
573
+
audio_detokenizer/cli/frontend.py ADDED
@@ -0,0 +1,101 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright (c) 2024 Alibaba Inc (authors: Xiang Lyu)
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
+ from functools import partial
15
+ import onnxruntime
16
+ import torch
17
+ import numpy as np
18
+ import whisper
19
+ from typing import Callable
20
+ import torchaudio.compliance.kaldi as kaldi
21
+ import torchaudio
22
+ import ipdb
23
+ import sys
24
+ from hyperpyyaml import load_hyperpyyaml
25
+ from audio_detokenizer.utils.file_utils import load_wav
26
+
27
+
28
+ class TTSFrontEnd:
29
+
30
+ def __init__(self,
31
+ feat_extractor: Callable,
32
+ campplus_model: str,
33
+ speech_tokenizer_model: str
34
+ ):
35
+ self.feat_extractor = feat_extractor
36
+ self.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
37
+ option = onnxruntime.SessionOptions()
38
+ option.graph_optimization_level = onnxruntime.GraphOptimizationLevel.ORT_ENABLE_ALL
39
+ option.intra_op_num_threads = 1
40
+ self.campplus_session = onnxruntime.InferenceSession(campplus_model, sess_options=option, providers=["CPUExecutionProvider"])
41
+ self.speech_tokenizer_session = onnxruntime.InferenceSession(speech_tokenizer_model, sess_options=option,
42
+ providers=["CUDAExecutionProvider" if torch.cuda.is_available() else
43
+ "CPUExecutionProvider"])
44
+
45
+
46
+
47
+ def _extract_speech_token(self, speech):
48
+ assert speech.shape[1] / 16000 <= 30, 'do not support extract speech token for audio longer than 30s'
49
+ feat = whisper.log_mel_spectrogram(speech, n_mels=128)
50
+ speech_token = self.speech_tokenizer_session.run(None,
51
+ {self.speech_tokenizer_session.get_inputs()[0].name:
52
+ feat.detach().cpu().numpy(),
53
+ self.speech_tokenizer_session.get_inputs()[1].name:
54
+ np.array([feat.shape[2]], dtype=np.int32)})[0].flatten().tolist()
55
+ speech_token = torch.tensor([speech_token], dtype=torch.int32).to(self.device)
56
+ speech_token_len = torch.tensor([speech_token.shape[1]], dtype=torch.int32).to(self.device)
57
+ return speech_token, speech_token_len
58
+
59
+ def _extract_spk_embedding(self, speech):
60
+ feat = kaldi.fbank(speech,
61
+ num_mel_bins=80,
62
+ dither=0,
63
+ sample_frequency=16000)
64
+ feat = feat - feat.mean(dim=0, keepdim=True)
65
+ embedding = self.campplus_session.run(None,
66
+ {self.campplus_session.get_inputs()[0].name: feat.unsqueeze(dim=0).cpu().numpy()})[0].flatten().tolist()
67
+ embedding = torch.tensor([embedding]).to(self.device)
68
+ return embedding
69
+
70
+ def _extract_speech_feat(self, speech):
71
+ speech_feat = self.feat_extractor(speech).squeeze(dim=0).transpose(0, 1).to(self.device)
72
+ speech_feat = speech_feat.unsqueeze(dim=0)
73
+ speech_feat_len = torch.tensor([speech_feat.shape[1]], dtype=torch.int32).to(self.device)
74
+ return speech_feat, speech_feat_len
75
+
76
+
77
+ def frontend_zero_shot(self, prompt_text, prompt_wav_path):
78
+ prompt_speech_16k = load_wav(prompt_wav_path, 16000)
79
+ prompt_speech_22050 = torchaudio.transforms.Resample(orig_freq=16000, new_freq=22050)(prompt_speech_16k)
80
+ speech_feat, speech_feat_len = self._extract_speech_feat(prompt_speech_22050)
81
+ speech_token, speech_token_len = self._extract_speech_token(prompt_speech_16k)
82
+ embedding = self._extract_spk_embedding(prompt_speech_16k)
83
+ model_input = {
84
+ 'prompt_text': prompt_text,
85
+ 'prompt_speech_token': speech_token, 'prompt_speech_token_len': speech_token_len,
86
+ 'prompt_speech_feat': speech_feat, 'prompt_speech_feat_len': speech_feat_len,
87
+ 'vp_emb': embedding}
88
+
89
+ return model_input
90
+
91
+
92
+ if __name__ == "__main__":
93
+ model_dir = sys.argv[1]
94
+ hyper_yaml_path = f"{model_dir}/audio_detokenizer.yaml"
95
+ with open(hyper_yaml_path, 'r') as f:
96
+ configs = load_hyperpyyaml(f)
97
+ frontend = TTSFrontEnd(
98
+ configs["feat_extractor"],
99
+ '{}/campplus.onnx'.format(model_dir),
100
+ '{}/speech_tokenizer_v1.onnx'.format(model_dir))
101
+ model_input = frontend.frontend_zero_shot("你叫什么名字", 'db30_02.wav')
data/spks/prompt.wav ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:1611471cc6ebdda5a207802802ad12d2265b21e61e2ca43b98a2605cf981559c
3
+ size 39094
talker/audio_detokenizer.yaml CHANGED
@@ -85,3 +85,12 @@ hift: !new:.audio_detokenizer.hifigan.generator.HiFTGenerator
85
  in_channels: 80
86
  cond_channels: 512
87
 
 
 
 
 
 
 
 
 
 
 
85
  in_channels: 80
86
  cond_channels: 512
87
 
88
+ feat_extractor: !name:matcha.utils.audio.mel_spectrogram
89
+ n_fft: 1024
90
+ num_mels: 80
91
+ sampling_rate: !ref <sample_rate>
92
+ hop_size: 256
93
+ win_size: 1024
94
+ fmin: 0
95
+ fmax: 8000
96
+ center: False
talker/campplus.onnx ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a6ac6a63997761ae2997373e2ee1c47040854b4b759ea41ec48e4e42df0f4d73
3
+ size 28303423
talker/speech_tokenizer_v1.onnx ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:23b5a723ed9143aebfd9ffda14ac4c21231f31c35ef837b6a13bb9e5488abb1e
3
+ size 522624269