Commit
·
a504d09
1
Parent(s):
544845b
update zero-shot TTS
Browse files- .gitattributes +5 -0
- README.md +35 -24
- audio_detokenizer/cli/frontend.py +101 -0
- data/spks/prompt.wav +3 -0
- talker/audio_detokenizer.yaml +9 -0
- talker/campplus.onnx +3 -0
- talker/speech_tokenizer_v1.onnx +3 -0
.gitattributes
CHANGED
@@ -41,5 +41,10 @@ data/openai_whisper-20240930-py3-none-any.whl filter=lfs diff=lfs merge=lfs -tex
|
|
41 |
data/matcha_tts-0.0.5.1-cp310-cp310-linux_x86_64.whl filter=lfs diff=lfs merge=lfs -text
|
42 |
*.mp4 filter=lfs diff=lfs merge=lfs -text
|
43 |
*.png filter=lfs diff=lfs merge=lfs -text
|
|
|
|
|
|
|
|
|
|
|
44 |
|
45 |
|
|
|
41 |
data/matcha_tts-0.0.5.1-cp310-cp310-linux_x86_64.whl filter=lfs diff=lfs merge=lfs -text
|
42 |
*.mp4 filter=lfs diff=lfs merge=lfs -text
|
43 |
*.png filter=lfs diff=lfs merge=lfs -text
|
44 |
+
*.onnx filter=lfs diff=lfs merge=lfs -text
|
45 |
+
data/spks/prompt.wav filter=lfs diff=lfs merge=lfs -text
|
46 |
+
|
47 |
+
|
48 |
+
|
49 |
|
50 |
|
README.md
CHANGED
@@ -1,11 +1,3 @@
|
|
1 |
-
---
|
2 |
-
base_model:
|
3 |
-
- inclusionAI/Ling-lite
|
4 |
-
license: mit
|
5 |
-
pipeline_tag: any-to-any
|
6 |
-
library_name: transformers
|
7 |
-
---
|
8 |
-
|
9 |
# Ming-Lite-Omni
|
10 |
|
11 |
<p align="center">
|
@@ -216,6 +208,7 @@ pip install nvidia-cublas-cu12==12.4.5.8 # for H20
|
|
216 |
Note: We test following examples on hardware of NVIDIA H800-80GB with CUDA 12.2. Loading inclusionAI/Ming-Lite-Omni in bfloat16 takes about 40890MB memory.
|
217 |
|
218 |
|
|
|
219 |
```python
|
220 |
import os
|
221 |
import torch
|
@@ -275,31 +268,19 @@ messages = [
|
|
275 |
To enable thinking before response, adding the following system prompt before your question:
|
276 |
|
277 |
```python
|
278 |
-
cot_prompt = "SYSTEM: You are a helpful assistant. When the user asks a question, your response must include two parts: first, the reasoning process enclosed in <thinking>...</thinking> tags, then the final answer enclosed in <answer>...</answer> tags. The critical answer or key result should be placed within \\boxed{}
|
279 |
-
"
|
280 |
# And your input message should be like this:
|
281 |
messages = [
|
282 |
{
|
283 |
"role": "HUMAN",
|
284 |
"content": [
|
285 |
{"type": "image", "image": os.path.join(assets_path, "reasoning.png")},
|
286 |
-
{"type": "text", "text": cot_prompt + "In the rectangle $A B C D$ pictured, $M_{1}$ is the midpoint of $D C, M_{2}$ the midpoint of $A M_{1}, M_{3}$ the midpoint of $B M_{2}$ and $M_{4}$ the midpoint of $C M_{3}$. Determine the ratio of the area of the quadrilateral $M_{1} M_{2} M_{3} M_{4}$ to the area of the rectangle $A B C D
|
287 |
-
Choices:
|
288 |
-
(A) $\\frac{7}{16}$
|
289 |
-
(B) $\\frac{3}{16}$
|
290 |
-
(C) $\\frac{7}{32}$
|
291 |
-
(D) $\\frac{9}{32}$
|
292 |
-
(E) $\\frac{1}{5}$"},
|
293 |
],
|
294 |
},
|
295 |
]
|
296 |
# Output:
|
297 |
-
# \<think
|
298 |
-
Okay, so I have this problem about a rectangle ABCD ... (thinking process omitted) ... So, the correct answer is C.
|
299 |
-
\</think\>
|
300 |
-
\<answer\>\\boxed{C}\</answer\>
|
301 |
-
|
302 |
-
|
303 |
```
|
304 |
|
305 |
```python
|
@@ -467,6 +448,34 @@ audio_tokens = model.talker.omni_audio_generation(
|
|
467 |
waveform = audio_detokenizer.token2wav(audio_tokens, save_path='out.wav', **spk_input)
|
468 |
|
469 |
```
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
470 |
For detailed usage for ASR, SpeechQA, and TTS tasks, please refer to `test_audio_tasks.py`
|
471 |
|
472 |
### Image Generation & Edit
|
@@ -559,4 +568,6 @@ If you find our work helpful, feel free to give us a cite.
|
|
559 |
archivePrefix = {arXiv},
|
560 |
url = {https://arxiv.org/abs/2506.09344}
|
561 |
}
|
562 |
-
```
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
# Ming-Lite-Omni
|
2 |
|
3 |
<p align="center">
|
|
|
208 |
Note: We test following examples on hardware of NVIDIA H800-80GB with CUDA 12.2. Loading inclusionAI/Ming-Lite-Omni in bfloat16 takes about 40890MB memory.
|
209 |
|
210 |
|
211 |
+
|
212 |
```python
|
213 |
import os
|
214 |
import torch
|
|
|
268 |
To enable thinking before response, adding the following system prompt before your question:
|
269 |
|
270 |
```python
|
271 |
+
cot_prompt = "SYSTEM: You are a helpful assistant. When the user asks a question, your response must include two parts: first, the reasoning process enclosed in <thinking>...</thinking> tags, then the final answer enclosed in <answer>...</answer> tags. The critical answer or key result should be placed within \\boxed{}.\n"
|
|
|
272 |
# And your input message should be like this:
|
273 |
messages = [
|
274 |
{
|
275 |
"role": "HUMAN",
|
276 |
"content": [
|
277 |
{"type": "image", "image": os.path.join(assets_path, "reasoning.png")},
|
278 |
+
{"type": "text", "text": cot_prompt + "In the rectangle $A B C D$ pictured, $M_{1}$ is the midpoint of $D C, M_{2}$ the midpoint of $A M_{1}, M_{3}$ the midpoint of $B M_{2}$ and $M_{4}$ the midpoint of $C M_{3}$. Determine the ratio of the area of the quadrilateral $M_{1} M_{2} M_{3} M_{4}$ to the area of the rectangle $A B C D$.\nChoices:\n(A) $\frac{7}{16}$\n(B) $\frac{3}{16}$\n(C) $\frac{7}{32}$\n(D) $\frac{9}{32}$\n(E) $\frac{1}{5}$"},
|
|
|
|
|
|
|
|
|
|
|
|
|
279 |
],
|
280 |
},
|
281 |
]
|
282 |
# Output:
|
283 |
+
# \<think\>\nOkay, so I have this problem about a rectangle ABCD ... (thinking process omitted) ... So, the correct answer is C.\n\</think\>\n\<answer\>\\boxed{C}\</answer\>\n\n
|
|
|
|
|
|
|
|
|
|
|
284 |
```
|
285 |
|
286 |
```python
|
|
|
448 |
waveform = audio_detokenizer.token2wav(audio_tokens, save_path='out.wav', **spk_input)
|
449 |
|
450 |
```
|
451 |
+
|
452 |
+
```python
|
453 |
+
# zero-shot TTS
|
454 |
+
from modeling_bailing_talker import AudioDetokenizer
|
455 |
+
from audio_detokenizer.cli.frontend import TTSFrontEnd
|
456 |
+
from hyperpyyaml import load_hyperpyyaml
|
457 |
+
|
458 |
+
model_name_or_path = model.config._name_or_path
|
459 |
+
audio_detokenizer = AudioDetokenizer(
|
460 |
+
f'{model_name_or_path}/talker/audio_detokenizer.yaml',
|
461 |
+
flow_model_path=f'{model_name_or_path}/talker/flow.pt',
|
462 |
+
hifigan_model_path=f'{model_name_or_path}/talker/hift.pt'
|
463 |
+
)
|
464 |
+
|
465 |
+
with open(f'{model_name_or_path}/talker/audio_detokenizer.yaml', 'r') as f:
|
466 |
+
configs = load_hyperpyyaml(f)
|
467 |
+
audio_frontend = TTSFrontEnd(
|
468 |
+
configs["feat_extractor"],
|
469 |
+
f'{model_name_or_path}/talker/campplus.onnx',
|
470 |
+
f'{model_name_or_path}/talker/speech_tokenizer_v1.onnx',
|
471 |
+
)
|
472 |
+
|
473 |
+
tts_text = "这是一条测试语句。"
|
474 |
+
spk_input = audio_frontend.frontend_zero_shot(prompt_text="感谢你的认可。", prompt_wav_path="data/spks/prompt.wav")
|
475 |
+
audio_tokens = model.talker.omni_audio_generation(tts_text, **spk_input)
|
476 |
+
waveform = audio_detokenizer.token2wav(audio_tokens, save_path='out.wav', **spk_input)
|
477 |
+
```
|
478 |
+
|
479 |
For detailed usage for ASR, SpeechQA, and TTS tasks, please refer to `test_audio_tasks.py`
|
480 |
|
481 |
### Image Generation & Edit
|
|
|
568 |
archivePrefix = {arXiv},
|
569 |
url = {https://arxiv.org/abs/2506.09344}
|
570 |
}
|
571 |
+
```
|
572 |
+
|
573 |
+
|
audio_detokenizer/cli/frontend.py
ADDED
@@ -0,0 +1,101 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# Copyright (c) 2024 Alibaba Inc (authors: Xiang Lyu)
|
2 |
+
#
|
3 |
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
4 |
+
# you may not use this file except in compliance with the License.
|
5 |
+
# You may obtain a copy of the License at
|
6 |
+
#
|
7 |
+
# http://www.apache.org/licenses/LICENSE-2.0
|
8 |
+
#
|
9 |
+
# Unless required by applicable law or agreed to in writing, software
|
10 |
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
11 |
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
12 |
+
# See the License for the specific language governing permissions and
|
13 |
+
# limitations under the License.
|
14 |
+
from functools import partial
|
15 |
+
import onnxruntime
|
16 |
+
import torch
|
17 |
+
import numpy as np
|
18 |
+
import whisper
|
19 |
+
from typing import Callable
|
20 |
+
import torchaudio.compliance.kaldi as kaldi
|
21 |
+
import torchaudio
|
22 |
+
import ipdb
|
23 |
+
import sys
|
24 |
+
from hyperpyyaml import load_hyperpyyaml
|
25 |
+
from audio_detokenizer.utils.file_utils import load_wav
|
26 |
+
|
27 |
+
|
28 |
+
class TTSFrontEnd:
|
29 |
+
|
30 |
+
def __init__(self,
|
31 |
+
feat_extractor: Callable,
|
32 |
+
campplus_model: str,
|
33 |
+
speech_tokenizer_model: str
|
34 |
+
):
|
35 |
+
self.feat_extractor = feat_extractor
|
36 |
+
self.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
|
37 |
+
option = onnxruntime.SessionOptions()
|
38 |
+
option.graph_optimization_level = onnxruntime.GraphOptimizationLevel.ORT_ENABLE_ALL
|
39 |
+
option.intra_op_num_threads = 1
|
40 |
+
self.campplus_session = onnxruntime.InferenceSession(campplus_model, sess_options=option, providers=["CPUExecutionProvider"])
|
41 |
+
self.speech_tokenizer_session = onnxruntime.InferenceSession(speech_tokenizer_model, sess_options=option,
|
42 |
+
providers=["CUDAExecutionProvider" if torch.cuda.is_available() else
|
43 |
+
"CPUExecutionProvider"])
|
44 |
+
|
45 |
+
|
46 |
+
|
47 |
+
def _extract_speech_token(self, speech):
|
48 |
+
assert speech.shape[1] / 16000 <= 30, 'do not support extract speech token for audio longer than 30s'
|
49 |
+
feat = whisper.log_mel_spectrogram(speech, n_mels=128)
|
50 |
+
speech_token = self.speech_tokenizer_session.run(None,
|
51 |
+
{self.speech_tokenizer_session.get_inputs()[0].name:
|
52 |
+
feat.detach().cpu().numpy(),
|
53 |
+
self.speech_tokenizer_session.get_inputs()[1].name:
|
54 |
+
np.array([feat.shape[2]], dtype=np.int32)})[0].flatten().tolist()
|
55 |
+
speech_token = torch.tensor([speech_token], dtype=torch.int32).to(self.device)
|
56 |
+
speech_token_len = torch.tensor([speech_token.shape[1]], dtype=torch.int32).to(self.device)
|
57 |
+
return speech_token, speech_token_len
|
58 |
+
|
59 |
+
def _extract_spk_embedding(self, speech):
|
60 |
+
feat = kaldi.fbank(speech,
|
61 |
+
num_mel_bins=80,
|
62 |
+
dither=0,
|
63 |
+
sample_frequency=16000)
|
64 |
+
feat = feat - feat.mean(dim=0, keepdim=True)
|
65 |
+
embedding = self.campplus_session.run(None,
|
66 |
+
{self.campplus_session.get_inputs()[0].name: feat.unsqueeze(dim=0).cpu().numpy()})[0].flatten().tolist()
|
67 |
+
embedding = torch.tensor([embedding]).to(self.device)
|
68 |
+
return embedding
|
69 |
+
|
70 |
+
def _extract_speech_feat(self, speech):
|
71 |
+
speech_feat = self.feat_extractor(speech).squeeze(dim=0).transpose(0, 1).to(self.device)
|
72 |
+
speech_feat = speech_feat.unsqueeze(dim=0)
|
73 |
+
speech_feat_len = torch.tensor([speech_feat.shape[1]], dtype=torch.int32).to(self.device)
|
74 |
+
return speech_feat, speech_feat_len
|
75 |
+
|
76 |
+
|
77 |
+
def frontend_zero_shot(self, prompt_text, prompt_wav_path):
|
78 |
+
prompt_speech_16k = load_wav(prompt_wav_path, 16000)
|
79 |
+
prompt_speech_22050 = torchaudio.transforms.Resample(orig_freq=16000, new_freq=22050)(prompt_speech_16k)
|
80 |
+
speech_feat, speech_feat_len = self._extract_speech_feat(prompt_speech_22050)
|
81 |
+
speech_token, speech_token_len = self._extract_speech_token(prompt_speech_16k)
|
82 |
+
embedding = self._extract_spk_embedding(prompt_speech_16k)
|
83 |
+
model_input = {
|
84 |
+
'prompt_text': prompt_text,
|
85 |
+
'prompt_speech_token': speech_token, 'prompt_speech_token_len': speech_token_len,
|
86 |
+
'prompt_speech_feat': speech_feat, 'prompt_speech_feat_len': speech_feat_len,
|
87 |
+
'vp_emb': embedding}
|
88 |
+
|
89 |
+
return model_input
|
90 |
+
|
91 |
+
|
92 |
+
if __name__ == "__main__":
|
93 |
+
model_dir = sys.argv[1]
|
94 |
+
hyper_yaml_path = f"{model_dir}/audio_detokenizer.yaml"
|
95 |
+
with open(hyper_yaml_path, 'r') as f:
|
96 |
+
configs = load_hyperpyyaml(f)
|
97 |
+
frontend = TTSFrontEnd(
|
98 |
+
configs["feat_extractor"],
|
99 |
+
'{}/campplus.onnx'.format(model_dir),
|
100 |
+
'{}/speech_tokenizer_v1.onnx'.format(model_dir))
|
101 |
+
model_input = frontend.frontend_zero_shot("你叫什么名字", 'db30_02.wav')
|
data/spks/prompt.wav
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:1611471cc6ebdda5a207802802ad12d2265b21e61e2ca43b98a2605cf981559c
|
3 |
+
size 39094
|
talker/audio_detokenizer.yaml
CHANGED
@@ -85,3 +85,12 @@ hift: !new:.audio_detokenizer.hifigan.generator.HiFTGenerator
|
|
85 |
in_channels: 80
|
86 |
cond_channels: 512
|
87 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
85 |
in_channels: 80
|
86 |
cond_channels: 512
|
87 |
|
88 |
+
feat_extractor: !name:matcha.utils.audio.mel_spectrogram
|
89 |
+
n_fft: 1024
|
90 |
+
num_mels: 80
|
91 |
+
sampling_rate: !ref <sample_rate>
|
92 |
+
hop_size: 256
|
93 |
+
win_size: 1024
|
94 |
+
fmin: 0
|
95 |
+
fmax: 8000
|
96 |
+
center: False
|
talker/campplus.onnx
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:a6ac6a63997761ae2997373e2ee1c47040854b4b759ea41ec48e4e42df0f4d73
|
3 |
+
size 28303423
|
talker/speech_tokenizer_v1.onnx
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:23b5a723ed9143aebfd9ffda14ac4c21231f31c35ef837b6a13bb9e5488abb1e
|
3 |
+
size 522624269
|