update zero-shot TTS

Browse files

Files changed (7) hide show

.gitattributes +5 -0
README.md +35 -24
audio_detokenizer/cli/frontend.py +101 -0
data/spks/prompt.wav +3 -0
talker/audio_detokenizer.yaml +9 -0
talker/campplus.onnx +3 -0
talker/speech_tokenizer_v1.onnx +3 -0

.gitattributes CHANGED Viewed

@@ -41,5 +41,10 @@ data/openai_whisper-20240930-py3-none-any.whl filter=lfs diff=lfs merge=lfs -tex
 data/matcha_tts-0.0.5.1-cp310-cp310-linux_x86_64.whl filter=lfs diff=lfs merge=lfs -text
 *.mp4 filter=lfs diff=lfs merge=lfs -text
 *.png filter=lfs diff=lfs merge=lfs -text

 data/matcha_tts-0.0.5.1-cp310-cp310-linux_x86_64.whl filter=lfs diff=lfs merge=lfs -text
 *.mp4 filter=lfs diff=lfs merge=lfs -text
 *.png filter=lfs diff=lfs merge=lfs -text
+*.onnx filter=lfs diff=lfs merge=lfs -text
+data/spks/prompt.wav filter=lfs diff=lfs merge=lfs -text

README.md CHANGED Viewed

@@ -1,11 +1,3 @@
----
-base_model:
-- inclusionAI/Ling-lite
-license: mit
-pipeline_tag: any-to-any
-library_name: transformers
----
 # Ming-Lite-Omni
 <p align="center">
@@ -216,6 +208,7 @@ pip install nvidia-cublas-cu12==12.4.5.8  # for H20
 Note: We test following examples on hardware of NVIDIA H800-80GB with CUDA 12.2. Loading inclusionAI/Ming-Lite-Omni in bfloat16 takes about 40890MB memory.
 ```python
 import os
 import torch
@@ -275,31 +268,19 @@ messages = [
 To enable thinking before response, adding the following system prompt before your question:
 ```python
-cot_prompt = "SYSTEM: You are a helpful assistant. When the user asks a question, your response must include two parts: first, the reasoning process enclosed in <thinking>...</thinking> tags, then the final answer enclosed in <answer>...</answer> tags. The critical answer or key result should be placed within \\boxed{}.
-"
 # And your input message should be like this:
 messages = [
     {
         "role": "HUMAN",
         "content": [
             {"type": "image", "image": os.path.join(assets_path, "reasoning.png")},
-            {"type": "text", "text": cot_prompt + "In the rectangle $A B C D$ pictured, $M_{1}$ is the midpoint of $D C, M_{2}$ the midpoint of $A M_{1}, M_{3}$ the midpoint of $B M_{2}$ and $M_{4}$ the midpoint of $C M_{3}$. Determine the ratio of the area of the quadrilateral $M_{1} M_{2} M_{3} M_{4}$ to the area of the rectangle $A B C D$.
-Choices:
-(A) $\\frac{7}{16}$
-(B) $\\frac{3}{16}$
-(C) $\\frac{7}{32}$
-(D) $\\frac{9}{32}$
-(E) $\\frac{1}{5}$"},
         ],
     },
 ]
 # Output:
-# \<think\>
-Okay, so I have this problem about a rectangle ABCD ... (thinking process omitted) ... So, the correct answer is C.
-\</think\>
-\<answer\>\\boxed{C}\</answer\>
 ```
 ```python
@@ -467,6 +448,34 @@ audio_tokens = model.talker.omni_audio_generation(
 waveform = audio_detokenizer.token2wav(audio_tokens, save_path='out.wav', **spk_input)
 ```
 For detailed usage for ASR, SpeechQA, and TTS tasks, please refer to `test_audio_tasks.py`
 ### Image Generation & Edit
@@ -559,4 +568,6 @@ If you find our work helpful, feel free to give us a cite.
       archivePrefix = {arXiv},
       url = {https://arxiv.org/abs/2506.09344}
 }
-```

 # Ming-Lite-Omni
 <p align="center">
 Note: We test following examples on hardware of NVIDIA H800-80GB with CUDA 12.2. Loading inclusionAI/Ming-Lite-Omni in bfloat16 takes about 40890MB memory.
 ```python
 import os
 import torch
 To enable thinking before response, adding the following system prompt before your question:
 ```python
+cot_prompt = "SYSTEM: You are a helpful assistant. When the user asks a question, your response must include two parts: first, the reasoning process enclosed in <thinking>...</thinking> tags, then the final answer enclosed in <answer>...</answer> tags. The critical answer or key result should be placed within \\boxed{}.\n"
 # And your input message should be like this:
 messages = [
     {
         "role": "HUMAN",
         "content": [
             {"type": "image", "image": os.path.join(assets_path, "reasoning.png")},
+            {"type": "text", "text": cot_prompt + "In the rectangle $A B C D$ pictured, $M_{1}$ is the midpoint of $D C, M_{2}$ the midpoint of $A M_{1}, M_{3}$ the midpoint of $B M_{2}$ and $M_{4}$ the midpoint of $C M_{3}$. Determine the ratio of the area of the quadrilateral $M_{1} M_{2} M_{3} M_{4}$ to the area of the rectangle $A B C D$.\nChoices:\n(A) $\frac{7}{16}$\n(B) $\frac{3}{16}$\n(C) $\frac{7}{32}$\n(D) $\frac{9}{32}$\n(E) $\frac{1}{5}$"},
         ],
     },
 ]
 # Output:
+# \<think\>\nOkay, so I have this problem about a rectangle ABCD ... (thinking process omitted) ... So, the correct answer is C.\n\</think\>\n\<answer\>\\boxed{C}\</answer\>\n\n
 ```
 ```python
 waveform = audio_detokenizer.token2wav(audio_tokens, save_path='out.wav', **spk_input)
 ```
+```python
+# zero-shot TTS
+from modeling_bailing_talker import AudioDetokenizer
+from audio_detokenizer.cli.frontend import TTSFrontEnd
+from hyperpyyaml import load_hyperpyyaml
+model_name_or_path = model.config._name_or_path
+audio_detokenizer = AudioDetokenizer(
+    f'{model_name_or_path}/talker/audio_detokenizer.yaml',
+    flow_model_path=f'{model_name_or_path}/talker/flow.pt',
+    hifigan_model_path=f'{model_name_or_path}/talker/hift.pt'
+)
+with open(f'{model_name_or_path}/talker/audio_detokenizer.yaml', 'r') as f:
+    configs = load_hyperpyyaml(f)
+audio_frontend = TTSFrontEnd(
+    configs["feat_extractor"],
+    f'{model_name_or_path}/talker/campplus.onnx',
+    f'{model_name_or_path}/talker/speech_tokenizer_v1.onnx',
+)
+tts_text = "这是一条测试语句。"
+spk_input = audio_frontend.frontend_zero_shot(prompt_text="感谢你的认可。", prompt_wav_path="data/spks/prompt.wav")
+audio_tokens = model.talker.omni_audio_generation(tts_text, **spk_input)
+waveform = audio_detokenizer.token2wav(audio_tokens, save_path='out.wav', **spk_input)
+```
 For detailed usage for ASR, SpeechQA, and TTS tasks, please refer to `test_audio_tasks.py`
 ### Image Generation & Edit
       archivePrefix = {arXiv},
       url = {https://arxiv.org/abs/2506.09344}
 }
+```

audio_detokenizer/cli/frontend.py ADDED Viewed

	@@ -0,0 +1,101 @@

+# Copyright (c) 2024 Alibaba Inc (authors: Xiang Lyu)
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from functools import partial
+import onnxruntime
+import torch
+import numpy as np
+import whisper
+from typing import Callable
+import torchaudio.compliance.kaldi as kaldi
+import torchaudio
+import ipdb
+import sys
+from hyperpyyaml import load_hyperpyyaml
+from audio_detokenizer.utils.file_utils import load_wav
+class TTSFrontEnd:
+    def __init__(self,
+                 feat_extractor: Callable,
+                 campplus_model: str,
+                 speech_tokenizer_model: str
+    ):
+        self.feat_extractor = feat_extractor
+        self.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
+        option = onnxruntime.SessionOptions()
+        option.graph_optimization_level = onnxruntime.GraphOptimizationLevel.ORT_ENABLE_ALL
+        option.intra_op_num_threads = 1
+        self.campplus_session = onnxruntime.InferenceSession(campplus_model, sess_options=option, providers=["CPUExecutionProvider"])
+        self.speech_tokenizer_session = onnxruntime.InferenceSession(speech_tokenizer_model, sess_options=option,
+                                                                     providers=["CUDAExecutionProvider" if torch.cuda.is_available() else
+                                                                                "CPUExecutionProvider"])
+    def _extract_speech_token(self, speech):
+        assert speech.shape[1] / 16000 <= 30, 'do not support extract speech token for audio longer than 30s'
+        feat = whisper.log_mel_spectrogram(speech, n_mels=128)
+        speech_token = self.speech_tokenizer_session.run(None,
+                                                         {self.speech_tokenizer_session.get_inputs()[0].name:
+                                                          feat.detach().cpu().numpy(),
+                                                          self.speech_tokenizer_session.get_inputs()[1].name:
+                                                          np.array([feat.shape[2]], dtype=np.int32)})[0].flatten().tolist()
+        speech_token = torch.tensor([speech_token], dtype=torch.int32).to(self.device)
+        speech_token_len = torch.tensor([speech_token.shape[1]], dtype=torch.int32).to(self.device)
+        return speech_token, speech_token_len
+    def _extract_spk_embedding(self, speech):
+        feat = kaldi.fbank(speech,
+                           num_mel_bins=80,
+                           dither=0,
+                           sample_frequency=16000)
+        feat = feat - feat.mean(dim=0, keepdim=True)
+        embedding = self.campplus_session.run(None,
+                                              {self.campplus_session.get_inputs()[0].name: feat.unsqueeze(dim=0).cpu().numpy()})[0].flatten().tolist()
+        embedding = torch.tensor([embedding]).to(self.device)
+        return embedding
+    def _extract_speech_feat(self, speech):
+        speech_feat = self.feat_extractor(speech).squeeze(dim=0).transpose(0, 1).to(self.device)
+        speech_feat = speech_feat.unsqueeze(dim=0)
+        speech_feat_len = torch.tensor([speech_feat.shape[1]], dtype=torch.int32).to(self.device)
+        return speech_feat, speech_feat_len
+    def frontend_zero_shot(self, prompt_text, prompt_wav_path):
+        prompt_speech_16k = load_wav(prompt_wav_path, 16000)
+        prompt_speech_22050 = torchaudio.transforms.Resample(orig_freq=16000, new_freq=22050)(prompt_speech_16k)
+        speech_feat, speech_feat_len = self._extract_speech_feat(prompt_speech_22050)
+        speech_token, speech_token_len = self._extract_speech_token(prompt_speech_16k)
+        embedding = self._extract_spk_embedding(prompt_speech_16k)
+        model_input = {
+                       'prompt_text': prompt_text,
+                       'prompt_speech_token': speech_token, 'prompt_speech_token_len': speech_token_len,
+                       'prompt_speech_feat': speech_feat, 'prompt_speech_feat_len': speech_feat_len,
+                       'vp_emb': embedding}
+        return model_input
+if __name__ == "__main__":
+    model_dir = sys.argv[1]
+    hyper_yaml_path = f"{model_dir}/audio_detokenizer.yaml"
+    with open(hyper_yaml_path, 'r') as f:
+        configs = load_hyperpyyaml(f)
+    frontend = TTSFrontEnd(
+                        configs["feat_extractor"],
+                        '{}/campplus.onnx'.format(model_dir),
+                        '{}/speech_tokenizer_v1.onnx'.format(model_dir))
+    model_input = frontend.frontend_zero_shot("你叫什么名字", 'db30_02.wav')

data/spks/prompt.wav ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:1611471cc6ebdda5a207802802ad12d2265b21e61e2ca43b98a2605cf981559c
+size 39094

talker/audio_detokenizer.yaml CHANGED Viewed

@@ -85,3 +85,12 @@ hift: !new:.audio_detokenizer.hifigan.generator.HiFTGenerator
         in_channels: 80
         cond_channels: 512

         in_channels: 80
         cond_channels: 512
+feat_extractor: !name:matcha.utils.audio.mel_spectrogram
+    n_fft: 1024
+    num_mels: 80
+    sampling_rate: !ref <sample_rate>
+    hop_size: 256
+    win_size: 1024
+    fmin: 0
+    fmax: 8000
+    center: False

talker/campplus.onnx ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:a6ac6a63997761ae2997373e2ee1c47040854b4b759ea41ec48e4e42df0f4d73
+size 28303423

talker/speech_tokenizer_v1.onnx ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:23b5a723ed9143aebfd9ffda14ac4c21231f31c35ef837b6a13bb9e5488abb1e
+size 522624269