ancv commited on
Commit
273b7cf
·
verified ·
1 Parent(s): 3c9ffb9

Delete _utils.py

Browse files
Files changed (1) hide show
  1. _utils.py +0 -151
_utils.py DELETED
@@ -1,151 +0,0 @@
1
- # Copyright (c) 2025 SparkAudio & The HuggingFace Inc. team. All rights reserved.
2
- #
3
- # Licensed under the Apache License, Version 2.0 (the "License");
4
- # you may not use this file except in compliance with the License.
5
- # You may obtain a copy of the License at
6
- #
7
- # http://www.apache.org/licenses/LICENSE-2.0
8
- #
9
- # Unless required by applicable law or agreed to in writing, software
10
- # distributed under the License is distributed on an "AS IS" BASIS,
11
- # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
- # See the License for the specific language governing permissions and
13
- # limitations under the License.
14
- """ Utility functions for SparkTTS """
15
-
16
- import random
17
- import soxr
18
- import soundfile
19
- import torch
20
- import torchaudio
21
- import numpy as np
22
-
23
- from pathlib import Path
24
- from typing import Tuple, Dict, Any
25
- from numpy.lib.stride_tricks import sliding_window_view
26
- from omegaconf import OmegaConf # Keep if BiCodec config loading needs it
27
-
28
-
29
- # --- Token Maps (from sparktts/utils/token_parser.py) ---
30
- TASK_TOKEN_MAP = {
31
- "vc": "<|task_vc|>",
32
- "tts": "<|task_tts|>",
33
- "asr": "<|task_asr|>",
34
- "s2s": "<|task_s2s|>",
35
- "t2s": "<|task_t2s|>",
36
- "understand": "<|task_understand|>",
37
- "caption": "<|task_cap|>",
38
- "controllable_tts": "<|task_controllable_tts|>",
39
- "prompt_tts": "<|task_prompt_tts|>",
40
- "speech_edit": "<|task_edit|>",
41
- }
42
-
43
- LEVELS_MAP = {
44
- "very_low": 0,
45
- "low": 1,
46
- "moderate": 2,
47
- "high": 3,
48
- "very_high": 4,
49
- }
50
-
51
- LEVELS_MAP_UI = {
52
- 1: 'very_low',
53
- 2: 'low',
54
- 3: 'moderate',
55
- 4: 'high',
56
- 5: 'very_high'
57
- }
58
-
59
- GENDER_MAP = {
60
- "female": 0,
61
- "male": 1,
62
- }
63
-
64
- # --- Audio Utils (from sparktts/utils/audio.py) ---
65
- def audio_volume_normalize(audio: np.ndarray, coeff: float = 0.2) -> np.ndarray:
66
- temp = np.sort(np.abs(audio))
67
- if len(temp) == 0: # Handle empty audio case
68
- return audio
69
- if temp[-1] < 0.1:
70
- scaling_factor = max(temp[-1], 1e-3)
71
- audio = audio / scaling_factor * 0.1
72
- temp = temp[temp > 0.01]
73
- L = temp.shape[0]
74
- if L <= 10:
75
- return audio
76
- volume = np.mean(temp[int(0.9 * L) : int(0.99 * L)])
77
- if volume == 0: # Avoid division by zero if volume is effectively zero
78
- return audio
79
- audio = audio * np.clip(coeff / volume, a_min=0.1, a_max=10)
80
- max_value = np.max(np.abs(audio)) if len(audio) > 0 else 0
81
- if max_value > 1:
82
- audio = audio / max_value
83
- return audio
84
-
85
- def load_audio(
86
- adfile: Path,
87
- sampling_rate: int = None,
88
- length: int = None,
89
- volume_normalize: bool = False,
90
- segment_duration: int = None,
91
- ) -> np.ndarray:
92
- try:
93
- audio, sr = soundfile.read(adfile, dtype='float32') # Ensure float32
94
- except Exception as e:
95
- raise IOError(f"Could not read audio file {adfile}: {e}")
96
-
97
- if audio is None or len(audio) == 0:
98
- raise ValueError(f"Audio file {adfile} is empty or invalid.")
99
-
100
- if len(audio.shape) > 1:
101
- audio = audio[:, 0]
102
-
103
- if sampling_rate is not None and sr != sampling_rate:
104
- try:
105
- # Ensure input is float64 for soxr
106
- audio = audio.astype(np.float64)
107
- audio = soxr.resample(audio, sr, sampling_rate, quality="VHQ")
108
- # Convert back to float32
109
- audio = audio.astype(np.float32)
110
- sr = sampling_rate
111
- except Exception as e:
112
- raise RuntimeError(f"Failed to resample audio from {sr}Hz to {sampling_rate}Hz: {e}")
113
-
114
- if segment_duration is not None:
115
- seg_length = int(sr * segment_duration)
116
- audio = random_select_audio_segment(audio, seg_length)
117
-
118
- if volume_normalize:
119
- audio = audio_volume_normalize(audio)
120
-
121
- if length is not None:
122
- if audio.shape[0] > length:
123
- audio = audio[:length]
124
- else:
125
- audio = np.pad(audio, (0, int(length - audio.shape[0])), mode='constant')
126
- return audio
127
-
128
- def random_select_audio_segment(audio: np.ndarray, length: int) -> np.ndarray:
129
- if audio.shape[0] < length:
130
- audio = np.pad(audio, (0, int(length - audio.shape[0])), mode='constant')
131
- start_index = 0 # If padded, start from beginning
132
- elif audio.shape[0] == length:
133
- start_index = 0 # If exact length, start from beginning
134
- else:
135
- start_index = random.randint(0, audio.shape[0] - length)
136
-
137
- end_index = int(start_index + length)
138
- return audio[start_index:end_index]
139
-
140
- # --- File Utils (Minimal required) ---
141
- def load_config_yaml(config_path: Path) -> Dict:
142
- """Loads a YAML configuration file using OmegaConf."""
143
- # Check if path exists
144
- if not Path(config_path).is_file():
145
- raise FileNotFoundError(f"YAML Config file not found: {config_path}")
146
- try:
147
- config = OmegaConf.load(config_path)
148
- # Convert OmegaConf DictConfig to standard Python dict
149
- return OmegaConf.to_container(config, resolve=True)
150
- except Exception as e:
151
- raise IOError(f"Error loading YAML config file {config_path}: {e}")