gosummer commited on
Commit
f3f17e2
·
verified ·
1 Parent(s): e625816

Upload 7 files

Browse files
Files changed (7) hide show
  1. 5_HP-Karaoke-UVR.py +132 -0
  2. deecho.py +201 -0
  3. infer.py +214 -0
  4. requirements.txt +20 -0
  5. uvr.py +132 -0
  6. 仅去和声混响.sh +58 -0
  7. 全流程一键版.sh +82 -0
5_HP-Karaoke-UVR.py ADDED
@@ -0,0 +1,132 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import sys
3
+ import torch
4
+ import warnings
5
+ import hashlib
6
+ import math
7
+ import importlib
8
+ import numpy as np
9
+ from tqdm import tqdm
10
+ from scipy.io import wavfile
11
+ import librosa
12
+ import pdb
13
+ from uvr5_pack.lib_v5 import spec_utils
14
+ from uvr5_pack.utils import _get_name_params, inference
15
+ from uvr5_pack.lib_v5.model_param_init import ModelParameters
16
+
17
+
18
+ warnings.filterwarnings("ignore")
19
+
20
+
21
+ class _audio_pre_():
22
+ def __init__(self, model_path, device, is_half):
23
+ self.model_path = model_path
24
+ self.device = device
25
+ self.data = {
26
+ # Processing Options
27
+ 'postprocess': False,
28
+ 'tta': False,
29
+ # Constants
30
+ 'window_size': 320,
31
+ 'agg': 10,
32
+ 'high_end_process': 'mirroring',
33
+ }
34
+ nn_arch_sizes = [
35
+ 31191, # default
36
+ 33966,61968, 123821, 123812, 537238 # custom
37
+ ]
38
+ self.nn_architecture = list('{}KB'.format(s) for s in nn_arch_sizes)
39
+ model_size = math.ceil(os.stat(model_path).st_size / 1024)
40
+ nn_architecture = '{}KB'.format(min(nn_arch_sizes, key=lambda x:abs(x-model_size)))
41
+ nets = importlib.import_module('uvr5_pack.lib_v5.nets' + f'_{nn_architecture}'.replace('_{}KB'.format(nn_arch_sizes[0]), ''), package=None)
42
+ model_hash = hashlib.md5(open(model_path, 'rb').read()).hexdigest()
43
+ param_name, model_params_d = _get_name_params(model_path, model_hash)
44
+
45
+ mp = ModelParameters(model_params_d)
46
+ model = nets.CascadedASPPNet(mp.param['bins'] * 2)
47
+ cpk = torch.load(model_path, map_location='cpu')
48
+ model.load_state_dict(cpk)
49
+ model.eval()
50
+ if is_half:
51
+ model = model.half().to(device)
52
+ else:
53
+ model = model.to(device)
54
+
55
+ self.mp = mp
56
+ self.model = model
57
+
58
+ def _path_audio_(self, music_file, ins_root=None, vocal_root=None):
59
+ if ins_root is None and vocal_root is None:
60
+ return "No save root."
61
+ name = os.path.basename(music_file)
62
+ if ins_root is not None:
63
+ os.makedirs(ins_root, exist_ok=True)
64
+ if vocal_root is not None:
65
+ os.makedirs(vocal_root, exist_ok=True)
66
+ X_wave, y_wave, X_spec_s, y_spec_s = {}, {}, {}, {}
67
+ bands_n = len(self.mp.param['band'])
68
+ for d in range(bands_n, 0, -1):
69
+ bp = self.mp.param['band'][d]
70
+ if d == bands_n:
71
+ X_wave[d], _ = librosa.core.load(
72
+ music_file, bp['sr'], False, dtype=np.float32, res_type=bp['res_type'])
73
+ if X_wave[d].ndim == 1:
74
+ X_wave[d] = np.asfortranarray([X_wave[d], X_wave[d]])
75
+ else:
76
+ X_wave[d] = librosa.core.resample(X_wave[d+1], self.mp.param['band'][d+1]['sr'], bp['sr'], res_type=bp['res_type'])
77
+
78
+ X_spec_s[d] = spec_utils.wave_to_spectrogram_mt(X_wave[d], bp['hl'], bp['n_fft'], self.mp.param['mid_side'], self.mp.param['mid_side_b2'], self.mp.param['reverse'])
79
+ if d == bands_n and self.data['high_end_process'] != 'none':
80
+ input_high_end_h = (bp['n_fft'] // 2 - bp['crop_stop']) + (self.mp.param['pre_filter_stop'] - self.mp.param['pre_filter_start'])
81
+ input_high_end = X_spec_s[d][:, bp['n_fft']//2-input_high_end_h:bp['n_fft']//2, :]
82
+
83
+ X_spec_m = spec_utils.combine_spectrograms(X_spec_s, self.mp)
84
+ aggresive_set = float(self.data['agg']/100)
85
+ aggressiveness = {'value': aggresive_set, 'split_bin': self.mp.param['band'][1]['crop_stop']}
86
+ with torch.no_grad():
87
+ pred, X_mag, X_phase = inference(X_spec_m, self.device, self.model, aggressiveness, self.data)
88
+
89
+ if self.data['postprocess']:
90
+ pred_inv = np.clip(X_mag - pred, 0, np.inf)
91
+ pred = spec_utils.mask_silence(pred, pred_inv)
92
+
93
+ y_spec_m = pred * X_phase
94
+ v_spec_m = X_spec_m - y_spec_m
95
+
96
+ if ins_root is not None:
97
+ if self.data['high_end_process'].startswith('mirroring'):
98
+ input_high_end_ = spec_utils.mirroring(self.data['high_end_process'], y_spec_m, input_high_end, self.mp)
99
+ wav_instrument = spec_utils.cmb_spectrogram_to_wave(y_spec_m, self.mp, input_high_end_h, input_high_end_)
100
+ else:
101
+ wav_instrument = spec_utils.cmb_spectrogram_to_wave(y_spec_m, self.mp)
102
+ print('%s instruments done' % name)
103
+ # 分离文件名和扩展名
104
+ file_name, ext = os.path.splitext(name)
105
+ wavfile.write(os.path.join(ins_root, '和声_{}{}'.format(file_name, ext)), self.mp.param['sr'], (np.array(wav_instrument)*32768).astype(np.int16))
106
+ if vocal_root is not None:
107
+ if self.data['high_end_process'].startswith('mirroring'):
108
+ input_high_end_ = spec_utils.mirroring(self.data['high_end_process'], v_spec_m, input_high_end, self.mp)
109
+ wav_vocals = spec_utils.cmb_spectrogram_to_wave(v_spec_m, self.mp, input_high_end_h, input_high_end_)
110
+ else:
111
+ wav_vocals = spec_utils.cmb_spectrogram_to_wave(v_spec_m, self.mp)
112
+ print('%s vocals done' % name)
113
+ # 分离文件名和扩展名
114
+ file_name, ext = os.path.splitext(name)
115
+ wavfile.write(os.path.join(vocal_root, '{}{}'.format(file_name, ext)), self.mp.param['sr'], (np.array(wav_vocals)*32768).astype(np.int16))
116
+
117
+
118
+
119
+ if __name__ == '__main__':
120
+ device = 'cuda'
121
+ is_half = True
122
+ model_path = 'uvr5_weights/5_HP-Karaoke-UVR.pth'
123
+ pre_fun = _audio_pre_(model_path=model_path, device=device, is_half=True)
124
+
125
+ # 获取混响文件夹内的所有.wav文件路径
126
+ audio_folder = '/mnt/workspace/input/'
127
+ wav_files = [os.path.join(audio_folder, file) for file in os.listdir(audio_folder) if file.endswith('.mp3')]
128
+
129
+ # 遍历每个音频文件进行处理
130
+ save_path = 'echo'
131
+ for wav_file in wav_files:
132
+ pre_fun._path_audio_(wav_file, save_path, save_path)
deecho.py ADDED
@@ -0,0 +1,201 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os, sys, torch, warnings, pdb
2
+ now_dir = os.getcwd()
3
+ sys.path.append(now_dir)
4
+ from json import load as ll
5
+ warnings.filterwarnings("ignore")
6
+ import librosa
7
+ import importlib
8
+ import numpy as np
9
+ import hashlib, math
10
+ from tqdm import tqdm
11
+ from uvr5_pack.lib_v5 import spec_utils
12
+ from uvr5_pack.utils import _get_name_params, inference
13
+ from uvr5_pack.lib_v5.model_param_init import ModelParameters
14
+ import soundfile as sf
15
+ from uvr5_pack.lib_v5.nets_new import CascadedNet
16
+ from uvr5_pack.lib_v5 import nets_61968KB as nets
17
+ import argparse
18
+
19
+ class AudioSeparator:
20
+ def __init__(self, agg, model_path, device, is_half, model_params):
21
+ self.model_path = model_path
22
+ self.device = device
23
+ self.data = {
24
+ # Processing Options
25
+ "postprocess": False,
26
+ "tta": False,
27
+ # Constants
28
+ "window_size": 320,
29
+ "agg": agg,
30
+ "high_end_process": "mirroring",
31
+ }
32
+ if model_params == "4band_v3":
33
+ mp = ModelParameters("uvr5_pack/lib_v5/modelparams/4band_v3.json")
34
+ nout = 64 if "DeReverb" in model_path else 48
35
+ model = CascadedNet(mp.param["bins"] * 2, nout)
36
+ if model_params == "4band_v2":
37
+ mp = ModelParameters("uvr5_pack/lib_v5/modelparams/4band_v2.json")
38
+ model = nets.CascadedASPPNet(mp.param["bins"] * 2)
39
+ cpk = torch.load(model_path, map_location="cpu")
40
+ model.load_state_dict(cpk)
41
+ model.eval()
42
+ if is_half:
43
+ model = model.half().to(device)
44
+ else:
45
+ model = model.to(device)
46
+
47
+ self.mp = mp
48
+ self.model = model
49
+
50
+ def separate(self, music_file, vocal_root=None, ins_root=None, model_params=None, format="flac"):
51
+ if ins_root is None and vocal_root is None:
52
+ return "No save root."
53
+ if os.path.isfile(music_file):
54
+ music_files = [music_file]
55
+ elif os.path.isdir(music_file):
56
+ music_files = [os.path.join(music_file, f) for f in os.listdir(music_file) if f.endswith(".wav") or f.endswith(".mp3")]
57
+ else:
58
+ return "Invalid path."
59
+ for music_file in music_files:
60
+ name = os.path.basename(music_file)
61
+ if ins_root is not None:
62
+ os.makedirs(ins_root, exist_ok=True)
63
+ if vocal_root is not None:
64
+ os.makedirs(vocal_root, exist_ok=True)
65
+ X_wave, y_wave, X_spec_s, y_spec_s = {}, {}, {}, {}
66
+ bands_n = len(self.mp.param["band"])
67
+
68
+ for d in range(bands_n, 0, -1):
69
+ bp = self.mp.param["band"][d]
70
+ if d == bands_n: # high-end band
71
+ (
72
+ X_wave[d],
73
+ _,
74
+ ) = librosa.core.load(
75
+ music_file,
76
+ bp["sr"],
77
+ False,
78
+ dtype=np.float32,
79
+ res_type=bp["res_type"],
80
+ )
81
+ if X_wave[d].ndim == 1:
82
+ X_wave[d] = np.asfortranarray([X_wave[d], X_wave[d]])
83
+ else: # lower bands
84
+ X_wave[d] = librosa.core.resample(
85
+ X_wave[d + 1],
86
+ self.mp.param["band"][d + 1]["sr"],
87
+ bp["sr"],
88
+ res_type=bp["res_type"],
89
+ )
90
+ # Stft of wave source
91
+ X_spec_s[d] = spec_utils.wave_to_spectrogram_mt(
92
+ X_wave[d],
93
+ bp["hl"],
94
+ bp["n_fft"],
95
+ self.mp.param["mid_side"],
96
+ self.mp.param["mid_side_b2"],
97
+ self.mp.param["reverse"],
98
+ )
99
+ if d == bands_n and self.data["high_end_process"] != "none":
100
+ input_high_end_h = (bp["n_fft"] // 2 - bp["crop_stop"]) + (
101
+ self.mp.param["pre_filter_stop"] - self.mp.param["pre_filter_start"]
102
+ )
103
+ input_high_end = X_spec_s[d][
104
+ :, bp["n_fft"] // 2 - input_high_end_h : bp["n_fft"] // 2, :
105
+ ]
106
+
107
+ X_spec_m = spec_utils.combine_spectrograms(X_spec_s, self.mp)
108
+ aggresive_set = float(self.data["agg"] / 100)
109
+ aggressiveness = {
110
+ "value": aggresive_set,
111
+ "split_bin": self.mp.param["band"][1]["crop_stop"],
112
+ }
113
+ with torch.no_grad():
114
+ pred, X_mag, X_phase = inference(
115
+ X_spec_m, self.device, self.model, aggressiveness, self.data
116
+ )
117
+
118
+ # Postprocess
119
+ if self.data["postprocess"]:
120
+ pred_inv = np.clip(X_mag - pred, 0, np.inf)
121
+ pred = spec_utils.mask_silence(pred, pred_inv)
122
+ y_spec_m = pred * X_phase
123
+ v_spec_m = X_spec_m - y_spec_m
124
+
125
+ if ins_root is not None:
126
+ if self.data["high_end_process"].startswith("mirroring"):
127
+ input_high_end_ = spec_utils.mirroring(
128
+ self.data["high_end_process"], y_spec_m, input_high_end, self.mp
129
+ )
130
+ wav_instrument = spec_utils.cmb_spectrogram_to_wave(
131
+ y_spec_m, self.mp, input_high_end_h, input_high_end_
132
+ )
133
+ else:
134
+ wav_instrument = spec_utils.cmb_spectrogram_to_wave(y_spec_m, self.mp)
135
+ print("%s instruments done" % name)
136
+ if model_params == "4band_v2":
137
+ sf.write(
138
+ os.path.join(
139
+ ins_root, "instrument_{}_{}.{}".format(name, self.data["agg"],format)
140
+ ),
141
+ (np.array(wav_instrument) * 32768).astype("int16"), self.mp.param["sr"],
142
+ ) #
143
+ if model_params == "4band_v3":
144
+ sf.write(
145
+ os.path.join(
146
+ ins_root, "人声_{}".format(name, self.data["agg"], format)
147
+ ),
148
+ (np.array(wav_instrument) * 32768).astype("int16"), self.mp.param["sr"],
149
+ ) #
150
+
151
+ if vocal_root is not None:
152
+ if self.data["high_end_process"].startswith("mirroring"):
153
+ input_high_end_ = spec_utils.mirroring(
154
+ self.data["high_end_process"], v_spec_m, input_high_end, self.mp
155
+ )
156
+ wav_vocals = spec_utils.cmb_spectrogram_to_wave(
157
+ v_spec_m, self.mp, input_high_end_h, input_high_end_
158
+ )
159
+ else:
160
+ wav_vocals = spec_utils.cmb_spectrogram_to_wave(v_spec_m, self.mp)
161
+ print("%s vocals done" % name)
162
+ if model_params == "4band_v2":
163
+ sf.write(
164
+ os.path.join(
165
+ vocal_root, "vocal_{}_{}.{}".format(name, self.data["agg"], format)
166
+ ),
167
+ (np.array(wav_vocals) * 32768).astype("int16"), self.mp.param["sr"],
168
+ )
169
+ if model_params == "4band_v3":
170
+ sf.write(
171
+ os.path.join(
172
+ vocal_root, "混响_{}".format(name, self.data["agg"], format)
173
+ ),
174
+ (np.array(wav_vocals) * 32768).astype("int16"), self.mp.param["sr"],
175
+ )
176
+
177
+ if __name__ == "__main__":
178
+ parser = argparse.ArgumentParser(description="Process audio with specified parameters.")
179
+ parser.add_argument("-device", choices=["cpu", "cuda"], default="cpu", help="Device for processing")
180
+ parser.add_argument("-is_half", type=bool, required=True, help="Use half precision")
181
+ parser.add_argument("-model_path", required=True, help="Path to the model weights")
182
+ parser.add_argument("-agg", type=int, default=10, help="Aggregation parameter")
183
+ parser.add_argument("-audio_path", required=True, help="Path to the audio file or folder")
184
+ parser.add_argument("-save_path", required=True, help="Path to save the output")
185
+ parser.add_argument("-model_params", choices=["4band_v3", "4band_v2"], required=True, help="Path to save the output")
186
+ parser.add_argument("-format", choices=["wav", "flac"], default="wav",)
187
+ args = parser.parse_args()
188
+ separator = AudioSeparator(
189
+ model_path=args.model_path,
190
+ device=args.device,
191
+ is_half=args.is_half,
192
+ agg=args.agg,
193
+ model_params=args.model_params
194
+ )
195
+ separator.separate(
196
+ args.audio_path,
197
+ args.save_path,
198
+ args.save_path,
199
+ args.model_params,
200
+ args.format
201
+ )
infer.py ADDED
@@ -0,0 +1,214 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import soundfile as sf
2
+ import torch
3
+ import os
4
+ import librosa
5
+ import numpy as np
6
+ import onnxruntime as ort
7
+ from pathlib import Path
8
+ from argparse import ArgumentParser
9
+ from tqdm import tqdm
10
+
11
+
12
+ class ConvTDFNet:
13
+ def __init__(self, target_name, L, dim_f, dim_t, n_fft, hop=1024):
14
+ super(ConvTDFNet, self).__init__()
15
+ self.dim_c = 4
16
+ self.dim_f = dim_f
17
+ self.dim_t = 2**dim_t
18
+ self.n_fft = n_fft
19
+ self.hop = hop
20
+ self.n_bins = self.n_fft // 2 + 1
21
+ self.chunk_size = hop * (self.dim_t - 1)
22
+ self.window = torch.hann_window(window_length=self.n_fft, periodic=True)
23
+ self.target_name = target_name
24
+
25
+ out_c = self.dim_c * 4 if target_name == "*" else self.dim_c
26
+
27
+ self.freq_pad = torch.zeros([1, out_c, self.n_bins - self.dim_f, self.dim_t])
28
+ self.n = L // 2
29
+
30
+ def stft(self, x):
31
+ x = x.reshape([-1, self.chunk_size])
32
+ x = torch.stft(
33
+ x,
34
+ n_fft=self.n_fft,
35
+ hop_length=self.hop,
36
+ window=self.window,
37
+ center=True,
38
+ return_complex=True,
39
+ )
40
+ x = torch.view_as_real(x)
41
+ x = x.permute([0, 3, 1, 2])
42
+ x = x.reshape([-1, 2, 2, self.n_bins, self.dim_t]).reshape(
43
+ [-1, self.dim_c, self.n_bins, self.dim_t]
44
+ )
45
+ return x[:, :, : self.dim_f]
46
+
47
+ # Inversed Short-time Fourier transform (STFT).
48
+ def istft(self, x, freq_pad=None):
49
+ freq_pad = (
50
+ self.freq_pad.repeat([x.shape[0], 1, 1, 1])
51
+ if freq_pad is None
52
+ else freq_pad
53
+ )
54
+ x = torch.cat([x, freq_pad], -2)
55
+ c = 4 * 2 if self.target_name == "*" else 2
56
+ x = x.reshape([-1, c, 2, self.n_bins, self.dim_t]).reshape(
57
+ [-1, 2, self.n_bins, self.dim_t]
58
+ )
59
+ x = x.permute([0, 2, 3, 1])
60
+ x = x.contiguous()
61
+ x = torch.view_as_complex(x)
62
+ x = torch.istft(
63
+ x, n_fft=self.n_fft, hop_length=self.hop, window=self.window, center=True
64
+ )
65
+ return x.reshape([-1, c, self.chunk_size])
66
+
67
+ class Predictor:
68
+ def __init__(self, args):
69
+ self.args = args
70
+ self.model_ = ConvTDFNet(
71
+ target_name="vocals",
72
+ L=11,
73
+ dim_f=args["dim_f"],
74
+ dim_t=args["dim_t"],
75
+ n_fft=args["n_fft"]
76
+ )
77
+
78
+ if torch.cuda.is_available():
79
+ self.model = ort.InferenceSession(args['model_path'], providers=['CUDAExecutionProvider'])
80
+ else:
81
+ self.model = ort.InferenceSession(args['model_path'], providers=['CPUExecutionProvider'])
82
+
83
+ def demix(self, mix):
84
+ samples = mix.shape[-1]
85
+ margin = self.args["margin"]
86
+ chunk_size = self.args["chunks"] * 44100
87
+
88
+ assert not margin == 0, "margin cannot be zero!"
89
+
90
+ if margin > chunk_size:
91
+ margin = chunk_size
92
+
93
+ segmented_mix = {}
94
+
95
+ if self.args["chunks"] == 0 or samples < chunk_size:
96
+ chunk_size = samples
97
+
98
+ counter = -1
99
+ for skip in range(0, samples, chunk_size):
100
+ counter += 1
101
+ s_margin = 0 if counter == 0 else margin
102
+ end = min(skip + chunk_size + margin, samples)
103
+ start = skip - s_margin
104
+ segmented_mix[skip] = mix[:, start:end].copy()
105
+ if end == samples:
106
+ break
107
+
108
+ sources = self.demix_base(segmented_mix, margin_size=margin)
109
+ return sources
110
+
111
+ def demix_base(self, mixes, margin_size):
112
+ chunked_sources = []
113
+ progress_bar = tqdm(total=len(mixes))
114
+ progress_bar.set_description("Processing")
115
+
116
+ for mix in mixes:
117
+ cmix = mixes[mix]
118
+ sources = []
119
+ n_sample = cmix.shape[1]
120
+ model = self.model_
121
+ trim = model.n_fft // 2
122
+ gen_size = model.chunk_size - 2 * trim
123
+ pad = gen_size - n_sample % gen_size
124
+ mix_p = np.concatenate(
125
+ (np.zeros((2, trim)), cmix, np.zeros((2, pad)), np.zeros((2, trim))), 1
126
+ )
127
+ mix_waves = []
128
+ i = 0
129
+ while i < n_sample + pad:
130
+ waves = np.array(mix_p[:, i : i + model.chunk_size])
131
+ mix_waves.append(waves)
132
+ i += gen_size
133
+
134
+ mix_waves = torch.tensor(np.array(mix_waves), dtype=torch.float32)
135
+
136
+ with torch.no_grad():
137
+ _ort = self.model
138
+ spek = model.stft(mix_waves)
139
+ if self.args["denoise"]:
140
+ spec_pred = (
141
+ -_ort.run(None, {"input": -spek.cpu().numpy()})[0] * 0.5
142
+ + _ort.run(None, {"input": spek.cpu().numpy()})[0] * 0.5
143
+ )
144
+ tar_waves = model.istft(torch.tensor(spec_pred))
145
+ else:
146
+ tar_waves = model.istft(
147
+ torch.tensor(_ort.run(None, {"input": spek.cpu().numpy() })[0])
148
+ )
149
+ tar_signal = (
150
+ tar_waves[:, :, trim:-trim]
151
+ .transpose(0, 1)
152
+ .reshape(2, -1)
153
+ .numpy()[:, :-pad]
154
+ )
155
+
156
+ start = 0 if mix == 0 else margin_size
157
+ end = None if mix == list(mixes.keys())[::-1][0] else -margin_size
158
+
159
+ if margin_size == 0:
160
+ end = None
161
+
162
+ sources.append(tar_signal[:, start:end])
163
+
164
+ progress_bar.update(1)
165
+
166
+ chunked_sources.append(sources)
167
+ _sources = np.concatenate(chunked_sources, axis=-1)
168
+
169
+ progress_bar.close()
170
+ return _sources
171
+
172
+ def predict(self, file_path):
173
+
174
+ mix, rate = librosa.load(file_path, mono=False, sr=44100)
175
+
176
+ if mix.ndim == 1:
177
+ mix = np.asfortranarray([mix, mix])
178
+
179
+ mix = mix.T
180
+ sources = self.demix(mix.T)
181
+ opt = sources[0].T
182
+
183
+ return (mix - opt, opt, rate)
184
+
185
+ def main():
186
+ parser = ArgumentParser()
187
+
188
+ parser.add_argument("files", nargs="+", type=Path, default=[], help="Source audio path")
189
+ parser.add_argument("-o", "--output", type=Path, default=Path("separated"), help="Output folder")
190
+ parser.add_argument("-m", "--model_path", type=Path, help="MDX Net ONNX Model path")
191
+
192
+ parser.add_argument("-d", "--no-denoise", dest="denoise", action="store_false", default=True, help="Disable denoising")
193
+ parser.add_argument("-M", "--margin", type=int, default=44100, help="Margin")
194
+ parser.add_argument("-c", "--chunks", type=int, default=15, help="Chunk size")
195
+ parser.add_argument("-F", "--n_fft", type=int, default=6144)
196
+ parser.add_argument("-t", "--dim_t", type=int, default=8)
197
+ parser.add_argument("-f", "--dim_f", type=int, default=2048)
198
+
199
+ args = parser.parse_args()
200
+ dict_args = vars(args)
201
+
202
+ os.makedirs(args.output, exist_ok=True)
203
+
204
+ for file_path in args.files:
205
+ predictor = Predictor(args=dict_args)
206
+ vocals, no_vocals, sampling_rate = predictor.predict(file_path)
207
+ filename = os.path.splitext(os.path.split(file_path)[-1])[0]
208
+ sf.write(os.path.join(args.output, filename+".wav"), no_vocals, sampling_rate)
209
+ sf.write(os.path.join(args.output, filename+"_instrum.wav"), vocals, sampling_rate)
210
+
211
+ if __name__ == "__main__":
212
+ main()
213
+
214
+
requirements.txt ADDED
@@ -0,0 +1,20 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ joblib>=1.1.0
2
+ numba==0.56.4
3
+ numpy==1.23.5
4
+ scipy==1.9.3
5
+ librosa>=0.9.1
6
+ llvmlite==0.39.0
7
+ pydub>=0.25.1
8
+ soundfile>=0.12.1
9
+ praat-parselmouth>=0.4.2
10
+ Pillow==9.5.0
11
+ resampy>=0.4.2
12
+ scikit-learn
13
+ starlette>=0.25.0
14
+ tqdm>=4.63.1
15
+ audioread==3.0.0
16
+ soundstretch==1.2
17
+ demucs
18
+ pyyaml
19
+ ml_collections
20
+ rotary_embedding_torch
uvr.py ADDED
@@ -0,0 +1,132 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import sys
3
+ import torch
4
+ import warnings
5
+ import hashlib
6
+ import math
7
+ import importlib
8
+ import numpy as np
9
+ from tqdm import tqdm
10
+ from scipy.io import wavfile
11
+ import librosa
12
+ import pdb
13
+ from uvr5_pack.lib_v5 import spec_utils
14
+ from uvr5_pack.utils import _get_name_params, inference
15
+ from uvr5_pack.lib_v5.model_param_init import ModelParameters
16
+
17
+
18
+ warnings.filterwarnings("ignore")
19
+
20
+
21
+ class _audio_pre_():
22
+ def __init__(self, model_path, device, is_half):
23
+ self.model_path = model_path
24
+ self.device = device
25
+ self.data = {
26
+ # Processing Options
27
+ 'postprocess': False,
28
+ 'tta': False,
29
+ # Constants
30
+ 'window_size': 320,
31
+ 'agg': 10,
32
+ 'high_end_process': 'mirroring',
33
+ }
34
+ nn_arch_sizes = [
35
+ 31191, # default
36
+ 33966,61968, 123821, 123812, 537238 # custom
37
+ ]
38
+ self.nn_architecture = list('{}KB'.format(s) for s in nn_arch_sizes)
39
+ model_size = math.ceil(os.stat(model_path).st_size / 1024)
40
+ nn_architecture = '{}KB'.format(min(nn_arch_sizes, key=lambda x:abs(x-model_size)))
41
+ nets = importlib.import_module('uvr5_pack.lib_v5.nets' + f'_{nn_architecture}'.replace('_{}KB'.format(nn_arch_sizes[0]), ''), package=None)
42
+ model_hash = hashlib.md5(open(model_path, 'rb').read()).hexdigest()
43
+ param_name, model_params_d = _get_name_params(model_path, model_hash)
44
+
45
+ mp = ModelParameters(model_params_d)
46
+ model = nets.CascadedASPPNet(mp.param['bins'] * 2)
47
+ cpk = torch.load(model_path, map_location='cpu')
48
+ model.load_state_dict(cpk)
49
+ model.eval()
50
+ if is_half:
51
+ model = model.half().to(device)
52
+ else:
53
+ model = model.to(device)
54
+
55
+ self.mp = mp
56
+ self.model = model
57
+
58
+ def _path_audio_(self, music_file, ins_root=None, vocal_root=None):
59
+ if ins_root is None and vocal_root is None:
60
+ return "No save root."
61
+ name = os.path.basename(music_file)
62
+ if ins_root is not None:
63
+ os.makedirs(ins_root, exist_ok=True)
64
+ if vocal_root is not None:
65
+ os.makedirs(vocal_root, exist_ok=True)
66
+ X_wave, y_wave, X_spec_s, y_spec_s = {}, {}, {}, {}
67
+ bands_n = len(self.mp.param['band'])
68
+ for d in range(bands_n, 0, -1):
69
+ bp = self.mp.param['band'][d]
70
+ if d == bands_n:
71
+ X_wave[d], _ = librosa.core.load(
72
+ music_file, bp['sr'], False, dtype=np.float32, res_type=bp['res_type'])
73
+ if X_wave[d].ndim == 1:
74
+ X_wave[d] = np.asfortranarray([X_wave[d], X_wave[d]])
75
+ else:
76
+ X_wave[d] = librosa.core.resample(X_wave[d+1], self.mp.param['band'][d+1]['sr'], bp['sr'], res_type=bp['res_type'])
77
+
78
+ X_spec_s[d] = spec_utils.wave_to_spectrogram_mt(X_wave[d], bp['hl'], bp['n_fft'], self.mp.param['mid_side'], self.mp.param['mid_side_b2'], self.mp.param['reverse'])
79
+ if d == bands_n and self.data['high_end_process'] != 'none':
80
+ input_high_end_h = (bp['n_fft'] // 2 - bp['crop_stop']) + (self.mp.param['pre_filter_stop'] - self.mp.param['pre_filter_start'])
81
+ input_high_end = X_spec_s[d][:, bp['n_fft']//2-input_high_end_h:bp['n_fft']//2, :]
82
+
83
+ X_spec_m = spec_utils.combine_spectrograms(X_spec_s, self.mp)
84
+ aggresive_set = float(self.data['agg']/100)
85
+ aggressiveness = {'value': aggresive_set, 'split_bin': self.mp.param['band'][1]['crop_stop']}
86
+ with torch.no_grad():
87
+ pred, X_mag, X_phase = inference(X_spec_m, self.device, self.model, aggressiveness, self.data)
88
+
89
+ if self.data['postprocess']:
90
+ pred_inv = np.clip(X_mag - pred, 0, np.inf)
91
+ pred = spec_utils.mask_silence(pred, pred_inv)
92
+
93
+ y_spec_m = pred * X_phase
94
+ v_spec_m = X_spec_m - y_spec_m
95
+
96
+ if ins_root is not None:
97
+ if self.data['high_end_process'].startswith('mirroring'):
98
+ input_high_end_ = spec_utils.mirroring(self.data['high_end_process'], y_spec_m, input_high_end, self.mp)
99
+ wav_instrument = spec_utils.cmb_spectrogram_to_wave(y_spec_m, self.mp, input_high_end_h, input_high_end_)
100
+ else:
101
+ wav_instrument = spec_utils.cmb_spectrogram_to_wave(y_spec_m, self.mp)
102
+ print('%s instruments done' % name)
103
+ # 分离文件名和扩展名
104
+ file_name, ext = os.path.splitext(name)
105
+ wavfile.write(os.path.join(ins_root, '和声_{}{}'.format(file_name, ext)), self.mp.param['sr'], (np.array(wav_instrument)*32768).astype(np.int16))
106
+ if vocal_root is not None:
107
+ if self.data['high_end_process'].startswith('mirroring'):
108
+ input_high_end_ = spec_utils.mirroring(self.data['high_end_process'], v_spec_m, input_high_end, self.mp)
109
+ wav_vocals = spec_utils.cmb_spectrogram_to_wave(v_spec_m, self.mp, input_high_end_h, input_high_end_)
110
+ else:
111
+ wav_vocals = spec_utils.cmb_spectrogram_to_wave(v_spec_m, self.mp)
112
+ print('%s vocals done' % name)
113
+ # 分离文件名和扩展名
114
+ file_name, ext = os.path.splitext(name)
115
+ wavfile.write(os.path.join(vocal_root, '{}{}'.format(file_name, ext)), self.mp.param['sr'], (np.array(wav_vocals)*32768).astype(np.int16))
116
+
117
+
118
+
119
+ if __name__ == '__main__':
120
+ device = 'cuda'
121
+ is_half = True
122
+ model_path = 'uvr5_weights/5_HP-Karaoke-UVR.pth'
123
+ pre_fun = _audio_pre_(model_path=model_path, device=device, is_half=True)
124
+
125
+ # 获取混响文件夹内的所有.wav文件路径
126
+ audio_folder = 'output'
127
+ wav_files = [os.path.join(audio_folder, file) for file in os.listdir(audio_folder) if file.endswith('.wav')]
128
+
129
+ # 遍历每个音频文件进行处理
130
+ save_path = 'echo'
131
+ for wav_file in wav_files:
132
+ pre_fun._path_audio_(wav_file, save_path, save_path)
仅去和声混响.sh ADDED
@@ -0,0 +1,58 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/bin/bash
2
+
3
+ # Change directory
4
+ cd /mnt/workspace/uvr5
5
+
6
+ # Run 5_HP-Karaoke-UVR.py
7
+ python 5_HP-Karaoke-UVR.py
8
+
9
+ # Convert mp3 files to wav
10
+ source_folder="/mnt/workspace/uvr5/echo"
11
+ for mp3_file in $(ls $source_folder/*.mp3)
12
+ do
13
+ wav_file="${mp3_file%.mp3}.wav"
14
+ mv $mp3_file $wav_file
15
+ done
16
+
17
+ # Move and convert files
18
+ target_folder="/mnt/workspace/uvr5/伴奏"
19
+ for file_name in $(ls $source_folder)
20
+ do
21
+ if [[ $file_name == *"和声_"* && $file_name == *.wav ]]
22
+ then
23
+ file_path="$source_folder/$file_name"
24
+ target_path="$target_folder/$file_name"
25
+ mv $file_path $target_path
26
+ mp3_file_path="${target_path%.wav}.mp3"
27
+ ffmpeg -i $target_path -vn -ar 44100 -ac 2 -b:a 192k -loglevel panic $mp3_file_path
28
+ rm $target_path
29
+ fi
30
+ done
31
+
32
+ # Run deecho.py
33
+ python deecho.py -d cuda -model_path /mnt/workspace/uvr5/uvr5_weights/VR-DeEchoAggressive.pth -audio_path /mnt/workspace/uvr5/echo/ -is_half False -save_path /mnt/workspace/uvr5/人声/ -model_params 4band_v3
34
+
35
+ # Convert files to mp3 and remove wav files
36
+ input_folder='/mnt/workspace/uvr5/人声/'
37
+ output_folder='/mnt/workspace/uvr5/人声/'
38
+ for file in $(ls $input_folder)
39
+ do
40
+ if [[ $file == *"人声_"* && $file == *.wav ]]
41
+ then
42
+ input_path="$input_folder/$file"
43
+ output_file="${file%.wav}.mp3"
44
+ output_path="$output_folder/$output_file"
45
+ ffmpeg -i $input_path -vn -ar 44100 -ac 2 -b:a 192k -loglevel panic $output_path
46
+ fi
47
+ done
48
+
49
+ # Remove wav files
50
+ for file in $(ls $input_folder)
51
+ do
52
+ if [[ $file == *.wav ]]
53
+ then
54
+ file_path="$input_folder/$file"
55
+ rm $file_path
56
+ fi
57
+ done
58
+ echo -e "\033[32m> 已将所有input文件夹内的音频去除和声及混响,人声在 /mnt/workspace/uvr5/人声/ 文件夹内,非人声在 /mnt/workspace/uvr5/伴奏/ 文件夹内。 \033[0m"
全流程一键版.sh ADDED
@@ -0,0 +1,82 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/bin/bash
2
+
3
+ # Change directory
4
+ cd /mnt/workspace/uvr5/MDX23v24
5
+
6
+ # 设置 HF_ENDPOINT 环境变量
7
+ export HF_ENDPOINT="https://hf-mirror.com"
8
+
9
+ input_folder="/mnt/workspace/input/"
10
+ output_folder="/mnt/workspace/input"
11
+
12
+ for file_path in $(ls -1 $input_folder); do
13
+ filename=$(basename "$file_path" | cut -d. -f1)
14
+ # 调用 inference.py,并传入环境变量 HF_ENDPOINT
15
+ python inference.py --vocals_only --large_gpu --use_InstVoc --use_VitLarge --use_BSRoformer --input_audio "$input_folder/$file_path" --output_folder "$output_folder"
16
+ done
17
+
18
+ # Move and convert files
19
+ source_folder="/mnt/workspace/uvr5/output"
20
+ target_folder="/mnt/workspace/uvr5/伴奏"
21
+ for file_name in $(ls $source_folder)
22
+ do
23
+ if [[ $file_name == *"_instrum.wav" ]]
24
+ then
25
+ file_path="$source_folder/$file_name"
26
+ target_path="$target_folder/$file_name"
27
+ mv $file_path $target_path
28
+ mp3_file_path="${target_path%.wav}.mp3"
29
+ ffmpeg -i $target_path -vn -ar 44100 -ac 2 -b:a 320k -loglevel panic $mp3_file_path
30
+ rm $target_path
31
+ fi
32
+ done
33
+
34
+ # Change directory
35
+ cd /mnt/workspace/uvr5
36
+
37
+ # Run uvr.py
38
+ python uvr.py
39
+
40
+ # Move and convert files
41
+ source_folder="/mnt/workspace/uvr5/echo"
42
+ target_folder="/mnt/workspace/uvr5/伴奏"
43
+ for file_name in $(ls $source_folder)
44
+ do
45
+ if [[ $file_name == *"和声_"* && $file_name == *.wav ]]
46
+ then
47
+ file_path="$source_folder/$file_name"
48
+ target_path="$target_folder/$file_name"
49
+ mv $file_path $target_path
50
+ mp3_file_path="${target_path%.wav}.mp3"
51
+ ffmpeg -i $target_path -vn -ar 44100 -ac 2 -b:a 320k -loglevel panic $mp3_file_path
52
+ rm $target_path
53
+ fi
54
+ done
55
+
56
+ # Run deecho.py
57
+ python deecho.py -d cuda -model_path /mnt/workspace/uvr5/uvr5_weights/VR-DeEchoAggressive.pth -audio_path /mnt/workspace/uvr5/echo/ -is_half False -save_path /mnt/workspace/uvr5/人声/ -model_params 4band_v3
58
+
59
+ # Convert files to mp3 and remove wav files
60
+ input_folder='/mnt/workspace/uvr5/人声/'
61
+ output_folder='/mnt/workspace/uvr5/人声/'
62
+ for file in $(ls $input_folder)
63
+ do
64
+ if [[ $file == *"人声_"* && $file == *.wav ]]
65
+ then
66
+ input_path="$input_folder/$file"
67
+ output_file="${file%.wav}.mp3"
68
+ output_path="$output_folder/$output_file"
69
+ ffmpeg -i $input_path -vn -ar 44100 -ac 2 -b:a 320k -loglevel panic $output_path
70
+ fi
71
+ done
72
+
73
+ # Remove wav files
74
+ for file in $(ls $input_folder)
75
+ do
76
+ if [[ $file == *.wav ]]
77
+ then
78
+ file_path="$input_folder/$file"
79
+ rm $file_path
80
+ fi
81
+ done
82
+ echo -e "\033[32m> 已将所有input文件夹内的音频完成分离,人声在 /mnt/workspace/uvr5/人声/ 文件夹内,伴奏及和声在 /mnt/workspace/uvr5/伴奏/ 文件夹内。 \033[0m"