nevreal commited on
Commit
f728a0f
·
verified ·
1 Parent(s): cf0e2fd

Update vc_infer_pipeline.py

Browse files
Files changed (1) hide show
  1. vc_infer_pipeline.py +413 -181
vc_infer_pipeline.py CHANGED
@@ -1,225 +1,457 @@
1
- import numpy as np,parselmouth,torch,pdb
 
 
 
 
 
 
 
2
  from time import time as ttime
 
 
 
 
 
 
 
3
  import torch.nn.functional as F
4
- from config import x_pad,x_query,x_center,x_max
5
- from sklearn.cluster import KMeans
6
-
7
- def resize2d(x, target_len,is1):
8
- minn=1 if is1==True else 0
9
- ss = np.array(x).astype("float32")
10
- ss[ss <=minn] = np.nan
11
- target = np.interp(np.arange(0, len(ss) * target_len, len(ss)) / target_len, np.arange(0, len(ss)), ss)
12
- res = np.nan_to_num(target)
13
- return res
14
-
15
- class VC(object):
16
- def __init__(self,tgt_sr,device,is_half):
17
- self.sr=16000#hubert输入采样率
18
- self.window=160#每帧点数
19
- self.t_pad=self.sr*x_pad#每条前后pad时间
20
- self.t_pad_tgt=tgt_sr*x_pad
21
- self.t_pad2=self.t_pad*2
22
- self.t_query=self.sr*x_query#查询切点前后查询时间
23
- self.t_center=self.sr*x_center#查询切点位置
24
- self.t_max=self.sr*x_max#免查询时长阈值
25
- self.device=device
26
- self.is_half=is_half
27
-
28
- def get_f0(self,x, p_len,f0_up_key=0,inp_f0=None):
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
29
  time_step = self.window / self.sr * 1000
30
  f0_min = 50
31
  f0_max = 1100
32
  f0_mel_min = 1127 * np.log(1 + f0_min / 700)
33
  f0_mel_max = 1127 * np.log(1 + f0_max / 700)
34
- f0 = parselmouth.Sound(x, self.sr).to_pitch_ac(
35
- time_step=time_step / 1000, voicing_threshold=0.6,
36
- pitch_floor=f0_min, pitch_ceiling=f0_max).selected_array['frequency']
37
- pad_size=(p_len - len(f0) + 1) // 2
38
- if(pad_size>0 or p_len - len(f0) - pad_size>0):
39
- f0 = np.pad(f0,[[pad_size,p_len - len(f0) - pad_size]], mode='constant')
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
40
  f0 *= pow(2, f0_up_key / 12)
41
  # with open("test.txt","w")as f:f.write("\n".join([str(i)for i in f0.tolist()]))
42
- tf0=self.sr//self.window#每秒f0点数
43
- if (inp_f0 is not None):
44
- delta_t=np.round((inp_f0[:,0].max()-inp_f0[:,0].min())*tf0+1).astype("int16")
45
- replace_f0=np.interp(list(range(delta_t)), inp_f0[:, 0]*100, inp_f0[:, 1])
46
- shape=f0[x_pad*tf0:x_pad*tf0+len(replace_f0)].shape[0]
47
- f0[x_pad*tf0:x_pad*tf0+len(replace_f0)]=replace_f0[:shape]
 
 
 
 
 
 
48
  # with open("test_opt.txt","w")as f:f.write("\n".join([str(i)for i in f0.tolist()]))
49
  f0bak = f0.copy()
50
  f0_mel = 1127 * np.log(1 + f0 / 700)
51
- f0_mel[f0_mel > 0] = (f0_mel[f0_mel > 0] - f0_mel_min) * 254 / (f0_mel_max - f0_mel_min) + 1
 
 
52
  f0_mel[f0_mel <= 1] = 1
53
  f0_mel[f0_mel > 255] = 255
54
- f0_coarse = np.rint(f0_mel).astype(np.int)
55
- return f0_coarse, f0bak#1-0
56
 
57
- def vc(self,model,net_g,dv,audio0,pitch,pitchf,times):
 
 
 
 
 
 
 
 
 
 
 
 
 
 
58
  feats = torch.from_numpy(audio0)
59
- if(self.is_half==True):feats=feats.half()
60
- else:feats=feats.float()
 
 
61
  if feats.dim() == 2: # double channels
62
  feats = feats.mean(-1)
63
  assert feats.dim() == 1, feats.dim()
64
  feats = feats.view(1, -1)
65
- padding_mask = torch.BoolTensor(feats.shape).fill_(False)
66
 
67
  inputs = {
68
  "source": feats.to(self.device),
69
- "padding_mask": padding_mask.to(self.device),
70
- "output_layer": 9, # layer 9
71
  }
72
  t0 = ttime()
73
  with torch.no_grad():
74
  logits = model.extract_features(**inputs)
75
- feats = model.final_proj(logits[0])
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
76
  feats = F.interpolate(feats.permute(0, 2, 1), scale_factor=2).permute(0, 2, 1)
 
 
 
 
77
  t1 = ttime()
78
- p_len = audio0.shape[0]//self.window
79
- if(feats.shape[1]<p_len):
80
- p_len=feats.shape[1]
81
- pitch=pitch[:,:p_len]
82
- pitchf=pitchf[:,:p_len]
83
- p_len=torch.LongTensor([p_len]).to(self.device)
84
- with torch.no_grad():
85
- audio1 = (net_g.infer(feats, p_len, pitch, pitchf, dv)[0][0, 0] * 32768).data.cpu().float().numpy().astype(np.int16)
86
- del feats,p_len,padding_mask
87
- torch.cuda.empty_cache()
88
- t2 = ttime()
89
- times[0] += (t1 - t0)
90
- times[2] += (t2 - t1)
91
- return audio1
92
- def vc_km(self,model,net_g,dv,audio0,pitch,pitchf,times):
93
- kmeans = KMeans(500)
94
- def get_cluster_result(x):
95
- """x: np.array [t, 256]"""
96
- return kmeans.predict(x)
97
- checkpoint = torch.load("lulu_contentvec_kmeans_500.pt")
98
- kmeans.__dict__["n_features_in_"] = checkpoint["n_features_in_"]
99
- kmeans.__dict__["_n_threads"] = checkpoint["_n_threads"]
100
- kmeans.__dict__["cluster_centers_"] = checkpoint["cluster_centers_"]
101
- feats = torch.from_numpy(audio0).float()
102
- if feats.dim() == 2: # double channels
103
- feats = feats.mean(-1)
104
- assert feats.dim() == 1, feats.dim()
105
- feats = feats.view(1, -1)
106
- padding_mask = torch.BoolTensor(feats.shape).fill_(False)
107
- inputs = {
108
- "source": feats.half().to(self.device),
109
- "padding_mask": padding_mask.to(self.device),
110
- "output_layer": 9, # layer 9
111
- }
112
- torch.cuda.synchronize()
113
- t0 = ttime()
114
- with torch.no_grad():
115
- logits = model.extract_features(**inputs)
116
- feats = model.final_proj(logits[0])
117
- feats = get_cluster_result(feats.cpu().numpy()[0].astype("float32"))
118
- feats = torch.from_numpy(feats).to(self.device)
119
- feats = F.interpolate(feats.half().unsqueeze(0).unsqueeze(0), scale_factor=2).long().squeeze(0)
120
- t1 = ttime()
121
- p_len = audio0.shape[0]//self.window
122
- if(feats.shape[1]<p_len):
123
- p_len=feats.shape[1]
124
- pitch=pitch[:,:p_len]
125
- pitchf=pitchf[:,:p_len]
126
- p_len=torch.LongTensor([p_len]).to(self.device)
127
  with torch.no_grad():
128
- audio1 = (net_g.infer(feats, p_len, pitch, pitchf, dv)[0][0, 0] * 32768).data.cpu().float().numpy().astype(np.int16)
129
- del feats,p_len,padding_mask
130
- torch.cuda.empty_cache()
 
 
 
 
131
  t2 = ttime()
132
- times[0] += (t1 - t0)
133
- times[2] += (t2 - t1)
134
  return audio1
135
 
136
- def pipeline(self,model,net_g,dv,audio,times,f0_up_key,f0_file=None):
137
- audio_pad = np.pad(audio, (self.window // 2, self.window // 2), mode='reflect')
138
- opt_ts = []
139
- if(audio_pad.shape[0]>self.t_max):
140
- audio_sum = np.zeros_like(audio)
141
- for i in range(self.window): audio_sum += audio_pad[i:i - self.window]
142
- for t in range(self.t_center, audio.shape[0],self.t_center):opt_ts.append(t - self.t_query + np.where(np.abs(audio_sum[t - self.t_query:t + self.t_query]) == np.abs(audio_sum[t - self.t_query:t + self.t_query]).min())[0][0])
143
- s = 0
144
- audio_opt=[]
145
- t=None
146
- t1=ttime()
147
- audio_pad = np.pad(audio, (self.t_pad, self.t_pad), mode='reflect')
148
- p_len=audio_pad.shape[0]//self.window
149
- inp_f0=None
150
- if(hasattr(f0_file,'name') ==True):
 
 
 
 
 
 
 
 
 
 
 
 
 
151
  try:
152
- with open(f0_file.name,"r")as f:
153
- lines=f.read().strip("\n").split("\n")
154
- inp_f0=[]
155
- for line in lines:inp_f0.append([float(i)for i in line.split(",")])
156
- inp_f0=np.array(inp_f0,dtype="float32")
157
  except:
158
  traceback.print_exc()
159
- pitch, pitchf = self.get_f0(audio_pad, p_len, f0_up_key,inp_f0)
160
-
161
- pitch = pitch[:p_len]
162
- pitchf = pitchf[:p_len]
163
- # if(inp_f0 is None):
164
- # pitch = pitch[:p_len]
165
- # pitchf = pitchf[:p_len]
166
- # else:
167
- # pitch=resize2d(pitch,p_len,is1=True)
168
- # pitchf=resize2d(pitchf,p_len,is1=False)
169
- pitch = torch.LongTensor(pitch).unsqueeze(0).to(self.device)
170
- pitchf = torch.FloatTensor(pitchf).unsqueeze(0).to(self.device)
171
- t2=ttime()
172
- times[1] += (t2 - t1)
173
- for t in opt_ts:
174
- t=t//self.window*self.window
175
- audio_opt.append(self.vc(model,net_g,dv,audio_pad[s:t+self.t_pad2+self.window],pitch[:,s//self.window:(t+self.t_pad2)//self.window],pitchf[:,s//self.window:(t+self.t_pad2)//self.window],times)[self.t_pad_tgt:-self.t_pad_tgt])
176
- s = t
177
- audio_opt.append(self.vc(model,net_g,dv,audio_pad[t:],pitch[:,t//self.window:]if t is not None else pitch,pitchf[:,t//self.window:]if t is not None else pitchf,times)[self.t_pad_tgt:-self.t_pad_tgt])
178
- audio_opt=np.concatenate(audio_opt)
179
- del pitch,pitchf
180
- return audio_opt
181
- def pipeline_km(self,model,net_g,dv,audio,times,f0_up_key,f0_file=None):
182
- audio_pad = np.pad(audio, (self.window // 2, self.window // 2), mode='reflect')
183
  opt_ts = []
184
- if(audio_pad.shape[0]>self.t_max):
185
  audio_sum = np.zeros_like(audio)
186
- for i in range(self.window): audio_sum += audio_pad[i:i - self.window]
187
- for t in range(self.t_center, audio.shape[0],self.t_center):opt_ts.append(t - self.t_query + np.where(np.abs(audio_sum[t - self.t_query:t + self.t_query]) == np.abs(audio_sum[t - self.t_query:t + self.t_query]).min())[0][0])
 
 
 
 
 
 
 
 
 
188
  s = 0
189
- audio_opt=[]
190
- t=None
191
- t1=ttime()
192
- audio_pad = np.pad(audio, (self.t_pad, self.t_pad), mode='reflect')
193
- p_len=audio_pad.shape[0]//self.window
194
- inp_f0=None
195
- if(hasattr(f0_file,'name') ==True):
196
  try:
197
- with open(f0_file.name,"r")as f:
198
- lines=f.read().strip("\n").split("\n")
199
- inp_f0=[]
200
- for line in lines:inp_f0.append([float(i)for i in line.split(",")])
201
- inp_f0=np.array(inp_f0,dtype="float32")
 
202
  except:
203
  traceback.print_exc()
204
- pitch, pitchf = self.get_f0(audio_pad, p_len, f0_up_key,inp_f0)
205
-
206
- pitch = pitch[:p_len]
207
- pitchf = pitchf[:p_len]
208
- # if(inp_f0 is None):
209
- # pitch = pitch[:p_len]
210
- # pitchf = pitchf[:p_len]
211
- # else:
212
- # pitch=resize2d(pitch,p_len,is1=True)
213
- # pitchf=resize2d(pitchf,p_len,is1=False)
214
- pitch = torch.LongTensor(pitch).unsqueeze(0).to(self.device)
215
- pitchf = torch.FloatTensor(pitchf).unsqueeze(0).to(self.device)
216
- t2=ttime()
217
- times[1] += (t2 - t1)
 
 
 
 
 
 
218
  for t in opt_ts:
219
- t=t//self.window*self.window
220
- audio_opt.append(self.vc_km(model,net_g,dv,audio_pad[s:t+self.t_pad2+self.window],pitch[:,s//self.window:(t+self.t_pad2)//self.window],pitchf[:,s//self.window:(t+self.t_pad2)//self.window],times)[self.t_pad_tgt:-self.t_pad_tgt])
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
221
  s = t
222
- audio_opt.append(self.vc_km(model,net_g,dv,audio_pad[t:],pitch[:,t//self.window:]if t is not None else pitch,pitchf[:,t//self.window:]if t is not None else pitchf,times)[self.t_pad_tgt:-self.t_pad_tgt])
223
- audio_opt=np.concatenate(audio_opt)
224
- del pitch,pitchf
225
- return audio_opt
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import sys
3
+ import traceback
4
+ import logging
5
+
6
+ logger = logging.getLogger(__name__)
7
+
8
+ from functools import lru_cache
9
  from time import time as ttime
10
+
11
+ import faiss
12
+ import librosa
13
+ import numpy as np
14
+ import parselmouth
15
+ import pyworld
16
+ import torch
17
  import torch.nn.functional as F
18
+ import torchcrepe
19
+ from scipy import signal
20
+
21
+ now_dir = os.getcwd()
22
+ sys.path.append(now_dir)
23
+
24
+ bh, ah = signal.butter(N=5, Wn=48, btype="high", fs=16000)
25
+
26
+ input_audio_path2wav = {}
27
+
28
+
29
+ @lru_cache
30
+ def cache_harvest_f0(input_audio_path, fs, f0max, f0min, frame_period):
31
+ audio = input_audio_path2wav[input_audio_path]
32
+ f0, t = pyworld.harvest(
33
+ audio,
34
+ fs=fs,
35
+ f0_ceil=f0max,
36
+ f0_floor=f0min,
37
+ frame_period=frame_period,
38
+ )
39
+ f0 = pyworld.stonemask(audio, f0, t, fs)
40
+ return f0
41
+
42
+
43
+ def change_rms(data1, sr1, data2, sr2, rate): # 1是输入音频,2是输出音频,rate是2的占比
44
+ # print(data1.max(),data2.max())
45
+ rms1 = librosa.feature.rms(
46
+ y=data1, frame_length=sr1 // 2 * 2, hop_length=sr1 // 2
47
+ ) # 每半秒一个点
48
+ rms2 = librosa.feature.rms(y=data2, frame_length=sr2 // 2 * 2, hop_length=sr2 // 2)
49
+ rms1 = torch.from_numpy(rms1)
50
+ rms1 = F.interpolate(
51
+ rms1.unsqueeze(0), size=data2.shape[0], mode="linear"
52
+ ).squeeze()
53
+ rms2 = torch.from_numpy(rms2)
54
+ rms2 = F.interpolate(
55
+ rms2.unsqueeze(0), size=data2.shape[0], mode="linear"
56
+ ).squeeze()
57
+ rms2 = torch.max(rms2, torch.zeros_like(rms2) + 1e-6)
58
+ data2 *= (
59
+ torch.pow(rms1, torch.tensor(1 - rate))
60
+ * torch.pow(rms2, torch.tensor(rate - 1))
61
+ ).numpy()
62
+ return data2
63
+
64
+
65
+ class Pipeline(object):
66
+ def __init__(self, tgt_sr, config):
67
+ self.x_pad, self.x_query, self.x_center, self.x_max, self.is_half = (
68
+ config.x_pad,
69
+ config.x_query,
70
+ config.x_center,
71
+ config.x_max,
72
+ config.is_half,
73
+ )
74
+ self.sr = 16000 # hubert输入采样率
75
+ self.window = 160 # 每帧点数
76
+ self.t_pad = self.sr * self.x_pad # 每条前后pad时间
77
+ self.t_pad_tgt = tgt_sr * self.x_pad
78
+ self.t_pad2 = self.t_pad * 2
79
+ self.t_query = self.sr * self.x_query # 查询切点前后查询时间
80
+ self.t_center = self.sr * self.x_center # 查询切点位置
81
+ self.t_max = self.sr * self.x_max # 免查询时长阈值
82
+ self.device = config.device
83
+
84
+ def get_f0(
85
+ self,
86
+ input_audio_path,
87
+ x,
88
+ p_len,
89
+ f0_up_key,
90
+ f0_method,
91
+ filter_radius,
92
+ inp_f0=None,
93
+ ):
94
+ global input_audio_path2wav
95
  time_step = self.window / self.sr * 1000
96
  f0_min = 50
97
  f0_max = 1100
98
  f0_mel_min = 1127 * np.log(1 + f0_min / 700)
99
  f0_mel_max = 1127 * np.log(1 + f0_max / 700)
100
+ if f0_method == "pm":
101
+ f0 = (
102
+ parselmouth.Sound(x, self.sr)
103
+ .to_pitch_ac(
104
+ time_step=time_step / 1000,
105
+ voicing_threshold=0.6,
106
+ pitch_floor=f0_min,
107
+ pitch_ceiling=f0_max,
108
+ )
109
+ .selected_array["frequency"]
110
+ )
111
+ pad_size = (p_len - len(f0) + 1) // 2
112
+ if pad_size > 0 or p_len - len(f0) - pad_size > 0:
113
+ f0 = np.pad(
114
+ f0, [[pad_size, p_len - len(f0) - pad_size]], mode="constant"
115
+ )
116
+ elif f0_method == "harvest":
117
+ input_audio_path2wav[input_audio_path] = x.astype(np.double)
118
+ f0 = cache_harvest_f0(input_audio_path, self.sr, f0_max, f0_min, 10)
119
+ if filter_radius > 2:
120
+ f0 = signal.medfilt(f0, 3)
121
+ elif f0_method == "crepe":
122
+ model = "full"
123
+ # Pick a batch size that doesn't cause memory errors on your gpu
124
+ batch_size = 512
125
+ # Compute pitch using first gpu
126
+ audio = torch.tensor(np.copy(x))[None].float()
127
+ f0, pd = torchcrepe.predict(
128
+ audio,
129
+ self.sr,
130
+ self.window,
131
+ f0_min,
132
+ f0_max,
133
+ model,
134
+ batch_size=batch_size,
135
+ device=self.device,
136
+ return_periodicity=True,
137
+ )
138
+ pd = torchcrepe.filter.median(pd, 3)
139
+ f0 = torchcrepe.filter.mean(f0, 3)
140
+ f0[pd < 0.1] = 0
141
+ f0 = f0[0].cpu().numpy()
142
+ elif f0_method == "rmvpe":
143
+ if not hasattr(self, "model_rmvpe"):
144
+ from infer.lib.rmvpe import RMVPE
145
+
146
+ logger.info(
147
+ "Loading rmvpe model,%s" % "%s/rmvpe.pt" % os.environ["rmvpe_root"]
148
+ )
149
+ self.model_rmvpe = RMVPE(
150
+ "%s/rmvpe.pt" % os.environ["rmvpe_root"],
151
+ is_half=self.is_half,
152
+ device=self.device,
153
+ )
154
+ f0 = self.model_rmvpe.infer_from_audio(x, thred=0.03)
155
+
156
+ if "privateuseone" in str(self.device): # clean ortruntime memory
157
+ del self.model_rmvpe.model
158
+ del self.model_rmvpe
159
+ logger.info("Cleaning ortruntime memory")
160
+
161
  f0 *= pow(2, f0_up_key / 12)
162
  # with open("test.txt","w")as f:f.write("\n".join([str(i)for i in f0.tolist()]))
163
+ tf0 = self.sr // self.window # 每秒f0点数
164
+ if inp_f0 is not None:
165
+ delta_t = np.round(
166
+ (inp_f0[:, 0].max() - inp_f0[:, 0].min()) * tf0 + 1
167
+ ).astype("int16")
168
+ replace_f0 = np.interp(
169
+ list(range(delta_t)), inp_f0[:, 0] * 100, inp_f0[:, 1]
170
+ )
171
+ shape = f0[self.x_pad * tf0 : self.x_pad * tf0 + len(replace_f0)].shape[0]
172
+ f0[self.x_pad * tf0 : self.x_pad * tf0 + len(replace_f0)] = replace_f0[
173
+ :shape
174
+ ]
175
  # with open("test_opt.txt","w")as f:f.write("\n".join([str(i)for i in f0.tolist()]))
176
  f0bak = f0.copy()
177
  f0_mel = 1127 * np.log(1 + f0 / 700)
178
+ f0_mel[f0_mel > 0] = (f0_mel[f0_mel > 0] - f0_mel_min) * 254 / (
179
+ f0_mel_max - f0_mel_min
180
+ ) + 1
181
  f0_mel[f0_mel <= 1] = 1
182
  f0_mel[f0_mel > 255] = 255
183
+ f0_coarse = np.rint(f0_mel).astype(np.int32)
184
+ return f0_coarse, f0bak # 1-0
185
 
186
+ def vc(
187
+ self,
188
+ model,
189
+ net_g,
190
+ sid,
191
+ audio0,
192
+ pitch,
193
+ pitchf,
194
+ times,
195
+ index,
196
+ big_npy,
197
+ index_rate,
198
+ version,
199
+ protect,
200
+ ): # ,file_index,file_big_npy
201
  feats = torch.from_numpy(audio0)
202
+ if self.is_half:
203
+ feats = feats.half()
204
+ else:
205
+ feats = feats.float()
206
  if feats.dim() == 2: # double channels
207
  feats = feats.mean(-1)
208
  assert feats.dim() == 1, feats.dim()
209
  feats = feats.view(1, -1)
210
+ padding_mask = torch.BoolTensor(feats.shape).to(self.device).fill_(False)
211
 
212
  inputs = {
213
  "source": feats.to(self.device),
214
+ "padding_mask": padding_mask,
215
+ "output_layer": 9 if version == "v1" else 12,
216
  }
217
  t0 = ttime()
218
  with torch.no_grad():
219
  logits = model.extract_features(**inputs)
220
+ feats = model.final_proj(logits[0]) if version == "v1" else logits[0]
221
+ if protect < 0.5 and pitch is not None and pitchf is not None:
222
+ feats0 = feats.clone()
223
+ if (
224
+ not isinstance(index, type(None))
225
+ and not isinstance(big_npy, type(None))
226
+ and index_rate != 0
227
+ ):
228
+ npy = feats[0].cpu().numpy()
229
+ if self.is_half:
230
+ npy = npy.astype("float32")
231
+
232
+ # _, I = index.search(npy, 1)
233
+ # npy = big_npy[I.squeeze()]
234
+
235
+ score, ix = index.search(npy, k=8)
236
+ weight = np.square(1 / score)
237
+ weight /= weight.sum(axis=1, keepdims=True)
238
+ npy = np.sum(big_npy[ix] * np.expand_dims(weight, axis=2), axis=1)
239
+
240
+ if self.is_half:
241
+ npy = npy.astype("float16")
242
+ feats = (
243
+ torch.from_numpy(npy).unsqueeze(0).to(self.device) * index_rate
244
+ + (1 - index_rate) * feats
245
+ )
246
+
247
  feats = F.interpolate(feats.permute(0, 2, 1), scale_factor=2).permute(0, 2, 1)
248
+ if protect < 0.5 and pitch is not None and pitchf is not None:
249
+ feats0 = F.interpolate(feats0.permute(0, 2, 1), scale_factor=2).permute(
250
+ 0, 2, 1
251
+ )
252
  t1 = ttime()
253
+ p_len = audio0.shape[0] // self.window
254
+ if feats.shape[1] < p_len:
255
+ p_len = feats.shape[1]
256
+ if pitch is not None and pitchf is not None:
257
+ pitch = pitch[:, :p_len]
258
+ pitchf = pitchf[:, :p_len]
259
+
260
+ if protect < 0.5 and pitch is not None and pitchf is not None:
261
+ pitchff = pitchf.clone()
262
+ pitchff[pitchf > 0] = 1
263
+ pitchff[pitchf < 1] = protect
264
+ pitchff = pitchff.unsqueeze(-1)
265
+ feats = feats * pitchff + feats0 * (1 - pitchff)
266
+ feats = feats.to(feats0.dtype)
267
+ p_len = torch.tensor([p_len], device=self.device).long()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
268
  with torch.no_grad():
269
+ hasp = pitch is not None and pitchf is not None
270
+ arg = (feats, p_len, pitch, pitchf, sid) if hasp else (feats, p_len, sid)
271
+ audio1 = (net_g.infer(*arg)[0][0, 0]).data.cpu().float().numpy()
272
+ del hasp, arg
273
+ del feats, p_len, padding_mask
274
+ if torch.cuda.is_available():
275
+ torch.cuda.empty_cache()
276
  t2 = ttime()
277
+ times[0] += t1 - t0
278
+ times[2] += t2 - t1
279
  return audio1
280
 
281
+ def pipeline(
282
+ self,
283
+ model,
284
+ net_g,
285
+ sid,
286
+ audio,
287
+ input_audio_path,
288
+ times,
289
+ f0_up_key,
290
+ f0_method,
291
+ file_index,
292
+ index_rate,
293
+ if_f0,
294
+ filter_radius,
295
+ tgt_sr,
296
+ resample_sr,
297
+ rms_mix_rate,
298
+ version,
299
+ protect,
300
+ f0_file=None,
301
+ ):
302
+ if (
303
+ file_index != ""
304
+ # and file_big_npy != ""
305
+ # and os.path.exists(file_big_npy) == True
306
+ and os.path.exists(file_index)
307
+ and index_rate != 0
308
+ ):
309
  try:
310
+ index = faiss.read_index(file_index)
311
+ # big_npy = np.load(file_big_npy)
312
+ big_npy = index.reconstruct_n(0, index.ntotal)
 
 
313
  except:
314
  traceback.print_exc()
315
+ index = big_npy = None
316
+ else:
317
+ index = big_npy = None
318
+ audio = signal.filtfilt(bh, ah, audio)
319
+ audio_pad = np.pad(audio, (self.window // 2, self.window // 2), mode="reflect")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
320
  opt_ts = []
321
+ if audio_pad.shape[0] > self.t_max:
322
  audio_sum = np.zeros_like(audio)
323
+ for i in range(self.window):
324
+ audio_sum += np.abs(audio_pad[i : i - self.window])
325
+ for t in range(self.t_center, audio.shape[0], self.t_center):
326
+ opt_ts.append(
327
+ t
328
+ - self.t_query
329
+ + np.where(
330
+ audio_sum[t - self.t_query : t + self.t_query]
331
+ == audio_sum[t - self.t_query : t + self.t_query].min()
332
+ )[0][0]
333
+ )
334
  s = 0
335
+ audio_opt = []
336
+ t = None
337
+ t1 = ttime()
338
+ audio_pad = np.pad(audio, (self.t_pad, self.t_pad), mode="reflect")
339
+ p_len = audio_pad.shape[0] // self.window
340
+ inp_f0 = None
341
+ if hasattr(f0_file, "name"):
342
  try:
343
+ with open(f0_file.name, "r") as f:
344
+ lines = f.read().strip("\n").split("\n")
345
+ inp_f0 = []
346
+ for line in lines:
347
+ inp_f0.append([float(i) for i in line.split(",")])
348
+ inp_f0 = np.array(inp_f0, dtype="float32")
349
  except:
350
  traceback.print_exc()
351
+ sid = torch.tensor(sid, device=self.device).unsqueeze(0).long()
352
+ pitch, pitchf = None, None
353
+ if if_f0 == 1:
354
+ pitch, pitchf = self.get_f0(
355
+ input_audio_path,
356
+ audio_pad,
357
+ p_len,
358
+ f0_up_key,
359
+ f0_method,
360
+ filter_radius,
361
+ inp_f0,
362
+ )
363
+ pitch = pitch[:p_len]
364
+ pitchf = pitchf[:p_len]
365
+ if "mps" not in str(self.device) or "xpu" not in str(self.device):
366
+ pitchf = pitchf.astype(np.float32)
367
+ pitch = torch.tensor(pitch, device=self.device).unsqueeze(0).long()
368
+ pitchf = torch.tensor(pitchf, device=self.device).unsqueeze(0).float()
369
+ t2 = ttime()
370
+ times[1] += t2 - t1
371
  for t in opt_ts:
372
+ t = t // self.window * self.window
373
+ if if_f0 == 1:
374
+ audio_opt.append(
375
+ self.vc(
376
+ model,
377
+ net_g,
378
+ sid,
379
+ audio_pad[s : t + self.t_pad2 + self.window],
380
+ pitch[:, s // self.window : (t + self.t_pad2) // self.window],
381
+ pitchf[:, s // self.window : (t + self.t_pad2) // self.window],
382
+ times,
383
+ index,
384
+ big_npy,
385
+ index_rate,
386
+ version,
387
+ protect,
388
+ )[self.t_pad_tgt : -self.t_pad_tgt]
389
+ )
390
+ else:
391
+ audio_opt.append(
392
+ self.vc(
393
+ model,
394
+ net_g,
395
+ sid,
396
+ audio_pad[s : t + self.t_pad2 + self.window],
397
+ None,
398
+ None,
399
+ times,
400
+ index,
401
+ big_npy,
402
+ index_rate,
403
+ version,
404
+ protect,
405
+ )[self.t_pad_tgt : -self.t_pad_tgt]
406
+ )
407
  s = t
408
+ if if_f0 == 1:
409
+ audio_opt.append(
410
+ self.vc(
411
+ model,
412
+ net_g,
413
+ sid,
414
+ audio_pad[t:],
415
+ pitch[:, t // self.window :] if t is not None else pitch,
416
+ pitchf[:, t // self.window :] if t is not None else pitchf,
417
+ times,
418
+ index,
419
+ big_npy,
420
+ index_rate,
421
+ version,
422
+ protect,
423
+ )[self.t_pad_tgt : -self.t_pad_tgt]
424
+ )
425
+ else:
426
+ audio_opt.append(
427
+ self.vc(
428
+ model,
429
+ net_g,
430
+ sid,
431
+ audio_pad[t:],
432
+ None,
433
+ None,
434
+ times,
435
+ index,
436
+ big_npy,
437
+ index_rate,
438
+ version,
439
+ protect,
440
+ )[self.t_pad_tgt : -self.t_pad_tgt]
441
+ )
442
+ audio_opt = np.concatenate(audio_opt)
443
+ if rms_mix_rate != 1:
444
+ audio_opt = change_rms(audio, 16000, audio_opt, tgt_sr, rms_mix_rate)
445
+ if tgt_sr != resample_sr >= 16000:
446
+ audio_opt = librosa.resample(
447
+ audio_opt, orig_sr=tgt_sr, target_sr=resample_sr
448
+ )
449
+ audio_max = np.abs(audio_opt).max() / 0.99
450
+ max_int16 = 32768
451
+ if audio_max > 1:
452
+ max_int16 /= audio_max
453
+ audio_opt = (audio_opt * max_int16).astype(np.int16)
454
+ del pitch, pitchf, sid
455
+ if torch.cuda.is_available():
456
+ torch.cuda.empty_cache()
457
+ return audio_opt