dangtr0408 commited on
Commit
0b61e28
·
verified ·
1 Parent(s): c96b495

Upload 33 files

Browse files
.gitattributes ADDED
@@ -0,0 +1,36 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ *.7z filter=lfs diff=lfs merge=lfs -text
2
+ *.arrow filter=lfs diff=lfs merge=lfs -text
3
+ *.bin filter=lfs diff=lfs merge=lfs -text
4
+ *.bz2 filter=lfs diff=lfs merge=lfs -text
5
+ *.ckpt filter=lfs diff=lfs merge=lfs -text
6
+ *.ftz filter=lfs diff=lfs merge=lfs -text
7
+ *.gz filter=lfs diff=lfs merge=lfs -text
8
+ *.h5 filter=lfs diff=lfs merge=lfs -text
9
+ *.joblib filter=lfs diff=lfs merge=lfs -text
10
+ *.lfs.* filter=lfs diff=lfs merge=lfs -text
11
+ *.mlmodel filter=lfs diff=lfs merge=lfs -text
12
+ *.model filter=lfs diff=lfs merge=lfs -text
13
+ *.msgpack filter=lfs diff=lfs merge=lfs -text
14
+ *.npy filter=lfs diff=lfs merge=lfs -text
15
+ *.npz filter=lfs diff=lfs merge=lfs -text
16
+ *.onnx filter=lfs diff=lfs merge=lfs -text
17
+ *.ot filter=lfs diff=lfs merge=lfs -text
18
+ *.parquet filter=lfs diff=lfs merge=lfs -text
19
+ *.pb filter=lfs diff=lfs merge=lfs -text
20
+ *.pickle filter=lfs diff=lfs merge=lfs -text
21
+ *.pkl filter=lfs diff=lfs merge=lfs -text
22
+ *.pt filter=lfs diff=lfs merge=lfs -text
23
+ *.pth filter=lfs diff=lfs merge=lfs -text
24
+ *.rar filter=lfs diff=lfs merge=lfs -text
25
+ *.safetensors filter=lfs diff=lfs merge=lfs -text
26
+ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
27
+ *.tar.* filter=lfs diff=lfs merge=lfs -text
28
+ *.tar filter=lfs diff=lfs merge=lfs -text
29
+ *.tflite filter=lfs diff=lfs merge=lfs -text
30
+ *.tgz filter=lfs diff=lfs merge=lfs -text
31
+ *.wasm filter=lfs diff=lfs merge=lfs -text
32
+ *.xz filter=lfs diff=lfs merge=lfs -text
33
+ *.zip filter=lfs diff=lfs merge=lfs -text
34
+ *.zst filter=lfs diff=lfs merge=lfs -text
35
+ *tfevents* filter=lfs diff=lfs merge=lfs -text
36
+ *.wav filter=lfs diff=lfs merge=lfs -text
.gitignore ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ Modules/__pycache__/__init__.cpython-311.pyc
2
+ Modules/__pycache__/hifigan.cpython-311.pyc
3
+ Modules/__pycache__/utils.cpython-311.pyc
4
+ Modules/__pycache__/__init__.cpython-311.pyc
5
+ Modules/__pycache__/hifigan.cpython-311.pyc
6
+ Modules/__pycache__/utils.cpython-311.pyc
7
+ __pycache__/inference.cpython-311.pyc
8
+ __pycache__/models.cpython-311.pyc
Audio/10_michael.wav ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:733023e56be0434c66ac3b855c9aaac29d64f3a060c295a75e700ecfd34c16f0
3
+ size 620444
Audio/11_fenrir.wav ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:abde72631473e48455d54cf585a0b1f229e6e77e9748ed1acef5678a40b08c08
3
+ size 537644
Audio/12_puck.wav ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:409cc59612472a0d4bb717613f539dafdb334411ed651ab6988f7fca8b922905
3
+ size 619244
Audio/13_echo.wav ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b6925e6737a67fcbf8dce32d22d29d086d81627b82c6edbfc92b3706f27479ff
3
+ size 524444
Audio/14_eric.wav ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:97b8bbf6a880e46730387ee7bb4bfba6c049ed58c4ec8680ec44f83df669eff1
3
+ size 573644
Audio/15_liam.wav ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:95842cfe6d1093deb37447b0e5993b6c18f7e5591c3fb1fb3dd230641925de44
3
+ size 541244
Audio/16_onyx.wav ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:25487ea7634b470392d787bfefb79da0a6a56dc26087ab27b62fa70aac43554d
3
+ size 514844
Audio/17_santa.wav ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:80bc56619904ccbd93ed813fc54491f7b83eb8b8fd6c8a1626bd9177f96a23cd
3
+ size 583244
Audio/18_adam.wav ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b84a1b122273a45d98b5cbf725f4633e4cccb4a0788b8a46cc9faa4b8612419b
3
+ size 517244
Audio/1_heart.wav ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:978b285ff24f274a1f4fe4551b0d57a5df704ca5ce83284e839ffe96c2dc3dfd
3
+ size 547244
Audio/2_belle.wav ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:459a64fa12dfb530320e8dab2f4057d7868ae4c020b447e8df3402149fa2be59
3
+ size 357644
Audio/3_kore.wav ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:e55fc5c463d01d46c090be5457c59727ee52f2ecbeba8be9b38862850418c0c3
3
+ size 276044
Audio/4_sarah.wav ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:0ae7416f410104b0cedc1cc9c7365a89fd16a1599733f8f416e7618943d0acb8
3
+ size 640844
Audio/5_nova.wav ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:252c20a3f55bfe0ea7f42fbd638f6d4113ade7918630d1d37e166e11143f74f8
3
+ size 336044
Audio/6_sky.wav ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:dc985eb31aa7e2088f852c55282ec6ff72365486478a627bcd56ce2387a8d5b2
3
+ size 502844
Audio/7_alloy.wav ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:cd7868816449f2139e21661dcbc13d3d553c558627d4c50fada1f7c22ce7f86c
3
+ size 632444
Audio/8_jessica.wav ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c8d7573154905c901281e767f25be2dbceae731c891da409f5b7c0be3096bd5d
3
+ size 477644
Audio/9_river.wav ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:75a3b2fc9d4e93ded21f28cccc6ae7bf7a39bf04fed7f2d4d36e59db0792eedd
3
+ size 472844
LICENSE ADDED
@@ -0,0 +1,21 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ MIT License
2
+
3
+ Copyright (c) 2023 Aaron (Yinghao) Li
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.
Models/base_model.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:821deb4efee549b7024f37236e86b4bcb023870baf0ddb9f407fb514253340d1
3
+ size 1692092384
Models/config.yaml ADDED
@@ -0,0 +1,70 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ log_dir: ./Models/Finetune
2
+ save_freq: 1
3
+ log_interval: 5
4
+ device: cuda
5
+ epochs: 50
6
+ batch_size: 2
7
+ max_len: 310 # maximum number of frames
8
+ pretrained_model: ./Models/Finetune/base_model.pth
9
+ load_only_params: false # set to true if do not want to load epoch numbers and optimizer parameters
10
+
11
+ data_params:
12
+ train_data: ../../Data_Speech/LibriTTS/train.txt
13
+ val_data: ../../Data_Speech/LibriTTS/val.txt
14
+ root_path: ../../Data_Speech/
15
+
16
+ preprocess_params:
17
+ sr: 24000
18
+ spect_params:
19
+ n_fft: 2048
20
+ win_length: 1200
21
+ hop_length: 300
22
+
23
+ model_params:
24
+ dim_in: 64
25
+ hidden_dim: 512
26
+ max_conv_dim: 512
27
+ n_layer: 3
28
+ n_mels: 80
29
+
30
+ n_token: 178 # number of phoneme tokens
31
+ max_dur: 50 # maximum duration of a single phoneme
32
+ style_dim: 128 # style vector size
33
+
34
+ dropout: 0.2
35
+
36
+ ASR_params:
37
+ input_dim: 80
38
+ hidden_dim: 256
39
+ n_token: 178 # number of phoneme tokens
40
+ n_layers: 6
41
+ token_embedding_dim: 512
42
+
43
+ JDC_params:
44
+ num_class: 1
45
+ seq_len: 192
46
+
47
+ # config for decoder
48
+ decoder:
49
+ type: hifigan # either hifigan or istftnet
50
+ resblock_kernel_sizes: [3,7,11]
51
+ upsample_rates : [10,5,3,2]
52
+ upsample_initial_channel: 512
53
+ resblock_dilation_sizes: [[1,3,5], [1,3,5], [1,3,5]]
54
+ upsample_kernel_sizes: [20,10,6,4]
55
+
56
+ loss_params:
57
+ lambda_mel: 5. # mel reconstruction loss
58
+ lambda_gen: 1. # generator loss
59
+
60
+ lambda_mono: 1. # monotonic alignment loss (TMA)
61
+ lambda_s2s: 1. # sequence-to-sequence loss (TMA)
62
+
63
+ lambda_F0: 1. # F0 reconstruction loss
64
+ lambda_norm: 1. # norm reconstruction loss
65
+ lambda_dur: 1. # duration loss
66
+ lambda_ce: 20. # duration predictor probability output CE loss
67
+
68
+ optimizer_params:
69
+ lr: 0.0001 # general learning rate
70
+ ft_lr: 0.00001 # learning rate for acoustic modules
Models/inference/model.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:2763d7b6c5477502d3f2a870eda76bbedae671f0107b15a1060fb4e6771ed634
3
+ size 359997166
Modules/__init__.py ADDED
@@ -0,0 +1 @@
 
 
1
+
Modules/hifigan.py ADDED
@@ -0,0 +1,477 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import torch
2
+ import torch.nn.functional as F
3
+ import torch.nn as nn
4
+ from torch.nn import Conv1d, ConvTranspose1d, AvgPool1d, Conv2d
5
+ from torch.nn.utils import weight_norm, remove_weight_norm, spectral_norm
6
+ from .utils import init_weights, get_padding
7
+
8
+ import math
9
+ import random
10
+ import numpy as np
11
+
12
+ LRELU_SLOPE = 0.1
13
+
14
+ class AdaIN1d(nn.Module):
15
+ def __init__(self, style_dim, num_features):
16
+ super().__init__()
17
+ self.norm = nn.InstanceNorm1d(num_features, affine=False)
18
+ self.fc = nn.Linear(style_dim, num_features*2)
19
+
20
+ def forward(self, x, s):
21
+ h = self.fc(s)
22
+ h = h.view(h.size(0), h.size(1), 1)
23
+ gamma, beta = torch.chunk(h, chunks=2, dim=1)
24
+ return (1 + gamma) * self.norm(x) + beta
25
+
26
+ class AdaINResBlock1(torch.nn.Module):
27
+ def __init__(self, channels, kernel_size=3, dilation=(1, 3, 5), style_dim=64):
28
+ super(AdaINResBlock1, self).__init__()
29
+ self.convs1 = nn.ModuleList([
30
+ weight_norm(Conv1d(channels, channels, kernel_size, 1, dilation=dilation[0],
31
+ padding=get_padding(kernel_size, dilation[0]))),
32
+ weight_norm(Conv1d(channels, channels, kernel_size, 1, dilation=dilation[1],
33
+ padding=get_padding(kernel_size, dilation[1]))),
34
+ weight_norm(Conv1d(channels, channels, kernel_size, 1, dilation=dilation[2],
35
+ padding=get_padding(kernel_size, dilation[2])))
36
+ ])
37
+ self.convs1.apply(init_weights)
38
+
39
+ self.convs2 = nn.ModuleList([
40
+ weight_norm(Conv1d(channels, channels, kernel_size, 1, dilation=1,
41
+ padding=get_padding(kernel_size, 1))),
42
+ weight_norm(Conv1d(channels, channels, kernel_size, 1, dilation=1,
43
+ padding=get_padding(kernel_size, 1))),
44
+ weight_norm(Conv1d(channels, channels, kernel_size, 1, dilation=1,
45
+ padding=get_padding(kernel_size, 1)))
46
+ ])
47
+ self.convs2.apply(init_weights)
48
+
49
+ self.adain1 = nn.ModuleList([
50
+ AdaIN1d(style_dim, channels),
51
+ AdaIN1d(style_dim, channels),
52
+ AdaIN1d(style_dim, channels),
53
+ ])
54
+
55
+ self.adain2 = nn.ModuleList([
56
+ AdaIN1d(style_dim, channels),
57
+ AdaIN1d(style_dim, channels),
58
+ AdaIN1d(style_dim, channels),
59
+ ])
60
+
61
+ self.alpha1 = nn.ParameterList([nn.Parameter(torch.ones(1, channels, 1)) for i in range(len(self.convs1))])
62
+ self.alpha2 = nn.ParameterList([nn.Parameter(torch.ones(1, channels, 1)) for i in range(len(self.convs2))])
63
+
64
+
65
+ def forward(self, x, s):
66
+ for c1, c2, n1, n2, a1, a2 in zip(self.convs1, self.convs2, self.adain1, self.adain2, self.alpha1, self.alpha2):
67
+ xt = n1(x, s)
68
+ xt = xt + (1 / a1) * (torch.sin(a1 * xt) ** 2) # Snake1D
69
+ xt = c1(xt)
70
+ xt = n2(xt, s)
71
+ xt = xt + (1 / a2) * (torch.sin(a2 * xt) ** 2) # Snake1D
72
+ xt = c2(xt)
73
+ x = xt + x
74
+ return x
75
+
76
+ def remove_weight_norm(self):
77
+ for l in self.convs1:
78
+ remove_weight_norm(l)
79
+ for l in self.convs2:
80
+ remove_weight_norm(l)
81
+
82
+ class SineGen(torch.nn.Module):
83
+ """ Definition of sine generator
84
+ SineGen(samp_rate, harmonic_num = 0,
85
+ sine_amp = 0.1, noise_std = 0.003,
86
+ voiced_threshold = 0,
87
+ flag_for_pulse=False)
88
+ samp_rate: sampling rate in Hz
89
+ harmonic_num: number of harmonic overtones (default 0)
90
+ sine_amp: amplitude of sine-wavefrom (default 0.1)
91
+ noise_std: std of Gaussian noise (default 0.003)
92
+ voiced_thoreshold: F0 threshold for U/V classification (default 0)
93
+ flag_for_pulse: this SinGen is used inside PulseGen (default False)
94
+ Note: when flag_for_pulse is True, the first time step of a voiced
95
+ segment is always sin(np.pi) or cos(0)
96
+ """
97
+
98
+ def __init__(self, samp_rate, upsample_scale, harmonic_num=0,
99
+ sine_amp=0.1, noise_std=0.003,
100
+ voiced_threshold=0,
101
+ flag_for_pulse=False):
102
+ super(SineGen, self).__init__()
103
+ self.sine_amp = sine_amp
104
+ self.noise_std = noise_std
105
+ self.harmonic_num = harmonic_num
106
+ self.dim = self.harmonic_num + 1
107
+ self.sampling_rate = samp_rate
108
+ self.voiced_threshold = voiced_threshold
109
+ self.flag_for_pulse = flag_for_pulse
110
+ self.upsample_scale = upsample_scale
111
+
112
+ def _f02uv(self, f0):
113
+ # generate uv signal
114
+ uv = (f0 > self.voiced_threshold).type(torch.float32)
115
+ return uv
116
+
117
+ def _f02sine(self, f0_values):
118
+ """ f0_values: (batchsize, length, dim)
119
+ where dim indicates fundamental tone and overtones
120
+ """
121
+ # convert to F0 in rad. The interger part n can be ignored
122
+ # because 2 * np.pi * n doesn't affect phase
123
+ rad_values = (f0_values / self.sampling_rate) % 1
124
+
125
+ # initial phase noise (no noise for fundamental component)
126
+ rand_ini = torch.rand(f0_values.shape[0], f0_values.shape[2], \
127
+ device=f0_values.device)
128
+ rand_ini[:, 0] = 0
129
+ rad_values[:, 0, :] = rad_values[:, 0, :] + rand_ini
130
+
131
+ # instantanouse phase sine[t] = sin(2*pi \sum_i=1 ^{t} rad)
132
+ if not self.flag_for_pulse:
133
+ # # for normal case
134
+
135
+ # # To prevent torch.cumsum numerical overflow,
136
+ # # it is necessary to add -1 whenever \sum_k=1^n rad_value_k > 1.
137
+ # # Buffer tmp_over_one_idx indicates the time step to add -1.
138
+ # # This will not change F0 of sine because (x-1) * 2*pi = x * 2*pi
139
+ # tmp_over_one = torch.cumsum(rad_values, 1) % 1
140
+ # tmp_over_one_idx = (padDiff(tmp_over_one)) < 0
141
+ # cumsum_shift = torch.zeros_like(rad_values)
142
+ # cumsum_shift[:, 1:, :] = tmp_over_one_idx * -1.0
143
+
144
+ # phase = torch.cumsum(rad_values, dim=1) * 2 * np.pi
145
+ rad_values = torch.nn.functional.interpolate(rad_values.transpose(1, 2),
146
+ scale_factor=1/self.upsample_scale,
147
+ mode="linear").transpose(1, 2)
148
+
149
+ # tmp_over_one = torch.cumsum(rad_values, 1) % 1
150
+ # tmp_over_one_idx = (padDiff(tmp_over_one)) < 0
151
+ # cumsum_shift = torch.zeros_like(rad_values)
152
+ # cumsum_shift[:, 1:, :] = tmp_over_one_idx * -1.0
153
+
154
+ phase = torch.cumsum(rad_values, dim=1) * 2 * np.pi
155
+ phase = torch.nn.functional.interpolate(phase.transpose(1, 2) * self.upsample_scale,
156
+ scale_factor=self.upsample_scale, mode="linear").transpose(1, 2)
157
+ sines = torch.sin(phase)
158
+
159
+ else:
160
+ # If necessary, make sure that the first time step of every
161
+ # voiced segments is sin(pi) or cos(0)
162
+ # This is used for pulse-train generation
163
+
164
+ # identify the last time step in unvoiced segments
165
+ uv = self._f02uv(f0_values)
166
+ uv_1 = torch.roll(uv, shifts=-1, dims=1)
167
+ uv_1[:, -1, :] = 1
168
+ u_loc = (uv < 1) * (uv_1 > 0)
169
+
170
+ # get the instantanouse phase
171
+ tmp_cumsum = torch.cumsum(rad_values, dim=1)
172
+ # different batch needs to be processed differently
173
+ for idx in range(f0_values.shape[0]):
174
+ temp_sum = tmp_cumsum[idx, u_loc[idx, :, 0], :]
175
+ temp_sum[1:, :] = temp_sum[1:, :] - temp_sum[0:-1, :]
176
+ # stores the accumulation of i.phase within
177
+ # each voiced segments
178
+ tmp_cumsum[idx, :, :] = 0
179
+ tmp_cumsum[idx, u_loc[idx, :, 0], :] = temp_sum
180
+
181
+ # rad_values - tmp_cumsum: remove the accumulation of i.phase
182
+ # within the previous voiced segment.
183
+ i_phase = torch.cumsum(rad_values - tmp_cumsum, dim=1)
184
+
185
+ # get the sines
186
+ sines = torch.cos(i_phase * 2 * np.pi)
187
+ return sines
188
+
189
+ def forward(self, f0):
190
+ """ sine_tensor, uv = forward(f0)
191
+ input F0: tensor(batchsize=1, length, dim=1)
192
+ f0 for unvoiced steps should be 0
193
+ output sine_tensor: tensor(batchsize=1, length, dim)
194
+ output uv: tensor(batchsize=1, length, 1)
195
+ """
196
+ f0_buf = torch.zeros(f0.shape[0], f0.shape[1], self.dim,
197
+ device=f0.device)
198
+ # fundamental component
199
+ fn = torch.multiply(f0, torch.FloatTensor([[range(1, self.harmonic_num + 2)]]).to(f0.device))
200
+
201
+ # generate sine waveforms
202
+ sine_waves = self._f02sine(fn) * self.sine_amp
203
+
204
+ # generate uv signal
205
+ # uv = torch.ones(f0.shape)
206
+ # uv = uv * (f0 > self.voiced_threshold)
207
+ uv = self._f02uv(f0)
208
+
209
+ # noise: for unvoiced should be similar to sine_amp
210
+ # std = self.sine_amp/3 -> max value ~ self.sine_amp
211
+ # . for voiced regions is self.noise_std
212
+ noise_amp = uv * self.noise_std + (1 - uv) * self.sine_amp / 3
213
+ noise = noise_amp * torch.randn_like(sine_waves)
214
+
215
+ # first: set the unvoiced part to 0 by uv
216
+ # then: additive noise
217
+ sine_waves = sine_waves * uv + noise
218
+ return sine_waves, uv, noise
219
+
220
+
221
+ class SourceModuleHnNSF(torch.nn.Module):
222
+ """ SourceModule for hn-nsf
223
+ SourceModule(sampling_rate, harmonic_num=0, sine_amp=0.1,
224
+ add_noise_std=0.003, voiced_threshod=0)
225
+ sampling_rate: sampling_rate in Hz
226
+ harmonic_num: number of harmonic above F0 (default: 0)
227
+ sine_amp: amplitude of sine source signal (default: 0.1)
228
+ add_noise_std: std of additive Gaussian noise (default: 0.003)
229
+ note that amplitude of noise in unvoiced is decided
230
+ by sine_amp
231
+ voiced_threshold: threhold to set U/V given F0 (default: 0)
232
+ Sine_source, noise_source = SourceModuleHnNSF(F0_sampled)
233
+ F0_sampled (batchsize, length, 1)
234
+ Sine_source (batchsize, length, 1)
235
+ noise_source (batchsize, length 1)
236
+ uv (batchsize, length, 1)
237
+ """
238
+
239
+ def __init__(self, sampling_rate, upsample_scale, harmonic_num=0, sine_amp=0.1,
240
+ add_noise_std=0.003, voiced_threshod=0):
241
+ super(SourceModuleHnNSF, self).__init__()
242
+
243
+ self.sine_amp = sine_amp
244
+ self.noise_std = add_noise_std
245
+
246
+ # to produce sine waveforms
247
+ self.l_sin_gen = SineGen(sampling_rate, upsample_scale, harmonic_num,
248
+ sine_amp, add_noise_std, voiced_threshod)
249
+
250
+ # to merge source harmonics into a single excitation
251
+ self.l_linear = torch.nn.Linear(harmonic_num + 1, 1)
252
+ self.l_tanh = torch.nn.Tanh()
253
+
254
+ def forward(self, x):
255
+ """
256
+ Sine_source, noise_source = SourceModuleHnNSF(F0_sampled)
257
+ F0_sampled (batchsize, length, 1)
258
+ Sine_source (batchsize, length, 1)
259
+ noise_source (batchsize, length 1)
260
+ """
261
+ # source for harmonic branch
262
+ with torch.no_grad():
263
+ sine_wavs, uv, _ = self.l_sin_gen(x)
264
+ sine_merge = self.l_tanh(self.l_linear(sine_wavs))
265
+
266
+ # source for noise branch, in the same shape as uv
267
+ noise = torch.randn_like(uv) * self.sine_amp / 3
268
+ return sine_merge, noise, uv
269
+ def padDiff(x):
270
+ return F.pad(F.pad(x, (0,0,-1,1), 'constant', 0) - x, (0,0,0,-1), 'constant', 0)
271
+
272
+ class Generator(torch.nn.Module):
273
+ def __init__(self, style_dim, resblock_kernel_sizes, upsample_rates, upsample_initial_channel, resblock_dilation_sizes, upsample_kernel_sizes):
274
+ super(Generator, self).__init__()
275
+ self.num_kernels = len(resblock_kernel_sizes)
276
+ self.num_upsamples = len(upsample_rates)
277
+ resblock = AdaINResBlock1
278
+
279
+ self.m_source = SourceModuleHnNSF(
280
+ sampling_rate=24000,
281
+ upsample_scale=np.prod(upsample_rates),
282
+ harmonic_num=8, voiced_threshod=10)
283
+
284
+ self.f0_upsamp = torch.nn.Upsample(scale_factor=np.prod(upsample_rates))
285
+ self.noise_convs = nn.ModuleList()
286
+ self.ups = nn.ModuleList()
287
+ self.noise_res = nn.ModuleList()
288
+
289
+ for i, (u, k) in enumerate(zip(upsample_rates, upsample_kernel_sizes)):
290
+ c_cur = upsample_initial_channel // (2 ** (i + 1))
291
+
292
+ self.ups.append(weight_norm(ConvTranspose1d(upsample_initial_channel//(2**i),
293
+ upsample_initial_channel//(2**(i+1)),
294
+ k, u, padding=(u//2 + u%2), output_padding=u%2)))
295
+
296
+ if i + 1 < len(upsample_rates): #
297
+ stride_f0 = np.prod(upsample_rates[i + 1:])
298
+ self.noise_convs.append(Conv1d(
299
+ 1, c_cur, kernel_size=stride_f0 * 2, stride=stride_f0, padding=(stride_f0+1) // 2))
300
+ self.noise_res.append(resblock(c_cur, 7, [1,3,5], style_dim))
301
+ else:
302
+ self.noise_convs.append(Conv1d(1, c_cur, kernel_size=1))
303
+ self.noise_res.append(resblock(c_cur, 11, [1,3,5], style_dim))
304
+
305
+ self.resblocks = nn.ModuleList()
306
+
307
+ self.alphas = nn.ParameterList()
308
+ self.alphas.append(nn.Parameter(torch.ones(1, upsample_initial_channel, 1)))
309
+
310
+ for i in range(len(self.ups)):
311
+ ch = upsample_initial_channel//(2**(i+1))
312
+ self.alphas.append(nn.Parameter(torch.ones(1, ch, 1)))
313
+
314
+ for j, (k, d) in enumerate(zip(resblock_kernel_sizes, resblock_dilation_sizes)):
315
+ self.resblocks.append(resblock(ch, k, d, style_dim))
316
+
317
+ self.conv_post = weight_norm(Conv1d(ch, 1, 7, 1, padding=3))
318
+ self.ups.apply(init_weights)
319
+ self.conv_post.apply(init_weights)
320
+
321
+ def forward(self, x, s, f0):
322
+
323
+ f0 = self.f0_upsamp(f0[:, None]).transpose(1, 2) # bs,n,t
324
+
325
+ har_source, noi_source, uv = self.m_source(f0)
326
+ har_source = har_source.transpose(1, 2)
327
+
328
+ for i in range(self.num_upsamples):
329
+ x = x + (1 / self.alphas[i]) * (torch.sin(self.alphas[i] * x) ** 2)
330
+ x_source = self.noise_convs[i](har_source)
331
+ x_source = self.noise_res[i](x_source, s)
332
+
333
+ x = self.ups[i](x)
334
+ x = x + x_source
335
+
336
+ xs = None
337
+ for j in range(self.num_kernels):
338
+ if xs is None:
339
+ xs = self.resblocks[i*self.num_kernels+j](x, s)
340
+ else:
341
+ xs += self.resblocks[i*self.num_kernels+j](x, s)
342
+ x = xs / self.num_kernels
343
+ x = x + (1 / self.alphas[i+1]) * (torch.sin(self.alphas[i+1] * x) ** 2)
344
+ x = self.conv_post(x)
345
+ x = torch.tanh(x)
346
+
347
+ return x
348
+
349
+ def remove_weight_norm(self):
350
+ print('Removing weight norm...')
351
+ for l in self.ups:
352
+ remove_weight_norm(l)
353
+ for l in self.resblocks:
354
+ l.remove_weight_norm()
355
+ remove_weight_norm(self.conv_pre)
356
+ remove_weight_norm(self.conv_post)
357
+
358
+
359
+ class AdainResBlk1d(nn.Module):
360
+ def __init__(self, dim_in, dim_out, style_dim=64, actv=nn.LeakyReLU(0.2),
361
+ upsample='none', dropout_p=0.0):
362
+ super().__init__()
363
+ self.actv = actv
364
+ self.upsample_type = upsample
365
+ self.upsample = UpSample1d(upsample)
366
+ self.learned_sc = dim_in != dim_out
367
+ self._build_weights(dim_in, dim_out, style_dim)
368
+ self.dropout = nn.Dropout(dropout_p)
369
+
370
+ if upsample == 'none':
371
+ self.pool = nn.Identity()
372
+ else:
373
+ self.pool = weight_norm(nn.ConvTranspose1d(dim_in, dim_in, kernel_size=3, stride=2, groups=dim_in, padding=1, output_padding=1))
374
+
375
+
376
+ def _build_weights(self, dim_in, dim_out, style_dim):
377
+ self.conv1 = weight_norm(nn.Conv1d(dim_in, dim_out, 3, 1, 1))
378
+ self.conv2 = weight_norm(nn.Conv1d(dim_out, dim_out, 3, 1, 1))
379
+ self.norm1 = AdaIN1d(style_dim, dim_in)
380
+ self.norm2 = AdaIN1d(style_dim, dim_out)
381
+ if self.learned_sc:
382
+ self.conv1x1 = weight_norm(nn.Conv1d(dim_in, dim_out, 1, 1, 0, bias=False))
383
+
384
+ def _shortcut(self, x):
385
+ x = self.upsample(x)
386
+ if self.learned_sc:
387
+ x = self.conv1x1(x)
388
+ return x
389
+
390
+ def _residual(self, x, s):
391
+ x = self.norm1(x, s)
392
+ x = self.actv(x)
393
+ x = self.pool(x)
394
+ x = self.conv1(self.dropout(x))
395
+ x = self.norm2(x, s)
396
+ x = self.actv(x)
397
+ x = self.conv2(self.dropout(x))
398
+ return x
399
+
400
+ def forward(self, x, s):
401
+ out = self._residual(x, s)
402
+ out = (out + self._shortcut(x)) / math.sqrt(2)
403
+ return out
404
+
405
+ class UpSample1d(nn.Module):
406
+ def __init__(self, layer_type):
407
+ super().__init__()
408
+ self.layer_type = layer_type
409
+
410
+ def forward(self, x):
411
+ if self.layer_type == 'none':
412
+ return x
413
+ else:
414
+ return F.interpolate(x, scale_factor=2, mode='nearest')
415
+
416
+ class Decoder(nn.Module):
417
+ def __init__(self, dim_in=512, F0_channel=512, style_dim=64, dim_out=80,
418
+ resblock_kernel_sizes = [3,7,11],
419
+ upsample_rates = [10,5,3,2],
420
+ upsample_initial_channel=512,
421
+ resblock_dilation_sizes=[[1,3,5], [1,3,5], [1,3,5]],
422
+ upsample_kernel_sizes=[20,10,6,4]):
423
+ super().__init__()
424
+
425
+ self.decode = nn.ModuleList()
426
+
427
+ self.encode = AdainResBlk1d(dim_in + 2, 1024, style_dim)
428
+
429
+ self.decode.append(AdainResBlk1d(1024 + 2 + 64, 1024, style_dim))
430
+ self.decode.append(AdainResBlk1d(1024 + 2 + 64, 1024, style_dim))
431
+ self.decode.append(AdainResBlk1d(1024 + 2 + 64, 1024, style_dim))
432
+ self.decode.append(AdainResBlk1d(1024 + 2 + 64, 512, style_dim, upsample=True))
433
+
434
+ self.F0_conv = weight_norm(nn.Conv1d(1, 1, kernel_size=3, stride=2, groups=1, padding=1))
435
+
436
+ self.N_conv = weight_norm(nn.Conv1d(1, 1, kernel_size=3, stride=2, groups=1, padding=1))
437
+
438
+ self.asr_res = nn.Sequential(
439
+ weight_norm(nn.Conv1d(512, 64, kernel_size=1)),
440
+ )
441
+
442
+
443
+ self.generator = Generator(style_dim, resblock_kernel_sizes, upsample_rates, upsample_initial_channel, resblock_dilation_sizes, upsample_kernel_sizes)
444
+
445
+
446
+ def forward(self, asr, F0_curve, N, s):
447
+ if self.training:
448
+ downlist = [0, 3, 7]
449
+ F0_down = downlist[random.randint(0, 2)]
450
+ downlist = [0, 3, 7, 15]
451
+ N_down = downlist[random.randint(0, 3)]
452
+ if F0_down:
453
+ F0_curve = nn.functional.conv1d(F0_curve.unsqueeze(1), torch.ones(1, 1, F0_down).to(asr.device), padding=F0_down//2).squeeze(1) / F0_down
454
+ if N_down:
455
+ N = nn.functional.conv1d(N.unsqueeze(1), torch.ones(1, 1, N_down).to(asr.device), padding=N_down//2).squeeze(1) / N_down
456
+
457
+
458
+ F0 = self.F0_conv(F0_curve.unsqueeze(1))
459
+ N = self.N_conv(N.unsqueeze(1))
460
+
461
+ x = torch.cat([asr, F0, N], axis=1)
462
+ x = self.encode(x, s)
463
+
464
+ asr_res = self.asr_res(asr)
465
+
466
+ res = True
467
+ for block in self.decode:
468
+ if res:
469
+ x = torch.cat([x, asr_res, F0, N], axis=1)
470
+ x = block(x, s)
471
+ if block.upsample_type != "none":
472
+ res = False
473
+
474
+ x = self.generator(x, s, F0_curve)
475
+ return x
476
+
477
+
Modules/utils.py ADDED
@@ -0,0 +1,14 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ def init_weights(m, mean=0.0, std=0.01):
2
+ classname = m.__class__.__name__
3
+ if classname.find("Conv") != -1:
4
+ m.weight.data.normal_(mean, std)
5
+
6
+
7
+ def apply_weight_norm(m):
8
+ classname = m.__class__.__name__
9
+ if classname.find("Conv") != -1:
10
+ weight_norm(m)
11
+
12
+
13
+ def get_padding(kernel_size, dilation=1):
14
+ return int((kernel_size*dilation - dilation)/2)
README.md ADDED
@@ -0,0 +1,88 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ license: mit
3
+ language:
4
+ - en
5
+ base_model:
6
+ - yl4579/StyleTTS2-LibriTTS
7
+ pipeline_tag: text-to-speech
8
+ ---
9
+
10
+ # StyleTTS 2 - lite
11
+
12
+ ## Online Demo
13
+ Explore the model on Hugging Face Spaces:
14
+ https://huggingface.co/spaces/dangtr0408/StyleTTS2-lite-space
15
+
16
+ ## Fine-tune
17
+ https://github.com/dangtr0408/StyleTTS2-lite
18
+
19
+ ## Training Details
20
+
21
+ 1. **Base Checkpoint:** Initialized from the official StyleTTS 2 weights pre-trained on LibriTTS.
22
+ 2. **Components Removal:** PLBert, Diffusion, Prosodic Encoder, SLM, and Spectral Normalization.
23
+ 2. **Training Data:** LibriTTS corpus.
24
+ 3. **Training Schedule:** Trained for 100,000 steps.
25
+
26
+ ## Model Architecture
27
+
28
+ | Component | Parameters |
29
+ | -------------- | ------------- |
30
+ | Decoder | 54 ,289 ,492 |
31
+ | Predictor | 16 ,194 ,612 |
32
+ | Style Encoder | 13 ,845 ,440 |
33
+ | Text Encoder | 5,612 ,320 |
34
+ | **Total** | **89 ,941 ,576** |
35
+
36
+ ## Prerequisites
37
+
38
+ - **Python:** Version 3.7 or higher
39
+ - **Git:** To clone the repository
40
+
41
+ ## Installation & Setup
42
+
43
+ 1. Clone the repository
44
+
45
+ ```bash
46
+
47
+ git clone https://huggingface.co/dangtr0408/StyleTTS2-lite
48
+
49
+ cd StyleTTS2-lite
50
+
51
+ ```
52
+
53
+ 2. Install dependencies:
54
+
55
+ ```bash
56
+
57
+ pip install -r requirements.txt
58
+
59
+ ```
60
+
61
+
62
+
63
+ 3. On **Linux**, manually install espeak:
64
+
65
+ ```bash
66
+
67
+ sudo apt-get install espeak-ng
68
+
69
+ ```
70
+
71
+ ## Usage Example
72
+
73
+ See run.ipynb file.
74
+
75
+ ## Disclaimer
76
+
77
+ **Before using these pre-trained models, you agree to inform the listeners that the speech samples are synthesized by the pre-trained models, unless you have the permission to use the voice you synthesize. That is, you agree to only use voices whose speakers grant the permission to have their voice cloned, either directly or by license before making synthesized voices public, or you have to publicly announce that these voices are synthesized if you do not have the permission to use these voices.**
78
+
79
+
80
+ ## References
81
+
82
+ - [yl4579/StyleTTS2](https://arxiv.org/abs/2306.07691)
83
+
84
+ - [jik876/hifi-gan](https://github.com/jik876/hifi-gan)
85
+
86
+ ## License
87
+
88
+ **Code: MIT License**
inference.py ADDED
@@ -0,0 +1,262 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import re
2
+ import yaml
3
+ from munch import Munch
4
+ import numpy as np
5
+ import librosa
6
+ import noisereduce as nr
7
+ from meldataset import TextCleaner
8
+ import torch
9
+ import torchaudio
10
+ from nltk.tokenize import word_tokenize
11
+ import nltk
12
+ nltk.download('punkt_tab')
13
+
14
+ from models import ProsodyPredictor, TextEncoder, StyleEncoder
15
+ from Modules.hifigan import Decoder
16
+
17
+ class Preprocess:
18
+ def __text_normalize(self, text):
19
+ punctuation = [",", "、", "،", ";", "(", ".", "。", "…", "!", "–", ":", "?"]
20
+ map_to = "."
21
+ punctuation_pattern = re.compile(f"[{''.join(re.escape(p) for p in punctuation)}]")
22
+ #replace punctuation that acts like a comma or period
23
+ text = punctuation_pattern.sub(map_to, text)
24
+ #replace consecutive whitespace chars with a single space and strip leading/trailing spaces
25
+ text = re.sub(r'\s+', ' ', text).strip()
26
+ return text
27
+ def __merge_fragments(self, texts, n):
28
+ merged = []
29
+ i = 0
30
+ while i < len(texts):
31
+ fragment = texts[i]
32
+ j = i + 1
33
+ while len(fragment.split()) < n and j < len(texts):
34
+ fragment += ", " + texts[j]
35
+ j += 1
36
+ merged.append(fragment)
37
+ i = j
38
+ if len(merged[-1].split()) < n and len(merged) > 1: #handle last sentence
39
+ merged[-2] = merged[-2] + ", " + merged[-1]
40
+ del merged[-1]
41
+ else:
42
+ merged[-1] = merged[-1]
43
+ return merged
44
+ def wave_preprocess(self, wave):
45
+ to_mel = torchaudio.transforms.MelSpectrogram(n_mels=80, n_fft=2048, win_length=1200, hop_length=300)
46
+ mean, std = -4, 4
47
+ wave_tensor = torch.from_numpy(wave).float()
48
+ mel_tensor = to_mel(wave_tensor)
49
+ mel_tensor = (torch.log(1e-5 + mel_tensor.unsqueeze(0)) - mean) / std
50
+ return mel_tensor
51
+ def text_preprocess(self, text, n_merge=12):
52
+ text_norm = self.__text_normalize(text).split(".")#split by sentences.
53
+ text_norm = [s.strip() for s in text_norm]
54
+ text_norm = list(filter(lambda x: x != '', text_norm)) #filter empty index
55
+ text_norm = self.__merge_fragments(text_norm, n=n_merge) #merge if a sentence has less that n
56
+ return text_norm
57
+ def length_to_mask(self, lengths):
58
+ mask = torch.arange(lengths.max()).unsqueeze(0).expand(lengths.shape[0], -1).type_as(lengths)
59
+ mask = torch.gt(mask+1, lengths.unsqueeze(1))
60
+ return mask
61
+
62
+ #For inference only
63
+ class StyleTTS2(torch.nn.Module):
64
+ def __init__(self, config_path, models_path):
65
+ super().__init__()
66
+ self.register_buffer("get_device", torch.empty(0))
67
+ self.preprocess = Preprocess()
68
+
69
+ config = yaml.safe_load(open(config_path))
70
+ args = self.__recursive_munch(config['model_params'])
71
+
72
+ assert args.decoder.type in ['hifigan'], 'Decoder type unknown'
73
+
74
+ self.decoder = Decoder(dim_in=args.hidden_dim, style_dim=args.style_dim, dim_out=args.n_mels,
75
+ resblock_kernel_sizes = args.decoder.resblock_kernel_sizes,
76
+ upsample_rates = args.decoder.upsample_rates,
77
+ upsample_initial_channel=args.decoder.upsample_initial_channel,
78
+ resblock_dilation_sizes=args.decoder.resblock_dilation_sizes,
79
+ upsample_kernel_sizes=args.decoder.upsample_kernel_sizes)
80
+ self.predictor = ProsodyPredictor(style_dim=args.style_dim, d_hid=args.hidden_dim, nlayers=args.n_layer, max_dur=args.max_dur, dropout=args.dropout)
81
+ self.text_encoder = TextEncoder(channels=args.hidden_dim, kernel_size=5, depth=args.n_layer, n_symbols=args.n_token)
82
+ self.style_encoder = StyleEncoder(dim_in=args.dim_in, style_dim=args.style_dim, max_conv_dim=args.hidden_dim)# acoustic style encoder
83
+
84
+ self.__load_models(models_path)
85
+
86
+ def __recursive_munch(self, d):
87
+ if isinstance(d, dict):
88
+ return Munch((k, self.__recursive_munch(v)) for k, v in d.items())
89
+ elif isinstance(d, list):
90
+ return [self.__recursive_munch(v) for v in d]
91
+ else:
92
+ return d
93
+
94
+ def __replace_outliers_zscore(self, tensor, threshold=3.0, factor=0.95):
95
+ mean = tensor.mean()
96
+ std = tensor.std()
97
+ z = (tensor - mean) / std
98
+
99
+ # Identify outliers
100
+ outlier_mask = torch.abs(z) > threshold
101
+ # Compute replacement value, respecting sign
102
+ sign = torch.sign(tensor - mean)
103
+ replacement = mean + sign * (threshold * std * factor)
104
+
105
+ result = tensor.clone()
106
+ result[outlier_mask] = replacement[outlier_mask]
107
+
108
+ return result
109
+
110
+ def __load_models(self, models_path):
111
+ module_params = []
112
+ model = {'decoder':self.decoder, 'predictor':self.predictor, 'text_encoder':self.text_encoder, 'style_encoder':self.style_encoder}
113
+
114
+ params_whole = torch.load(models_path, map_location='cpu')
115
+ params = params_whole['net']
116
+ params = {key: value for key, value in params.items() if key in model.keys()}
117
+
118
+ for key in model:
119
+ try:
120
+ model[key].load_state_dict(params[key])
121
+ except:
122
+ from collections import OrderedDict
123
+ state_dict = params[key]
124
+ new_state_dict = OrderedDict()
125
+ for k, v in state_dict.items():
126
+ name = k[7:] # remove `module.`
127
+ new_state_dict[name] = v
128
+ model[key].load_state_dict(new_state_dict, strict=False)
129
+
130
+ total_params = sum(p.numel() for p in model[key].parameters())
131
+ print(key,":",total_params)
132
+ module_params.append(total_params)
133
+
134
+ print('\nTotal',":",sum(module_params))
135
+
136
+ def __compute_style(self, path, denoise, split_dur):
137
+ device = self.get_device.device
138
+ denoise = min(denoise, 1)
139
+ if split_dur != 0: split_dur = max(int(split_dur), 1)
140
+ max_samples = 24000*20 #max 20 seconds ref audio
141
+ print("Computing the style for:", path)
142
+
143
+ wave, sr = librosa.load(path, sr=24000)
144
+ audio, index = librosa.effects.trim(wave, top_db=30)
145
+ if sr != 24000:
146
+ audio = librosa.resample(audio, sr, 24000)
147
+ if len(audio) > max_samples:
148
+ audio = audio[:max_samples]
149
+
150
+ if denoise > 0.0:
151
+ audio_denoise = nr.reduce_noise(y=audio, sr=sr, n_fft=2048, win_length=1200, hop_length=300)
152
+ audio = audio*(1-denoise) + audio_denoise*denoise
153
+
154
+ with torch.no_grad():
155
+ if split_dur>0 and len(audio)/sr>=4: #Only effective if audio length is >= 4s
156
+ #This option will split the ref audio to multiple parts, calculate styles and average them
157
+ count = 0
158
+ ref_s = None
159
+ jump = sr*split_dur
160
+ total_len = len(audio)
161
+
162
+ #Need to init before the loop
163
+ mel_tensor = self.preprocess.wave_preprocess(audio[0:jump]).to(device)
164
+ ref_s = self.style_encoder(mel_tensor.unsqueeze(1))
165
+ count += 1
166
+ for i in range(jump, total_len, jump):
167
+ if i+jump >= total_len:
168
+ left_dur = (total_len-i)/sr
169
+ if left_dur >= 1: #Still count if left over dur is >= 1s
170
+ mel_tensor = self.preprocess.wave_preprocess(audio[i:total_len]).to(device)
171
+ ref_s += self.style_encoder(mel_tensor.unsqueeze(1))
172
+ count += 1
173
+ continue
174
+ mel_tensor = self.preprocess.wave_preprocess(audio[i:i+jump]).to(device)
175
+ ref_s += self.style_encoder(mel_tensor.unsqueeze(1))
176
+ count += 1
177
+ ref_s /= count
178
+ else:
179
+ mel_tensor = self.preprocess.wave_preprocess(audio).to(device)
180
+ ref_s = self.style_encoder(mel_tensor.unsqueeze(1))
181
+
182
+ return ref_s
183
+
184
+ def __inference(self, phonem, ref_s, speed=1, prev_d_mean=0, t=0.1):
185
+ device = self.get_device.device
186
+ speed = min(max(speed, 0.0001), 2) #speed range [0, 2]
187
+
188
+ phonem = ' '.join(word_tokenize(phonem))
189
+ tokens = TextCleaner()(phonem)
190
+ tokens.insert(0, 0)
191
+ tokens.append(0)
192
+ tokens = torch.LongTensor(tokens).to(device).unsqueeze(0)
193
+
194
+ with torch.no_grad():
195
+ input_lengths = torch.LongTensor([tokens.shape[-1]]).to(device)
196
+ text_mask = self.preprocess.length_to_mask(input_lengths).to(device)
197
+
198
+ # encode
199
+ t_en = self.text_encoder(tokens, input_lengths, text_mask)
200
+ s = ref_s.to(device)
201
+
202
+ # cal alignment
203
+ d = self.predictor.text_encoder(t_en, s, input_lengths, text_mask)
204
+ x, _ = self.predictor.lstm(d)
205
+ duration = self.predictor.duration_proj(x)
206
+ duration = torch.sigmoid(duration).sum(axis=-1)
207
+
208
+ if prev_d_mean != 0:#Stabilize speaking speed between splits
209
+ dur_stats = torch.empty(duration.shape).normal_(mean=prev_d_mean, std=duration.std()).to(device)
210
+ else:
211
+ dur_stats = torch.empty(duration.shape).normal_(mean=duration.mean(), std=duration.std()).to(device)
212
+ duration = duration*(1-t) + dur_stats*t
213
+ duration[:,1:-2] = self.__replace_outliers_zscore(duration[:,1:-2]) #Normalize outlier
214
+
215
+ duration /= speed
216
+
217
+ pred_dur = torch.round(duration.squeeze()).clamp(min=1)
218
+ pred_aln_trg = torch.zeros(input_lengths, int(pred_dur.sum().data))
219
+ c_frame = 0
220
+ for i in range(pred_aln_trg.size(0)):
221
+ pred_aln_trg[i, c_frame:c_frame + int(pred_dur[i].data)] = 1
222
+ c_frame += int(pred_dur[i].data)
223
+ alignment = pred_aln_trg.unsqueeze(0).to(device)
224
+
225
+ # encode prosody
226
+ en = (d.transpose(-1, -2) @ alignment)
227
+ F0_pred, N_pred = self.predictor.F0Ntrain(en, s)
228
+ asr = (t_en @ pred_aln_trg.unsqueeze(0).to(device))
229
+
230
+ out = self.decoder(asr, F0_pred, N_pred, s)
231
+
232
+ return out.squeeze().cpu().numpy(), duration.mean()
233
+
234
+ def get_styles(self, speaker, denoise=0.3, avg_style=True):
235
+ if avg_style: split_dur = 3
236
+ else: split_dur = 0
237
+ style = {}
238
+ ref_s = self.__compute_style(speaker['path'], denoise=denoise, split_dur=split_dur)
239
+ style = {
240
+ 'style': ref_s,
241
+ 'path': speaker['path'],
242
+ 'speed': speaker['speed'],
243
+ }
244
+ return style
245
+
246
+ def generate(self, phonem, style, stabilize=True, n_merge=16):
247
+ if stabilize: smooth_value=0.2
248
+ else: smooth_value=0
249
+
250
+ list_wav = []
251
+ prev_d_mean = 0
252
+
253
+ print("Generating Audio...")
254
+ text_norm = self.preprocess.text_preprocess(phonem, n_merge=n_merge)
255
+ for sentence in text_norm:
256
+ wav, prev_d_mean = self.__inference(sentence, style['style'], speed=style['speed'], prev_d_mean=prev_d_mean, t=smooth_value)
257
+ wav = wav[4000:-4000] #Remove weird pulse and silent tokens
258
+ list_wav.append(wav)
259
+
260
+ final_wav = np.concatenate(list_wav)
261
+ final_wav = np.concatenate([np.zeros([4000]), final_wav, np.zeros([4000])], axis=0) # add padding
262
+ return final_wav
meldataset.py ADDED
@@ -0,0 +1,218 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #coding: utf-8
2
+ import os
3
+ import os.path as osp
4
+ import time
5
+ import random
6
+ import numpy as np
7
+ import random
8
+ import soundfile as sf
9
+ import librosa
10
+
11
+ import torch
12
+ from torch import nn
13
+ import torch.nn.functional as F
14
+ import torchaudio
15
+ from torch.utils.data import DataLoader
16
+
17
+ import logging
18
+ logger = logging.getLogger(__name__)
19
+ logger.setLevel(logging.DEBUG)
20
+
21
+ import pandas as pd
22
+
23
+ ##########################################################
24
+ _pad = "$"
25
+ _punctuation = ';:,.!?¡¿—…"«»“” '
26
+ _letters = 'ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz'
27
+ _letters_ipa = "ɑɐɒæɓʙβɔɕçɗɖðʤəɘɚɛɜɝɞɟʄɡɠɢʛɦɧħɥʜɨɪʝɭɬɫɮʟɱɯɰŋɳɲɴøɵɸθœɶʘɹɺɾɻʀʁɽʂʃʈʧʉʊʋⱱʌɣɤʍχʎʏʑʐʒʔʡʕʢǀǁǂǃˈˌːˑʼʴʰʱʲʷˠˤ˞↓↑→↗↘'̩'ᵻ"
28
+ _extend = "" #ADD MORE SYMBOLS HERE
29
+
30
+ # Export all symbols:
31
+ symbols = [_pad] + list(_punctuation) + list(_letters) + list(_letters_ipa) + list(_extend)
32
+
33
+ dicts = {}
34
+ for i in range(len((symbols))):
35
+ dicts[symbols[i]] = i
36
+
37
+ # Copy this code somewhere else then run with print(len(dicts) + 1) to check total symbols
38
+ ##########################################################
39
+
40
+ class TextCleaner:
41
+ def __init__(self, dummy=None):
42
+ self.word_index_dictionary = dicts
43
+ def __call__(self, text):
44
+ indexes = []
45
+ for char in text:
46
+ try:
47
+ indexes.append(self.word_index_dictionary[char])
48
+ except KeyError as e:
49
+ #print(char)
50
+ continue
51
+ return indexes
52
+
53
+ np.random.seed(1)
54
+ random.seed(1)
55
+ SPECT_PARAMS = {
56
+ "n_fft": 2048,
57
+ "win_length": 1200,
58
+ "hop_length": 300
59
+ }
60
+ MEL_PARAMS = {
61
+ "n_mels": 80,
62
+ }
63
+
64
+ to_mel = torchaudio.transforms.MelSpectrogram(
65
+ n_mels=80, n_fft=2048, win_length=1200, hop_length=300)
66
+ mean, std = -4, 4
67
+
68
+ def preprocess(wave):
69
+ wave_tensor = torch.from_numpy(wave).float()
70
+ mel_tensor = to_mel(wave_tensor)
71
+ mel_tensor = (torch.log(1e-5 + mel_tensor.unsqueeze(0)) - mean) / std
72
+ return mel_tensor
73
+
74
+ class FilePathDataset(torch.utils.data.Dataset):
75
+ def __init__(self,
76
+ data_list,
77
+ root_path,
78
+ sr=24000,
79
+ data_augmentation=False,
80
+ validation=False
81
+ ):
82
+
83
+ spect_params = SPECT_PARAMS
84
+ mel_params = MEL_PARAMS
85
+
86
+ _data_list = [l.strip().split('|') for l in data_list]
87
+ self.data_list = _data_list #[data if len(data) == 3 else (*data, 0) for data in _data_list] #append speakerid=0 for all
88
+ self.text_cleaner = TextCleaner()
89
+ self.sr = sr
90
+
91
+ self.df = pd.DataFrame(self.data_list)
92
+
93
+ self.to_melspec = torchaudio.transforms.MelSpectrogram(**MEL_PARAMS)
94
+
95
+ self.mean, self.std = -4, 4
96
+ self.data_augmentation = data_augmentation and (not validation)
97
+ self.max_mel_length = 192
98
+
99
+ self.root_path = root_path
100
+
101
+ def __len__(self):
102
+ return len(self.data_list)
103
+
104
+ def __getitem__(self, idx):
105
+ data = self.data_list[idx]
106
+ path = data[0]
107
+
108
+ wave, text_tensor = self._load_tensor(data)
109
+
110
+ mel_tensor = preprocess(wave).squeeze()
111
+
112
+ acoustic_feature = mel_tensor.squeeze()
113
+ length_feature = acoustic_feature.size(1)
114
+ acoustic_feature = acoustic_feature[:, :(length_feature - length_feature % 2)]
115
+
116
+ return acoustic_feature, text_tensor, path, wave
117
+
118
+ def _load_tensor(self, data):
119
+ wave_path, text = data
120
+ wave, sr = sf.read(osp.join(self.root_path, wave_path))
121
+ if wave.shape[-1] == 2:
122
+ wave = wave[:, 0].squeeze()
123
+ if sr != 24000:
124
+ wave = librosa.resample(wave, orig_sr=sr, target_sr=24000)
125
+ print(wave_path, sr)
126
+
127
+ # Adding half a second padding.
128
+ wave = np.concatenate([np.zeros([12000]), wave, np.zeros([12000])], axis=0)
129
+
130
+ text = self.text_cleaner(text)
131
+
132
+ text.insert(0, 0)
133
+ text.append(0)
134
+
135
+ text = torch.LongTensor(text)
136
+
137
+ return wave, text
138
+
139
+ def _load_data(self, data):
140
+ wave, text_tensor = self._load_tensor(data)
141
+ mel_tensor = preprocess(wave).squeeze()
142
+
143
+ mel_length = mel_tensor.size(1)
144
+ if mel_length > self.max_mel_length:
145
+ random_start = np.random.randint(0, mel_length - self.max_mel_length)
146
+ mel_tensor = mel_tensor[:, random_start:random_start + self.max_mel_length]
147
+
148
+ return mel_tensor
149
+
150
+
151
+ class Collater(object):
152
+ """
153
+ Args:
154
+ adaptive_batch_size (bool): if true, decrease batch size when long data comes.
155
+ """
156
+
157
+ def __init__(self, return_wave=False):
158
+ self.text_pad_index = 0
159
+ self.min_mel_length = 192
160
+ self.max_mel_length = 192
161
+ self.return_wave = return_wave
162
+
163
+
164
+ def __call__(self, batch):
165
+ batch_size = len(batch)
166
+
167
+ # sort by mel length
168
+ lengths = [b[0].shape[1] for b in batch]
169
+ batch_indexes = np.argsort(lengths)[::-1]
170
+ batch = [batch[bid] for bid in batch_indexes]
171
+
172
+ nmels = batch[0][0].size(0)
173
+ max_mel_length = max([b[0].shape[1] for b in batch])
174
+ max_text_length = max([b[1].shape[0] for b in batch])
175
+
176
+ mels = torch.zeros((batch_size, nmels, max_mel_length)).float()
177
+ texts = torch.zeros((batch_size, max_text_length)).long()
178
+
179
+ input_lengths = torch.zeros(batch_size).long()
180
+ output_lengths = torch.zeros(batch_size).long()
181
+ paths = ['' for _ in range(batch_size)]
182
+ waves = [None for _ in range(batch_size)]
183
+
184
+ for bid, (mel, text, path, wave) in enumerate(batch):
185
+ mel_size = mel.size(1)
186
+ text_size = text.size(0)
187
+ mels[bid, :, :mel_size] = mel
188
+ texts[bid, :text_size] = text
189
+ input_lengths[bid] = text_size
190
+ output_lengths[bid] = mel_size
191
+ paths[bid] = path
192
+
193
+ waves[bid] = wave
194
+
195
+ return waves, texts, input_lengths, mels, output_lengths
196
+
197
+
198
+
199
+ def build_dataloader(path_list,
200
+ root_path,
201
+ validation=False,
202
+ batch_size=4,
203
+ num_workers=1,
204
+ device='cpu',
205
+ collate_config={},
206
+ dataset_config={}):
207
+
208
+ dataset = FilePathDataset(path_list, root_path, validation=validation, **dataset_config)
209
+ collate_fn = Collater(**collate_config)
210
+ data_loader = DataLoader(dataset,
211
+ batch_size=batch_size,
212
+ shuffle=(not validation),
213
+ num_workers=num_workers,
214
+ drop_last=(not validation),
215
+ collate_fn=collate_fn,
216
+ pin_memory=(device != 'cpu'))
217
+
218
+ return data_loader
models.py ADDED
@@ -0,0 +1,532 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import math
2
+ import torch
3
+ import torch.nn as nn
4
+ import torch.nn.functional as F
5
+ from torch.nn.utils import weight_norm
6
+
7
+ from munch import Munch
8
+
9
+ class LearnedDownSample(nn.Module):
10
+ def __init__(self, layer_type, dim_in):
11
+ super().__init__()
12
+ self.layer_type = layer_type
13
+
14
+ if self.layer_type == 'none':
15
+ self.conv = nn.Identity()
16
+ elif self.layer_type == 'timepreserve':
17
+ self.conv = nn.Conv2d(dim_in, dim_in, kernel_size=(3, 1), stride=(2, 1), groups=dim_in, padding=(1, 0))
18
+ elif self.layer_type == 'half':
19
+ self.conv = nn.Conv2d(dim_in, dim_in, kernel_size=(3, 3), stride=(2, 2), groups=dim_in, padding=1)
20
+ else:
21
+ raise RuntimeError('Got unexpected donwsampletype %s, expected is [none, timepreserve, half]' % self.layer_type)
22
+
23
+ def forward(self, x):
24
+ return self.conv(x)
25
+
26
+ class LearnedUpSample(nn.Module):
27
+ def __init__(self, layer_type, dim_in):
28
+ super().__init__()
29
+ self.layer_type = layer_type
30
+
31
+ if self.layer_type == 'none':
32
+ self.conv = nn.Identity()
33
+ elif self.layer_type == 'timepreserve':
34
+ self.conv = nn.ConvTranspose2d(dim_in, dim_in, kernel_size=(3, 1), stride=(2, 1), groups=dim_in, output_padding=(1, 0), padding=(1, 0))
35
+ elif self.layer_type == 'half':
36
+ self.conv = nn.ConvTranspose2d(dim_in, dim_in, kernel_size=(3, 3), stride=(2, 2), groups=dim_in, output_padding=1, padding=1)
37
+ else:
38
+ raise RuntimeError('Got unexpected upsampletype %s, expected is [none, timepreserve, half]' % self.layer_type)
39
+
40
+
41
+ def forward(self, x):
42
+ return self.conv(x)
43
+
44
+ class DownSample(nn.Module):
45
+ def __init__(self, layer_type):
46
+ super().__init__()
47
+ self.layer_type = layer_type
48
+
49
+ def forward(self, x):
50
+ if self.layer_type == 'none':
51
+ return x
52
+ elif self.layer_type == 'timepreserve':
53
+ return F.avg_pool2d(x, (2, 1))
54
+ elif self.layer_type == 'half':
55
+ if x.shape[-1] % 2 != 0:
56
+ x = torch.cat([x, x[..., -1].unsqueeze(-1)], dim=-1)
57
+ return F.avg_pool2d(x, 2)
58
+ else:
59
+ raise RuntimeError('Got unexpected donwsampletype %s, expected is [none, timepreserve, half]' % self.layer_type)
60
+
61
+
62
+ class UpSample(nn.Module):
63
+ def __init__(self, layer_type):
64
+ super().__init__()
65
+ self.layer_type = layer_type
66
+
67
+ def forward(self, x):
68
+ if self.layer_type == 'none':
69
+ return x
70
+ elif self.layer_type == 'timepreserve':
71
+ return F.interpolate(x, scale_factor=(2, 1), mode='nearest')
72
+ elif self.layer_type == 'half':
73
+ return F.interpolate(x, scale_factor=2, mode='nearest')
74
+ else:
75
+ raise RuntimeError('Got unexpected upsampletype %s, expected is [none, timepreserve, half]' % self.layer_type)
76
+
77
+
78
+ class ResBlk(nn.Module):
79
+ def __init__(self, dim_in, dim_out, actv=nn.LeakyReLU(0.2),
80
+ normalize=False, downsample='none'):
81
+ super().__init__()
82
+ self.actv = actv
83
+ self.normalize = normalize
84
+ self.downsample = DownSample(downsample)
85
+ self.downsample_res = LearnedDownSample(downsample, dim_in)
86
+ self.learned_sc = dim_in != dim_out
87
+ self._build_weights(dim_in, dim_out)
88
+
89
+ def _build_weights(self, dim_in, dim_out):
90
+ self.conv1 = nn.Conv2d(dim_in, dim_in, 3, 1, 1)
91
+ self.conv2 = nn.Conv2d(dim_in, dim_out, 3, 1, 1)
92
+ if self.normalize:
93
+ self.norm1 = nn.InstanceNorm2d(dim_in, affine=True)
94
+ self.norm2 = nn.InstanceNorm2d(dim_in, affine=True)
95
+ if self.learned_sc:
96
+ self.conv1x1 = nn.Conv2d(dim_in, dim_out, 1, 1, 0, bias=False)
97
+
98
+ def _shortcut(self, x):
99
+ if self.learned_sc:
100
+ x = self.conv1x1(x)
101
+ if self.downsample:
102
+ x = self.downsample(x)
103
+ return x
104
+
105
+ def _residual(self, x):
106
+ if self.normalize:
107
+ x = self.norm1(x)
108
+ x = self.actv(x)
109
+ x = self.conv1(x)
110
+ x = self.downsample_res(x)
111
+ if self.normalize:
112
+ x = self.norm2(x)
113
+ x = self.actv(x)
114
+ x = self.conv2(x)
115
+ return x
116
+
117
+ def forward(self, x):
118
+ x = self._shortcut(x) + self._residual(x)
119
+ return x / math.sqrt(2) # unit variance
120
+
121
+ class StyleEncoder(nn.Module):
122
+ def __init__(self, dim_in=48, style_dim=48, max_conv_dim=384):
123
+ super().__init__()
124
+ blocks = []
125
+ blocks += [nn.Conv2d(1, dim_in, 3, 1, 1)]
126
+
127
+ repeat_num = 4
128
+ for _ in range(repeat_num):
129
+ dim_out = min(dim_in*2, max_conv_dim)
130
+ blocks += [ResBlk(dim_in, dim_out, downsample='half')]
131
+ dim_in = dim_out
132
+
133
+ blocks += [nn.LeakyReLU(0.2)]
134
+ blocks += [nn.Conv2d(dim_out, dim_out, 5, 1, 0)]
135
+ blocks += [nn.AdaptiveAvgPool2d(1)]
136
+ blocks += [nn.LeakyReLU(0.2)]
137
+ self.shared = nn.Sequential(*blocks)
138
+
139
+ self.unshared = nn.Linear(dim_out, style_dim)
140
+
141
+ def forward(self, x):
142
+ h = self.shared(x)
143
+ h = h.view(h.size(0), -1)
144
+ s = self.unshared(h)
145
+
146
+ return s
147
+
148
+ class LinearNorm(torch.nn.Module):
149
+ def __init__(self, in_dim, out_dim, bias=True, w_init_gain='linear'):
150
+ super(LinearNorm, self).__init__()
151
+ self.linear_layer = torch.nn.Linear(in_dim, out_dim, bias=bias)
152
+
153
+ torch.nn.init.xavier_uniform_(
154
+ self.linear_layer.weight,
155
+ gain=torch.nn.init.calculate_gain(w_init_gain))
156
+
157
+ def forward(self, x):
158
+ return self.linear_layer(x)
159
+
160
+ class ResBlk1d(nn.Module):
161
+ def __init__(self, dim_in, dim_out, actv=nn.LeakyReLU(0.2),
162
+ normalize=False, downsample='none', dropout_p=0.2):
163
+ super().__init__()
164
+ self.actv = actv
165
+ self.normalize = normalize
166
+ self.downsample_type = downsample
167
+ self.learned_sc = dim_in != dim_out
168
+ self._build_weights(dim_in, dim_out)
169
+ self.dropout_p = dropout_p
170
+
171
+ if self.downsample_type == 'none':
172
+ self.pool = nn.Identity()
173
+ else:
174
+ self.pool = weight_norm(nn.Conv1d(dim_in, dim_in, kernel_size=3, stride=2, groups=dim_in, padding=1))
175
+
176
+ def _build_weights(self, dim_in, dim_out):
177
+ self.conv1 = weight_norm(nn.Conv1d(dim_in, dim_in, 3, 1, 1))
178
+ self.conv2 = weight_norm(nn.Conv1d(dim_in, dim_out, 3, 1, 1))
179
+ if self.normalize:
180
+ self.norm1 = nn.InstanceNorm1d(dim_in, affine=True)
181
+ self.norm2 = nn.InstanceNorm1d(dim_in, affine=True)
182
+ if self.learned_sc:
183
+ self.conv1x1 = weight_norm(nn.Conv1d(dim_in, dim_out, 1, 1, 0, bias=False))
184
+
185
+ def downsample(self, x):
186
+ if self.downsample_type == 'none':
187
+ return x
188
+ else:
189
+ if x.shape[-1] % 2 != 0:
190
+ x = torch.cat([x, x[..., -1].unsqueeze(-1)], dim=-1)
191
+ return F.avg_pool1d(x, 2)
192
+
193
+ def _shortcut(self, x):
194
+ if self.learned_sc:
195
+ x = self.conv1x1(x)
196
+ x = self.downsample(x)
197
+ return x
198
+
199
+ def _residual(self, x):
200
+ if self.normalize:
201
+ x = self.norm1(x)
202
+ x = self.actv(x)
203
+ x = F.dropout(x, p=self.dropout_p, training=self.training)
204
+
205
+ x = self.conv1(x)
206
+ x = self.pool(x)
207
+ if self.normalize:
208
+ x = self.norm2(x)
209
+
210
+ x = self.actv(x)
211
+ x = F.dropout(x, p=self.dropout_p, training=self.training)
212
+
213
+ x = self.conv2(x)
214
+ return x
215
+
216
+ def forward(self, x):
217
+ x = self._shortcut(x) + self._residual(x)
218
+ return x / math.sqrt(2) # unit variance
219
+
220
+ class LayerNorm(nn.Module):
221
+ def __init__(self, channels, eps=1e-5):
222
+ super().__init__()
223
+ self.channels = channels
224
+ self.eps = eps
225
+
226
+ self.gamma = nn.Parameter(torch.ones(channels))
227
+ self.beta = nn.Parameter(torch.zeros(channels))
228
+
229
+ def forward(self, x):
230
+ x = x.transpose(1, -1)
231
+ x = F.layer_norm(x, (self.channels,), self.gamma, self.beta, self.eps)
232
+ return x.transpose(1, -1)
233
+
234
+ class TextEncoder(nn.Module):
235
+ def __init__(self, channels, kernel_size, depth, n_symbols, actv=nn.LeakyReLU(0.2)):
236
+ super().__init__()
237
+ self.embedding = nn.Embedding(n_symbols, channels)
238
+
239
+ padding = (kernel_size - 1) // 2
240
+ self.cnn = nn.ModuleList()
241
+ for _ in range(depth):
242
+ self.cnn.append(nn.Sequential(
243
+ weight_norm(nn.Conv1d(channels, channels, kernel_size=kernel_size, padding=padding)),
244
+ LayerNorm(channels),
245
+ actv,
246
+ nn.Dropout(0.2),
247
+ ))
248
+ # self.cnn = nn.Sequential(*self.cnn)
249
+
250
+ self.lstm = nn.LSTM(channels, channels//2, 1, batch_first=True, bidirectional=True)
251
+
252
+ def forward(self, x, input_lengths, m):
253
+ x = self.embedding(x) # [B, T, emb]
254
+ x = x.transpose(1, 2) # [B, emb, T]
255
+ m = m.to(input_lengths.device).unsqueeze(1)
256
+ x.masked_fill_(m, 0.0)
257
+
258
+ for c in self.cnn:
259
+ x = c(x)
260
+ x.masked_fill_(m, 0.0)
261
+
262
+ x = x.transpose(1, 2) # [B, T, chn]
263
+
264
+ input_lengths = input_lengths.cpu().numpy()
265
+ x = nn.utils.rnn.pack_padded_sequence(
266
+ x, input_lengths, batch_first=True, enforce_sorted=False)
267
+
268
+ self.lstm.flatten_parameters()
269
+ x, _ = self.lstm(x)
270
+ x, _ = nn.utils.rnn.pad_packed_sequence(
271
+ x, batch_first=True)
272
+
273
+ x = x.transpose(-1, -2)
274
+ x_pad = torch.zeros([x.shape[0], x.shape[1], m.shape[-1]])
275
+
276
+ x_pad[:, :, :x.shape[-1]] = x
277
+ x = x_pad.to(x.device)
278
+
279
+ x.masked_fill_(m, 0.0)
280
+
281
+ return x
282
+
283
+ def inference(self, x):
284
+ x = self.embedding(x)
285
+ x = x.transpose(1, 2)
286
+ x = self.cnn(x)
287
+ x = x.transpose(1, 2)
288
+ self.lstm.flatten_parameters()
289
+ x, _ = self.lstm(x)
290
+ return x
291
+
292
+ def length_to_mask(self, lengths):
293
+ mask = torch.arange(lengths.max()).unsqueeze(0).expand(lengths.shape[0], -1).type_as(lengths)
294
+ mask = torch.gt(mask+1, lengths.unsqueeze(1))
295
+ return mask
296
+
297
+
298
+
299
+ class AdaIN1d(nn.Module):
300
+ def __init__(self, style_dim, num_features):
301
+ super().__init__()
302
+ self.norm = nn.InstanceNorm1d(num_features, affine=False)
303
+ self.fc = nn.Linear(style_dim, num_features*2)
304
+
305
+ def forward(self, x, s):
306
+ h = self.fc(s)
307
+ h = h.view(h.size(0), h.size(1), 1)
308
+ gamma, beta = torch.chunk(h, chunks=2, dim=1)
309
+ return (1 + gamma) * self.norm(x) + beta
310
+
311
+ class UpSample1d(nn.Module):
312
+ def __init__(self, layer_type):
313
+ super().__init__()
314
+ self.layer_type = layer_type
315
+
316
+ def forward(self, x):
317
+ if self.layer_type == 'none':
318
+ return x
319
+ else:
320
+ return F.interpolate(x, scale_factor=2, mode='nearest')
321
+
322
+ class AdainResBlk1d(nn.Module):
323
+ def __init__(self, dim_in, dim_out, style_dim=64, actv=nn.LeakyReLU(0.2),
324
+ upsample='none', dropout_p=0.0):
325
+ super().__init__()
326
+ self.actv = actv
327
+ self.upsample_type = upsample
328
+ self.upsample = UpSample1d(upsample)
329
+ self.learned_sc = dim_in != dim_out
330
+ self._build_weights(dim_in, dim_out, style_dim)
331
+ self.dropout = nn.Dropout(dropout_p)
332
+
333
+ if upsample == 'none':
334
+ self.pool = nn.Identity()
335
+ else:
336
+ self.pool = weight_norm(nn.ConvTranspose1d(dim_in, dim_in, kernel_size=3, stride=2, groups=dim_in, padding=1, output_padding=1))
337
+
338
+
339
+ def _build_weights(self, dim_in, dim_out, style_dim):
340
+ self.conv1 = weight_norm(nn.Conv1d(dim_in, dim_out, 3, 1, 1))
341
+ self.conv2 = weight_norm(nn.Conv1d(dim_out, dim_out, 3, 1, 1))
342
+ self.norm1 = AdaIN1d(style_dim, dim_in)
343
+ self.norm2 = AdaIN1d(style_dim, dim_out)
344
+ if self.learned_sc:
345
+ self.conv1x1 = weight_norm(nn.Conv1d(dim_in, dim_out, 1, 1, 0, bias=False))
346
+
347
+ def _shortcut(self, x):
348
+ x = self.upsample(x)
349
+ if self.learned_sc:
350
+ x = self.conv1x1(x)
351
+ return x
352
+
353
+ def _residual(self, x, s):
354
+ x = self.norm1(x, s)
355
+ x = self.actv(x)
356
+ x = self.pool(x)
357
+ x = self.conv1(self.dropout(x))
358
+ x = self.norm2(x, s)
359
+ x = self.actv(x)
360
+ x = self.conv2(self.dropout(x))
361
+ return x
362
+
363
+ def forward(self, x, s):
364
+ out = self._residual(x, s)
365
+ out = (out + self._shortcut(x)) / math.sqrt(2)
366
+ return out
367
+
368
+ class AdaLayerNorm(nn.Module):
369
+ def __init__(self, style_dim, channels, eps=1e-5):
370
+ super().__init__()
371
+ self.channels = channels
372
+ self.eps = eps
373
+
374
+ self.fc = nn.Linear(style_dim, channels*2)
375
+
376
+ def forward(self, x, s):
377
+ x = x.transpose(-1, -2)
378
+ x = x.transpose(1, -1)
379
+
380
+ h = self.fc(s)
381
+ h = h.view(h.size(0), h.size(1), 1)
382
+ gamma, beta = torch.chunk(h, chunks=2, dim=1)
383
+ gamma, beta = gamma.transpose(1, -1), beta.transpose(1, -1)
384
+
385
+
386
+ x = F.layer_norm(x, (self.channels,), eps=self.eps)
387
+ x = (1 + gamma) * x + beta
388
+ return x.transpose(1, -1).transpose(-1, -2)
389
+
390
+ class ProsodyPredictor(nn.Module):
391
+
392
+ def __init__(self, style_dim, d_hid, nlayers, max_dur=50, dropout=0.1):
393
+ super().__init__()
394
+
395
+ self.text_encoder = DurationEncoder(sty_dim=style_dim,
396
+ d_model=d_hid,
397
+ nlayers=nlayers,
398
+ dropout=dropout)
399
+
400
+ self.lstm = nn.LSTM(d_hid + style_dim, d_hid // 2, 1, batch_first=True, bidirectional=True)
401
+ self.duration_proj = LinearNorm(d_hid, max_dur)
402
+
403
+ self.shared = nn.LSTM(d_hid + style_dim, d_hid // 2, 1, batch_first=True, bidirectional=True)
404
+ self.F0 = nn.ModuleList()
405
+ self.F0.append(AdainResBlk1d(d_hid, d_hid, style_dim, dropout_p=dropout))
406
+ self.F0.append(AdainResBlk1d(d_hid, d_hid // 2, style_dim, upsample=True, dropout_p=dropout))
407
+ self.F0.append(AdainResBlk1d(d_hid // 2, d_hid // 2, style_dim, dropout_p=dropout))
408
+
409
+ self.N = nn.ModuleList()
410
+ self.N.append(AdainResBlk1d(d_hid, d_hid, style_dim, dropout_p=dropout))
411
+ self.N.append(AdainResBlk1d(d_hid, d_hid // 2, style_dim, upsample=True, dropout_p=dropout))
412
+ self.N.append(AdainResBlk1d(d_hid // 2, d_hid // 2, style_dim, dropout_p=dropout))
413
+
414
+ self.F0_proj = nn.Conv1d(d_hid // 2, 1, 1, 1, 0)
415
+ self.N_proj = nn.Conv1d(d_hid // 2, 1, 1, 1, 0)
416
+
417
+
418
+ def forward(self, texts, style, text_lengths, alignment, m):
419
+ d = self.text_encoder(texts, style, text_lengths, m)
420
+
421
+ batch_size = d.shape[0]
422
+ text_size = d.shape[1]
423
+
424
+ # predict duration
425
+ input_lengths = text_lengths.cpu().numpy()
426
+ x = nn.utils.rnn.pack_padded_sequence(
427
+ d, input_lengths, batch_first=True, enforce_sorted=False)
428
+
429
+ m = m.to(text_lengths.device).unsqueeze(1)
430
+
431
+ self.lstm.flatten_parameters()
432
+ x, _ = self.lstm(x)
433
+ x, _ = nn.utils.rnn.pad_packed_sequence(
434
+ x, batch_first=True)
435
+
436
+ x_pad = torch.zeros([x.shape[0], m.shape[-1], x.shape[-1]])
437
+
438
+ x_pad[:, :x.shape[1], :] = x
439
+ x = x_pad.to(x.device)
440
+
441
+ duration = self.duration_proj(nn.functional.dropout(x, 0.5, training=self.training))
442
+
443
+ en = (d.transpose(-1, -2) @ alignment)
444
+
445
+ return duration.squeeze(-1), en
446
+
447
+ def F0Ntrain(self, x, s):
448
+ x, _ = self.shared(x.transpose(-1, -2))
449
+
450
+ F0 = x.transpose(-1, -2)
451
+ for block in self.F0:
452
+ F0 = block(F0, s)
453
+ F0 = self.F0_proj(F0)
454
+
455
+ N = x.transpose(-1, -2)
456
+ for block in self.N:
457
+ N = block(N, s)
458
+ N = self.N_proj(N)
459
+
460
+ return F0.squeeze(1), N.squeeze(1)
461
+
462
+ def length_to_mask(self, lengths):
463
+ mask = torch.arange(lengths.max()).unsqueeze(0).expand(lengths.shape[0], -1).type_as(lengths)
464
+ mask = torch.gt(mask+1, lengths.unsqueeze(1))
465
+ return mask
466
+
467
+ class DurationEncoder(nn.Module):
468
+
469
+ def __init__(self, sty_dim, d_model, nlayers, dropout=0.1):
470
+ super().__init__()
471
+ self.lstms = nn.ModuleList()
472
+ for _ in range(nlayers):
473
+ self.lstms.append(nn.LSTM(d_model + sty_dim,
474
+ d_model // 2,
475
+ num_layers=1,
476
+ batch_first=True,
477
+ bidirectional=True,
478
+ dropout=dropout))
479
+ self.lstms.append(AdaLayerNorm(sty_dim, d_model))
480
+
481
+
482
+ self.dropout = dropout
483
+ self.d_model = d_model
484
+ self.sty_dim = sty_dim
485
+
486
+ def forward(self, x, style, text_lengths, m):
487
+ masks = m.to(text_lengths.device)
488
+
489
+ x = x.permute(2, 0, 1)
490
+ s = style.expand(x.shape[0], x.shape[1], -1)
491
+ x = torch.cat([x, s], axis=-1)
492
+ x.masked_fill_(masks.unsqueeze(-1).transpose(0, 1), 0.0)
493
+
494
+ x = x.transpose(0, 1)
495
+ input_lengths = text_lengths.cpu().numpy()
496
+ x = x.transpose(-1, -2)
497
+
498
+ for block in self.lstms:
499
+ if isinstance(block, AdaLayerNorm):
500
+ x = block(x.transpose(-1, -2), style).transpose(-1, -2)
501
+ x = torch.cat([x, s.permute(1, -1, 0)], axis=1)
502
+ x.masked_fill_(masks.unsqueeze(-1).transpose(-1, -2), 0.0)
503
+ else:
504
+ x = x.transpose(-1, -2)
505
+ x = nn.utils.rnn.pack_padded_sequence(
506
+ x, input_lengths, batch_first=True, enforce_sorted=False)
507
+ block.flatten_parameters()
508
+ x, _ = block(x)
509
+ x, _ = nn.utils.rnn.pad_packed_sequence(
510
+ x, batch_first=True)
511
+ x = F.dropout(x, p=self.dropout, training=self.training)
512
+ x = x.transpose(-1, -2)
513
+
514
+ x_pad = torch.zeros([x.shape[0], x.shape[1], m.shape[-1]])
515
+
516
+ x_pad[:, :, :x.shape[-1]] = x
517
+ x = x_pad.to(x.device)
518
+
519
+ return x.transpose(-1, -2)
520
+
521
+ def inference(self, x, style):
522
+ x = self.embedding(x.transpose(-1, -2)) * math.sqrt(self.d_model)
523
+ style = style.expand(x.shape[0], x.shape[1], -1)
524
+ x = torch.cat([x, style], axis=-1)
525
+ src = self.pos_encoder(x)
526
+ output = self.transformer_encoder(src).transpose(0, 1)
527
+ return output
528
+
529
+ def length_to_mask(self, lengths):
530
+ mask = torch.arange(lengths.max()).unsqueeze(0).expand(lengths.shape[0], -1).type_as(lengths)
531
+ mask = torch.gt(mask+1, lengths.unsqueeze(1))
532
+ return mask
requirements.txt ADDED
@@ -0,0 +1,10 @@
 
 
 
 
 
 
 
 
 
 
 
1
+ torch
2
+ torchaudio
3
+ numpy
4
+ PyYAML
5
+ munch
6
+ nltk
7
+ librosa
8
+ noisereduce
9
+ phonemizer
10
+ espeakng-loader
run.ipynb ADDED
@@ -0,0 +1,176 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "cells": [
3
+ {
4
+ "cell_type": "code",
5
+ "execution_count": null,
6
+ "id": "5a3ddcc8",
7
+ "metadata": {},
8
+ "outputs": [],
9
+ "source": [
10
+ "from inference import StyleTTS2\n",
11
+ "\n",
12
+ "import librosa\n",
13
+ "import IPython.display as ipd\n",
14
+ "import torch.cuda\n",
15
+ "\n",
16
+ "device = 'cuda' if torch.cuda.is_available() else 'cpu'"
17
+ ]
18
+ },
19
+ {
20
+ "cell_type": "markdown",
21
+ "id": "092cfb69",
22
+ "metadata": {},
23
+ "source": [
24
+ "### Load G2P"
25
+ ]
26
+ },
27
+ {
28
+ "cell_type": "markdown",
29
+ "id": "a152ec13",
30
+ "metadata": {},
31
+ "source": [
32
+ "If you did not use eSpeak for your language, please add your own G2P."
33
+ ]
34
+ },
35
+ {
36
+ "cell_type": "code",
37
+ "execution_count": null,
38
+ "id": "ca224f37",
39
+ "metadata": {},
40
+ "outputs": [],
41
+ "source": [
42
+ "import sys\n",
43
+ "import phonemizer\n",
44
+ "if sys.platform.startswith(\"win\"):\n",
45
+ " try:\n",
46
+ " from phonemizer.backend.espeak.wrapper import EspeakWrapper\n",
47
+ " import espeakng_loader\n",
48
+ " EspeakWrapper.set_library(espeakng_loader.get_library_path())\n",
49
+ " except Exception as e:\n",
50
+ " print(e)\n",
51
+ "\n",
52
+ "def get_phoneme(text, lang):\n",
53
+ " try:\n",
54
+ " my_phonemizer = phonemizer.backend.EspeakBackend(language=lang, preserve_punctuation=True, with_stress=True, language_switch='remove-flags')\n",
55
+ " return my_phonemizer.phonemize([text])[0]\n",
56
+ " except Exception as e:\n",
57
+ " print(e)"
58
+ ]
59
+ },
60
+ {
61
+ "cell_type": "markdown",
62
+ "id": "7b9cecbe",
63
+ "metadata": {},
64
+ "source": [
65
+ "### Load models"
66
+ ]
67
+ },
68
+ {
69
+ "cell_type": "code",
70
+ "execution_count": null,
71
+ "id": "e7b9c01d",
72
+ "metadata": {},
73
+ "outputs": [],
74
+ "source": [
75
+ "config_path = \"Models/config.yaml\"\n",
76
+ "models_path = \"Models/inference/model.pth\""
77
+ ]
78
+ },
79
+ {
80
+ "cell_type": "markdown",
81
+ "id": "b803110e",
82
+ "metadata": {},
83
+ "source": [
84
+ "### Synthesize speech\n",
85
+ "\n",
86
+ "Little Note: Reference audio has a huge impact on the result. It is best to select audio around 10s long and consistent in both tone and speed."
87
+ ]
88
+ },
89
+ {
90
+ "cell_type": "code",
91
+ "execution_count": null,
92
+ "id": "78396f70",
93
+ "metadata": {},
94
+ "outputs": [],
95
+ "source": [
96
+ "speaker = {\n",
97
+ " \"path\": \"./Audio/1_heart.wav\", #Ref audio path\n",
98
+ " \"speed\": 1.0, #Speaking speed\n",
99
+ "}\n",
100
+ "\n",
101
+ "max_samples = 24000*20 #max 20 seconds ref audio\n",
102
+ "print(speaker['path'])\n",
103
+ "wave, sr = librosa.load(speaker['path'], sr=24000)\n",
104
+ "audio, index = librosa.effects.trim(wave, top_db=30)\n",
105
+ "if sr != 24000: audio = librosa.resample(audio, sr, 24000)\n",
106
+ "if len(audio) > max_samples: audio = audio[:max_samples]\n",
107
+ "display(ipd.Audio(audio, rate=24000, normalize=True))"
108
+ ]
109
+ },
110
+ {
111
+ "cell_type": "code",
112
+ "execution_count": null,
113
+ "id": "395959f1",
114
+ "metadata": {},
115
+ "outputs": [],
116
+ "source": [
117
+ "text = '''\n",
118
+ "Nearly 300 scholars currently working in the United States have applied for positions at Aix-Marseille University in France, which has announced a program to provide a haven for academics affected by the Trump administration's policies.\n",
119
+ "Aix-Marseille launched the \"Safe Place for Science\" initiative earlier this year, offering three-year funded placements for approximately 20 researchers. The program aims to support scholars facing budget cuts and policy changes that have disrupted U.S. academic institutions.\n",
120
+ "'''"
121
+ ]
122
+ },
123
+ {
124
+ "cell_type": "code",
125
+ "execution_count": null,
126
+ "id": "16194211",
127
+ "metadata": {},
128
+ "outputs": [],
129
+ "source": [
130
+ "model = StyleTTS2(config_path, models_path).eval().to(device)\n",
131
+ "avg_style = True #BOOL Split the ref audio and calculate the avg styles.\n",
132
+ "stabilize = False #BOOL Stabilize speaking speed.\n",
133
+ "denoise = 0.3 #FLOAT Adjust the strength of the denoiser. Value range is [0, 1]\n",
134
+ "n_merge = 16 #INT Avoid short sentences by merging when a sentence has fewer than n words"
135
+ ]
136
+ },
137
+ {
138
+ "cell_type": "code",
139
+ "execution_count": null,
140
+ "id": "980c6fbb",
141
+ "metadata": {},
142
+ "outputs": [],
143
+ "source": [
144
+ "with torch.no_grad():\n",
145
+ " phonemes = get_phoneme(text=text, lang=\"en-us\")\n",
146
+ "\n",
147
+ " styles = model.get_styles(speaker, denoise, avg_style)\n",
148
+ " r = model.generate(phonemes, styles, stabilize, n_merge)\n",
149
+ "\n",
150
+ "print('Synthesized:')\n",
151
+ "display(ipd.Audio(r, rate=24000, normalize=True))"
152
+ ]
153
+ }
154
+ ],
155
+ "metadata": {
156
+ "kernelspec": {
157
+ "display_name": "base",
158
+ "language": "python",
159
+ "name": "python3"
160
+ },
161
+ "language_info": {
162
+ "codemirror_mode": {
163
+ "name": "ipython",
164
+ "version": 3
165
+ },
166
+ "file_extension": ".py",
167
+ "mimetype": "text/x-python",
168
+ "name": "python",
169
+ "nbconvert_exporter": "python",
170
+ "pygments_lexer": "ipython3",
171
+ "version": "3.11.7"
172
+ }
173
+ },
174
+ "nbformat": 4,
175
+ "nbformat_minor": 5
176
+ }