dangtr0408 commited on Apr 25

Commit

0b61e28

verified ·

1 Parent(s): c96b495

Upload 33 files

Browse files

Files changed (33) hide show

.gitattributes +36 -0
.gitignore +8 -0
Audio/10_michael.wav +3 -0
Audio/11_fenrir.wav +3 -0
Audio/12_puck.wav +3 -0
Audio/13_echo.wav +3 -0
Audio/14_eric.wav +3 -0
Audio/15_liam.wav +3 -0
Audio/16_onyx.wav +3 -0
Audio/17_santa.wav +3 -0
Audio/18_adam.wav +3 -0
Audio/1_heart.wav +3 -0
Audio/2_belle.wav +3 -0
Audio/3_kore.wav +3 -0
Audio/4_sarah.wav +3 -0
Audio/5_nova.wav +3 -0
Audio/6_sky.wav +3 -0
Audio/7_alloy.wav +3 -0
Audio/8_jessica.wav +3 -0
Audio/9_river.wav +3 -0
LICENSE +21 -0
Models/base_model.pth +3 -0
Models/config.yaml +70 -0
Models/inference/model.pth +3 -0
Modules/__init__.py +1 -0
Modules/hifigan.py +477 -0
Modules/utils.py +14 -0
README.md +88 -0
inference.py +262 -0
meldataset.py +218 -0
models.py +532 -0
requirements.txt +10 -0
run.ipynb +176 -0

.gitattributes ADDED Viewed

	@@ -0,0 +1,36 @@

+*.7z filter=lfs diff=lfs merge=lfs -text
+*.arrow filter=lfs diff=lfs merge=lfs -text
+*.bin filter=lfs diff=lfs merge=lfs -text
+*.bz2 filter=lfs diff=lfs merge=lfs -text
+*.ckpt filter=lfs diff=lfs merge=lfs -text
+*.ftz filter=lfs diff=lfs merge=lfs -text
+*.gz filter=lfs diff=lfs merge=lfs -text
+*.h5 filter=lfs diff=lfs merge=lfs -text
+*.joblib filter=lfs diff=lfs merge=lfs -text
+*.lfs.* filter=lfs diff=lfs merge=lfs -text
+*.mlmodel filter=lfs diff=lfs merge=lfs -text
+*.model filter=lfs diff=lfs merge=lfs -text
+*.msgpack filter=lfs diff=lfs merge=lfs -text
+*.npy filter=lfs diff=lfs merge=lfs -text
+*.npz filter=lfs diff=lfs merge=lfs -text
+*.onnx filter=lfs diff=lfs merge=lfs -text
+*.ot filter=lfs diff=lfs merge=lfs -text
+*.parquet filter=lfs diff=lfs merge=lfs -text
+*.pb filter=lfs diff=lfs merge=lfs -text
+*.pickle filter=lfs diff=lfs merge=lfs -text
+*.pkl filter=lfs diff=lfs merge=lfs -text
+*.pt filter=lfs diff=lfs merge=lfs -text
+*.pth filter=lfs diff=lfs merge=lfs -text
+*.rar filter=lfs diff=lfs merge=lfs -text
+*.safetensors filter=lfs diff=lfs merge=lfs -text
+saved_model/**/* filter=lfs diff=lfs merge=lfs -text
+*.tar.* filter=lfs diff=lfs merge=lfs -text
+*.tar filter=lfs diff=lfs merge=lfs -text
+*.tflite filter=lfs diff=lfs merge=lfs -text
+*.tgz filter=lfs diff=lfs merge=lfs -text
+*.wasm filter=lfs diff=lfs merge=lfs -text
+*.xz filter=lfs diff=lfs merge=lfs -text
+*.zip filter=lfs diff=lfs merge=lfs -text
+*.zst filter=lfs diff=lfs merge=lfs -text
+*tfevents* filter=lfs diff=lfs merge=lfs -text
+*.wav filter=lfs diff=lfs merge=lfs -text

.gitignore ADDED Viewed

	@@ -0,0 +1,8 @@

+Modules/__pycache__/__init__.cpython-311.pyc
+Modules/__pycache__/hifigan.cpython-311.pyc
+Modules/__pycache__/utils.cpython-311.pyc
+Modules/__pycache__/__init__.cpython-311.pyc
+Modules/__pycache__/hifigan.cpython-311.pyc
+Modules/__pycache__/utils.cpython-311.pyc
+__pycache__/inference.cpython-311.pyc
+__pycache__/models.cpython-311.pyc

Audio/10_michael.wav ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:733023e56be0434c66ac3b855c9aaac29d64f3a060c295a75e700ecfd34c16f0
+size 620444

Audio/11_fenrir.wav ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:abde72631473e48455d54cf585a0b1f229e6e77e9748ed1acef5678a40b08c08
+size 537644

Audio/12_puck.wav ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:409cc59612472a0d4bb717613f539dafdb334411ed651ab6988f7fca8b922905
+size 619244

Audio/13_echo.wav ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:b6925e6737a67fcbf8dce32d22d29d086d81627b82c6edbfc92b3706f27479ff
+size 524444

Audio/14_eric.wav ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:97b8bbf6a880e46730387ee7bb4bfba6c049ed58c4ec8680ec44f83df669eff1
+size 573644

Audio/15_liam.wav ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:95842cfe6d1093deb37447b0e5993b6c18f7e5591c3fb1fb3dd230641925de44
+size 541244

Audio/16_onyx.wav ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:25487ea7634b470392d787bfefb79da0a6a56dc26087ab27b62fa70aac43554d
+size 514844

Audio/17_santa.wav ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:80bc56619904ccbd93ed813fc54491f7b83eb8b8fd6c8a1626bd9177f96a23cd
+size 583244

Audio/18_adam.wav ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:b84a1b122273a45d98b5cbf725f4633e4cccb4a0788b8a46cc9faa4b8612419b
+size 517244

Audio/1_heart.wav ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:978b285ff24f274a1f4fe4551b0d57a5df704ca5ce83284e839ffe96c2dc3dfd
+size 547244

Audio/2_belle.wav ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:459a64fa12dfb530320e8dab2f4057d7868ae4c020b447e8df3402149fa2be59
+size 357644

Audio/3_kore.wav ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:e55fc5c463d01d46c090be5457c59727ee52f2ecbeba8be9b38862850418c0c3
+size 276044

Audio/4_sarah.wav ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:0ae7416f410104b0cedc1cc9c7365a89fd16a1599733f8f416e7618943d0acb8
+size 640844

Audio/5_nova.wav ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:252c20a3f55bfe0ea7f42fbd638f6d4113ade7918630d1d37e166e11143f74f8
+size 336044

Audio/6_sky.wav ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:dc985eb31aa7e2088f852c55282ec6ff72365486478a627bcd56ce2387a8d5b2
+size 502844

Audio/7_alloy.wav ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:cd7868816449f2139e21661dcbc13d3d553c558627d4c50fada1f7c22ce7f86c
+size 632444

Audio/8_jessica.wav ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:c8d7573154905c901281e767f25be2dbceae731c891da409f5b7c0be3096bd5d
+size 477644

Audio/9_river.wav ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:75a3b2fc9d4e93ded21f28cccc6ae7bf7a39bf04fed7f2d4d36e59db0792eedd
+size 472844

LICENSE ADDED Viewed

	@@ -0,0 +1,21 @@

+MIT License
+Copyright (c) 2023 Aaron (Yinghao) Li
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.

Models/base_model.pth ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:821deb4efee549b7024f37236e86b4bcb023870baf0ddb9f407fb514253340d1
+size 1692092384

Models/config.yaml ADDED Viewed

	@@ -0,0 +1,70 @@

+log_dir: ./Models/Finetune
+save_freq: 1
+log_interval: 5
+device: cuda
+epochs: 50
+batch_size: 2
+max_len: 310 # maximum number of frames
+pretrained_model: ./Models/Finetune/base_model.pth
+load_only_params: false # set to true if do not want to load epoch numbers and optimizer parameters
+data_params:
+  train_data: ../../Data_Speech/LibriTTS/train.txt
+  val_data: ../../Data_Speech/LibriTTS/val.txt
+  root_path: ../../Data_Speech/
+preprocess_params:
+  sr: 24000
+  spect_params:
+    n_fft: 2048
+    win_length: 1200
+    hop_length: 300
+model_params:
+  dim_in: 64
+  hidden_dim: 512
+  max_conv_dim: 512
+  n_layer: 3
+  n_mels: 80
+  n_token: 178 # number of phoneme tokens
+  max_dur: 50 # maximum duration of a single phoneme
+  style_dim: 128 # style vector size
+  dropout: 0.2
+  ASR_params:
+    input_dim: 80
+    hidden_dim: 256
+    n_token: 178 # number of phoneme tokens
+    n_layers: 6
+    token_embedding_dim: 512
+  JDC_params:
+    num_class: 1
+    seq_len: 192
+  # config for decoder
+  decoder:
+      type: hifigan # either hifigan or istftnet
+      resblock_kernel_sizes: [3,7,11]
+      upsample_rates :  [10,5,3,2]
+      upsample_initial_channel: 512
+      resblock_dilation_sizes: [[1,3,5], [1,3,5], [1,3,5]]
+      upsample_kernel_sizes: [20,10,6,4]
+loss_params:
+    lambda_mel: 5. # mel reconstruction loss
+    lambda_gen: 1. # generator loss
+    lambda_mono: 1. # monotonic alignment loss (TMA)
+    lambda_s2s: 1. # sequence-to-sequence loss (TMA)
+    lambda_F0: 1. # F0 reconstruction loss
+    lambda_norm: 1. # norm reconstruction loss
+    lambda_dur: 1. # duration loss
+    lambda_ce: 20. # duration predictor probability output CE loss
+optimizer_params:
+  lr: 0.0001 # general learning rate
+  ft_lr: 0.00001 # learning rate for acoustic modules

Models/inference/model.pth ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:2763d7b6c5477502d3f2a870eda76bbedae671f0107b15a1060fb4e6771ed634
+size 359997166

Modules/__init__.py ADDED Viewed

	@@ -0,0 +1 @@


1	+

Modules/hifigan.py ADDED Viewed

	@@ -0,0 +1,477 @@

+import torch
+import torch.nn.functional as F
+import torch.nn as nn
+from torch.nn import Conv1d, ConvTranspose1d, AvgPool1d, Conv2d
+from torch.nn.utils import weight_norm, remove_weight_norm, spectral_norm
+from .utils import init_weights, get_padding
+import math
+import random
+import numpy as np
+LRELU_SLOPE = 0.1
+class AdaIN1d(nn.Module):
+    def __init__(self, style_dim, num_features):
+        super().__init__()
+        self.norm = nn.InstanceNorm1d(num_features, affine=False)
+        self.fc = nn.Linear(style_dim, num_features*2)
+    def forward(self, x, s):
+        h = self.fc(s)
+        h = h.view(h.size(0), h.size(1), 1)
+        gamma, beta = torch.chunk(h, chunks=2, dim=1)
+        return (1 + gamma) * self.norm(x) + beta
+class AdaINResBlock1(torch.nn.Module):
+    def __init__(self, channels, kernel_size=3, dilation=(1, 3, 5), style_dim=64):
+        super(AdaINResBlock1, self).__init__()
+        self.convs1 = nn.ModuleList([
+            weight_norm(Conv1d(channels, channels, kernel_size, 1, dilation=dilation[0],
+                               padding=get_padding(kernel_size, dilation[0]))),
+            weight_norm(Conv1d(channels, channels, kernel_size, 1, dilation=dilation[1],
+                               padding=get_padding(kernel_size, dilation[1]))),
+            weight_norm(Conv1d(channels, channels, kernel_size, 1, dilation=dilation[2],
+                               padding=get_padding(kernel_size, dilation[2])))
+        ])
+        self.convs1.apply(init_weights)
+        self.convs2 = nn.ModuleList([
+            weight_norm(Conv1d(channels, channels, kernel_size, 1, dilation=1,
+                               padding=get_padding(kernel_size, 1))),
+            weight_norm(Conv1d(channels, channels, kernel_size, 1, dilation=1,
+                               padding=get_padding(kernel_size, 1))),
+            weight_norm(Conv1d(channels, channels, kernel_size, 1, dilation=1,
+                               padding=get_padding(kernel_size, 1)))
+        ])
+        self.convs2.apply(init_weights)
+        self.adain1 = nn.ModuleList([
+            AdaIN1d(style_dim, channels),
+            AdaIN1d(style_dim, channels),
+            AdaIN1d(style_dim, channels),
+        ])
+        self.adain2 = nn.ModuleList([
+            AdaIN1d(style_dim, channels),
+            AdaIN1d(style_dim, channels),
+            AdaIN1d(style_dim, channels),
+        ])
+        self.alpha1 = nn.ParameterList([nn.Parameter(torch.ones(1, channels, 1)) for i in range(len(self.convs1))])
+        self.alpha2 = nn.ParameterList([nn.Parameter(torch.ones(1, channels, 1)) for i in range(len(self.convs2))])
+    def forward(self, x, s):
+        for c1, c2, n1, n2, a1, a2 in zip(self.convs1, self.convs2, self.adain1, self.adain2, self.alpha1, self.alpha2):
+            xt = n1(x, s)
+            xt = xt + (1 / a1) * (torch.sin(a1 * xt) ** 2)  # Snake1D
+            xt = c1(xt)
+            xt = n2(xt, s)
+            xt = xt + (1 / a2) * (torch.sin(a2 * xt) ** 2)  # Snake1D
+            xt = c2(xt)
+            x = xt + x
+        return x
+    def remove_weight_norm(self):
+        for l in self.convs1:
+            remove_weight_norm(l)
+        for l in self.convs2:
+            remove_weight_norm(l)
+class SineGen(torch.nn.Module):
+    """ Definition of sine generator
+    SineGen(samp_rate, harmonic_num = 0,
+            sine_amp = 0.1, noise_std = 0.003,
+            voiced_threshold = 0,
+            flag_for_pulse=False)
+    samp_rate: sampling rate in Hz
+    harmonic_num: number of harmonic overtones (default 0)
+    sine_amp: amplitude of sine-wavefrom (default 0.1)
+    noise_std: std of Gaussian noise (default 0.003)
+    voiced_thoreshold: F0 threshold for U/V classification (default 0)
+    flag_for_pulse: this SinGen is used inside PulseGen (default False)
+    Note: when flag_for_pulse is True, the first time step of a voiced
+        segment is always sin(np.pi) or cos(0)
+    """
+    def __init__(self, samp_rate, upsample_scale, harmonic_num=0,
+                 sine_amp=0.1, noise_std=0.003,
+                 voiced_threshold=0,
+                 flag_for_pulse=False):
+        super(SineGen, self).__init__()
+        self.sine_amp = sine_amp
+        self.noise_std = noise_std
+        self.harmonic_num = harmonic_num
+        self.dim = self.harmonic_num + 1
+        self.sampling_rate = samp_rate
+        self.voiced_threshold = voiced_threshold
+        self.flag_for_pulse = flag_for_pulse
+        self.upsample_scale = upsample_scale
+    def _f02uv(self, f0):
+        # generate uv signal
+        uv = (f0 > self.voiced_threshold).type(torch.float32)
+        return uv
+    def _f02sine(self, f0_values):
+        """ f0_values: (batchsize, length, dim)
+            where dim indicates fundamental tone and overtones
+        """
+        # convert to F0 in rad. The interger part n can be ignored
+        # because 2 * np.pi * n doesn't affect phase
+        rad_values = (f0_values / self.sampling_rate) % 1
+        # initial phase noise (no noise for fundamental component)
+        rand_ini = torch.rand(f0_values.shape[0], f0_values.shape[2], \
+                              device=f0_values.device)
+        rand_ini[:, 0] = 0
+        rad_values[:, 0, :] = rad_values[:, 0, :] + rand_ini
+        # instantanouse phase sine[t] = sin(2*pi \sum_i=1 ^{t} rad)
+        if not self.flag_for_pulse:
+#             # for normal case
+#             # To prevent torch.cumsum numerical overflow,
+#             # it is necessary to add -1 whenever \sum_k=1^n rad_value_k > 1.
+#             # Buffer tmp_over_one_idx indicates the time step to add -1.
+#             # This will not change F0 of sine because (x-1) * 2*pi = x * 2*pi
+#             tmp_over_one = torch.cumsum(rad_values, 1) % 1
+#             tmp_over_one_idx = (padDiff(tmp_over_one)) < 0
+#             cumsum_shift = torch.zeros_like(rad_values)
+#             cumsum_shift[:, 1:, :] = tmp_over_one_idx * -1.0
+#             phase = torch.cumsum(rad_values, dim=1) * 2 * np.pi
+            rad_values = torch.nn.functional.interpolate(rad_values.transpose(1, 2),
+                                                         scale_factor=1/self.upsample_scale,
+                                                         mode="linear").transpose(1, 2)
+#             tmp_over_one = torch.cumsum(rad_values, 1) % 1
+#             tmp_over_one_idx = (padDiff(tmp_over_one)) < 0
+#             cumsum_shift = torch.zeros_like(rad_values)
+#             cumsum_shift[:, 1:, :] = tmp_over_one_idx * -1.0
+            phase = torch.cumsum(rad_values, dim=1) * 2 * np.pi
+            phase = torch.nn.functional.interpolate(phase.transpose(1, 2) * self.upsample_scale,
+                                                    scale_factor=self.upsample_scale, mode="linear").transpose(1, 2)
+            sines = torch.sin(phase)
+        else:
+            # If necessary, make sure that the first time step of every
+            # voiced segments is sin(pi) or cos(0)
+            # This is used for pulse-train generation
+            # identify the last time step in unvoiced segments
+            uv = self._f02uv(f0_values)
+            uv_1 = torch.roll(uv, shifts=-1, dims=1)
+            uv_1[:, -1, :] = 1
+            u_loc = (uv < 1) * (uv_1 > 0)
+            # get the instantanouse phase
+            tmp_cumsum = torch.cumsum(rad_values, dim=1)
+            # different batch needs to be processed differently
+            for idx in range(f0_values.shape[0]):
+                temp_sum = tmp_cumsum[idx, u_loc[idx, :, 0], :]
+                temp_sum[1:, :] = temp_sum[1:, :] - temp_sum[0:-1, :]
+                # stores the accumulation of i.phase within
+                # each voiced segments
+                tmp_cumsum[idx, :, :] = 0
+                tmp_cumsum[idx, u_loc[idx, :, 0], :] = temp_sum
+            # rad_values - tmp_cumsum: remove the accumulation of i.phase
+            # within the previous voiced segment.
+            i_phase = torch.cumsum(rad_values - tmp_cumsum, dim=1)
+            # get the sines
+            sines = torch.cos(i_phase * 2 * np.pi)
+        return sines
+    def forward(self, f0):
+        """ sine_tensor, uv = forward(f0)
+        input F0: tensor(batchsize=1, length, dim=1)
+                  f0 for unvoiced steps should be 0
+        output sine_tensor: tensor(batchsize=1, length, dim)
+        output uv: tensor(batchsize=1, length, 1)
+        """
+        f0_buf = torch.zeros(f0.shape[0], f0.shape[1], self.dim,
+                             device=f0.device)
+        # fundamental component
+        fn = torch.multiply(f0, torch.FloatTensor([[range(1, self.harmonic_num + 2)]]).to(f0.device))
+        # generate sine waveforms
+        sine_waves = self._f02sine(fn) * self.sine_amp
+        # generate uv signal
+        # uv = torch.ones(f0.shape)
+        # uv = uv * (f0 > self.voiced_threshold)
+        uv = self._f02uv(f0)
+        # noise: for unvoiced should be similar to sine_amp
+        #        std = self.sine_amp/3 -> max value ~ self.sine_amp
+        # .       for voiced regions is self.noise_std
+        noise_amp = uv * self.noise_std + (1 - uv) * self.sine_amp / 3
+        noise = noise_amp * torch.randn_like(sine_waves)
+        # first: set the unvoiced part to 0 by uv
+        # then: additive noise
+        sine_waves = sine_waves * uv + noise
+        return sine_waves, uv, noise
+class SourceModuleHnNSF(torch.nn.Module):
+    """ SourceModule for hn-nsf
+    SourceModule(sampling_rate, harmonic_num=0, sine_amp=0.1,
+                 add_noise_std=0.003, voiced_threshod=0)
+    sampling_rate: sampling_rate in Hz
+    harmonic_num: number of harmonic above F0 (default: 0)
+    sine_amp: amplitude of sine source signal (default: 0.1)
+    add_noise_std: std of additive Gaussian noise (default: 0.003)
+        note that amplitude of noise in unvoiced is decided
+        by sine_amp
+    voiced_threshold: threhold to set U/V given F0 (default: 0)
+    Sine_source, noise_source = SourceModuleHnNSF(F0_sampled)
+    F0_sampled (batchsize, length, 1)
+    Sine_source (batchsize, length, 1)
+    noise_source (batchsize, length 1)
+    uv (batchsize, length, 1)
+    """
+    def __init__(self, sampling_rate, upsample_scale, harmonic_num=0, sine_amp=0.1,
+                 add_noise_std=0.003, voiced_threshod=0):
+        super(SourceModuleHnNSF, self).__init__()
+        self.sine_amp = sine_amp
+        self.noise_std = add_noise_std
+        # to produce sine waveforms
+        self.l_sin_gen = SineGen(sampling_rate, upsample_scale, harmonic_num,
+                                 sine_amp, add_noise_std, voiced_threshod)
+        # to merge source harmonics into a single excitation
+        self.l_linear = torch.nn.Linear(harmonic_num + 1, 1)
+        self.l_tanh = torch.nn.Tanh()
+    def forward(self, x):
+        """
+        Sine_source, noise_source = SourceModuleHnNSF(F0_sampled)
+        F0_sampled (batchsize, length, 1)
+        Sine_source (batchsize, length, 1)
+        noise_source (batchsize, length 1)
+        """
+        # source for harmonic branch
+        with torch.no_grad():
+            sine_wavs, uv, _ = self.l_sin_gen(x)
+        sine_merge = self.l_tanh(self.l_linear(sine_wavs))
+        # source for noise branch, in the same shape as uv
+        noise = torch.randn_like(uv) * self.sine_amp / 3
+        return sine_merge, noise, uv
+def padDiff(x):
+    return F.pad(F.pad(x, (0,0,-1,1), 'constant', 0) - x, (0,0,0,-1), 'constant', 0)
+class Generator(torch.nn.Module):
+    def __init__(self, style_dim, resblock_kernel_sizes, upsample_rates, upsample_initial_channel, resblock_dilation_sizes, upsample_kernel_sizes):
+        super(Generator, self).__init__()
+        self.num_kernels = len(resblock_kernel_sizes)
+        self.num_upsamples = len(upsample_rates)
+        resblock = AdaINResBlock1
+        self.m_source = SourceModuleHnNSF(
+                    sampling_rate=24000,
+                    upsample_scale=np.prod(upsample_rates),
+                    harmonic_num=8, voiced_threshod=10)
+        self.f0_upsamp = torch.nn.Upsample(scale_factor=np.prod(upsample_rates))
+        self.noise_convs = nn.ModuleList()
+        self.ups = nn.ModuleList()
+        self.noise_res = nn.ModuleList()
+        for i, (u, k) in enumerate(zip(upsample_rates, upsample_kernel_sizes)):
+            c_cur = upsample_initial_channel // (2 ** (i + 1))
+            self.ups.append(weight_norm(ConvTranspose1d(upsample_initial_channel//(2**i),
+                         upsample_initial_channel//(2**(i+1)),
+                         k, u, padding=(u//2 + u%2), output_padding=u%2)))
+            if i + 1 < len(upsample_rates):  #
+                stride_f0 = np.prod(upsample_rates[i + 1:])
+                self.noise_convs.append(Conv1d(
+                    1, c_cur, kernel_size=stride_f0 * 2, stride=stride_f0, padding=(stride_f0+1) // 2))
+                self.noise_res.append(resblock(c_cur, 7, [1,3,5], style_dim))
+            else:
+                self.noise_convs.append(Conv1d(1, c_cur, kernel_size=1))
+                self.noise_res.append(resblock(c_cur, 11, [1,3,5], style_dim))
+        self.resblocks = nn.ModuleList()
+        self.alphas = nn.ParameterList()
+        self.alphas.append(nn.Parameter(torch.ones(1, upsample_initial_channel, 1)))
+        for i in range(len(self.ups)):
+            ch = upsample_initial_channel//(2**(i+1))
+            self.alphas.append(nn.Parameter(torch.ones(1, ch, 1)))
+            for j, (k, d) in enumerate(zip(resblock_kernel_sizes, resblock_dilation_sizes)):
+                self.resblocks.append(resblock(ch, k, d, style_dim))
+        self.conv_post = weight_norm(Conv1d(ch, 1, 7, 1, padding=3))
+        self.ups.apply(init_weights)
+        self.conv_post.apply(init_weights)
+    def forward(self, x, s, f0):
+        f0 = self.f0_upsamp(f0[:, None]).transpose(1, 2)  # bs,n,t
+        har_source, noi_source, uv = self.m_source(f0)
+        har_source = har_source.transpose(1, 2)
+        for i in range(self.num_upsamples):
+            x = x + (1 / self.alphas[i]) * (torch.sin(self.alphas[i] * x) ** 2)
+            x_source = self.noise_convs[i](har_source)
+            x_source = self.noise_res[i](x_source, s)
+            x = self.ups[i](x)
+            x = x + x_source
+            xs = None
+            for j in range(self.num_kernels):
+                if xs is None:
+                    xs = self.resblocks[i*self.num_kernels+j](x, s)
+                else:
+                    xs += self.resblocks[i*self.num_kernels+j](x, s)
+            x = xs / self.num_kernels
+        x = x + (1 / self.alphas[i+1]) * (torch.sin(self.alphas[i+1] * x) ** 2)
+        x = self.conv_post(x)
+        x = torch.tanh(x)
+        return x
+    def remove_weight_norm(self):
+        print('Removing weight norm...')
+        for l in self.ups:
+            remove_weight_norm(l)
+        for l in self.resblocks:
+            l.remove_weight_norm()
+        remove_weight_norm(self.conv_pre)
+        remove_weight_norm(self.conv_post)
+class AdainResBlk1d(nn.Module):
+    def __init__(self, dim_in, dim_out, style_dim=64, actv=nn.LeakyReLU(0.2),
+                 upsample='none', dropout_p=0.0):
+        super().__init__()
+        self.actv = actv
+        self.upsample_type = upsample
+        self.upsample = UpSample1d(upsample)
+        self.learned_sc = dim_in != dim_out
+        self._build_weights(dim_in, dim_out, style_dim)
+        self.dropout = nn.Dropout(dropout_p)
+        if upsample == 'none':
+            self.pool = nn.Identity()
+        else:
+            self.pool = weight_norm(nn.ConvTranspose1d(dim_in, dim_in, kernel_size=3, stride=2, groups=dim_in, padding=1, output_padding=1))
+    def _build_weights(self, dim_in, dim_out, style_dim):
+        self.conv1 = weight_norm(nn.Conv1d(dim_in, dim_out, 3, 1, 1))
+        self.conv2 = weight_norm(nn.Conv1d(dim_out, dim_out, 3, 1, 1))
+        self.norm1 = AdaIN1d(style_dim, dim_in)
+        self.norm2 = AdaIN1d(style_dim, dim_out)
+        if self.learned_sc:
+            self.conv1x1 = weight_norm(nn.Conv1d(dim_in, dim_out, 1, 1, 0, bias=False))
+    def _shortcut(self, x):
+        x = self.upsample(x)
+        if self.learned_sc:
+            x = self.conv1x1(x)
+        return x
+    def _residual(self, x, s):
+        x = self.norm1(x, s)
+        x = self.actv(x)
+        x = self.pool(x)
+        x = self.conv1(self.dropout(x))
+        x = self.norm2(x, s)
+        x = self.actv(x)
+        x = self.conv2(self.dropout(x))
+        return x
+    def forward(self, x, s):
+        out = self._residual(x, s)
+        out = (out + self._shortcut(x)) / math.sqrt(2)
+        return out
+class UpSample1d(nn.Module):
+    def __init__(self, layer_type):
+        super().__init__()
+        self.layer_type = layer_type
+    def forward(self, x):
+        if self.layer_type == 'none':
+            return x
+        else:
+            return F.interpolate(x, scale_factor=2, mode='nearest')
+class Decoder(nn.Module):
+    def __init__(self, dim_in=512, F0_channel=512, style_dim=64, dim_out=80,
+                resblock_kernel_sizes = [3,7,11],
+                upsample_rates = [10,5,3,2],
+                upsample_initial_channel=512,
+                resblock_dilation_sizes=[[1,3,5], [1,3,5], [1,3,5]],
+                upsample_kernel_sizes=[20,10,6,4]):
+        super().__init__()
+        self.decode = nn.ModuleList()
+        self.encode = AdainResBlk1d(dim_in + 2, 1024, style_dim)
+        self.decode.append(AdainResBlk1d(1024 + 2 + 64, 1024, style_dim))
+        self.decode.append(AdainResBlk1d(1024 + 2 + 64, 1024, style_dim))
+        self.decode.append(AdainResBlk1d(1024 + 2 + 64, 1024, style_dim))
+        self.decode.append(AdainResBlk1d(1024 + 2 + 64, 512, style_dim, upsample=True))
+        self.F0_conv = weight_norm(nn.Conv1d(1, 1, kernel_size=3, stride=2, groups=1, padding=1))
+        self.N_conv = weight_norm(nn.Conv1d(1, 1, kernel_size=3, stride=2, groups=1, padding=1))
+        self.asr_res = nn.Sequential(
+            weight_norm(nn.Conv1d(512, 64, kernel_size=1)),
+        )
+        self.generator = Generator(style_dim, resblock_kernel_sizes, upsample_rates, upsample_initial_channel, resblock_dilation_sizes, upsample_kernel_sizes)
+    def forward(self, asr, F0_curve, N, s):
+        if self.training:
+            downlist = [0, 3, 7]
+            F0_down = downlist[random.randint(0, 2)]
+            downlist = [0, 3, 7, 15]
+            N_down = downlist[random.randint(0, 3)]
+            if F0_down:
+                F0_curve = nn.functional.conv1d(F0_curve.unsqueeze(1), torch.ones(1, 1, F0_down).to(asr.device), padding=F0_down//2).squeeze(1) / F0_down
+            if N_down:
+                N = nn.functional.conv1d(N.unsqueeze(1), torch.ones(1, 1, N_down).to(asr.device), padding=N_down//2).squeeze(1)  / N_down
+        F0 = self.F0_conv(F0_curve.unsqueeze(1))
+        N = self.N_conv(N.unsqueeze(1))
+        x = torch.cat([asr, F0, N], axis=1)
+        x = self.encode(x, s)
+        asr_res = self.asr_res(asr)
+        res = True
+        for block in self.decode:
+            if res:
+                x = torch.cat([x, asr_res, F0, N], axis=1)
+            x = block(x, s)
+            if block.upsample_type != "none":
+                res = False
+        x = self.generator(x, s, F0_curve)
+        return x

Modules/utils.py ADDED Viewed

	@@ -0,0 +1,14 @@

+def init_weights(m, mean=0.0, std=0.01):
+    classname = m.__class__.__name__
+    if classname.find("Conv") != -1:
+        m.weight.data.normal_(mean, std)
+def apply_weight_norm(m):
+    classname = m.__class__.__name__
+    if classname.find("Conv") != -1:
+        weight_norm(m)
+def get_padding(kernel_size, dilation=1):
+    return int((kernel_size*dilation - dilation)/2)

README.md ADDED Viewed

	@@ -0,0 +1,88 @@

+---
+license: mit
+language:
+- en
+base_model:
+- yl4579/StyleTTS2-LibriTTS
+pipeline_tag: text-to-speech
+---
+# StyleTTS 2 - lite
+## Online Demo
+Explore the model on Hugging Face Spaces:
+https://huggingface.co/spaces/dangtr0408/StyleTTS2-lite-space
+## Fine-tune
+https://github.com/dangtr0408/StyleTTS2-lite
+## Training Details
+1. **Base Checkpoint:** Initialized from the official StyleTTS 2 weights pre-trained on LibriTTS.
+2. **Components Removal:** PLBert, Diffusion, Prosodic Encoder, SLM, and Spectral Normalization.
+2. **Training Data:** LibriTTS corpus.
+3. **Training Schedule:** Trained for 100,000 steps.
+## Model Architecture
+| Component      | Parameters    |
+| -------------- | ------------- |
+| Decoder        | 54 ,289 ,492  |
+| Predictor      | 16 ,194 ,612  |
+| Style Encoder  | 13 ,845 ,440  |
+| Text Encoder   | 5,612 ,320  |
+| **Total**      | **89 ,941 ,576** |
+##  Prerequisites
+- **Python:** Version 3.7 or higher
+- **Git:** To clone the repository
+## Installation & Setup
+1. Clone the repository
+```bash
+git  clone  https://huggingface.co/dangtr0408/StyleTTS2-lite
+cd  StyleTTS2-lite
+```
+2. Install dependencies:
+```bash
+pip  install  -r  requirements.txt
+```
+3. On **Linux**, manually install espeak:
+```bash
+sudo  apt-get  install  espeak-ng
+```
+## Usage Example
+See run.ipynb file.
+## Disclaimer
+**Before using these pre-trained models, you agree to inform the listeners that the speech samples are synthesized by the pre-trained models, unless you have the permission to use the voice you synthesize. That is, you agree to only use voices whose speakers grant the permission to have their voice cloned, either directly or by license before making synthesized voices public, or you have to publicly announce that these voices are synthesized if you do not have the permission to use these voices.**
+## References
+- [yl4579/StyleTTS2](https://arxiv.org/abs/2306.07691)
+- [jik876/hifi-gan](https://github.com/jik876/hifi-gan)
+## License
+**Code: MIT License**

inference.py ADDED Viewed

	@@ -0,0 +1,262 @@

+import re
+import yaml
+from munch import Munch
+import numpy as np
+import librosa
+import noisereduce as nr
+from meldataset import TextCleaner
+import torch
+import torchaudio
+from nltk.tokenize import word_tokenize
+import nltk
+nltk.download('punkt_tab')
+from models import ProsodyPredictor, TextEncoder, StyleEncoder
+from Modules.hifigan import Decoder
+class Preprocess:
+    def __text_normalize(self, text):
+        punctuation = ["，", "、", "،", ";", "(", "．", "。", "…", "!", "–", ":", "?"]
+        map_to = "."
+        punctuation_pattern = re.compile(f"[{''.join(re.escape(p) for p in punctuation)}]")
+        #replace punctuation that acts like a comma or period
+        text = punctuation_pattern.sub(map_to, text)
+        #replace consecutive whitespace chars with a single space and strip leading/trailing spaces
+        text = re.sub(r'\s+', ' ', text).strip()
+        return text
+    def __merge_fragments(self, texts, n):
+        merged = []
+        i = 0
+        while i < len(texts):
+            fragment = texts[i]
+            j = i + 1
+            while len(fragment.split()) < n and j < len(texts):
+                fragment += ", " + texts[j]
+                j += 1
+            merged.append(fragment)
+            i = j
+        if len(merged[-1].split()) < n and len(merged) > 1: #handle last sentence
+            merged[-2] = merged[-2] + ", " + merged[-1]
+            del merged[-1]
+        else:
+            merged[-1] = merged[-1]
+        return merged
+    def wave_preprocess(self, wave):
+        to_mel = torchaudio.transforms.MelSpectrogram(n_mels=80, n_fft=2048, win_length=1200, hop_length=300)
+        mean, std = -4, 4
+        wave_tensor = torch.from_numpy(wave).float()
+        mel_tensor = to_mel(wave_tensor)
+        mel_tensor = (torch.log(1e-5 + mel_tensor.unsqueeze(0)) - mean) / std
+        return mel_tensor
+    def text_preprocess(self, text, n_merge=12):
+        text_norm = self.__text_normalize(text).split(".")#split by sentences.
+        text_norm = [s.strip() for s in text_norm]
+        text_norm = list(filter(lambda x: x != '', text_norm)) #filter empty index
+        text_norm = self.__merge_fragments(text_norm, n=n_merge) #merge if a sentence has less that n
+        return text_norm
+    def length_to_mask(self, lengths):
+        mask = torch.arange(lengths.max()).unsqueeze(0).expand(lengths.shape[0], -1).type_as(lengths)
+        mask = torch.gt(mask+1, lengths.unsqueeze(1))
+        return mask
+#For inference only
+class StyleTTS2(torch.nn.Module):
+    def __init__(self, config_path, models_path):
+        super().__init__()
+        self.register_buffer("get_device", torch.empty(0))
+        self.preprocess = Preprocess()
+        config = yaml.safe_load(open(config_path))
+        args = self.__recursive_munch(config['model_params'])
+        assert args.decoder.type in ['hifigan'], 'Decoder type unknown'
+        self.decoder            = Decoder(dim_in=args.hidden_dim, style_dim=args.style_dim, dim_out=args.n_mels,
+                                        resblock_kernel_sizes = args.decoder.resblock_kernel_sizes,
+                                        upsample_rates = args.decoder.upsample_rates,
+                                        upsample_initial_channel=args.decoder.upsample_initial_channel,
+                                        resblock_dilation_sizes=args.decoder.resblock_dilation_sizes,
+                                        upsample_kernel_sizes=args.decoder.upsample_kernel_sizes)
+        self.predictor           = ProsodyPredictor(style_dim=args.style_dim, d_hid=args.hidden_dim, nlayers=args.n_layer, max_dur=args.max_dur, dropout=args.dropout)
+        self.text_encoder        = TextEncoder(channels=args.hidden_dim, kernel_size=5, depth=args.n_layer, n_symbols=args.n_token)
+        self.style_encoder       = StyleEncoder(dim_in=args.dim_in, style_dim=args.style_dim, max_conv_dim=args.hidden_dim)# acoustic style encoder
+        self.__load_models(models_path)
+    def __recursive_munch(self, d):
+        if isinstance(d, dict):
+            return Munch((k, self.__recursive_munch(v)) for k, v in d.items())
+        elif isinstance(d, list):
+            return [self.__recursive_munch(v) for v in d]
+        else:
+            return d
+    def __replace_outliers_zscore(self, tensor, threshold=3.0, factor=0.95):
+        mean = tensor.mean()
+        std = tensor.std()
+        z = (tensor - mean) / std
+        # Identify outliers
+        outlier_mask = torch.abs(z) > threshold
+        # Compute replacement value, respecting sign
+        sign = torch.sign(tensor - mean)
+        replacement = mean + sign * (threshold * std * factor)
+        result = tensor.clone()
+        result[outlier_mask] = replacement[outlier_mask]
+        return result
+    def __load_models(self, models_path):
+        module_params = []
+        model = {'decoder':self.decoder, 'predictor':self.predictor, 'text_encoder':self.text_encoder, 'style_encoder':self.style_encoder}
+        params_whole = torch.load(models_path, map_location='cpu')
+        params = params_whole['net']
+        params = {key: value for key, value in params.items() if key in model.keys()}
+        for key in model:
+            try:
+                model[key].load_state_dict(params[key])
+            except:
+                from collections import OrderedDict
+                state_dict = params[key]
+                new_state_dict = OrderedDict()
+                for k, v in state_dict.items():
+                    name = k[7:] # remove `module.`
+                    new_state_dict[name] = v
+                model[key].load_state_dict(new_state_dict, strict=False)
+            total_params = sum(p.numel() for p in model[key].parameters())
+            print(key,":",total_params)
+            module_params.append(total_params)
+        print('\nTotal',":",sum(module_params))
+    def __compute_style(self, path, denoise, split_dur):
+        device = self.get_device.device
+        denoise = min(denoise, 1)
+        if split_dur != 0: split_dur = max(int(split_dur), 1)
+        max_samples = 24000*20 #max 20 seconds ref audio
+        print("Computing the style for:", path)
+        wave, sr = librosa.load(path, sr=24000)
+        audio, index = librosa.effects.trim(wave, top_db=30)
+        if sr != 24000:
+            audio = librosa.resample(audio, sr, 24000)
+        if len(audio) > max_samples:
+            audio = audio[:max_samples]
+        if denoise > 0.0:
+            audio_denoise = nr.reduce_noise(y=audio, sr=sr, n_fft=2048, win_length=1200, hop_length=300)
+            audio = audio*(1-denoise) + audio_denoise*denoise
+        with torch.no_grad():
+            if split_dur>0 and len(audio)/sr>=4: #Only effective if audio length is >= 4s
+                #This option will split the ref audio to multiple parts, calculate styles and average them
+                count = 0
+                ref_s = None
+                jump = sr*split_dur
+                total_len = len(audio)
+                #Need to init before the loop
+                mel_tensor = self.preprocess.wave_preprocess(audio[0:jump]).to(device)
+                ref_s = self.style_encoder(mel_tensor.unsqueeze(1))
+                count += 1
+                for i in range(jump, total_len, jump):
+                    if i+jump >= total_len:
+                        left_dur = (total_len-i)/sr
+                        if left_dur >= 1: #Still count if left over dur is >= 1s
+                            mel_tensor = self.preprocess.wave_preprocess(audio[i:total_len]).to(device)
+                            ref_s += self.style_encoder(mel_tensor.unsqueeze(1))
+                            count += 1
+                        continue
+                    mel_tensor = self.preprocess.wave_preprocess(audio[i:i+jump]).to(device)
+                    ref_s += self.style_encoder(mel_tensor.unsqueeze(1))
+                    count += 1
+                ref_s /= count
+            else:
+                mel_tensor = self.preprocess.wave_preprocess(audio).to(device)
+                ref_s = self.style_encoder(mel_tensor.unsqueeze(1))
+        return ref_s
+    def __inference(self, phonem, ref_s, speed=1, prev_d_mean=0, t=0.1):
+        device = self.get_device.device
+        speed = min(max(speed, 0.0001), 2) #speed range [0, 2]
+        phonem = ' '.join(word_tokenize(phonem))
+        tokens = TextCleaner()(phonem)
+        tokens.insert(0, 0)
+        tokens.append(0)
+        tokens = torch.LongTensor(tokens).to(device).unsqueeze(0)
+        with torch.no_grad():
+            input_lengths = torch.LongTensor([tokens.shape[-1]]).to(device)
+            text_mask = self.preprocess.length_to_mask(input_lengths).to(device)
+            # encode
+            t_en = self.text_encoder(tokens, input_lengths, text_mask)
+            s = ref_s.to(device)
+            # cal alignment
+            d = self.predictor.text_encoder(t_en, s, input_lengths, text_mask)
+            x, _ = self.predictor.lstm(d)
+            duration = self.predictor.duration_proj(x)
+            duration = torch.sigmoid(duration).sum(axis=-1)
+            if prev_d_mean != 0:#Stabilize speaking speed between splits
+                dur_stats = torch.empty(duration.shape).normal_(mean=prev_d_mean, std=duration.std()).to(device)
+            else:
+                dur_stats = torch.empty(duration.shape).normal_(mean=duration.mean(), std=duration.std()).to(device)
+            duration = duration*(1-t) + dur_stats*t
+            duration[:,1:-2] = self.__replace_outliers_zscore(duration[:,1:-2]) #Normalize outlier
+            duration /= speed
+            pred_dur = torch.round(duration.squeeze()).clamp(min=1)
+            pred_aln_trg = torch.zeros(input_lengths, int(pred_dur.sum().data))
+            c_frame = 0
+            for i in range(pred_aln_trg.size(0)):
+                pred_aln_trg[i, c_frame:c_frame + int(pred_dur[i].data)] = 1
+                c_frame += int(pred_dur[i].data)
+            alignment = pred_aln_trg.unsqueeze(0).to(device)
+            # encode prosody
+            en = (d.transpose(-1, -2) @ alignment)
+            F0_pred, N_pred = self.predictor.F0Ntrain(en, s)
+            asr = (t_en @ pred_aln_trg.unsqueeze(0).to(device))
+            out = self.decoder(asr, F0_pred, N_pred, s)
+        return out.squeeze().cpu().numpy(), duration.mean()
+    def get_styles(self, speaker, denoise=0.3, avg_style=True):
+        if avg_style:   split_dur = 3
+        else:           split_dur = 0
+        style = {}
+        ref_s = self.__compute_style(speaker['path'], denoise=denoise, split_dur=split_dur)
+        style = {
+            'style': ref_s,
+            'path': speaker['path'],
+            'speed': speaker['speed'],
+        }
+        return style
+    def generate(self, phonem, style, stabilize=True, n_merge=16):
+        if stabilize:   smooth_value=0.2
+        else:           smooth_value=0
+        list_wav        = []
+        prev_d_mean     = 0
+        print("Generating Audio...")
+        text_norm = self.preprocess.text_preprocess(phonem, n_merge=n_merge)
+        for sentence in text_norm:
+            wav, prev_d_mean = self.__inference(sentence, style['style'], speed=style['speed'], prev_d_mean=prev_d_mean, t=smooth_value)
+            wav = wav[4000:-4000] #Remove weird pulse and silent tokens
+            list_wav.append(wav)
+        final_wav = np.concatenate(list_wav)
+        final_wav = np.concatenate([np.zeros([4000]), final_wav, np.zeros([4000])], axis=0) # add padding
+        return final_wav

meldataset.py ADDED Viewed

	@@ -0,0 +1,218 @@

+#coding: utf-8
+import os
+import os.path as osp
+import time
+import random
+import numpy as np
+import random
+import soundfile as sf
+import librosa
+import torch
+from torch import nn
+import torch.nn.functional as F
+import torchaudio
+from torch.utils.data import DataLoader
+import logging
+logger = logging.getLogger(__name__)
+logger.setLevel(logging.DEBUG)
+import pandas as pd
+##########################################################
+_pad = "$"
+_punctuation = ';:,.!?¡¿—…"«»“” '
+_letters = 'ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz'
+_letters_ipa = "ɑɐɒæɓʙβɔɕçɗɖðʤəɘɚɛɜɝɞɟʄɡɠɢʛɦɧħɥʜɨɪʝɭɬɫɮʟɱɯɰŋɳɲɴøɵɸθœɶʘɹɺɾɻʀʁɽʂʃʈʧʉʊʋⱱʌɣɤʍχʎʏʑʐʒʔʡʕʢǀǁǂǃˈˌːˑʼʴʰʱʲʷˠˤ˞↓↑→↗↘'̩'ᵻ"
+_extend = "" #ADD MORE SYMBOLS HERE
+# Export all symbols:
+symbols = [_pad] + list(_punctuation) + list(_letters) + list(_letters_ipa) + list(_extend)
+dicts = {}
+for i in range(len((symbols))):
+    dicts[symbols[i]] = i
+# Copy this code somewhere else then run with print(len(dicts) + 1) to check total symbols
+##########################################################
+class TextCleaner:
+    def __init__(self, dummy=None):
+        self.word_index_dictionary = dicts
+    def __call__(self, text):
+        indexes = []
+        for char in text:
+            try:
+                indexes.append(self.word_index_dictionary[char])
+            except KeyError as e:
+                #print(char)
+                continue
+        return indexes
+np.random.seed(1)
+random.seed(1)
+SPECT_PARAMS = {
+    "n_fft": 2048,
+    "win_length": 1200,
+    "hop_length": 300
+}
+MEL_PARAMS = {
+    "n_mels": 80,
+}
+to_mel = torchaudio.transforms.MelSpectrogram(
+    n_mels=80, n_fft=2048, win_length=1200, hop_length=300)
+mean, std = -4, 4
+def preprocess(wave):
+    wave_tensor = torch.from_numpy(wave).float()
+    mel_tensor = to_mel(wave_tensor)
+    mel_tensor = (torch.log(1e-5 + mel_tensor.unsqueeze(0)) - mean) / std
+    return mel_tensor
+class FilePathDataset(torch.utils.data.Dataset):
+    def __init__(self,
+                 data_list,
+                 root_path,
+                 sr=24000,
+                 data_augmentation=False,
+                 validation=False
+                 ):
+        spect_params = SPECT_PARAMS
+        mel_params = MEL_PARAMS
+        _data_list = [l.strip().split('|') for l in data_list]
+        self.data_list = _data_list #[data if len(data) == 3 else (*data, 0) for data in _data_list] #append speakerid=0 for all
+        self.text_cleaner = TextCleaner()
+        self.sr = sr
+        self.df = pd.DataFrame(self.data_list)
+        self.to_melspec = torchaudio.transforms.MelSpectrogram(**MEL_PARAMS)
+        self.mean, self.std = -4, 4
+        self.data_augmentation = data_augmentation and (not validation)
+        self.max_mel_length = 192
+        self.root_path = root_path
+    def __len__(self):
+        return len(self.data_list)
+    def __getitem__(self, idx):
+        data = self.data_list[idx]
+        path = data[0]
+        wave, text_tensor = self._load_tensor(data)
+        mel_tensor = preprocess(wave).squeeze()
+        acoustic_feature = mel_tensor.squeeze()
+        length_feature = acoustic_feature.size(1)
+        acoustic_feature = acoustic_feature[:, :(length_feature - length_feature % 2)]
+        return acoustic_feature, text_tensor, path, wave
+    def _load_tensor(self, data):
+        wave_path, text = data
+        wave, sr = sf.read(osp.join(self.root_path, wave_path))
+        if wave.shape[-1] == 2:
+            wave = wave[:, 0].squeeze()
+        if sr != 24000:
+            wave = librosa.resample(wave, orig_sr=sr, target_sr=24000)
+            print(wave_path, sr)
+        # Adding half a second padding.
+        wave = np.concatenate([np.zeros([12000]), wave, np.zeros([12000])], axis=0)
+        text = self.text_cleaner(text)
+        text.insert(0, 0)
+        text.append(0)
+        text = torch.LongTensor(text)
+        return wave, text
+    def _load_data(self, data):
+        wave, text_tensor = self._load_tensor(data)
+        mel_tensor = preprocess(wave).squeeze()
+        mel_length = mel_tensor.size(1)
+        if mel_length > self.max_mel_length:
+            random_start = np.random.randint(0, mel_length - self.max_mel_length)
+            mel_tensor = mel_tensor[:, random_start:random_start + self.max_mel_length]
+        return mel_tensor
+class Collater(object):
+    """
+    Args:
+      adaptive_batch_size (bool): if true, decrease batch size when long data comes.
+    """
+    def __init__(self, return_wave=False):
+        self.text_pad_index = 0
+        self.min_mel_length = 192
+        self.max_mel_length = 192
+        self.return_wave = return_wave
+    def __call__(self, batch):
+        batch_size = len(batch)
+        # sort by mel length
+        lengths = [b[0].shape[1] for b in batch]
+        batch_indexes = np.argsort(lengths)[::-1]
+        batch = [batch[bid] for bid in batch_indexes]
+        nmels = batch[0][0].size(0)
+        max_mel_length = max([b[0].shape[1] for b in batch])
+        max_text_length = max([b[1].shape[0] for b in batch])
+        mels = torch.zeros((batch_size, nmels, max_mel_length)).float()
+        texts = torch.zeros((batch_size, max_text_length)).long()
+        input_lengths = torch.zeros(batch_size).long()
+        output_lengths = torch.zeros(batch_size).long()
+        paths = ['' for _ in range(batch_size)]
+        waves = [None for _ in range(batch_size)]
+        for bid, (mel, text, path, wave) in enumerate(batch):
+            mel_size = mel.size(1)
+            text_size = text.size(0)
+            mels[bid, :, :mel_size] = mel
+            texts[bid, :text_size] = text
+            input_lengths[bid] = text_size
+            output_lengths[bid] = mel_size
+            paths[bid] = path
+            waves[bid] = wave
+        return waves, texts, input_lengths, mels, output_lengths
+def build_dataloader(path_list,
+                     root_path,
+                     validation=False,
+                     batch_size=4,
+                     num_workers=1,
+                     device='cpu',
+                     collate_config={},
+                     dataset_config={}):
+    dataset = FilePathDataset(path_list, root_path, validation=validation, **dataset_config)
+    collate_fn = Collater(**collate_config)
+    data_loader = DataLoader(dataset,
+                             batch_size=batch_size,
+                             shuffle=(not validation),
+                             num_workers=num_workers,
+                             drop_last=(not validation),
+                             collate_fn=collate_fn,
+                             pin_memory=(device != 'cpu'))
+    return data_loader

models.py ADDED Viewed

	@@ -0,0 +1,532 @@

+import math
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from torch.nn.utils import weight_norm
+from munch import Munch
+class LearnedDownSample(nn.Module):
+    def __init__(self, layer_type, dim_in):
+        super().__init__()
+        self.layer_type = layer_type
+        if self.layer_type == 'none':
+            self.conv = nn.Identity()
+        elif self.layer_type == 'timepreserve':
+            self.conv = nn.Conv2d(dim_in, dim_in, kernel_size=(3, 1), stride=(2, 1), groups=dim_in, padding=(1, 0))
+        elif self.layer_type == 'half':
+            self.conv = nn.Conv2d(dim_in, dim_in, kernel_size=(3, 3), stride=(2, 2), groups=dim_in, padding=1)
+        else:
+            raise RuntimeError('Got unexpected donwsampletype %s, expected is [none, timepreserve, half]' % self.layer_type)
+    def forward(self, x):
+        return self.conv(x)
+class LearnedUpSample(nn.Module):
+    def __init__(self, layer_type, dim_in):
+        super().__init__()
+        self.layer_type = layer_type
+        if self.layer_type == 'none':
+            self.conv = nn.Identity()
+        elif self.layer_type == 'timepreserve':
+            self.conv = nn.ConvTranspose2d(dim_in, dim_in, kernel_size=(3, 1), stride=(2, 1), groups=dim_in, output_padding=(1, 0), padding=(1, 0))
+        elif self.layer_type == 'half':
+            self.conv = nn.ConvTranspose2d(dim_in, dim_in, kernel_size=(3, 3), stride=(2, 2), groups=dim_in, output_padding=1, padding=1)
+        else:
+            raise RuntimeError('Got unexpected upsampletype %s, expected is [none, timepreserve, half]' % self.layer_type)
+    def forward(self, x):
+        return self.conv(x)
+class DownSample(nn.Module):
+    def __init__(self, layer_type):
+        super().__init__()
+        self.layer_type = layer_type
+    def forward(self, x):
+        if self.layer_type == 'none':
+            return x
+        elif self.layer_type == 'timepreserve':
+            return F.avg_pool2d(x, (2, 1))
+        elif self.layer_type == 'half':
+            if x.shape[-1] % 2 != 0:
+                x = torch.cat([x, x[..., -1].unsqueeze(-1)], dim=-1)
+            return F.avg_pool2d(x, 2)
+        else:
+            raise RuntimeError('Got unexpected donwsampletype %s, expected is [none, timepreserve, half]' % self.layer_type)
+class UpSample(nn.Module):
+    def __init__(self, layer_type):
+        super().__init__()
+        self.layer_type = layer_type
+    def forward(self, x):
+        if self.layer_type == 'none':
+            return x
+        elif self.layer_type == 'timepreserve':
+            return F.interpolate(x, scale_factor=(2, 1), mode='nearest')
+        elif self.layer_type == 'half':
+            return F.interpolate(x, scale_factor=2, mode='nearest')
+        else:
+            raise RuntimeError('Got unexpected upsampletype %s, expected is [none, timepreserve, half]' % self.layer_type)
+class ResBlk(nn.Module):
+    def __init__(self, dim_in, dim_out, actv=nn.LeakyReLU(0.2),
+                 normalize=False, downsample='none'):
+        super().__init__()
+        self.actv = actv
+        self.normalize = normalize
+        self.downsample = DownSample(downsample)
+        self.downsample_res = LearnedDownSample(downsample, dim_in)
+        self.learned_sc = dim_in != dim_out
+        self._build_weights(dim_in, dim_out)
+    def _build_weights(self, dim_in, dim_out):
+        self.conv1 = nn.Conv2d(dim_in, dim_in, 3, 1, 1)
+        self.conv2 = nn.Conv2d(dim_in, dim_out, 3, 1, 1)
+        if self.normalize:
+            self.norm1 = nn.InstanceNorm2d(dim_in, affine=True)
+            self.norm2 = nn.InstanceNorm2d(dim_in, affine=True)
+        if self.learned_sc:
+            self.conv1x1 = nn.Conv2d(dim_in, dim_out, 1, 1, 0, bias=False)
+    def _shortcut(self, x):
+        if self.learned_sc:
+            x = self.conv1x1(x)
+        if self.downsample:
+            x = self.downsample(x)
+        return x
+    def _residual(self, x):
+        if self.normalize:
+            x = self.norm1(x)
+        x = self.actv(x)
+        x = self.conv1(x)
+        x = self.downsample_res(x)
+        if self.normalize:
+            x = self.norm2(x)
+        x = self.actv(x)
+        x = self.conv2(x)
+        return x
+    def forward(self, x):
+        x = self._shortcut(x) + self._residual(x)
+        return x / math.sqrt(2)  # unit variance
+class StyleEncoder(nn.Module):
+    def __init__(self, dim_in=48, style_dim=48, max_conv_dim=384):
+        super().__init__()
+        blocks = []
+        blocks += [nn.Conv2d(1, dim_in, 3, 1, 1)]
+        repeat_num = 4
+        for _ in range(repeat_num):
+            dim_out = min(dim_in*2, max_conv_dim)
+            blocks += [ResBlk(dim_in, dim_out, downsample='half')]
+            dim_in = dim_out
+        blocks += [nn.LeakyReLU(0.2)]
+        blocks += [nn.Conv2d(dim_out, dim_out, 5, 1, 0)]
+        blocks += [nn.AdaptiveAvgPool2d(1)]
+        blocks += [nn.LeakyReLU(0.2)]
+        self.shared = nn.Sequential(*blocks)
+        self.unshared = nn.Linear(dim_out, style_dim)
+    def forward(self, x):
+        h = self.shared(x)
+        h = h.view(h.size(0), -1)
+        s = self.unshared(h)
+        return s
+class LinearNorm(torch.nn.Module):
+    def __init__(self, in_dim, out_dim, bias=True, w_init_gain='linear'):
+        super(LinearNorm, self).__init__()
+        self.linear_layer = torch.nn.Linear(in_dim, out_dim, bias=bias)
+        torch.nn.init.xavier_uniform_(
+            self.linear_layer.weight,
+            gain=torch.nn.init.calculate_gain(w_init_gain))
+    def forward(self, x):
+        return self.linear_layer(x)
+class ResBlk1d(nn.Module):
+    def __init__(self, dim_in, dim_out, actv=nn.LeakyReLU(0.2),
+                 normalize=False, downsample='none', dropout_p=0.2):
+        super().__init__()
+        self.actv = actv
+        self.normalize = normalize
+        self.downsample_type = downsample
+        self.learned_sc = dim_in != dim_out
+        self._build_weights(dim_in, dim_out)
+        self.dropout_p = dropout_p
+        if self.downsample_type == 'none':
+            self.pool = nn.Identity()
+        else:
+            self.pool = weight_norm(nn.Conv1d(dim_in, dim_in, kernel_size=3, stride=2, groups=dim_in, padding=1))
+    def _build_weights(self, dim_in, dim_out):
+        self.conv1 = weight_norm(nn.Conv1d(dim_in, dim_in, 3, 1, 1))
+        self.conv2 = weight_norm(nn.Conv1d(dim_in, dim_out, 3, 1, 1))
+        if self.normalize:
+            self.norm1 = nn.InstanceNorm1d(dim_in, affine=True)
+            self.norm2 = nn.InstanceNorm1d(dim_in, affine=True)
+        if self.learned_sc:
+            self.conv1x1 = weight_norm(nn.Conv1d(dim_in, dim_out, 1, 1, 0, bias=False))
+    def downsample(self, x):
+        if self.downsample_type == 'none':
+            return x
+        else:
+            if x.shape[-1] % 2 != 0:
+                x = torch.cat([x, x[..., -1].unsqueeze(-1)], dim=-1)
+            return F.avg_pool1d(x, 2)
+    def _shortcut(self, x):
+        if self.learned_sc:
+            x = self.conv1x1(x)
+        x = self.downsample(x)
+        return x
+    def _residual(self, x):
+        if self.normalize:
+            x = self.norm1(x)
+        x = self.actv(x)
+        x = F.dropout(x, p=self.dropout_p, training=self.training)
+        x = self.conv1(x)
+        x = self.pool(x)
+        if self.normalize:
+            x = self.norm2(x)
+        x = self.actv(x)
+        x = F.dropout(x, p=self.dropout_p, training=self.training)
+        x = self.conv2(x)
+        return x
+    def forward(self, x):
+        x = self._shortcut(x) + self._residual(x)
+        return x / math.sqrt(2)  # unit variance
+class LayerNorm(nn.Module):
+    def __init__(self, channels, eps=1e-5):
+        super().__init__()
+        self.channels = channels
+        self.eps = eps
+        self.gamma = nn.Parameter(torch.ones(channels))
+        self.beta = nn.Parameter(torch.zeros(channels))
+    def forward(self, x):
+        x = x.transpose(1, -1)
+        x = F.layer_norm(x, (self.channels,), self.gamma, self.beta, self.eps)
+        return x.transpose(1, -1)
+class TextEncoder(nn.Module):
+    def __init__(self, channels, kernel_size, depth, n_symbols, actv=nn.LeakyReLU(0.2)):
+        super().__init__()
+        self.embedding = nn.Embedding(n_symbols, channels)
+        padding = (kernel_size - 1) // 2
+        self.cnn = nn.ModuleList()
+        for _ in range(depth):
+            self.cnn.append(nn.Sequential(
+                weight_norm(nn.Conv1d(channels, channels, kernel_size=kernel_size, padding=padding)),
+                LayerNorm(channels),
+                actv,
+                nn.Dropout(0.2),
+            ))
+        # self.cnn = nn.Sequential(*self.cnn)
+        self.lstm = nn.LSTM(channels, channels//2, 1, batch_first=True, bidirectional=True)
+    def forward(self, x, input_lengths, m):
+        x = self.embedding(x)  # [B, T, emb]
+        x = x.transpose(1, 2)  # [B, emb, T]
+        m = m.to(input_lengths.device).unsqueeze(1)
+        x.masked_fill_(m, 0.0)
+        for c in self.cnn:
+            x = c(x)
+            x.masked_fill_(m, 0.0)
+        x = x.transpose(1, 2)  # [B, T, chn]
+        input_lengths = input_lengths.cpu().numpy()
+        x = nn.utils.rnn.pack_padded_sequence(
+            x, input_lengths, batch_first=True, enforce_sorted=False)
+        self.lstm.flatten_parameters()
+        x, _ = self.lstm(x)
+        x, _ = nn.utils.rnn.pad_packed_sequence(
+            x, batch_first=True)
+        x = x.transpose(-1, -2)
+        x_pad = torch.zeros([x.shape[0], x.shape[1], m.shape[-1]])
+        x_pad[:, :, :x.shape[-1]] = x
+        x = x_pad.to(x.device)
+        x.masked_fill_(m, 0.0)
+        return x
+    def inference(self, x):
+        x = self.embedding(x)
+        x = x.transpose(1, 2)
+        x = self.cnn(x)
+        x = x.transpose(1, 2)
+        self.lstm.flatten_parameters()
+        x, _ = self.lstm(x)
+        return x
+    def length_to_mask(self, lengths):
+        mask = torch.arange(lengths.max()).unsqueeze(0).expand(lengths.shape[0], -1).type_as(lengths)
+        mask = torch.gt(mask+1, lengths.unsqueeze(1))
+        return mask
+class AdaIN1d(nn.Module):
+    def __init__(self, style_dim, num_features):
+        super().__init__()
+        self.norm = nn.InstanceNorm1d(num_features, affine=False)
+        self.fc = nn.Linear(style_dim, num_features*2)
+    def forward(self, x, s):
+        h = self.fc(s)
+        h = h.view(h.size(0), h.size(1), 1)
+        gamma, beta = torch.chunk(h, chunks=2, dim=1)
+        return (1 + gamma) * self.norm(x) + beta
+class UpSample1d(nn.Module):
+    def __init__(self, layer_type):
+        super().__init__()
+        self.layer_type = layer_type
+    def forward(self, x):
+        if self.layer_type == 'none':
+            return x
+        else:
+            return F.interpolate(x, scale_factor=2, mode='nearest')
+class AdainResBlk1d(nn.Module):
+    def __init__(self, dim_in, dim_out, style_dim=64, actv=nn.LeakyReLU(0.2),
+                 upsample='none', dropout_p=0.0):
+        super().__init__()
+        self.actv = actv
+        self.upsample_type = upsample
+        self.upsample = UpSample1d(upsample)
+        self.learned_sc = dim_in != dim_out
+        self._build_weights(dim_in, dim_out, style_dim)
+        self.dropout = nn.Dropout(dropout_p)
+        if upsample == 'none':
+            self.pool = nn.Identity()
+        else:
+            self.pool = weight_norm(nn.ConvTranspose1d(dim_in, dim_in, kernel_size=3, stride=2, groups=dim_in, padding=1, output_padding=1))
+    def _build_weights(self, dim_in, dim_out, style_dim):
+        self.conv1 = weight_norm(nn.Conv1d(dim_in, dim_out, 3, 1, 1))
+        self.conv2 = weight_norm(nn.Conv1d(dim_out, dim_out, 3, 1, 1))
+        self.norm1 = AdaIN1d(style_dim, dim_in)
+        self.norm2 = AdaIN1d(style_dim, dim_out)
+        if self.learned_sc:
+            self.conv1x1 = weight_norm(nn.Conv1d(dim_in, dim_out, 1, 1, 0, bias=False))
+    def _shortcut(self, x):
+        x = self.upsample(x)
+        if self.learned_sc:
+            x = self.conv1x1(x)
+        return x
+    def _residual(self, x, s):
+        x = self.norm1(x, s)
+        x = self.actv(x)
+        x = self.pool(x)
+        x = self.conv1(self.dropout(x))
+        x = self.norm2(x, s)
+        x = self.actv(x)
+        x = self.conv2(self.dropout(x))
+        return x
+    def forward(self, x, s):
+        out = self._residual(x, s)
+        out = (out + self._shortcut(x)) / math.sqrt(2)
+        return out
+class AdaLayerNorm(nn.Module):
+    def __init__(self, style_dim, channels, eps=1e-5):
+        super().__init__()
+        self.channels = channels
+        self.eps = eps
+        self.fc = nn.Linear(style_dim, channels*2)
+    def forward(self, x, s):
+        x = x.transpose(-1, -2)
+        x = x.transpose(1, -1)
+        h = self.fc(s)
+        h = h.view(h.size(0), h.size(1), 1)
+        gamma, beta = torch.chunk(h, chunks=2, dim=1)
+        gamma, beta = gamma.transpose(1, -1), beta.transpose(1, -1)
+        x = F.layer_norm(x, (self.channels,), eps=self.eps)
+        x = (1 + gamma) * x + beta
+        return x.transpose(1, -1).transpose(-1, -2)
+class ProsodyPredictor(nn.Module):
+    def __init__(self, style_dim, d_hid, nlayers, max_dur=50, dropout=0.1):
+        super().__init__()
+        self.text_encoder = DurationEncoder(sty_dim=style_dim,
+                                            d_model=d_hid,
+                                            nlayers=nlayers,
+                                            dropout=dropout)
+        self.lstm = nn.LSTM(d_hid + style_dim, d_hid // 2, 1, batch_first=True, bidirectional=True)
+        self.duration_proj = LinearNorm(d_hid, max_dur)
+        self.shared = nn.LSTM(d_hid + style_dim, d_hid // 2, 1, batch_first=True, bidirectional=True)
+        self.F0 = nn.ModuleList()
+        self.F0.append(AdainResBlk1d(d_hid, d_hid, style_dim, dropout_p=dropout))
+        self.F0.append(AdainResBlk1d(d_hid, d_hid // 2, style_dim, upsample=True, dropout_p=dropout))
+        self.F0.append(AdainResBlk1d(d_hid // 2, d_hid // 2, style_dim, dropout_p=dropout))
+        self.N = nn.ModuleList()
+        self.N.append(AdainResBlk1d(d_hid, d_hid, style_dim, dropout_p=dropout))
+        self.N.append(AdainResBlk1d(d_hid, d_hid // 2, style_dim, upsample=True, dropout_p=dropout))
+        self.N.append(AdainResBlk1d(d_hid // 2, d_hid // 2, style_dim, dropout_p=dropout))
+        self.F0_proj = nn.Conv1d(d_hid // 2, 1, 1, 1, 0)
+        self.N_proj = nn.Conv1d(d_hid // 2, 1, 1, 1, 0)
+    def forward(self, texts, style, text_lengths, alignment, m):
+        d = self.text_encoder(texts, style, text_lengths, m)
+        batch_size = d.shape[0]
+        text_size = d.shape[1]
+        # predict duration
+        input_lengths = text_lengths.cpu().numpy()
+        x = nn.utils.rnn.pack_padded_sequence(
+            d, input_lengths, batch_first=True, enforce_sorted=False)
+        m = m.to(text_lengths.device).unsqueeze(1)
+        self.lstm.flatten_parameters()
+        x, _ = self.lstm(x)
+        x, _ = nn.utils.rnn.pad_packed_sequence(
+            x, batch_first=True)
+        x_pad = torch.zeros([x.shape[0], m.shape[-1], x.shape[-1]])
+        x_pad[:, :x.shape[1], :] = x
+        x = x_pad.to(x.device)
+        duration = self.duration_proj(nn.functional.dropout(x, 0.5, training=self.training))
+        en = (d.transpose(-1, -2) @ alignment)
+        return duration.squeeze(-1), en
+    def F0Ntrain(self, x, s):
+        x, _ = self.shared(x.transpose(-1, -2))
+        F0 = x.transpose(-1, -2)
+        for block in self.F0:
+            F0 = block(F0, s)
+        F0 = self.F0_proj(F0)
+        N = x.transpose(-1, -2)
+        for block in self.N:
+            N = block(N, s)
+        N = self.N_proj(N)
+        return F0.squeeze(1), N.squeeze(1)
+    def length_to_mask(self, lengths):
+        mask = torch.arange(lengths.max()).unsqueeze(0).expand(lengths.shape[0], -1).type_as(lengths)
+        mask = torch.gt(mask+1, lengths.unsqueeze(1))
+        return mask
+class DurationEncoder(nn.Module):
+    def __init__(self, sty_dim, d_model, nlayers, dropout=0.1):
+        super().__init__()
+        self.lstms = nn.ModuleList()
+        for _ in range(nlayers):
+            self.lstms.append(nn.LSTM(d_model + sty_dim,
+                                 d_model // 2,
+                                 num_layers=1,
+                                 batch_first=True,
+                                 bidirectional=True,
+                                 dropout=dropout))
+            self.lstms.append(AdaLayerNorm(sty_dim, d_model))
+        self.dropout = dropout
+        self.d_model = d_model
+        self.sty_dim = sty_dim
+    def forward(self, x, style, text_lengths, m):
+        masks = m.to(text_lengths.device)
+        x = x.permute(2, 0, 1)
+        s = style.expand(x.shape[0], x.shape[1], -1)
+        x = torch.cat([x, s], axis=-1)
+        x.masked_fill_(masks.unsqueeze(-1).transpose(0, 1), 0.0)
+        x = x.transpose(0, 1)
+        input_lengths = text_lengths.cpu().numpy()
+        x = x.transpose(-1, -2)
+        for block in self.lstms:
+            if isinstance(block, AdaLayerNorm):
+                x = block(x.transpose(-1, -2), style).transpose(-1, -2)
+                x = torch.cat([x, s.permute(1, -1, 0)], axis=1)
+                x.masked_fill_(masks.unsqueeze(-1).transpose(-1, -2), 0.0)
+            else:
+                x = x.transpose(-1, -2)
+                x = nn.utils.rnn.pack_padded_sequence(
+                    x, input_lengths, batch_first=True, enforce_sorted=False)
+                block.flatten_parameters()
+                x, _ = block(x)
+                x, _ = nn.utils.rnn.pad_packed_sequence(
+                    x, batch_first=True)
+                x = F.dropout(x, p=self.dropout, training=self.training)
+                x = x.transpose(-1, -2)
+                x_pad = torch.zeros([x.shape[0], x.shape[1], m.shape[-1]])
+                x_pad[:, :, :x.shape[-1]] = x
+                x = x_pad.to(x.device)
+        return x.transpose(-1, -2)
+    def inference(self, x, style):
+        x = self.embedding(x.transpose(-1, -2)) * math.sqrt(self.d_model)
+        style = style.expand(x.shape[0], x.shape[1], -1)
+        x = torch.cat([x, style], axis=-1)
+        src = self.pos_encoder(x)
+        output = self.transformer_encoder(src).transpose(0, 1)
+        return output
+    def length_to_mask(self, lengths):
+        mask = torch.arange(lengths.max()).unsqueeze(0).expand(lengths.shape[0], -1).type_as(lengths)
+        mask = torch.gt(mask+1, lengths.unsqueeze(1))
+        return mask

requirements.txt ADDED Viewed

	@@ -0,0 +1,10 @@

+torch
+torchaudio
+numpy
+PyYAML
+munch
+nltk
+librosa
+noisereduce
+phonemizer
+espeakng-loader

run.ipynb ADDED Viewed

	@@ -0,0 +1,176 @@

+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "5a3ddcc8",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from inference import StyleTTS2\n",
+    "\n",
+    "import librosa\n",
+    "import IPython.display as ipd\n",
+    "import torch.cuda\n",
+    "\n",
+    "device = 'cuda' if torch.cuda.is_available() else 'cpu'"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "092cfb69",
+   "metadata": {},
+   "source": [
+    "### Load G2P"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "a152ec13",
+   "metadata": {},
+   "source": [
+    "If you did not use eSpeak for your language, please add your own G2P."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "ca224f37",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import sys\n",
+    "import phonemizer\n",
+    "if sys.platform.startswith(\"win\"):\n",
+    "    try:\n",
+    "        from phonemizer.backend.espeak.wrapper import EspeakWrapper\n",
+    "        import espeakng_loader\n",
+    "        EspeakWrapper.set_library(espeakng_loader.get_library_path())\n",
+    "    except Exception as e:\n",
+    "        print(e)\n",
+    "\n",
+    "def get_phoneme(text, lang):\n",
+    "    try:\n",
+    "        my_phonemizer = phonemizer.backend.EspeakBackend(language=lang, preserve_punctuation=True,  with_stress=True, language_switch='remove-flags')\n",
+    "        return my_phonemizer.phonemize([text])[0]\n",
+    "    except Exception as e:\n",
+    "        print(e)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "7b9cecbe",
+   "metadata": {},
+   "source": [
+    "### Load models"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "e7b9c01d",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "config_path = \"Models/config.yaml\"\n",
+    "models_path = \"Models/inference/model.pth\""
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "b803110e",
+   "metadata": {},
+   "source": [
+    "### Synthesize speech\n",
+    "\n",
+    "Little Note: Reference audio has a huge impact on the result. It is best to select audio around 10s long and consistent in both tone and speed."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "78396f70",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "speaker = {\n",
+    "    \"path\": \"./Audio/1_heart.wav\",  #Ref audio path\n",
+    "    \"speed\": 1.0,                        #Speaking speed\n",
+    "}\n",
+    "\n",
+    "max_samples = 24000*20 #max 20 seconds ref audio\n",
+    "print(speaker['path'])\n",
+    "wave, sr = librosa.load(speaker['path'], sr=24000)\n",
+    "audio, index = librosa.effects.trim(wave, top_db=30)\n",
+    "if sr != 24000:              audio = librosa.resample(audio, sr, 24000)\n",
+    "if len(audio) > max_samples: audio = audio[:max_samples]\n",
+    "display(ipd.Audio(audio, rate=24000, normalize=True))"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "395959f1",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "text = '''\n",
+    "Nearly 300 scholars currently working in the United States have applied for positions at Aix-Marseille University in France, which has announced a program to provide a haven for academics affected by the Trump administration's policies.\n",
+    "Aix-Marseille launched the \"Safe Place for Science\" initiative earlier this year, offering three-year funded placements for approximately 20 researchers. The program aims to support scholars facing budget cuts and policy changes that have disrupted U.S. academic institutions.\n",
+    "'''"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "16194211",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "model             = StyleTTS2(config_path, models_path).eval().to(device)\n",
+    "avg_style         = True      #BOOL   Split the ref audio and calculate the avg styles.\n",
+    "stabilize         = False     #BOOL   Stabilize speaking speed.\n",
+    "denoise           = 0.3       #FLOAT  Adjust the strength of the denoiser. Value range is [0, 1]\n",
+    "n_merge           = 16        #INT    Avoid short sentences by merging when a sentence has fewer than n words"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "980c6fbb",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "with torch.no_grad():\n",
+    "    phonemes = get_phoneme(text=text, lang=\"en-us\")\n",
+    "\n",
+    "    styles  = model.get_styles(speaker, denoise, avg_style)\n",
+    "    r       = model.generate(phonemes, styles, stabilize, n_merge)\n",
+    "\n",
+    "print('Synthesized:')\n",
+    "display(ipd.Audio(r, rate=24000, normalize=True))"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "base",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.11.7"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}