Commit
·
1f001bb
0
Parent(s):
First model version
Browse files- .gitattributes +33 -0
- .gitignore +151 -0
- README.md +51 -0
- checkpoints/FastDiff/config.yaml +149 -0
- checkpoints/FastDiff/model_ckpt_steps_500000.ckpt +3 -0
- checkpoints/ProDiff/config.yaml +205 -0
- checkpoints/ProDiff/model_ckpt_steps_200000.ckpt +3 -0
- checkpoints/ProDiff_Teacher/config.yaml +205 -0
- checkpoints/ProDiff_Teacher/model_ckpt_steps_188000.ckpt +3 -0
- data/binary/LJSpeech/phone_set.json +1 -0
- data/binary/LJSpeech/spk_map.json +1 -0
- data/binary/LJSpeech/train_f0s_mean_std.npy +3 -0
.gitattributes
ADDED
|
@@ -0,0 +1,33 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
*.7z filter=lfs diff=lfs merge=lfs -text
|
| 2 |
+
*.arrow filter=lfs diff=lfs merge=lfs -text
|
| 3 |
+
*.bin filter=lfs diff=lfs merge=lfs -text
|
| 4 |
+
*.bz2 filter=lfs diff=lfs merge=lfs -text
|
| 5 |
+
*.ftz filter=lfs diff=lfs merge=lfs -text
|
| 6 |
+
*.gz filter=lfs diff=lfs merge=lfs -text
|
| 7 |
+
*.h5 filter=lfs diff=lfs merge=lfs -text
|
| 8 |
+
*.joblib filter=lfs diff=lfs merge=lfs -text
|
| 9 |
+
*.lfs.* filter=lfs diff=lfs merge=lfs -text
|
| 10 |
+
*.mlmodel filter=lfs diff=lfs merge=lfs -text
|
| 11 |
+
*.model filter=lfs diff=lfs merge=lfs -text
|
| 12 |
+
*.msgpack filter=lfs diff=lfs merge=lfs -text
|
| 13 |
+
*.npy filter=lfs diff=lfs merge=lfs -text
|
| 14 |
+
*.npz filter=lfs diff=lfs merge=lfs -text
|
| 15 |
+
*.onnx filter=lfs diff=lfs merge=lfs -text
|
| 16 |
+
*.ot filter=lfs diff=lfs merge=lfs -text
|
| 17 |
+
*.parquet filter=lfs diff=lfs merge=lfs -text
|
| 18 |
+
*.pb filter=lfs diff=lfs merge=lfs -text
|
| 19 |
+
*.pickle filter=lfs diff=lfs merge=lfs -text
|
| 20 |
+
*.pkl filter=lfs diff=lfs merge=lfs -text
|
| 21 |
+
*.pt filter=lfs diff=lfs merge=lfs -text
|
| 22 |
+
*.pth filter=lfs diff=lfs merge=lfs -text
|
| 23 |
+
*.rar filter=lfs diff=lfs merge=lfs -text
|
| 24 |
+
saved_model/**/* filter=lfs diff=lfs merge=lfs -text
|
| 25 |
+
*.tar.* filter=lfs diff=lfs merge=lfs -text
|
| 26 |
+
*.tflite filter=lfs diff=lfs merge=lfs -text
|
| 27 |
+
*.tgz filter=lfs diff=lfs merge=lfs -text
|
| 28 |
+
*.wasm filter=lfs diff=lfs merge=lfs -text
|
| 29 |
+
*.xz filter=lfs diff=lfs merge=lfs -text
|
| 30 |
+
*.zip filter=lfs diff=lfs merge=lfs -text
|
| 31 |
+
*.zst filter=lfs diff=lfs merge=lfs -text
|
| 32 |
+
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
| 33 |
+
*.ckpt filter=lfs diff=lfs merge=lfs -text
|
.gitignore
ADDED
|
@@ -0,0 +1,151 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
### Project ignore
|
| 2 |
+
|
| 3 |
+
/ParallelWaveGAN
|
| 4 |
+
/wavegan_pretrained*
|
| 5 |
+
/pretrained_models
|
| 6 |
+
rsync
|
| 7 |
+
.idea
|
| 8 |
+
.DS_Store
|
| 9 |
+
bak
|
| 10 |
+
tmp
|
| 11 |
+
*.tar.gz
|
| 12 |
+
# mfa and kaldi
|
| 13 |
+
kaldi_align/exp
|
| 14 |
+
mfa
|
| 15 |
+
montreal-forced-aligner
|
| 16 |
+
mos
|
| 17 |
+
nbs
|
| 18 |
+
/configs_usr/*
|
| 19 |
+
!/configs_usr/.gitkeep
|
| 20 |
+
/fast_transformers
|
| 21 |
+
/rnnoise
|
| 22 |
+
/usr/*
|
| 23 |
+
!/usr/.gitkeep
|
| 24 |
+
|
| 25 |
+
# Created by .ignore support plugin (hsz.mobi)
|
| 26 |
+
### Python template
|
| 27 |
+
# Byte-compiled / optimized / DLL files
|
| 28 |
+
__pycache__/
|
| 29 |
+
*.py[cod]
|
| 30 |
+
*$py.class
|
| 31 |
+
|
| 32 |
+
# C extensions
|
| 33 |
+
*.so
|
| 34 |
+
|
| 35 |
+
# Distribution / packaging
|
| 36 |
+
.Python
|
| 37 |
+
build/
|
| 38 |
+
develop-eggs/
|
| 39 |
+
dist/
|
| 40 |
+
downloads/
|
| 41 |
+
eggs/
|
| 42 |
+
.eggs/
|
| 43 |
+
lib/
|
| 44 |
+
lib64/
|
| 45 |
+
parts/
|
| 46 |
+
sdist/
|
| 47 |
+
var/
|
| 48 |
+
wheels/
|
| 49 |
+
pip-wheel-metadata/
|
| 50 |
+
share/python-wheels/
|
| 51 |
+
*.egg-info/
|
| 52 |
+
.installed.cfg
|
| 53 |
+
*.egg
|
| 54 |
+
MANIFEST
|
| 55 |
+
|
| 56 |
+
# PyInstaller
|
| 57 |
+
# Usually these files are written by a python script from a template
|
| 58 |
+
# before PyInstaller builds the exe, so as to inject date/other infos into it.
|
| 59 |
+
*.manifest
|
| 60 |
+
*.spec
|
| 61 |
+
|
| 62 |
+
# Installer logs
|
| 63 |
+
pip-log.txt
|
| 64 |
+
pip-delete-this-directory.txt
|
| 65 |
+
|
| 66 |
+
# Unit test / coverage reports
|
| 67 |
+
htmlcov/
|
| 68 |
+
.tox/
|
| 69 |
+
.nox/
|
| 70 |
+
.coverage
|
| 71 |
+
.coverage.*
|
| 72 |
+
.cache
|
| 73 |
+
nosetests.xml
|
| 74 |
+
coverage.xml
|
| 75 |
+
*.cover
|
| 76 |
+
.hypothesis/
|
| 77 |
+
.pytest_cache/
|
| 78 |
+
|
| 79 |
+
# Translations
|
| 80 |
+
*.mo
|
| 81 |
+
*.pot
|
| 82 |
+
|
| 83 |
+
# Django stuff:
|
| 84 |
+
*.log
|
| 85 |
+
local_settings.py
|
| 86 |
+
db.sqlite3
|
| 87 |
+
db.sqlite3-journal
|
| 88 |
+
|
| 89 |
+
# Flask stuff:
|
| 90 |
+
instance/
|
| 91 |
+
.webassets-cache
|
| 92 |
+
|
| 93 |
+
# Scrapy stuff:
|
| 94 |
+
.scrapy
|
| 95 |
+
|
| 96 |
+
# Sphinx documentation
|
| 97 |
+
docs/_build/
|
| 98 |
+
|
| 99 |
+
# PyBuilder
|
| 100 |
+
target/
|
| 101 |
+
|
| 102 |
+
# Jupyter Notebook
|
| 103 |
+
.ipynb_checkpoints
|
| 104 |
+
|
| 105 |
+
# IPython
|
| 106 |
+
profile_default/
|
| 107 |
+
ipython_config.py
|
| 108 |
+
|
| 109 |
+
# pyenv
|
| 110 |
+
.python-version
|
| 111 |
+
|
| 112 |
+
# pipenv
|
| 113 |
+
# According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
|
| 114 |
+
# However, in case of collaboration, if having platform-specific dependencies or dependencies
|
| 115 |
+
# having no cross-platform support, pipenv may install dependencies that don't work, or not
|
| 116 |
+
# install all needed dependencies.
|
| 117 |
+
#Pipfile.lock
|
| 118 |
+
|
| 119 |
+
# celery beat schedule file
|
| 120 |
+
celerybeat-schedule
|
| 121 |
+
|
| 122 |
+
# SageMath parsed files
|
| 123 |
+
*.sage.py
|
| 124 |
+
|
| 125 |
+
# Environments
|
| 126 |
+
.env
|
| 127 |
+
.venv
|
| 128 |
+
env/
|
| 129 |
+
venv/
|
| 130 |
+
ENV/
|
| 131 |
+
env.bak/
|
| 132 |
+
venv.bak/
|
| 133 |
+
|
| 134 |
+
# Spyder project settings
|
| 135 |
+
.spyderproject
|
| 136 |
+
.spyproject
|
| 137 |
+
|
| 138 |
+
# Rope project settings
|
| 139 |
+
.ropeproject
|
| 140 |
+
|
| 141 |
+
# mkdocs documentation
|
| 142 |
+
/site
|
| 143 |
+
|
| 144 |
+
# mypy
|
| 145 |
+
.mypy_cache/
|
| 146 |
+
.dmypy.json
|
| 147 |
+
dmypy.json
|
| 148 |
+
|
| 149 |
+
# Pyre type checker
|
| 150 |
+
.pyre/
|
| 151 |
+
将删除 datasets/remi/test/
|
README.md
ADDED
|
@@ -0,0 +1,51 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
---
|
| 2 |
+
license: other
|
| 3 |
+
tags:
|
| 4 |
+
- text-to-speech
|
| 5 |
+
- neural-vocoder
|
| 6 |
+
inference: false
|
| 7 |
+
extra_gated_prompt: |-
|
| 8 |
+
One more step before getting this model.
|
| 9 |
+
This model is open access and available to all, with a license further specifying rights and usage.
|
| 10 |
+
|
| 11 |
+
Any organization or individual is prohibited from using any technology mentioned in this paper to generate someone's speech without his/her consent, including but not limited to government leaders, political figures, and celebrities. If you do not comply with this item, you could be in violation of copyright laws.
|
| 12 |
+
|
| 13 |
+
|
| 14 |
+
By clicking on "Access repository" below, you accept that your *contact information* (email address and username) can be shared with the model authors as well.
|
| 15 |
+
|
| 16 |
+
extra_gated_fields:
|
| 17 |
+
I have read the License and agree with its terms: checkbox
|
| 18 |
+
---
|
| 19 |
+
|
| 20 |
+
# ProDiff and FastDiff Model Card
|
| 21 |
+
|
| 22 |
+
## Key Features
|
| 23 |
+
- **Extremely-Fast** diffusion text-to-speech synthesis pipeline for potential **industrial deployment**.
|
| 24 |
+
- **Tutorial and code base** for speech diffusion models.
|
| 25 |
+
- More **supported diffusion mechanism** (e.g., guided diffusion) will be available.
|
| 26 |
+
|
| 27 |
+
|
| 28 |
+
## Model Details
|
| 29 |
+
- **Developed by:** Robin Rombach, Patrick Esser
|
| 30 |
+
- **Model type:** Diffusion-based text-to-speech generation model
|
| 31 |
+
- **Language(s):** English
|
| 32 |
+
- **License:**
|
| 33 |
+
- **Model Description:** A conditional diffusion probabilistic model capable of generating high fidelity speech efficiently.
|
| 34 |
+
- **Resources for more information:** [FastDiff GitHub Repository](https://github.com/Rongjiehuang/FastDiff), [FastDiff Paper](https://arxiv.org/abs/2204.09934). [ProDiff GitHub Repository](https://github.com/Rongjiehuang/ProDiff), [ProDiff Paper](https://arxiv.org/abs/2207.06389).
|
| 35 |
+
- **Cite as:**
|
| 36 |
+
|
| 37 |
+
@inproceedings{huang2022prodiff,
|
| 38 |
+
title={ProDiff: Progressive Fast Diffusion Model For High-Quality Text-to-Speech},
|
| 39 |
+
author={Huang, Rongjie and Zhao, Zhou and Liu, Huadai and Liu, Jinglin and Cui, Chenye and Ren, Yi},
|
| 40 |
+
booktitle={Proceedings of the 30th ACM International Conference on Multimedia},
|
| 41 |
+
year={2022}
|
| 42 |
+
|
| 43 |
+
@inproceedings{huang2022fastdiff,
|
| 44 |
+
title={FastDiff: A Fast Conditional Diffusion Model for High-Quality Speech Synthesis},
|
| 45 |
+
author={Huang, Rongjie and Lam, Max WY and Wang, Jun and Su, Dan and Yu, Dong and Ren, Yi and Zhao, Zhou},
|
| 46 |
+
booktitle = {Proceedings of the Thirty-First International Joint Conference on Artificial Intelligence, {IJCAI-22}},
|
| 47 |
+
year={2022}
|
| 48 |
+
-
|
| 49 |
+
|
| 50 |
+
|
| 51 |
+
*This model card was written based on the [DALL-E Mini model card](https://huggingface.co/dalle-mini/dalle-mini).*
|
checkpoints/FastDiff/config.yaml
ADDED
|
@@ -0,0 +1,149 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
N: ''
|
| 2 |
+
T: 1000
|
| 3 |
+
accumulate_grad_batches: 1
|
| 4 |
+
amp: false
|
| 5 |
+
audio_channels: 1
|
| 6 |
+
audio_num_mel_bins: 80
|
| 7 |
+
audio_sample_rate: 22050
|
| 8 |
+
aux_context_window: 0
|
| 9 |
+
beta_0: 1.0e-06
|
| 10 |
+
beta_T: 0.01
|
| 11 |
+
binarization_args:
|
| 12 |
+
reset_phone_dict: true
|
| 13 |
+
reset_word_dict: true
|
| 14 |
+
shuffle: false
|
| 15 |
+
trim_eos_bos: false
|
| 16 |
+
with_align: false
|
| 17 |
+
with_f0: false
|
| 18 |
+
with_f0cwt: false
|
| 19 |
+
with_linear: false
|
| 20 |
+
with_spk_embed: false
|
| 21 |
+
with_spk_id: true
|
| 22 |
+
with_txt: false
|
| 23 |
+
with_wav: true
|
| 24 |
+
with_word: false
|
| 25 |
+
binarizer_cls: data_gen.tts.vocoder_binarizer.VocoderBinarizer
|
| 26 |
+
binary_data_dir: data/binary/LJSpeech
|
| 27 |
+
check_val_every_n_epoch: 10
|
| 28 |
+
clip_grad_norm: 1
|
| 29 |
+
clip_grad_value: 0
|
| 30 |
+
cond_channels: 80
|
| 31 |
+
debug: false
|
| 32 |
+
dec_ffn_kernel_size: 9
|
| 33 |
+
dec_layers: 4
|
| 34 |
+
dict_dir: ''
|
| 35 |
+
diffusion_step_embed_dim_in: 128
|
| 36 |
+
diffusion_step_embed_dim_mid: 512
|
| 37 |
+
diffusion_step_embed_dim_out: 512
|
| 38 |
+
disc_start_steps: 40000
|
| 39 |
+
discriminator_grad_norm: 1
|
| 40 |
+
dropout: 0.0
|
| 41 |
+
ds_workers: 1
|
| 42 |
+
enc_ffn_kernel_size: 9
|
| 43 |
+
enc_layers: 4
|
| 44 |
+
endless_ds: true
|
| 45 |
+
eval_max_batches: -1
|
| 46 |
+
ffn_act: gelu
|
| 47 |
+
ffn_padding: SAME
|
| 48 |
+
fft_size: 1024
|
| 49 |
+
fmax: 7600
|
| 50 |
+
fmin: 80
|
| 51 |
+
frames_multiple: 1
|
| 52 |
+
gen_dir_name: ''
|
| 53 |
+
generator_grad_norm: 10
|
| 54 |
+
griffin_lim_iters: 60
|
| 55 |
+
hidden_size: 256
|
| 56 |
+
hop_size: 256
|
| 57 |
+
infer: false
|
| 58 |
+
inner_channels: 32
|
| 59 |
+
kpnet_conv_size: 3
|
| 60 |
+
kpnet_hidden_channels: 64
|
| 61 |
+
load_ckpt: ''
|
| 62 |
+
loud_norm: false
|
| 63 |
+
lr: 2e-4
|
| 64 |
+
lvc_kernel_size: 3
|
| 65 |
+
lvc_layers_each_block: 4
|
| 66 |
+
max_epochs: 1000
|
| 67 |
+
max_frames: 1548
|
| 68 |
+
max_input_tokens: 1550
|
| 69 |
+
max_samples: 25600
|
| 70 |
+
max_sentences: 20
|
| 71 |
+
max_tokens: 30000
|
| 72 |
+
max_updates: 1000000
|
| 73 |
+
max_valid_sentences: 1
|
| 74 |
+
max_valid_tokens: 60000
|
| 75 |
+
mel_loss: l1
|
| 76 |
+
mel_vmax: 1.5
|
| 77 |
+
mel_vmin: -6
|
| 78 |
+
mfa_version: 2
|
| 79 |
+
min_frames: 0
|
| 80 |
+
min_level_db: -100
|
| 81 |
+
noise_schedule: ''
|
| 82 |
+
num_ckpt_keep: 3
|
| 83 |
+
num_heads: 2
|
| 84 |
+
num_mels: 80
|
| 85 |
+
num_sanity_val_steps: -1
|
| 86 |
+
num_spk: 400
|
| 87 |
+
num_test_samples: 0
|
| 88 |
+
num_valid_plots: 10
|
| 89 |
+
optimizer_adam_beta1: 0.9
|
| 90 |
+
optimizer_adam_beta2: 0.98
|
| 91 |
+
out_wav_norm: false
|
| 92 |
+
pitch_extractor: parselmouth
|
| 93 |
+
pre_align_args:
|
| 94 |
+
allow_no_txt: false
|
| 95 |
+
denoise: false
|
| 96 |
+
nsample_per_mfa_group: 1000
|
| 97 |
+
sox_resample: false
|
| 98 |
+
sox_to_wav: false
|
| 99 |
+
trim_sil: false
|
| 100 |
+
txt_processor: en
|
| 101 |
+
use_tone: true
|
| 102 |
+
pre_align_cls: egs.datasets.audio.pre_align.PreAlign
|
| 103 |
+
print_nan_grads: false
|
| 104 |
+
processed_data_dir: data/processed/LJSpeech
|
| 105 |
+
profile_infer: false
|
| 106 |
+
raw_data_dir: data/raw/LJSpeech-1.1
|
| 107 |
+
ref_level_db: 20
|
| 108 |
+
rename_tmux: true
|
| 109 |
+
resume_from_checkpoint: 0
|
| 110 |
+
save_best: true
|
| 111 |
+
save_codes: []
|
| 112 |
+
save_f0: false
|
| 113 |
+
save_gt: true
|
| 114 |
+
scheduler: rsqrt
|
| 115 |
+
seed: 1234
|
| 116 |
+
sort_by_len: true
|
| 117 |
+
task_cls: modules.FastDiff.task.FastDiff.FastDiffTask
|
| 118 |
+
tb_log_interval: 100
|
| 119 |
+
test_ids: []
|
| 120 |
+
test_input_dir: ''
|
| 121 |
+
test_mel_dir: ''
|
| 122 |
+
test_num: 100
|
| 123 |
+
test_set_name: test
|
| 124 |
+
train_set_name: train
|
| 125 |
+
train_sets: ''
|
| 126 |
+
upsample_ratios:
|
| 127 |
+
- 8
|
| 128 |
+
- 8
|
| 129 |
+
- 4
|
| 130 |
+
use_pitch_embed: false
|
| 131 |
+
use_spk_embed: false
|
| 132 |
+
use_spk_id: false
|
| 133 |
+
use_split_spk_id: false
|
| 134 |
+
use_wav: true
|
| 135 |
+
use_weight_norm: true
|
| 136 |
+
use_word_input: false
|
| 137 |
+
val_check_interval: 2000
|
| 138 |
+
valid_infer_interval: 10000
|
| 139 |
+
valid_monitor_key: val_loss
|
| 140 |
+
valid_monitor_mode: min
|
| 141 |
+
valid_set_name: valid
|
| 142 |
+
vocoder_denoise_c: 0.0
|
| 143 |
+
warmup_updates: 8000
|
| 144 |
+
weight_decay: 0
|
| 145 |
+
win_length: null
|
| 146 |
+
win_size: 1024
|
| 147 |
+
window: hann
|
| 148 |
+
word_size: 30000
|
| 149 |
+
work_dir: checkpoints/FastDiff
|
checkpoints/FastDiff/model_ckpt_steps_500000.ckpt
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:ee7b6022e525c71a6025b41eeeafff9d6186b52cba76b580d6986bc8674902f3
|
| 3 |
+
size 183951271
|
checkpoints/ProDiff/config.yaml
ADDED
|
@@ -0,0 +1,205 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
accumulate_grad_batches: 1
|
| 2 |
+
amp: false
|
| 3 |
+
audio_num_mel_bins: 80
|
| 4 |
+
audio_sample_rate: 22050
|
| 5 |
+
base_config:
|
| 6 |
+
- ./base.yaml
|
| 7 |
+
binarization_args:
|
| 8 |
+
reset_phone_dict: true
|
| 9 |
+
reset_word_dict: true
|
| 10 |
+
shuffle: false
|
| 11 |
+
trim_eos_bos: false
|
| 12 |
+
trim_sil: false
|
| 13 |
+
with_align: true
|
| 14 |
+
with_f0: true
|
| 15 |
+
with_f0cwt: false
|
| 16 |
+
with_linear: false
|
| 17 |
+
with_spk_embed: false
|
| 18 |
+
with_spk_id: true
|
| 19 |
+
with_txt: true
|
| 20 |
+
with_wav: false
|
| 21 |
+
with_word: true
|
| 22 |
+
binarizer_cls: data_gen.tts.base_binarizer.BaseBinarizer
|
| 23 |
+
binary_data_dir: data/binary/LJSpeech
|
| 24 |
+
check_val_every_n_epoch: 10
|
| 25 |
+
clip_grad_norm: 1
|
| 26 |
+
clip_grad_value: 0
|
| 27 |
+
conv_use_pos: false
|
| 28 |
+
cwt_add_f0_loss: false
|
| 29 |
+
cwt_hidden_size: 128
|
| 30 |
+
cwt_layers: 2
|
| 31 |
+
cwt_loss: l1
|
| 32 |
+
cwt_std_scale: 0.8
|
| 33 |
+
debug: false
|
| 34 |
+
dec_dilations:
|
| 35 |
+
- 1
|
| 36 |
+
- 1
|
| 37 |
+
- 1
|
| 38 |
+
- 1
|
| 39 |
+
dec_ffn_kernel_size: 9
|
| 40 |
+
dec_inp_add_noise: false
|
| 41 |
+
dec_kernel_size: 5
|
| 42 |
+
dec_layers: 4
|
| 43 |
+
dec_num_heads: 2
|
| 44 |
+
decoder_rnn_dim: 0
|
| 45 |
+
decoder_type: fft
|
| 46 |
+
dict_dir: ''
|
| 47 |
+
diff_decoder_type: wavenet
|
| 48 |
+
diff_loss_type: l1
|
| 49 |
+
dilation_cycle_length: 1
|
| 50 |
+
dropout: 0.1
|
| 51 |
+
ds_workers: 2
|
| 52 |
+
dur_enc_hidden_stride_kernel:
|
| 53 |
+
- 0,2,3
|
| 54 |
+
- 0,2,3
|
| 55 |
+
- 0,1,3
|
| 56 |
+
dur_loss: mse
|
| 57 |
+
dur_predictor_kernel: 3
|
| 58 |
+
dur_predictor_layers: 2
|
| 59 |
+
enc_dec_norm: ln
|
| 60 |
+
enc_dilations:
|
| 61 |
+
- 1
|
| 62 |
+
- 1
|
| 63 |
+
- 1
|
| 64 |
+
- 1
|
| 65 |
+
enc_ffn_kernel_size: 9
|
| 66 |
+
enc_kernel_size: 5
|
| 67 |
+
enc_layers: 4
|
| 68 |
+
encoder_K: 8
|
| 69 |
+
encoder_type: fft
|
| 70 |
+
endless_ds: true
|
| 71 |
+
ffn_act: gelu
|
| 72 |
+
ffn_hidden_size: 1024
|
| 73 |
+
ffn_padding: SAME
|
| 74 |
+
fft_size: 1024
|
| 75 |
+
fmax: 7600
|
| 76 |
+
fmin: 80
|
| 77 |
+
frames_multiple: 1
|
| 78 |
+
gen_dir_name: ''
|
| 79 |
+
gen_tgt_spk_id: -1
|
| 80 |
+
griffin_lim_iters: 60
|
| 81 |
+
hidden_size: 256
|
| 82 |
+
hop_size: 256
|
| 83 |
+
infer: false
|
| 84 |
+
keep_bins: 80
|
| 85 |
+
lambda_commit: 0.25
|
| 86 |
+
lambda_energy: 0.1
|
| 87 |
+
lambda_f0: 1.0
|
| 88 |
+
lambda_ph_dur: 0.1
|
| 89 |
+
lambda_sent_dur: 1.0
|
| 90 |
+
lambda_uv: 1.0
|
| 91 |
+
lambda_word_dur: 1.0
|
| 92 |
+
layers_in_block: 2
|
| 93 |
+
load_ckpt: ''
|
| 94 |
+
loud_norm: false
|
| 95 |
+
lr: 1.0
|
| 96 |
+
max_beta: 0.06
|
| 97 |
+
max_epochs: 1000
|
| 98 |
+
max_frames: 1548
|
| 99 |
+
max_input_tokens: 1550
|
| 100 |
+
max_sentences: 48
|
| 101 |
+
max_tokens: 32000
|
| 102 |
+
max_updates: 200000
|
| 103 |
+
max_valid_sentences: 1
|
| 104 |
+
max_valid_tokens: 60000
|
| 105 |
+
mel_loss: ssim:0.5|l1:0.5
|
| 106 |
+
mel_vmax: 1.5
|
| 107 |
+
mel_vmin: -6
|
| 108 |
+
min_frames: 0
|
| 109 |
+
min_level_db: -100
|
| 110 |
+
num_ckpt_keep: 3
|
| 111 |
+
num_heads: 2
|
| 112 |
+
num_sanity_val_steps: -1
|
| 113 |
+
num_spk: 1
|
| 114 |
+
num_test_samples: 0
|
| 115 |
+
num_valid_plots: 10
|
| 116 |
+
optimizer_adam_beta1: 0.9
|
| 117 |
+
optimizer_adam_beta2: 0.98
|
| 118 |
+
out_wav_norm: false
|
| 119 |
+
pitch_ar: false
|
| 120 |
+
pitch_embed_type: 0
|
| 121 |
+
pitch_enc_hidden_stride_kernel:
|
| 122 |
+
- 0,2,5
|
| 123 |
+
- 0,2,5
|
| 124 |
+
- 0,2,5
|
| 125 |
+
pitch_extractor: parselmouth
|
| 126 |
+
pitch_loss: l1
|
| 127 |
+
pitch_norm: standard
|
| 128 |
+
pitch_ssim_win: 11
|
| 129 |
+
pitch_type: frame
|
| 130 |
+
pre_align_args:
|
| 131 |
+
allow_no_txt: false
|
| 132 |
+
denoise: false
|
| 133 |
+
sox_resample: false
|
| 134 |
+
sox_to_wav: false
|
| 135 |
+
trim_sil: false
|
| 136 |
+
txt_processor: en
|
| 137 |
+
use_tone: true
|
| 138 |
+
pre_align_cls: ''
|
| 139 |
+
predictor_dropout: 0.5
|
| 140 |
+
predictor_grad: 0.1
|
| 141 |
+
predictor_hidden: -1
|
| 142 |
+
predictor_kernel: 5
|
| 143 |
+
predictor_layers: 2
|
| 144 |
+
pretrain_fs_ckpt: ''
|
| 145 |
+
print_nan_grads: false
|
| 146 |
+
processed_data_dir: data/processed/LJSpeech
|
| 147 |
+
profile_infer: false
|
| 148 |
+
raw_data_dir: data/raw/LJSpeech
|
| 149 |
+
ref_hidden_stride_kernel:
|
| 150 |
+
- 0,3,5
|
| 151 |
+
- 0,3,5
|
| 152 |
+
- 0,2,5
|
| 153 |
+
- 0,2,5
|
| 154 |
+
- 0,2,5
|
| 155 |
+
ref_level_db: 20
|
| 156 |
+
ref_norm_layer: bn
|
| 157 |
+
rename_tmux: true
|
| 158 |
+
residual_channels: 256
|
| 159 |
+
residual_layers: 20
|
| 160 |
+
resume_from_checkpoint: 0
|
| 161 |
+
save_best: true
|
| 162 |
+
save_codes: []
|
| 163 |
+
save_f0: false
|
| 164 |
+
save_gt: true
|
| 165 |
+
schedule_type: vpsde
|
| 166 |
+
scheduler: rsqrt
|
| 167 |
+
seed: 1234
|
| 168 |
+
sil_add_noise: false
|
| 169 |
+
sort_by_len: true
|
| 170 |
+
spec_max: []
|
| 171 |
+
spec_min: []
|
| 172 |
+
task_cls: modules.ProDiff.task.ProDiff_task.ProDiff_Task
|
| 173 |
+
tb_log_interval: 100
|
| 174 |
+
teacher_ckpt: checkpoints/ProDiff_Teacher/model_ckpt_steps_188000.ckpt
|
| 175 |
+
test_ids: []
|
| 176 |
+
test_input_dir: ''
|
| 177 |
+
test_num: 100
|
| 178 |
+
test_set_name: test
|
| 179 |
+
timesteps: 4
|
| 180 |
+
train_set_name: train
|
| 181 |
+
train_sets: ''
|
| 182 |
+
use_cond_disc: true
|
| 183 |
+
use_energy_embed: true
|
| 184 |
+
use_gt_dur: true
|
| 185 |
+
use_gt_f0: true
|
| 186 |
+
use_pitch_embed: true
|
| 187 |
+
use_pos_embed: true
|
| 188 |
+
use_ref_enc: false
|
| 189 |
+
use_spk_embed: false
|
| 190 |
+
use_spk_id: false
|
| 191 |
+
use_split_spk_id: false
|
| 192 |
+
use_uv: true
|
| 193 |
+
use_var_enc: false
|
| 194 |
+
val_check_interval: 2000
|
| 195 |
+
valid_infer_interval: 10000
|
| 196 |
+
valid_monitor_key: val_loss
|
| 197 |
+
valid_monitor_mode: min
|
| 198 |
+
valid_set_name: valid
|
| 199 |
+
var_enc_vq_codes: 64
|
| 200 |
+
vocoder_denoise_c: 0.0
|
| 201 |
+
warmup_updates: 2000
|
| 202 |
+
weight_decay: 0
|
| 203 |
+
win_size: 1024
|
| 204 |
+
word_size: 30000
|
| 205 |
+
work_dir: checkpoints/ProDiff
|
checkpoints/ProDiff/model_ckpt_steps_200000.ckpt
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:8cc8aad355c297b010e2c362341f736b3477744af76e02f6c9965409a7e9113a
|
| 3 |
+
size 349055740
|
checkpoints/ProDiff_Teacher/config.yaml
ADDED
|
@@ -0,0 +1,205 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
accumulate_grad_batches: 1
|
| 2 |
+
amp: false
|
| 3 |
+
audio_num_mel_bins: 80
|
| 4 |
+
audio_sample_rate: 22050
|
| 5 |
+
base_config:
|
| 6 |
+
- ./base.yaml
|
| 7 |
+
binarization_args:
|
| 8 |
+
reset_phone_dict: true
|
| 9 |
+
reset_word_dict: true
|
| 10 |
+
shuffle: false
|
| 11 |
+
trim_eos_bos: false
|
| 12 |
+
trim_sil: false
|
| 13 |
+
with_align: true
|
| 14 |
+
with_f0: true
|
| 15 |
+
with_f0cwt: false
|
| 16 |
+
with_linear: false
|
| 17 |
+
with_spk_embed: false
|
| 18 |
+
with_spk_id: true
|
| 19 |
+
with_txt: true
|
| 20 |
+
with_wav: false
|
| 21 |
+
with_word: true
|
| 22 |
+
binarizer_cls: data_gen.tts.base_binarizer.BaseBinarizer
|
| 23 |
+
binary_data_dir: data/binary/LJSpeech
|
| 24 |
+
check_val_every_n_epoch: 10
|
| 25 |
+
clip_grad_norm: 1
|
| 26 |
+
clip_grad_value: 0
|
| 27 |
+
conv_use_pos: false
|
| 28 |
+
cwt_add_f0_loss: false
|
| 29 |
+
cwt_hidden_size: 128
|
| 30 |
+
cwt_layers: 2
|
| 31 |
+
cwt_loss: l1
|
| 32 |
+
cwt_std_scale: 0.8
|
| 33 |
+
debug: false
|
| 34 |
+
dec_dilations:
|
| 35 |
+
- 1
|
| 36 |
+
- 1
|
| 37 |
+
- 1
|
| 38 |
+
- 1
|
| 39 |
+
dec_ffn_kernel_size: 9
|
| 40 |
+
dec_inp_add_noise: false
|
| 41 |
+
dec_kernel_size: 5
|
| 42 |
+
dec_layers: 4
|
| 43 |
+
dec_num_heads: 2
|
| 44 |
+
decoder_rnn_dim: 0
|
| 45 |
+
decoder_type: fft
|
| 46 |
+
dict_dir: ''
|
| 47 |
+
diff_decoder_type: wavenet
|
| 48 |
+
diff_loss_type: l1
|
| 49 |
+
dilation_cycle_length: 1
|
| 50 |
+
dropout: 0.1
|
| 51 |
+
ds_workers: 2
|
| 52 |
+
dur_enc_hidden_stride_kernel:
|
| 53 |
+
- 0,2,3
|
| 54 |
+
- 0,2,3
|
| 55 |
+
- 0,1,3
|
| 56 |
+
dur_loss: mse
|
| 57 |
+
dur_predictor_kernel: 3
|
| 58 |
+
dur_predictor_layers: 2
|
| 59 |
+
enc_dec_norm: ln
|
| 60 |
+
enc_dilations:
|
| 61 |
+
- 1
|
| 62 |
+
- 1
|
| 63 |
+
- 1
|
| 64 |
+
- 1
|
| 65 |
+
enc_ffn_kernel_size: 9
|
| 66 |
+
enc_kernel_size: 5
|
| 67 |
+
enc_layers: 4
|
| 68 |
+
encoder_K: 8
|
| 69 |
+
encoder_type: fft
|
| 70 |
+
endless_ds: true
|
| 71 |
+
ffn_act: gelu
|
| 72 |
+
ffn_hidden_size: 1024
|
| 73 |
+
ffn_padding: SAME
|
| 74 |
+
fft_size: 1024
|
| 75 |
+
fmax: 7600
|
| 76 |
+
fmin: 80
|
| 77 |
+
frames_multiple: 1
|
| 78 |
+
gen_dir_name: ''
|
| 79 |
+
gen_tgt_spk_id: -1
|
| 80 |
+
griffin_lim_iters: 60
|
| 81 |
+
hidden_size: 256
|
| 82 |
+
hop_size: 256
|
| 83 |
+
infer: false
|
| 84 |
+
keep_bins: 80
|
| 85 |
+
lambda_commit: 0.25
|
| 86 |
+
lambda_energy: 0.1
|
| 87 |
+
lambda_f0: 1.0
|
| 88 |
+
lambda_ph_dur: 0.1
|
| 89 |
+
lambda_sent_dur: 1.0
|
| 90 |
+
lambda_uv: 1.0
|
| 91 |
+
lambda_word_dur: 1.0
|
| 92 |
+
layers_in_block: 2
|
| 93 |
+
load_ckpt: ''
|
| 94 |
+
loud_norm: false
|
| 95 |
+
lr: 1.0
|
| 96 |
+
max_beta: 0.06
|
| 97 |
+
max_epochs: 1000
|
| 98 |
+
max_frames: 1548
|
| 99 |
+
max_input_tokens: 1550
|
| 100 |
+
max_sentences: 48
|
| 101 |
+
max_tokens: 32000
|
| 102 |
+
max_updates: 200000
|
| 103 |
+
max_valid_sentences: 1
|
| 104 |
+
max_valid_tokens: 60000
|
| 105 |
+
mel_loss: ssim:0.5|l1:0.5
|
| 106 |
+
mel_vmax: 1.5
|
| 107 |
+
mel_vmin: -6
|
| 108 |
+
min_frames: 0
|
| 109 |
+
min_level_db: -100
|
| 110 |
+
num_ckpt_keep: 3
|
| 111 |
+
num_heads: 2
|
| 112 |
+
num_sanity_val_steps: -1
|
| 113 |
+
num_spk: 1
|
| 114 |
+
num_test_samples: 20
|
| 115 |
+
num_valid_plots: 10
|
| 116 |
+
optimizer_adam_beta1: 0.9
|
| 117 |
+
optimizer_adam_beta2: 0.98
|
| 118 |
+
out_wav_norm: false
|
| 119 |
+
pitch_ar: false
|
| 120 |
+
pitch_embed_type: 0
|
| 121 |
+
pitch_enc_hidden_stride_kernel:
|
| 122 |
+
- 0,2,5
|
| 123 |
+
- 0,2,5
|
| 124 |
+
- 0,2,5
|
| 125 |
+
pitch_extractor: parselmouth
|
| 126 |
+
pitch_loss: l1
|
| 127 |
+
pitch_norm: standard
|
| 128 |
+
pitch_ssim_win: 11
|
| 129 |
+
pitch_type: frame
|
| 130 |
+
pre_align_args:
|
| 131 |
+
allow_no_txt: false
|
| 132 |
+
denoise: false
|
| 133 |
+
sox_resample: false
|
| 134 |
+
sox_to_wav: false
|
| 135 |
+
trim_sil: false
|
| 136 |
+
txt_processor: en
|
| 137 |
+
use_tone: true
|
| 138 |
+
pre_align_cls: egs.datasets.audio.lj.pre_align.LJPreAlign
|
| 139 |
+
predictor_dropout: 0.5
|
| 140 |
+
predictor_grad: 0.1
|
| 141 |
+
predictor_hidden: -1
|
| 142 |
+
predictor_kernel: 5
|
| 143 |
+
predictor_layers: 2
|
| 144 |
+
pretrain_fs_ckpt: ''
|
| 145 |
+
print_nan_grads: false
|
| 146 |
+
processed_data_dir: data/processed/LJSpeech
|
| 147 |
+
profile_infer: false
|
| 148 |
+
raw_data_dir: data/raw/LJSpeech
|
| 149 |
+
ref_hidden_stride_kernel:
|
| 150 |
+
- 0,3,5
|
| 151 |
+
- 0,3,5
|
| 152 |
+
- 0,2,5
|
| 153 |
+
- 0,2,5
|
| 154 |
+
- 0,2,5
|
| 155 |
+
ref_level_db: 20
|
| 156 |
+
ref_norm_layer: bn
|
| 157 |
+
rename_tmux: true
|
| 158 |
+
residual_channels: 256
|
| 159 |
+
residual_layers: 20
|
| 160 |
+
resume_from_checkpoint: 0
|
| 161 |
+
save_best: true
|
| 162 |
+
save_codes: []
|
| 163 |
+
save_f0: false
|
| 164 |
+
save_gt: true
|
| 165 |
+
schedule_type: vpsde
|
| 166 |
+
scheduler: rsqrt
|
| 167 |
+
seed: 1234
|
| 168 |
+
sil_add_noise: false
|
| 169 |
+
sort_by_len: true
|
| 170 |
+
spec_max: []
|
| 171 |
+
spec_min: []
|
| 172 |
+
task_cls: modules.ProDiff.task.ProDiff_teacher_task.ProDiff_teacher_Task
|
| 173 |
+
tb_log_interval: 100
|
| 174 |
+
test_ids: []
|
| 175 |
+
test_input_dir: ''
|
| 176 |
+
test_num: 100
|
| 177 |
+
test_set_name: test
|
| 178 |
+
timescale: 1
|
| 179 |
+
timesteps: 4
|
| 180 |
+
train_set_name: train
|
| 181 |
+
train_sets: ''
|
| 182 |
+
use_cond_disc: true
|
| 183 |
+
use_energy_embed: true
|
| 184 |
+
use_gt_dur: true
|
| 185 |
+
use_gt_f0: true
|
| 186 |
+
use_pitch_embed: true
|
| 187 |
+
use_pos_embed: true
|
| 188 |
+
use_ref_enc: false
|
| 189 |
+
use_spk_embed: false
|
| 190 |
+
use_spk_id: false
|
| 191 |
+
use_split_spk_id: false
|
| 192 |
+
use_uv: true
|
| 193 |
+
use_var_enc: false
|
| 194 |
+
val_check_interval: 2000
|
| 195 |
+
valid_infer_interval: 10000
|
| 196 |
+
valid_monitor_key: val_loss
|
| 197 |
+
valid_monitor_mode: min
|
| 198 |
+
valid_set_name: valid
|
| 199 |
+
var_enc_vq_codes: 64
|
| 200 |
+
vocoder_denoise_c: 0.0
|
| 201 |
+
warmup_updates: 2000
|
| 202 |
+
weight_decay: 0
|
| 203 |
+
win_size: 1024
|
| 204 |
+
word_size: 30000
|
| 205 |
+
work_dir: checkpoints/ProDiff_Teacher1
|
checkpoints/ProDiff_Teacher/model_ckpt_steps_188000.ckpt
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:5d3d02a215431c69dd54c1413b9a02cdc32795e2039ad9be857b12e85c470eea
|
| 3 |
+
size 342252871
|
data/binary/LJSpeech/phone_set.json
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
["!", ",", ".", ":", ";", "<BOS>", "<EOS>", "?", "AA0", "AA1", "AA2", "AE0", "AE1", "AE2", "AH0", "AH1", "AH2", "AO0", "AO1", "AO2", "AW0", "AW1", "AW2", "AY0", "AY1", "AY2", "B", "CH", "D", "DH", "EH0", "EH1", "EH2", "ER0", "ER1", "ER2", "EY0", "EY1", "EY2", "F", "G", "HH", "IH0", "IH1", "IH2", "IY0", "IY1", "IY2", "JH", "K", "L", "M", "N", "NG", "OW0", "OW1", "OW2", "OY0", "OY1", "OY2", "P", "R", "S", "SH", "T", "TH", "UH0", "UH1", "UH2", "UW0", "UW1", "UW2", "V", "W", "Y", "Z", "ZH", "|"]
|
data/binary/LJSpeech/spk_map.json
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
{"SPK1": 0}
|
data/binary/LJSpeech/train_f0s_mean_std.npy
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:8790d5a84d77143690ae71a1f1e7fc81359e69ead263dc440366f2164c739efd
|
| 3 |
+
size 144
|