File size: 5,108 Bytes
0b61e28 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 |
{
"cells": [
{
"cell_type": "code",
"execution_count": null,
"id": "5a3ddcc8",
"metadata": {},
"outputs": [],
"source": [
"from inference import StyleTTS2\n",
"\n",
"import librosa\n",
"import IPython.display as ipd\n",
"import torch.cuda\n",
"\n",
"device = 'cuda' if torch.cuda.is_available() else 'cpu'"
]
},
{
"cell_type": "markdown",
"id": "092cfb69",
"metadata": {},
"source": [
"### Load G2P"
]
},
{
"cell_type": "markdown",
"id": "a152ec13",
"metadata": {},
"source": [
"If you did not use eSpeak for your language, please add your own G2P."
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "ca224f37",
"metadata": {},
"outputs": [],
"source": [
"import sys\n",
"import phonemizer\n",
"if sys.platform.startswith(\"win\"):\n",
" try:\n",
" from phonemizer.backend.espeak.wrapper import EspeakWrapper\n",
" import espeakng_loader\n",
" EspeakWrapper.set_library(espeakng_loader.get_library_path())\n",
" except Exception as e:\n",
" print(e)\n",
"\n",
"def get_phoneme(text, lang):\n",
" try:\n",
" my_phonemizer = phonemizer.backend.EspeakBackend(language=lang, preserve_punctuation=True, with_stress=True, language_switch='remove-flags')\n",
" return my_phonemizer.phonemize([text])[0]\n",
" except Exception as e:\n",
" print(e)"
]
},
{
"cell_type": "markdown",
"id": "7b9cecbe",
"metadata": {},
"source": [
"### Load models"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "e7b9c01d",
"metadata": {},
"outputs": [],
"source": [
"config_path = \"Models/config.yaml\"\n",
"models_path = \"Models/inference/model.pth\""
]
},
{
"cell_type": "markdown",
"id": "b803110e",
"metadata": {},
"source": [
"### Synthesize speech\n",
"\n",
"Little Note: Reference audio has a huge impact on the result. It is best to select audio around 10s long and consistent in both tone and speed."
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "78396f70",
"metadata": {},
"outputs": [],
"source": [
"speaker = {\n",
" \"path\": \"./Audio/1_heart.wav\", #Ref audio path\n",
" \"speed\": 1.0, #Speaking speed\n",
"}\n",
"\n",
"max_samples = 24000*20 #max 20 seconds ref audio\n",
"print(speaker['path'])\n",
"wave, sr = librosa.load(speaker['path'], sr=24000)\n",
"audio, index = librosa.effects.trim(wave, top_db=30)\n",
"if sr != 24000: audio = librosa.resample(audio, sr, 24000)\n",
"if len(audio) > max_samples: audio = audio[:max_samples]\n",
"display(ipd.Audio(audio, rate=24000, normalize=True))"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "395959f1",
"metadata": {},
"outputs": [],
"source": [
"text = '''\n",
"Nearly 300 scholars currently working in the United States have applied for positions at Aix-Marseille University in France, which has announced a program to provide a haven for academics affected by the Trump administration's policies.\n",
"Aix-Marseille launched the \"Safe Place for Science\" initiative earlier this year, offering three-year funded placements for approximately 20 researchers. The program aims to support scholars facing budget cuts and policy changes that have disrupted U.S. academic institutions.\n",
"'''"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "16194211",
"metadata": {},
"outputs": [],
"source": [
"model = StyleTTS2(config_path, models_path).eval().to(device)\n",
"avg_style = True #BOOL Split the ref audio and calculate the avg styles.\n",
"stabilize = False #BOOL Stabilize speaking speed.\n",
"denoise = 0.3 #FLOAT Adjust the strength of the denoiser. Value range is [0, 1]\n",
"n_merge = 16 #INT Avoid short sentences by merging when a sentence has fewer than n words"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "980c6fbb",
"metadata": {},
"outputs": [],
"source": [
"with torch.no_grad():\n",
" phonemes = get_phoneme(text=text, lang=\"en-us\")\n",
"\n",
" styles = model.get_styles(speaker, denoise, avg_style)\n",
" r = model.generate(phonemes, styles, stabilize, n_merge)\n",
"\n",
"print('Synthesized:')\n",
"display(ipd.Audio(r, rate=24000, normalize=True))"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "base",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.11.7"
}
},
"nbformat": 4,
"nbformat_minor": 5
}
|