File size: 5,108 Bytes
0b61e28
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "5a3ddcc8",
   "metadata": {},
   "outputs": [],
   "source": [
    "from inference import StyleTTS2\n",
    "\n",
    "import librosa\n",
    "import IPython.display as ipd\n",
    "import torch.cuda\n",
    "\n",
    "device = 'cuda' if torch.cuda.is_available() else 'cpu'"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "092cfb69",
   "metadata": {},
   "source": [
    "### Load G2P"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "a152ec13",
   "metadata": {},
   "source": [
    "If you did not use eSpeak for your language, please add your own G2P."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "ca224f37",
   "metadata": {},
   "outputs": [],
   "source": [
    "import sys\n",
    "import phonemizer\n",
    "if sys.platform.startswith(\"win\"):\n",
    "    try:\n",
    "        from phonemizer.backend.espeak.wrapper import EspeakWrapper\n",
    "        import espeakng_loader\n",
    "        EspeakWrapper.set_library(espeakng_loader.get_library_path())\n",
    "    except Exception as e:\n",
    "        print(e)\n",
    "\n",
    "def get_phoneme(text, lang):\n",
    "    try:\n",
    "        my_phonemizer = phonemizer.backend.EspeakBackend(language=lang, preserve_punctuation=True,  with_stress=True, language_switch='remove-flags')\n",
    "        return my_phonemizer.phonemize([text])[0]\n",
    "    except Exception as e:\n",
    "        print(e)"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "7b9cecbe",
   "metadata": {},
   "source": [
    "### Load models"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "e7b9c01d",
   "metadata": {},
   "outputs": [],
   "source": [
    "config_path = \"Models/config.yaml\"\n",
    "models_path = \"Models/inference/model.pth\""
   ]
  },
  {
   "cell_type": "markdown",
   "id": "b803110e",
   "metadata": {},
   "source": [
    "### Synthesize speech\n",
    "\n",
    "Little Note: Reference audio has a huge impact on the result. It is best to select audio around 10s long and consistent in both tone and speed."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "78396f70",
   "metadata": {},
   "outputs": [],
   "source": [
    "speaker = {\n",
    "    \"path\": \"./Audio/1_heart.wav\",  #Ref audio path\n",
    "    \"speed\": 1.0,                        #Speaking speed\n",
    "}\n",
    "\n",
    "max_samples = 24000*20 #max 20 seconds ref audio\n",
    "print(speaker['path'])\n",
    "wave, sr = librosa.load(speaker['path'], sr=24000)\n",
    "audio, index = librosa.effects.trim(wave, top_db=30)\n",
    "if sr != 24000:              audio = librosa.resample(audio, sr, 24000)\n",
    "if len(audio) > max_samples: audio = audio[:max_samples]\n",
    "display(ipd.Audio(audio, rate=24000, normalize=True))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "395959f1",
   "metadata": {},
   "outputs": [],
   "source": [
    "text = '''\n",
    "Nearly 300 scholars currently working in the United States have applied for positions at Aix-Marseille University in France, which has announced a program to provide a haven for academics affected by the Trump administration's policies.\n",
    "Aix-Marseille launched the \"Safe Place for Science\" initiative earlier this year, offering three-year funded placements for approximately 20 researchers. The program aims to support scholars facing budget cuts and policy changes that have disrupted U.S. academic institutions.\n",
    "'''"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "16194211",
   "metadata": {},
   "outputs": [],
   "source": [
    "model             = StyleTTS2(config_path, models_path).eval().to(device)\n",
    "avg_style         = True      #BOOL   Split the ref audio and calculate the avg styles.\n",
    "stabilize         = False     #BOOL   Stabilize speaking speed.\n",
    "denoise           = 0.3       #FLOAT  Adjust the strength of the denoiser. Value range is [0, 1]\n",
    "n_merge           = 16        #INT    Avoid short sentences by merging when a sentence has fewer than n words"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "980c6fbb",
   "metadata": {},
   "outputs": [],
   "source": [
    "with torch.no_grad():\n",
    "    phonemes = get_phoneme(text=text, lang=\"en-us\")\n",
    "\n",
    "    styles  = model.get_styles(speaker, denoise, avg_style)\n",
    "    r       = model.generate(phonemes, styles, stabilize, n_merge)\n",
    "\n",
    "print('Synthesized:')\n",
    "display(ipd.Audio(r, rate=24000, normalize=True))"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "base",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.11.7"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}