File size: 5,108 Bytes

0b61e28

{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "5a3ddcc8",
   "metadata": {},
   "outputs": [],
   "source": [
    "from inference import StyleTTS2\n",
    "\n",
    "import librosa\n",
    "import IPython.display as ipd\n",
    "import torch.cuda\n",
    "\n",
    "device = 'cuda' if torch.cuda.is_available() else 'cpu'"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "092cfb69",
   "metadata": {},
   "source": [
    "### Load G2P"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "a152ec13",
   "metadata": {},
   "source": [
    "If you did not use eSpeak for your language, please add your own G2P."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "ca224f37",
   "metadata": {},
   "outputs": [],
   "source": [
    "import sys\n",
    "import phonemizer\n",
    "if sys.platform.startswith(\"win\"):\n",
    "    try:\n",
    "        from phonemizer.backend.espeak.wrapper import EspeakWrapper\n",
    "        import espeakng_loader\n",
    "        EspeakWrapper.set_library(espeakng_loader.get_library_path())\n",
    "    except Exception as e:\n",
    "        print(e)\n",
    "\n",
    "def get_phoneme(text, lang):\n",
    "    try:\n",
    "        my_phonemizer = phonemizer.backend.EspeakBackend(language=lang, preserve_punctuation=True,  with_stress=True, language_switch='remove-flags')\n",
    "        return my_phonemizer.phonemize([text])[0]\n",
    "    except Exception as e:\n",
    "        print(e)"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "7b9cecbe",
   "metadata": {},
   "source": [
    "### Load models"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "e7b9c01d",
   "metadata": {},
   "outputs": [],
   "source": [
    "config_path = \"Models/config.yaml\"\n",
    "models_path = \"Models/inference/model.pth\""
   ]
  },
  {
   "cell_type": "markdown",
   "id": "b803110e",
   "metadata": {},
   "source": [
    "### Synthesize speech\n",
    "\n",
    "Little Note: Reference audio has a huge impact on the result. It is best to select audio around 10s long and consistent in both tone and speed."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "78396f70",
   "metadata": {},
   "outputs": [],
   "source": [
    "speaker = {\n",
    "    \"path\": \"./Audio/1_heart.wav\",  #Ref audio path\n",
    "    \"speed\": 1.0,                        #Speaking speed\n",
    "}\n",
    "\n",
    "max_samples = 24000*20 #max 20 seconds ref audio\n",
    "print(speaker['path'])\n",
    "wave, sr = librosa.load(speaker['path'], sr=24000)\n",
    "audio, index = librosa.effects.trim(wave, top_db=30)\n",
    "if sr != 24000:              audio = librosa.resample(audio, sr, 24000)\n",
    "if len(audio) > max_samples: audio = audio[:max_samples]\n",
    "display(ipd.Audio(audio, rate=24000, normalize=True))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "395959f1",
   "metadata": {},
   "outputs": [],
   "source": [
    "text = '''\n",
    "Nearly 300 scholars currently working in the United States have applied for positions at Aix-Marseille University in France, which has announced a program to provide a haven for academics affected by the Trump administration's policies.\n",
    "Aix-Marseille launched the \"Safe Place for Science\" initiative earlier this year, offering three-year funded placements for approximately 20 researchers. The program aims to support scholars facing budget cuts and policy changes that have disrupted U.S. academic institutions.\n",
    "'''"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "16194211",
   "metadata": {},
   "outputs": [],
   "source": [
    "model             = StyleTTS2(config_path, models_path).eval().to(device)\n",
    "avg_style         = True      #BOOL   Split the ref audio and calculate the avg styles.\n",
    "stabilize         = False     #BOOL   Stabilize speaking speed.\n",
    "denoise           = 0.3       #FLOAT  Adjust the strength of the denoiser. Value range is [0, 1]\n",
    "n_merge           = 16        #INT    Avoid short sentences by merging when a sentence has fewer than n words"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "980c6fbb",
   "metadata": {},
   "outputs": [],
   "source": [
    "with torch.no_grad():\n",
    "    phonemes = get_phoneme(text=text, lang=\"en-us\")\n",
    "\n",
    "    styles  = model.get_styles(speaker, denoise, avg_style)\n",
    "    r       = model.generate(phonemes, styles, stabilize, n_merge)\n",
    "\n",
    "print('Synthesized:')\n",
    "display(ipd.Audio(r, rate=24000, normalize=True))"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "base",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.11.7"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}