Emmylahot12
/

TextToCloneSpeech

speech synthesis

voice generation

Model card Files Files and versions

Emmylahot12 commited on May 8

Commit

745a2c1

·

verified ·

1 Parent(s): 2056127

Create train.py

Files changed (1) hide show

train.py +55 -0

train.py ADDED Viewed

	@@ -0,0 +1,55 @@

+import pandas as pd
+import tensorflow as tf
+import numpy as np
+import librosa
+import os
+# === CONFIG ===
+DATA_PATH = "data/transcriptions.csv"
+AUDIO_DIR = "data"
+MODEL_PATH = "model/clone_tts_model.h5"
+SAMPLE_RATE = 22050
+TEXT_MAX_LEN = 100  # Max characters per text
+# === Load and preprocess dataset ===
+def load_data():
+    data = pd.read_csv(DATA_PATH)
+    texts = data['text'].values
+    audio_arrays = []
+    for file in data['file']:
+        audio_path = os.path.join(AUDIO_DIR, file)
+        y, _ = librosa.load(audio_path, sr=SAMPLE_RATE)
+        audio_arrays.append(y)
+    max_audio_len = max(len(a) for a in audio_arrays)
+    padded_audios = np.array([np.pad(a, (0, max_audio_len - len(a))) for a in audio_arrays])
+    padded_texts = np.array([
+        [ord(c) for c in text.ljust(TEXT_MAX_LEN)[:TEXT_MAX_LEN]] for text in texts
+    ])
+    return padded_texts, padded_audios, max_audio_len
+# === Build and train model ===
+def train_model():
+    print("Loading and preparing data...")
+    X, y, audio_len = load_data()
+    print("Building model...")
+    model = tf.keras.Sequential([
+        tf.keras.layers.Input(shape=(TEXT_MAX_LEN,)),
+        tf.keras.layers.Dense(256, activation='relu'),
+        tf.keras.layers.Dense(audio_len)
+    ])
+    model.compile(optimizer='adam', loss='mse')
+    print("Training...")
+    model.fit(X, y, epochs=10, batch_size=4)
+    os.makedirs(os.path.dirname(MODEL_PATH), exist_ok=True)
+    model.save(MODEL_PATH)
+    print(f"Model saved to {MODEL_PATH}")
+if __name__ == "__main__":
+    train_model()