Upload folder using huggingface_hub

Browse files

Files changed (7) hide show

.gitattributes +1 -0
Confusion-matrix-of-speaker-dependent-emotions-prediction-on-RAVDESS-corpus-with-8202.png +0 -0
README.md +133 -3
loss and accuracy.png +3 -0
predict.py +87 -0
requirements.txt +5 -0
trained_model.h5 +3 -0

.gitattributes CHANGED Viewed

@@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text

 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text
+loss[[:space:]]and[[:space:]]accuracy.png filter=lfs diff=lfs merge=lfs -text

Confusion-matrix-of-speaker-dependent-emotions-prediction-on-RAVDESS-corpus-with-8202.png ADDED Viewed

README.md CHANGED Viewed

@@ -1,3 +1,133 @@
----
-license: mit
----

+# Speech Emotion Recognition Model
+This model performs speech emotion recognition, classifying audio into 8 different emotional states.
+## Model Description
+This is a deep learning model trained to recognize emotions from speech audio. The model can classify audio into the following emotions:
+- 😐 Neutral
+- 😌 Calm
+- 😊 Happy
+- 😢 Sad
+- 😠 Angry
+- 😨 Fearful
+- 🤢 Disgust
+- 😲 Surprised
+## Model Architecture
+The model uses audio features extraction including:
+- MFCC (Mel-frequency cepstral coefficients)
+- Chroma features
+- Mel-spectrogram features
+## Usage
+```python
+import librosa
+import numpy as np
+from tensorflow.keras.models import load_model
+# Load the model
+model = load_model('trained_model.h5')
+# Load and preprocess audio
+def extract_feature(data, sr, mfcc=True, chroma=True, mel=True):
+    result = np.array([])
+    if mfcc:
+        mfccs = np.mean(librosa.feature.mfcc(y=data, sr=sr, n_mfcc=40).T, axis=0)
+        result = np.hstack((result, mfccs))
+    if chroma:
+        stft = np.abs(librosa.stft(data))
+        chroma_feat = np.mean(librosa.feature.chroma_stft(S=stft, sr=sr).T, axis=0)
+        result = np.hstack((result, chroma_feat))
+    if mel:
+        mel_feat = np.mean(librosa.feature.melspectrogram(y=data, sr=sr).T, axis=0)
+        result = np.hstack((result, mel_feat))
+    return result
+# Load audio file
+audio_path = "your_audio_file.wav"
+data, sr = librosa.load(audio_path, sr=22050)
+# Extract features
+feature = extract_feature(data, sr, mfcc=True, chroma=True, mel=True)
+feature = np.expand_dims(feature, axis=0)
+feature = np.expand_dims(feature, axis=2)
+# Make prediction
+prediction = model.predict(feature)
+predicted_class = np.argmax(prediction, axis=1)
+# Map to emotion labels
+emotions = {
+    0: 'Neutral',
+    1: 'Calm',
+    2: 'Happy',
+    3: 'Sad',
+    4: 'Angry',
+    5: 'Fearful',
+    6: 'Disgust',
+    7: 'Surprised'
+}
+predicted_emotion = emotions[predicted_class[0]]
+print(f"Predicted emotion: {predicted_emotion}")
+```
+## Requirements
+```
+librosa
+tensorflow
+numpy
+scikit-learn
+```
+## Training Data
+The model was trained on the RAVDESS (Ryerson Audio-Visual Database of Emotional Speech and Song) dataset, which contains speech emotion recordings with the following emotion categories:
+- Neutral
+- Calm
+- Happy
+- Sad
+- Angry
+- Fearful
+- Disgust
+- Surprised
+The dataset provides high-quality audio recordings from multiple speakers, allowing the model to learn robust emotion recognition patterns across different voices and speaking styles.
+## Model Performance
+The model has been trained and evaluated with the following performance metrics:
+### Training Progress
+![Loss and Accuracy](loss%20and%20accuracy.png)
+The training curves show the model's learning progress over epochs, demonstrating convergence and good generalization.
+### Confusion Matrix
+![Confusion Matrix](Confusion-matrix-of-speaker-dependent-emotions-prediction-on-RAVDESS-corpus-with-8202.png)
+The confusion matrix shows the model's performance on the RAVDESS dataset, demonstrating how well the model distinguishes between different emotional states.
+## License
+[Specify your license here]
+## Citation
+If you use this model, please cite:
+```
+@misc{speech-emotion-recognition,
+  author = {JagjeevanAK},
+  title = {Speech Emotion Recognition Model},
+  year = {2025},
+  publisher = {Hugging Face},
+  url = {https://huggingface.co/JagjeevanAK/Speech-emotion-detection}
+}
+```

loss and accuracy.png ADDED Viewed

Git LFS Details

SHA256: c17cabeb13be1dc5659541a55e16b8f8c4c5443b904235b259cde1472a467928
Pointer size: 131 Bytes
Size of remote file: 213 kB

predict.py ADDED Viewed

	@@ -0,0 +1,87 @@

+"""
+Speech Emotion Recognition Inference Script
+"""
+import librosa
+import numpy as np
+import tensorflow as tf
+from tensorflow.keras.models import load_model
+from sklearn.preprocessing import LabelEncoder
+import argparse
+def extract_feature(data, sr, mfcc=True, chroma=True, mel=True):
+    """
+    Extract features from audio files into numpy array
+    """
+    result = np.array([])
+    if mfcc:
+        mfccs = np.mean(librosa.feature.mfcc(y=data, sr=sr, n_mfcc=40).T, axis=0)
+        result = np.hstack((result, mfccs))
+    if chroma:
+        stft = np.abs(librosa.stft(data))
+        chroma_feat = np.mean(librosa.feature.chroma_stft(S=stft, sr=sr).T, axis=0)
+        result = np.hstack((result, chroma_feat))
+    if mel:
+        mel_feat = np.mean(librosa.feature.melspectrogram(y=data, sr=sr).T, axis=0)
+        result = np.hstack((result, mel_feat))
+    return result
+def predict_emotion(audio_path, model_path='trained_model.h5'):
+    """
+    Predict emotion from audio file
+    """
+    # Load audio
+    data, sr = librosa.load(audio_path, sr=22050)
+    # Extract features
+    feature = extract_feature(data, sr, mfcc=True, chroma=True, mel=True)
+    feature = np.expand_dims(feature, axis=0)
+    feature = np.expand_dims(feature, axis=2)
+    # Load model and predict
+    model = load_model(model_path)
+    prediction = model.predict(feature)
+    predicted_class = np.argmax(prediction, axis=1)
+    # Map to emotion labels
+    emotions = {
+        '01': 'Neutral',
+        '02': 'Calm',
+        '03': 'Happy',
+        '04': 'Sad',
+        '05': 'Angry',
+        '06': 'Fearful',
+        '07': 'Disgust',
+        '08': 'Surprised'
+    }
+    emojis = {
+        'Neutral': '😐',
+        'Calm': '😌',
+        'Happy': '😊',
+        'Sad': '😢',
+        'Angry': '😠',
+        'Fearful': '😨',
+        'Disgust': '🤢',
+        'Surprised': '😲'
+    }
+    label_encoder = LabelEncoder()
+    label_encoder.fit(list(emotions.values()))
+    predicted_emotion = label_encoder.inverse_transform(predicted_class)[0]
+    return predicted_emotion, emojis[predicted_emotion], prediction[0]
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(description='Predict emotion from audio file')
+    parser.add_argument('audio_path', help='Path to audio file')
+    parser.add_argument('--model', default='trained_model.h5', help='Path to model file')
+    args = parser.parse_args()
+    try:
+        emotion, emoji, confidence = predict_emotion(args.audio_path, args.model)
+        print(f"Predicted Emotion: {emotion} {emoji}")
+        print(f"Confidence scores: {confidence}")
+    except Exception as e:
+        print(f"Error: {e}")

requirements.txt ADDED Viewed

	@@ -0,0 +1,5 @@

+librosa>=0.8.0
+tensorflow>=2.8.0
+numpy>=1.21.0
+scikit-learn>=1.0.0
+scipy>=1.7.0

trained_model.h5 ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:90111a9b8a4107d0b9c247a5deaba1676000d8d1fcecdf8e9fc3d465f1a459e7
+size 4281088