JagjeevanAK commited on
Commit
9401b94
·
verified ·
1 Parent(s): f4ab329

Upload folder using huggingface_hub

Browse files
.gitattributes CHANGED
@@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
 
 
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
36
+ loss[[:space:]]and[[:space:]]accuracy.png filter=lfs diff=lfs merge=lfs -text
Confusion-matrix-of-speaker-dependent-emotions-prediction-on-RAVDESS-corpus-with-8202.png ADDED
README.md CHANGED
@@ -1,3 +1,133 @@
1
- ---
2
- license: mit
3
- ---
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Speech Emotion Recognition Model
2
+
3
+ This model performs speech emotion recognition, classifying audio into 8 different emotional states.
4
+
5
+ ## Model Description
6
+
7
+ This is a deep learning model trained to recognize emotions from speech audio. The model can classify audio into the following emotions:
8
+
9
+ - 😐 Neutral
10
+ - 😌 Calm
11
+ - 😊 Happy
12
+ - 😢 Sad
13
+ - 😠 Angry
14
+ - 😨 Fearful
15
+ - 🤢 Disgust
16
+ - 😲 Surprised
17
+
18
+ ## Model Architecture
19
+
20
+ The model uses audio features extraction including:
21
+ - MFCC (Mel-frequency cepstral coefficients)
22
+ - Chroma features
23
+ - Mel-spectrogram features
24
+
25
+ ## Usage
26
+
27
+ ```python
28
+ import librosa
29
+ import numpy as np
30
+ from tensorflow.keras.models import load_model
31
+
32
+ # Load the model
33
+ model = load_model('trained_model.h5')
34
+
35
+ # Load and preprocess audio
36
+ def extract_feature(data, sr, mfcc=True, chroma=True, mel=True):
37
+ result = np.array([])
38
+ if mfcc:
39
+ mfccs = np.mean(librosa.feature.mfcc(y=data, sr=sr, n_mfcc=40).T, axis=0)
40
+ result = np.hstack((result, mfccs))
41
+ if chroma:
42
+ stft = np.abs(librosa.stft(data))
43
+ chroma_feat = np.mean(librosa.feature.chroma_stft(S=stft, sr=sr).T, axis=0)
44
+ result = np.hstack((result, chroma_feat))
45
+ if mel:
46
+ mel_feat = np.mean(librosa.feature.melspectrogram(y=data, sr=sr).T, axis=0)
47
+ result = np.hstack((result, mel_feat))
48
+ return result
49
+
50
+ # Load audio file
51
+ audio_path = "your_audio_file.wav"
52
+ data, sr = librosa.load(audio_path, sr=22050)
53
+
54
+ # Extract features
55
+ feature = extract_feature(data, sr, mfcc=True, chroma=True, mel=True)
56
+ feature = np.expand_dims(feature, axis=0)
57
+ feature = np.expand_dims(feature, axis=2)
58
+
59
+ # Make prediction
60
+ prediction = model.predict(feature)
61
+ predicted_class = np.argmax(prediction, axis=1)
62
+
63
+ # Map to emotion labels
64
+ emotions = {
65
+ 0: 'Neutral',
66
+ 1: 'Calm',
67
+ 2: 'Happy',
68
+ 3: 'Sad',
69
+ 4: 'Angry',
70
+ 5: 'Fearful',
71
+ 6: 'Disgust',
72
+ 7: 'Surprised'
73
+ }
74
+
75
+ predicted_emotion = emotions[predicted_class[0]]
76
+ print(f"Predicted emotion: {predicted_emotion}")
77
+ ```
78
+
79
+ ## Requirements
80
+
81
+ ```
82
+ librosa
83
+ tensorflow
84
+ numpy
85
+ scikit-learn
86
+ ```
87
+
88
+ ## Training Data
89
+
90
+ The model was trained on the RAVDESS (Ryerson Audio-Visual Database of Emotional Speech and Song) dataset, which contains speech emotion recordings with the following emotion categories:
91
+
92
+ - Neutral
93
+ - Calm
94
+ - Happy
95
+ - Sad
96
+ - Angry
97
+ - Fearful
98
+ - Disgust
99
+ - Surprised
100
+
101
+ The dataset provides high-quality audio recordings from multiple speakers, allowing the model to learn robust emotion recognition patterns across different voices and speaking styles.
102
+
103
+ ## Model Performance
104
+
105
+ The model has been trained and evaluated with the following performance metrics:
106
+
107
+ ### Training Progress
108
+ ![Loss and Accuracy](loss%20and%20accuracy.png)
109
+
110
+ The training curves show the model's learning progress over epochs, demonstrating convergence and good generalization.
111
+
112
+ ### Confusion Matrix
113
+ ![Confusion Matrix](Confusion-matrix-of-speaker-dependent-emotions-prediction-on-RAVDESS-corpus-with-8202.png)
114
+
115
+ The confusion matrix shows the model's performance on the RAVDESS dataset, demonstrating how well the model distinguishes between different emotional states.
116
+
117
+ ## License
118
+
119
+ [Specify your license here]
120
+
121
+ ## Citation
122
+
123
+ If you use this model, please cite:
124
+
125
+ ```
126
+ @misc{speech-emotion-recognition,
127
+ author = {JagjeevanAK},
128
+ title = {Speech Emotion Recognition Model},
129
+ year = {2025},
130
+ publisher = {Hugging Face},
131
+ url = {https://huggingface.co/JagjeevanAK/Speech-emotion-detection}
132
+ }
133
+ ```
loss and accuracy.png ADDED

Git LFS Details

  • SHA256: c17cabeb13be1dc5659541a55e16b8f8c4c5443b904235b259cde1472a467928
  • Pointer size: 131 Bytes
  • Size of remote file: 213 kB
predict.py ADDED
@@ -0,0 +1,87 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Speech Emotion Recognition Inference Script
3
+ """
4
+
5
+ import librosa
6
+ import numpy as np
7
+ import tensorflow as tf
8
+ from tensorflow.keras.models import load_model
9
+ from sklearn.preprocessing import LabelEncoder
10
+ import argparse
11
+
12
+ def extract_feature(data, sr, mfcc=True, chroma=True, mel=True):
13
+ """
14
+ Extract features from audio files into numpy array
15
+ """
16
+ result = np.array([])
17
+ if mfcc:
18
+ mfccs = np.mean(librosa.feature.mfcc(y=data, sr=sr, n_mfcc=40).T, axis=0)
19
+ result = np.hstack((result, mfccs))
20
+ if chroma:
21
+ stft = np.abs(librosa.stft(data))
22
+ chroma_feat = np.mean(librosa.feature.chroma_stft(S=stft, sr=sr).T, axis=0)
23
+ result = np.hstack((result, chroma_feat))
24
+ if mel:
25
+ mel_feat = np.mean(librosa.feature.melspectrogram(y=data, sr=sr).T, axis=0)
26
+ result = np.hstack((result, mel_feat))
27
+ return result
28
+
29
+ def predict_emotion(audio_path, model_path='trained_model.h5'):
30
+ """
31
+ Predict emotion from audio file
32
+ """
33
+ # Load audio
34
+ data, sr = librosa.load(audio_path, sr=22050)
35
+
36
+ # Extract features
37
+ feature = extract_feature(data, sr, mfcc=True, chroma=True, mel=True)
38
+ feature = np.expand_dims(feature, axis=0)
39
+ feature = np.expand_dims(feature, axis=2)
40
+
41
+ # Load model and predict
42
+ model = load_model(model_path)
43
+ prediction = model.predict(feature)
44
+ predicted_class = np.argmax(prediction, axis=1)
45
+
46
+ # Map to emotion labels
47
+ emotions = {
48
+ '01': 'Neutral',
49
+ '02': 'Calm',
50
+ '03': 'Happy',
51
+ '04': 'Sad',
52
+ '05': 'Angry',
53
+ '06': 'Fearful',
54
+ '07': 'Disgust',
55
+ '08': 'Surprised'
56
+ }
57
+
58
+ emojis = {
59
+ 'Neutral': '😐',
60
+ 'Calm': '😌',
61
+ 'Happy': '😊',
62
+ 'Sad': '😢',
63
+ 'Angry': '😠',
64
+ 'Fearful': '😨',
65
+ 'Disgust': '🤢',
66
+ 'Surprised': '😲'
67
+ }
68
+
69
+ label_encoder = LabelEncoder()
70
+ label_encoder.fit(list(emotions.values()))
71
+ predicted_emotion = label_encoder.inverse_transform(predicted_class)[0]
72
+
73
+ return predicted_emotion, emojis[predicted_emotion], prediction[0]
74
+
75
+ if __name__ == "__main__":
76
+ parser = argparse.ArgumentParser(description='Predict emotion from audio file')
77
+ parser.add_argument('audio_path', help='Path to audio file')
78
+ parser.add_argument('--model', default='trained_model.h5', help='Path to model file')
79
+
80
+ args = parser.parse_args()
81
+
82
+ try:
83
+ emotion, emoji, confidence = predict_emotion(args.audio_path, args.model)
84
+ print(f"Predicted Emotion: {emotion} {emoji}")
85
+ print(f"Confidence scores: {confidence}")
86
+ except Exception as e:
87
+ print(f"Error: {e}")
requirements.txt ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ librosa>=0.8.0
2
+ tensorflow>=2.8.0
3
+ numpy>=1.21.0
4
+ scikit-learn>=1.0.0
5
+ scipy>=1.7.0
trained_model.h5 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:90111a9b8a4107d0b9c247a5deaba1676000d8d1fcecdf8e9fc3d465f1a459e7
3
+ size 4281088