File size: 2,746 Bytes

9401b94

"""
Speech Emotion Recognition Inference Script
"""

import librosa
import numpy as np
import tensorflow as tf
from tensorflow.keras.models import load_model
from sklearn.preprocessing import LabelEncoder
import argparse

def extract_feature(data, sr, mfcc=True, chroma=True, mel=True):
    """
    Extract features from audio files into numpy array
    """
    result = np.array([])
    if mfcc:                          
        mfccs = np.mean(librosa.feature.mfcc(y=data, sr=sr, n_mfcc=40).T, axis=0)
        result = np.hstack((result, mfccs))
    if chroma:
        stft = np.abs(librosa.stft(data))
        chroma_feat = np.mean(librosa.feature.chroma_stft(S=stft, sr=sr).T, axis=0)
        result = np.hstack((result, chroma_feat))
    if mel:                             
        mel_feat = np.mean(librosa.feature.melspectrogram(y=data, sr=sr).T, axis=0)
        result = np.hstack((result, mel_feat))
    return result

def predict_emotion(audio_path, model_path='trained_model.h5'):
    """
    Predict emotion from audio file
    """
    # Load audio
    data, sr = librosa.load(audio_path, sr=22050)
    
    # Extract features
    feature = extract_feature(data, sr, mfcc=True, chroma=True, mel=True)
    feature = np.expand_dims(feature, axis=0)
    feature = np.expand_dims(feature, axis=2)
    
    # Load model and predict
    model = load_model(model_path)
    prediction = model.predict(feature)
    predicted_class = np.argmax(prediction, axis=1)
    
    # Map to emotion labels
    emotions = {
        '01': 'Neutral',
        '02': 'Calm',
        '03': 'Happy',
        '04': 'Sad',
        '05': 'Angry',
        '06': 'Fearful',
        '07': 'Disgust',
        '08': 'Surprised'
    }
    
    emojis = {
        'Neutral': '😐',
        'Calm': '😌',
        'Happy': '😊',
        'Sad': '😢',
        'Angry': '😠',
        'Fearful': '😨',
        'Disgust': '🤢',
        'Surprised': '😲'
    }
    
    label_encoder = LabelEncoder()
    label_encoder.fit(list(emotions.values()))
    predicted_emotion = label_encoder.inverse_transform(predicted_class)[0]
    
    return predicted_emotion, emojis[predicted_emotion], prediction[0]

if __name__ == "__main__":
    parser = argparse.ArgumentParser(description='Predict emotion from audio file')
    parser.add_argument('audio_path', help='Path to audio file')
    parser.add_argument('--model', default='trained_model.h5', help='Path to model file')
    
    args = parser.parse_args()
    
    try:
        emotion, emoji, confidence = predict_emotion(args.audio_path, args.model)
        print(f"Predicted Emotion: {emotion} {emoji}")
        print(f"Confidence scores: {confidence}")
    except Exception as e:
        print(f"Error: {e}")