import streamlit as st import torch import librosa from transformers import HubertForSequenceClassification, Wav2Vec2FeatureExtractor from transformers import pipeline # Title of the app st.title("Emotion Recognition from Speech") # Upload audio file uploaded_file = st.file_uploader("Choose an audio file...", type=["wav"]) # Load the model and feature extractor model = HubertForSequenceClassification.from_pretrained("superb/hubert-large-superb-er") feature_extractor = Wav2Vec2FeatureExtractor.from_pretrained("superb/hubert-large-superb-er") classifier = pipeline("audio-classification", model="superb/hubert-large-superb-er") if uploaded_file is not None: # Load and preprocess audio file speech, sr = librosa.load(uploaded_file, sr=16000, mono=True) # Display audio player st.audio(uploaded_file, format='audio/wav') # Process the audio inputs = feature_extractor(speech, sampling_rate=16000, padding=True, return_tensors="pt") # Predict emotion with torch.no_grad(): logits = model(**inputs).logits predicted_ids = torch.argmax(logits, dim=-1) labels = [model.config.id2label[_id] for _id in predicted_ids.tolist()] # Display the result # st.write("Predicted Emotion:", labels[0]) # Alternatively using the pipeline results = classifier(uploaded_file, top_k=5) st.write("Top 5 Predicted Emotions:") for result in results: st.write(f"{result['label']}: {result['score']:.4f}")