Spaces:
Sleeping
Sleeping
File size: 2,135 Bytes
8080438 7ded071 0f212cf 7ded071 0f212cf 7ded071 0f212cf 7ded071 0f212cf 7ded071 0f212cf 7ded071 0f212cf 7ded071 0f212cf 7ded071 0f212cf 7ded071 0f212cf e1edbee 7ded071 e1edbee 0f212cf e1edbee 7ded071 e1edbee 0f212cf e1edbee 7ded071 0f212cf 7ded071 55719c9 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 |
import gradio as gr
import torch
import torch.nn as nn
import torch.nn.functional as F
import torchaudio
from transformers import AutoConfig, Wav2Vec2Processor, Wav2Vec2FeatureExtractor
from src.models import Wav2Vec2ForSpeechClassification
import librosa
import IPython.display as ipd
import numpy as np
import pandas as pd
import os
model_name_or_path = "andromeda01111/Malayalam_SA"
config = AutoConfig.from_pretrained(model_name_or_path)
feature_extractor = Wav2Vec2FeatureExtractor.from_pretrained(model_name_or_path)
sampling_rate = feature_extractor.sampling_rate
model = Wav2Vec2ForSpeechClassification.from_pretrained(model_name_or_path)
def speech_file_to_array_fn(path, sampling_rate):
speech_array, _sampling_rate = torchaudio.load(path)
resampler = torchaudio.transforms.Resample(_sampling_rate)
speech = resampler(speech_array).squeeze().numpy()
return speech
def predict(path, sampling_rate):
speech = speech_file_to_array_fn(path, sampling_rate)
features = feature_extractor(speech, sampling_rate=sampling_rate, return_tensors="pt", padding=True)
input_values = features.input_values
attention_mask = features.attention_mask
with torch.no_grad():
logits = model(input_values, attention_mask=attention_mask).logits
scores = F.softmax(logits, dim=1).detach().cpu().numpy()[0]
output_emotion = [{"Emotion": config.id2label[i], "Score": f"{round(score * 100, 3):.1f}%"} for i, score in enumerate(scores)]
return output_emotion
# Wrapper function for Gradio
def gradio_predict(audio):
predictions = predict(audio)
return [f"{pred['Emotion']}: {pred['Score']}" for pred in predictions]
# Gradio interface
emotions = [config.id2label[i] for i in range(len(config.id2label))]
outputs = [gr.Textbox(label=emotion, interactive=False) for emotion in emotions]
interface = gr.Interface(
fn=predict,
inputs=gr.Audio(label="Upload Audio", type="filepath"),
outputs=outputs,
title="Emotion Recognition",
description="Upload an audio file to predict emotions and their corresponding percentages.",
)
# Launch the app
interface.launch() |