File size: 2,135 Bytes
8080438
7ded071
0f212cf
7ded071
 
 
 
0f212cf
 
 
7ded071
0f212cf
 
7ded071
 
 
 
 
 
 
0f212cf
 
 
 
 
 
 
 
 
 
7ded071
0f212cf
7ded071
 
0f212cf
7ded071
 
0f212cf
7ded071
0f212cf
 
7ded071
 
0f212cf
e1edbee
7ded071
e1edbee
 
 
0f212cf
e1edbee
 
 
7ded071
 
e1edbee
0f212cf
e1edbee
7ded071
0f212cf
7ded071
 
 
55719c9
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
import gradio as gr
import torch
import torch.nn as nn
import torch.nn.functional as F
import torchaudio
from transformers import AutoConfig, Wav2Vec2Processor, Wav2Vec2FeatureExtractor
from src.models import Wav2Vec2ForSpeechClassification

import librosa
import IPython.display as ipd
import numpy as np
import pandas as pd
import os

model_name_or_path = "andromeda01111/Malayalam_SA"
config = AutoConfig.from_pretrained(model_name_or_path)
feature_extractor = Wav2Vec2FeatureExtractor.from_pretrained(model_name_or_path)
sampling_rate = feature_extractor.sampling_rate
model = Wav2Vec2ForSpeechClassification.from_pretrained(model_name_or_path)


def speech_file_to_array_fn(path, sampling_rate):
    speech_array, _sampling_rate = torchaudio.load(path)
    resampler = torchaudio.transforms.Resample(_sampling_rate)
    speech = resampler(speech_array).squeeze().numpy()
    return speech


def predict(path, sampling_rate):
    speech = speech_file_to_array_fn(path, sampling_rate)
    features = feature_extractor(speech, sampling_rate=sampling_rate, return_tensors="pt", padding=True)

    input_values = features.input_values
    attention_mask = features.attention_mask

    with torch.no_grad():
        logits = model(input_values, attention_mask=attention_mask).logits

    scores = F.softmax(logits, dim=1).detach().cpu().numpy()[0]
    output_emotion = [{"Emotion": config.id2label[i], "Score": f"{round(score * 100, 3):.1f}%"} for i, score in enumerate(scores)]

    return output_emotion


# Wrapper function for Gradio
def gradio_predict(audio):
    predictions = predict(audio)
    return [f"{pred['Emotion']}: {pred['Score']}" for pred in predictions]


# Gradio interface
emotions = [config.id2label[i] for i in range(len(config.id2label))]
outputs = [gr.Textbox(label=emotion, interactive=False) for emotion in emotions]

interface = gr.Interface(
    fn=predict,
    inputs=gr.Audio(label="Upload Audio", type="filepath"),
    outputs=outputs,
    title="Emotion Recognition",
    description="Upload an audio file to predict emotions and their corresponding percentages.",
)

# Launch the app
interface.launch()