andromeda01111 commited on
Commit
0f212cf
·
verified ·
1 Parent(s): c6d010d

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +25 -40
app.py CHANGED
@@ -1,10 +1,16 @@
1
  import gradio as gr
2
  import torch
 
3
  import torch.nn.functional as F
4
  import torchaudio
5
  from transformers import AutoConfig, Wav2Vec2Processor, Wav2Vec2FeatureExtractor
6
  from src.models import Wav2Vec2ForSpeechClassification
 
 
 
7
  import numpy as np
 
 
8
 
9
  model_name_or_path = "andromeda01111/Malayalam_SA"
10
  config = AutoConfig.from_pretrained(model_name_or_path)
@@ -12,67 +18,46 @@ feature_extractor = Wav2Vec2FeatureExtractor.from_pretrained(model_name_or_path)
12
  sampling_rate = feature_extractor.sampling_rate
13
  model = Wav2Vec2ForSpeechClassification.from_pretrained(model_name_or_path)
14
 
15
- # def speech_file_to_array_fn(path, sampling_rate):
16
- # speech_array, _sampling_rate = torchaudio.load(path)
17
- # resampler = torchaudio.transforms.Resample(_sampling_rate, sampling_rate)
18
- # speech = resampler(speech_array).squeeze().numpy()
19
- # return speech
20
-
21
- def speech_file_to_array_fn(audio_path):
22
- if audio_path is None:
23
- return None # Handle cases where no file is provided
24
-
25
- try:
26
- # Check if the input is a file path (upload) or direct audio data (recording)
27
- if isinstance(audio_path, str):
28
- speech_array, _sampling_rate = torchaudio.load(audio_path)
29
- else:
30
- # If it's recorded audio, Gradio provides it as a NumPy array
31
- speech_array = torch.tensor(audio_path)
32
- _sampling_rate = sampling_rate # Use default sampling rate
33
-
34
- # Resample to match model requirements
35
- resampler = torchaudio.transforms.Resample(orig_freq=_sampling_rate, new_freq=sampling_rate)
36
- speech = resampler(speech_array).squeeze().numpy()
37
- return speech
38
-
39
- except Exception as e:
40
- print(f"Error processing audio: {e}")
41
- return None
42
-
43
-
44
- def predict(audio_path):
45
- speech = speech_file_to_array_fn(audio_path, sampling_rate)
46
  features = feature_extractor(speech, sampling_rate=sampling_rate, return_tensors="pt", padding=True)
47
-
48
  input_values = features.input_values
49
  attention_mask = features.attention_mask
50
-
51
  with torch.no_grad():
52
  logits = model(input_values, attention_mask=attention_mask).logits
53
-
54
  scores = F.softmax(logits, dim=1).detach().cpu().numpy()[0]
55
- output_emotion = {config.id2label[i]: f"{round(score * 100, 3):.1f}%" for i, score in enumerate(scores)}
56
-
57
  return output_emotion
58
 
 
59
  # Wrapper function for Gradio
60
  def gradio_predict(audio):
61
  predictions = predict(audio)
62
  return [f"{pred['Emotion']}: {pred['Score']}" for pred in predictions]
63
 
 
64
  # Gradio interface
65
  emotions = [config.id2label[i] for i in range(len(config.id2label))]
66
  outputs = [gr.Textbox(label=emotion, interactive=False) for emotion in emotions]
67
 
68
- # Gradio Interface with Audio Recording (max duration: 10 seconds)
69
  interface = gr.Interface(
70
  fn=predict,
71
- inputs=gr.Audio(type="filepath", label="Record or Upload Audio"),
72
  outputs=outputs,
73
  title="Emotion Recognition",
74
- description="Record or upload an audio file (max 10 sec) to predict emotions and their corresponding percentages.",
75
- live=False,
76
  )
77
 
78
  # Launch the app
 
1
  import gradio as gr
2
  import torch
3
+ import torch.nn as nn
4
  import torch.nn.functional as F
5
  import torchaudio
6
  from transformers import AutoConfig, Wav2Vec2Processor, Wav2Vec2FeatureExtractor
7
  from src.models import Wav2Vec2ForSpeechClassification
8
+
9
+ import librosa
10
+ import IPython.display as ipd
11
  import numpy as np
12
+ import pandas as pd
13
+ import os
14
 
15
  model_name_or_path = "andromeda01111/Malayalam_SA"
16
  config = AutoConfig.from_pretrained(model_name_or_path)
 
18
  sampling_rate = feature_extractor.sampling_rate
19
  model = Wav2Vec2ForSpeechClassification.from_pretrained(model_name_or_path)
20
 
21
+
22
+ def speech_file_to_array_fn(path, sampling_rate):
23
+ speech_array, _sampling_rate = torchaudio.load(path)
24
+ resampler = torchaudio.transforms.Resample(_sampling_rate)
25
+ speech = resampler(speech_array).squeeze().numpy()
26
+ return speech
27
+
28
+
29
+ def predict(path, sampling_rate):
30
+ speech = speech_file_to_array_fn(path, sampling_rate)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
31
  features = feature_extractor(speech, sampling_rate=sampling_rate, return_tensors="pt", padding=True)
32
+
33
  input_values = features.input_values
34
  attention_mask = features.attention_mask
35
+
36
  with torch.no_grad():
37
  logits = model(input_values, attention_mask=attention_mask).logits
38
+
39
  scores = F.softmax(logits, dim=1).detach().cpu().numpy()[0]
40
+ output_emotion = [{"Emotion": config.id2label[i], "Score": f"{round(score * 100, 3):.1f}%"} for i, score in enumerate(scores)]
41
+
42
  return output_emotion
43
 
44
+
45
  # Wrapper function for Gradio
46
  def gradio_predict(audio):
47
  predictions = predict(audio)
48
  return [f"{pred['Emotion']}: {pred['Score']}" for pred in predictions]
49
 
50
+
51
  # Gradio interface
52
  emotions = [config.id2label[i] for i in range(len(config.id2label))]
53
  outputs = [gr.Textbox(label=emotion, interactive=False) for emotion in emotions]
54
 
 
55
  interface = gr.Interface(
56
  fn=predict,
57
+ inputs=gr.Audio(label="Upload Audio", type="filepath"),
58
  outputs=outputs,
59
  title="Emotion Recognition",
60
+ description="Upload an audio file to predict emotions and their corresponding percentages.",
 
61
  )
62
 
63
  # Launch the app