Kabatubare commited on
Commit
1364a7f
·
verified ·
1 Parent(s): 9ff14b4

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +10 -6
app.py CHANGED
@@ -1,23 +1,25 @@
1
  import gradio as gr
2
  from transformers import AutoFeatureExtractor, AutoModelForAudioClassification
3
  import torch
4
- from torch.nn.functional import softmax
5
  import librosa
6
- import os
7
 
8
  local_model_path = "./"
9
  extractor = AutoFeatureExtractor.from_pretrained(local_model_path)
10
  model = AutoModelForAudioClassification.from_pretrained(local_model_path)
11
 
12
  def preprocess_audio(audio_file_path, target_sample_rate=16000):
 
13
  waveform, _ = librosa.load(audio_file_path, sr=target_sample_rate, mono=True)
 
 
14
  return waveform, target_sample_rate
15
 
16
  def predict_voice(audio_file_path):
17
  try:
18
- # In Hugging Face Spaces, uploaded files are temporarily stored in a way that's accessible
19
- # to the app, so there's no need for a strict path check here.
20
  waveform, sample_rate = preprocess_audio(audio_file_path)
 
 
21
  inputs = extractor(waveform, return_tensors="pt", sampling_rate=sample_rate)
22
 
23
  with torch.no_grad():
@@ -26,11 +28,12 @@ def predict_voice(audio_file_path):
26
  logits = outputs.logits
27
  predicted_index = logits.argmax()
28
  label = model.config.id2label[predicted_index.item()]
29
- confidence = softmax(logits, dim=1).max().item() * 100
30
 
31
  result = f"The voice is classified as '{label}' with a confidence of {confidence:.2f}%."
32
  except Exception as e:
33
- result = f"An error occurred during processing: {str(e)}"
 
34
 
35
  return result
36
 
@@ -43,3 +46,4 @@ iface = gr.Interface(
43
  )
44
 
45
  iface.launch()
 
 
1
  import gradio as gr
2
  from transformers import AutoFeatureExtractor, AutoModelForAudioClassification
3
  import torch
 
4
  import librosa
5
+ import numpy as np
6
 
7
  local_model_path = "./"
8
  extractor = AutoFeatureExtractor.from_pretrained(local_model_path)
9
  model = AutoModelForAudioClassification.from_pretrained(local_model_path)
10
 
11
  def preprocess_audio(audio_file_path, target_sample_rate=16000):
12
+ # Load the audio file, ensuring mono conversion
13
  waveform, _ = librosa.load(audio_file_path, sr=target_sample_rate, mono=True)
14
+ # Normalizing waveform to be between -1 and 1
15
+ waveform = librosa.util.normalize(waveform)
16
  return waveform, target_sample_rate
17
 
18
  def predict_voice(audio_file_path):
19
  try:
 
 
20
  waveform, sample_rate = preprocess_audio(audio_file_path)
21
+ # Ensure waveform is a float32 array
22
+ waveform = waveform.astype(np.float32)
23
  inputs = extractor(waveform, return_tensors="pt", sampling_rate=sample_rate)
24
 
25
  with torch.no_grad():
 
28
  logits = outputs.logits
29
  predicted_index = logits.argmax()
30
  label = model.config.id2label[predicted_index.item()]
31
+ confidence = torch.softmax(logits, dim=1).max().item() * 100
32
 
33
  result = f"The voice is classified as '{label}' with a confidence of {confidence:.2f}%."
34
  except Exception as e:
35
+ # Improved error handling for debugging
36
+ result = f"Error during processing: {e}"
37
 
38
  return result
39
 
 
46
  )
47
 
48
  iface.launch()
49
+