Spaces:
Runtime error
Runtime error
Update app.py
Browse files
app.py
CHANGED
@@ -1,23 +1,25 @@
|
|
1 |
import gradio as gr
|
2 |
from transformers import AutoFeatureExtractor, AutoModelForAudioClassification
|
3 |
import torch
|
4 |
-
from torch.nn.functional import softmax
|
5 |
import librosa
|
6 |
-
import
|
7 |
|
8 |
local_model_path = "./"
|
9 |
extractor = AutoFeatureExtractor.from_pretrained(local_model_path)
|
10 |
model = AutoModelForAudioClassification.from_pretrained(local_model_path)
|
11 |
|
12 |
def preprocess_audio(audio_file_path, target_sample_rate=16000):
|
|
|
13 |
waveform, _ = librosa.load(audio_file_path, sr=target_sample_rate, mono=True)
|
|
|
|
|
14 |
return waveform, target_sample_rate
|
15 |
|
16 |
def predict_voice(audio_file_path):
|
17 |
try:
|
18 |
-
# In Hugging Face Spaces, uploaded files are temporarily stored in a way that's accessible
|
19 |
-
# to the app, so there's no need for a strict path check here.
|
20 |
waveform, sample_rate = preprocess_audio(audio_file_path)
|
|
|
|
|
21 |
inputs = extractor(waveform, return_tensors="pt", sampling_rate=sample_rate)
|
22 |
|
23 |
with torch.no_grad():
|
@@ -26,11 +28,12 @@ def predict_voice(audio_file_path):
|
|
26 |
logits = outputs.logits
|
27 |
predicted_index = logits.argmax()
|
28 |
label = model.config.id2label[predicted_index.item()]
|
29 |
-
confidence = softmax(logits, dim=1).max().item() * 100
|
30 |
|
31 |
result = f"The voice is classified as '{label}' with a confidence of {confidence:.2f}%."
|
32 |
except Exception as e:
|
33 |
-
|
|
|
34 |
|
35 |
return result
|
36 |
|
@@ -43,3 +46,4 @@ iface = gr.Interface(
|
|
43 |
)
|
44 |
|
45 |
iface.launch()
|
|
|
|
1 |
import gradio as gr
|
2 |
from transformers import AutoFeatureExtractor, AutoModelForAudioClassification
|
3 |
import torch
|
|
|
4 |
import librosa
|
5 |
+
import numpy as np
|
6 |
|
7 |
local_model_path = "./"
|
8 |
extractor = AutoFeatureExtractor.from_pretrained(local_model_path)
|
9 |
model = AutoModelForAudioClassification.from_pretrained(local_model_path)
|
10 |
|
11 |
def preprocess_audio(audio_file_path, target_sample_rate=16000):
|
12 |
+
# Load the audio file, ensuring mono conversion
|
13 |
waveform, _ = librosa.load(audio_file_path, sr=target_sample_rate, mono=True)
|
14 |
+
# Normalizing waveform to be between -1 and 1
|
15 |
+
waveform = librosa.util.normalize(waveform)
|
16 |
return waveform, target_sample_rate
|
17 |
|
18 |
def predict_voice(audio_file_path):
|
19 |
try:
|
|
|
|
|
20 |
waveform, sample_rate = preprocess_audio(audio_file_path)
|
21 |
+
# Ensure waveform is a float32 array
|
22 |
+
waveform = waveform.astype(np.float32)
|
23 |
inputs = extractor(waveform, return_tensors="pt", sampling_rate=sample_rate)
|
24 |
|
25 |
with torch.no_grad():
|
|
|
28 |
logits = outputs.logits
|
29 |
predicted_index = logits.argmax()
|
30 |
label = model.config.id2label[predicted_index.item()]
|
31 |
+
confidence = torch.softmax(logits, dim=1).max().item() * 100
|
32 |
|
33 |
result = f"The voice is classified as '{label}' with a confidence of {confidence:.2f}%."
|
34 |
except Exception as e:
|
35 |
+
# Improved error handling for debugging
|
36 |
+
result = f"Error during processing: {e}"
|
37 |
|
38 |
return result
|
39 |
|
|
|
46 |
)
|
47 |
|
48 |
iface.launch()
|
49 |
+
|