Spaces:
Runtime error
Runtime error
Update app.py
Browse files
app.py
CHANGED
@@ -3,51 +3,29 @@ import librosa
|
|
3 |
import numpy as np
|
4 |
import torch
|
5 |
import logging
|
6 |
-
from transformers import
|
7 |
|
8 |
logging.basicConfig(level=logging.INFO)
|
9 |
|
10 |
model_path = "./"
|
11 |
-
model =
|
12 |
|
13 |
-
def
|
14 |
-
y, sr = librosa.load(audio_path, sr=
|
15 |
-
|
16 |
-
|
17 |
-
|
18 |
-
mfcc = librosa.feature.mfcc(y=y_time_stretched, sr=sr, n_mfcc=n_mfcc, n_fft=n_fft, hop_length=hop_length)
|
19 |
-
chroma = librosa.feature.chroma_stft(y=y_time_stretched, sr=sr, n_fft=n_fft, hop_length=hop_length)
|
20 |
-
mel = librosa.feature.melspectrogram(y=y_time_stretched, sr=sr, n_fft=n_fft, hop_length=hop_length)
|
21 |
-
contrast = librosa.feature.spectral_contrast(y=y_time_stretched, sr=sr, n_fft=n_fft, hop_length=hop_length)
|
22 |
-
tonnetz = librosa.feature.tonnetz(y=librosa.effects.harmonic(y_time_stretched), sr=sr)
|
23 |
-
|
24 |
-
features = np.concatenate((mfcc, chroma, mel, contrast, tonnetz), axis=0)
|
25 |
-
features_normalized = (features - np.mean(features, axis=1, keepdims=True)) / np.std(features, axis=1, keepdims=True)
|
26 |
-
|
27 |
-
if features_normalized.shape[1] > target_length:
|
28 |
-
features_normalized = features_normalized[:, :target_length]
|
29 |
-
else:
|
30 |
-
padding = target_length - features_normalized.shape[1]
|
31 |
-
features_normalized = np.pad(features_normalized, ((0, 0), (0, padding)), 'constant')
|
32 |
-
|
33 |
-
features_tensor = torch.tensor(features_normalized).float().unsqueeze(0) # Add batch dimension
|
34 |
-
return features_tensor
|
35 |
|
36 |
def predict_voice(audio_file_path):
|
37 |
try:
|
38 |
-
|
39 |
-
|
40 |
-
|
41 |
-
features_tensor = features_tensor.squeeze() # Remove unnecessary dimensions
|
42 |
-
if features_tensor.shape[-1] < model.config.num_labels: # Ensure sufficient length for model input
|
43 |
-
padding_size = model.config.num_labels - features_tensor.shape[-1]
|
44 |
-
features_tensor = torch.nn.functional.pad(features_tensor, (0, padding_size), "constant", 0)
|
45 |
with torch.no_grad():
|
46 |
-
outputs = model(
|
47 |
|
48 |
logits = outputs.logits
|
49 |
-
predicted_index = logits.argmax()
|
50 |
-
label = model.config.id2label[predicted_index
|
51 |
confidence = torch.softmax(logits, dim=1).max().item() * 100
|
52 |
|
53 |
result = f"The voice is classified as '{label}' with a confidence of {confidence:.2f}%."
|
|
|
3 |
import numpy as np
|
4 |
import torch
|
5 |
import logging
|
6 |
+
from transformers import Wav2Vec2ForSequenceClassification
|
7 |
|
8 |
logging.basicConfig(level=logging.INFO)
|
9 |
|
10 |
model_path = "./"
|
11 |
+
model = Wav2Vec2ForSequenceClassification.from_pretrained(model_path)
|
12 |
|
13 |
+
def preprocess_audio(audio_path, target_sr=16000):
|
14 |
+
y, sr = librosa.load(audio_path, sr=target_sr)
|
15 |
+
y = librosa.resample(y, orig_sr=sr, target_sr=target_sr)
|
16 |
+
return y
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
17 |
|
18 |
def predict_voice(audio_file_path):
|
19 |
try:
|
20 |
+
audio_data = preprocess_audio(audio_file_path)
|
21 |
+
inputs = model.processor(audio_data, sampling_rate=16000, return_tensors="pt", padding=True)
|
22 |
+
|
|
|
|
|
|
|
|
|
23 |
with torch.no_grad():
|
24 |
+
outputs = model(**inputs)
|
25 |
|
26 |
logits = outputs.logits
|
27 |
+
predicted_index = logits.argmax(dim=1).item()
|
28 |
+
label = model.config.id2label[predicted_index]
|
29 |
confidence = torch.softmax(logits, dim=1).max().item() * 100
|
30 |
|
31 |
result = f"The voice is classified as '{label}' with a confidence of {confidence:.2f}%."
|