Spaces:
Runtime error
Runtime error
Update app.py
Browse files
app.py
CHANGED
@@ -5,63 +5,47 @@ import torch
|
|
5 |
import logging
|
6 |
import soundfile as sf
|
7 |
from transformers import AutoModelForAudioClassification
|
8 |
-
from librosa.effects import pitch_shift, time_stretch
|
9 |
|
10 |
-
# Enhanced logging for detailed debugging and information
|
11 |
logging.basicConfig(level=logging.INFO)
|
12 |
|
13 |
-
# Load your model (Ensure the model path is correct)
|
14 |
model_path = "./"
|
15 |
model = AutoModelForAudioClassification.from_pretrained(model_path)
|
16 |
|
17 |
-
def augment_and_extract_features(audio_path, sr=16000, n_mfcc=40,
|
18 |
-
"""
|
19 |
-
Load, augment, and extract comprehensive features from an audio file.
|
20 |
-
Features: MFCC, Chroma, Mel Spectrogram, Spectral Contrast, Tonnetz.
|
21 |
-
Augmentations: Pitch Shift and Time Stretch for nuanced variability.
|
22 |
-
"""
|
23 |
-
# Load audio
|
24 |
y, sr = librosa.load(audio_path, sr=sr)
|
|
|
|
|
25 |
|
26 |
-
|
27 |
-
|
28 |
-
|
|
|
|
|
29 |
|
30 |
-
|
31 |
-
|
32 |
-
chroma = librosa.feature.chroma_stft(y=y_stretched, sr=sr)
|
33 |
-
mel = librosa.feature.melspectrogram(y=y_stretched, sr=sr)
|
34 |
-
contrast = librosa.feature.spectral_contrast(y=y_stretched, sr=sr)
|
35 |
-
tonnetz = librosa.feature.tonnetz(y=librosa.effects.harmonic(y_stretched), sr=sr)
|
36 |
-
|
37 |
-
# Aggregate features
|
38 |
-
features = np.concatenate([mfcc, chroma, mel, contrast, tonnetz], axis=0)
|
39 |
-
features_normalized = StandardScaler().fit_transform(features.T).T # Normalize
|
40 |
-
features_tensor = torch.tensor(features_normalized).float().unsqueeze(0) # Tensor conversion
|
41 |
|
|
|
42 |
return features_tensor
|
43 |
|
44 |
def predict_voice(audio_file_path):
|
45 |
-
"""
|
46 |
-
Make a voice authenticity prediction based on augmented and extracted audio features.
|
47 |
-
"""
|
48 |
try:
|
49 |
features_tensor = augment_and_extract_features(audio_file_path)
|
50 |
with torch.no_grad():
|
51 |
outputs = model(features_tensor)
|
|
|
52 |
logits = outputs.logits
|
53 |
predicted_index = logits.argmax()
|
54 |
label = model.config.id2label[predicted_index.item()]
|
55 |
confidence = torch.softmax(logits, dim=1).max().item() * 100
|
56 |
-
|
57 |
-
|
58 |
-
|
59 |
except Exception as e:
|
60 |
-
|
61 |
-
logging.error(
|
62 |
-
|
63 |
|
64 |
-
# Gradio interface for easy and interactive model testing
|
65 |
iface = gr.Interface(
|
66 |
fn=predict_voice,
|
67 |
inputs=gr.Audio(label="Upload Audio File", type="filepath"),
|
|
|
5 |
import logging
|
6 |
import soundfile as sf
|
7 |
from transformers import AutoModelForAudioClassification
|
|
|
8 |
|
|
|
9 |
logging.basicConfig(level=logging.INFO)
|
10 |
|
|
|
11 |
model_path = "./"
|
12 |
model = AutoModelForAudioClassification.from_pretrained(model_path)
|
13 |
|
14 |
+
def augment_and_extract_features(audio_path, sr=16000, n_mfcc=40, n_fft=2048, hop_length=512):
|
|
|
|
|
|
|
|
|
|
|
|
|
15 |
y, sr = librosa.load(audio_path, sr=sr)
|
16 |
+
y_augmented = librosa.effects.pitch_shift(y, sr=sr, n_steps=4)
|
17 |
+
y_augmented = librosa.effects.time_stretch(y_augmented, rate=1.2)
|
18 |
|
19 |
+
mfcc = librosa.feature.mfcc(y=y_augmented, sr=sr, n_mfcc=n_mfcc, n_fft=n_fft, hop_length=hop_length)
|
20 |
+
chroma = librosa.feature.chroma_stft(y=y_augmented, sr=sr, n_fft=n_fft, hop_length=hop_length)
|
21 |
+
mel = librosa.feature.melspectrogram(y=y_augmented, sr=sr, n_fft=n_fft, hop_length=hop_length)
|
22 |
+
contrast = librosa.feature.spectral_contrast(y=y_augmented, sr=sr, n_fft=n_fft, hop_length=hop_length)
|
23 |
+
tonnetz = librosa.feature.tonnetz(y=librosa.effects.harmonic(y_augmented), sr=sr)
|
24 |
|
25 |
+
features = np.concatenate((mfcc, chroma, mel, contrast, tonnetz), axis=0)
|
26 |
+
features = (features - np.mean(features, axis=1, keepdims=True)) / np.std(features, axis=1, keepdims=True)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
27 |
|
28 |
+
features_tensor = torch.tensor(features).float().unsqueeze(0)
|
29 |
return features_tensor
|
30 |
|
31 |
def predict_voice(audio_file_path):
|
|
|
|
|
|
|
32 |
try:
|
33 |
features_tensor = augment_and_extract_features(audio_file_path)
|
34 |
with torch.no_grad():
|
35 |
outputs = model(features_tensor)
|
36 |
+
|
37 |
logits = outputs.logits
|
38 |
predicted_index = logits.argmax()
|
39 |
label = model.config.id2label[predicted_index.item()]
|
40 |
confidence = torch.softmax(logits, dim=1).max().item() * 100
|
41 |
+
|
42 |
+
result = f"The voice is classified as '{label}' with a confidence of {confidence:.2f}%."
|
43 |
+
logging.info("Prediction successful.")
|
44 |
except Exception as e:
|
45 |
+
result = f"Error during processing: {e}"
|
46 |
+
logging.error(result)
|
47 |
+
return result
|
48 |
|
|
|
49 |
iface = gr.Interface(
|
50 |
fn=predict_voice,
|
51 |
inputs=gr.Audio(label="Upload Audio File", type="filepath"),
|