Spaces:
Runtime error
Runtime error
Update
Browse files
app.py
CHANGED
@@ -3,63 +3,65 @@ import librosa
|
|
3 |
import numpy as np
|
4 |
import torch
|
5 |
import logging
|
6 |
-
from transformers import AutoModelForAudioClassification
|
7 |
import soundfile as sf
|
|
|
|
|
8 |
|
9 |
-
#
|
10 |
logging.basicConfig(level=logging.INFO)
|
11 |
|
12 |
-
# Load the model
|
13 |
model_path = "./"
|
14 |
model = AutoModelForAudioClassification.from_pretrained(model_path)
|
15 |
|
16 |
-
def augment_and_extract_features(audio_path,
|
17 |
-
|
|
|
|
|
|
|
|
|
|
|
18 |
y, sr = librosa.load(audio_path, sr=sr)
|
19 |
-
|
20 |
-
|
21 |
-
|
22 |
-
|
23 |
-
|
24 |
-
|
25 |
-
|
26 |
-
|
27 |
-
|
28 |
-
|
29 |
-
|
30 |
-
|
31 |
-
|
32 |
-
|
33 |
-
|
34 |
-
|
35 |
-
|
36 |
-
# Normalize features
|
37 |
-
features = (features - np.mean(features, axis=1, keepdims=True)) / np.std(features, axis=1, keepdims=True)
|
38 |
-
|
39 |
-
# Convert to tensor
|
40 |
-
features_tensor = torch.tensor(features).float().unsqueeze(0) # Add batch dimension
|
41 |
return features_tensor
|
42 |
|
43 |
def predict_voice(audio_file_path):
|
|
|
|
|
|
|
44 |
try:
|
45 |
features_tensor = augment_and_extract_features(audio_file_path)
|
46 |
-
|
47 |
with torch.no_grad():
|
48 |
outputs = model(features_tensor)
|
49 |
-
|
50 |
logits = outputs.logits
|
51 |
predicted_index = logits.argmax()
|
52 |
label = model.config.id2label[predicted_index.item()]
|
53 |
confidence = torch.softmax(logits, dim=1).max().item() * 100
|
54 |
-
|
55 |
-
result
|
56 |
-
|
57 |
except Exception as e:
|
58 |
-
|
59 |
-
logging.error(
|
60 |
-
|
61 |
-
return result
|
62 |
|
|
|
63 |
iface = gr.Interface(
|
64 |
fn=predict_voice,
|
65 |
inputs=gr.Audio(label="Upload Audio File", type="filepath"),
|
@@ -69,3 +71,4 @@ iface = gr.Interface(
|
|
69 |
)
|
70 |
|
71 |
iface.launch()
|
|
|
|
3 |
import numpy as np
|
4 |
import torch
|
5 |
import logging
|
|
|
6 |
import soundfile as sf
|
7 |
+
from transformers import AutoModelForAudioClassification
|
8 |
+
from librosa.effects import pitch_shift, time_stretch
|
9 |
|
10 |
+
# Enhanced logging for detailed debugging and information
|
11 |
logging.basicConfig(level=logging.INFO)
|
12 |
|
13 |
+
# Load your model (Ensure the model path is correct)
|
14 |
model_path = "./"
|
15 |
model = AutoModelForAudioClassification.from_pretrained(model_path)
|
16 |
|
17 |
+
def augment_and_extract_features(audio_path, sr=16000, n_mfcc=40, pitch_shift_steps=4, stretch_rate=1.2):
|
18 |
+
"""
|
19 |
+
Load, augment, and extract comprehensive features from an audio file.
|
20 |
+
Features: MFCC, Chroma, Mel Spectrogram, Spectral Contrast, Tonnetz.
|
21 |
+
Augmentations: Pitch Shift and Time Stretch for nuanced variability.
|
22 |
+
"""
|
23 |
+
# Load audio
|
24 |
y, sr = librosa.load(audio_path, sr=sr)
|
25 |
+
|
26 |
+
# Audio Augmentation
|
27 |
+
y_pitched = pitch_shift(y, sr=sr, n_steps=pitch_shift_steps) # Apply pitch shift
|
28 |
+
y_stretched = time_stretch(y_pitched, rate=stretch_rate) # Apply time stretch
|
29 |
+
|
30 |
+
# Feature Extraction
|
31 |
+
mfcc = librosa.feature.mfcc(y=y_stretched, sr=sr, n_mfcc=n_mfcc)
|
32 |
+
chroma = librosa.feature.chroma_stft(y=y_stretched, sr=sr)
|
33 |
+
mel = librosa.feature.melspectrogram(y=y_stretched, sr=sr)
|
34 |
+
contrast = librosa.feature.spectral_contrast(y=y_stretched, sr=sr)
|
35 |
+
tonnetz = librosa.feature.tonnetz(y=librosa.effects.harmonic(y_stretched), sr=sr)
|
36 |
+
|
37 |
+
# Aggregate features
|
38 |
+
features = np.concatenate([mfcc, chroma, mel, contrast, tonnetz], axis=0)
|
39 |
+
features_normalized = StandardScaler().fit_transform(features.T).T # Normalize
|
40 |
+
features_tensor = torch.tensor(features_normalized).float().unsqueeze(0) # Tensor conversion
|
41 |
+
|
|
|
|
|
|
|
|
|
|
|
42 |
return features_tensor
|
43 |
|
44 |
def predict_voice(audio_file_path):
|
45 |
+
"""
|
46 |
+
Make a voice authenticity prediction based on augmented and extracted audio features.
|
47 |
+
"""
|
48 |
try:
|
49 |
features_tensor = augment_and_extract_features(audio_file_path)
|
|
|
50 |
with torch.no_grad():
|
51 |
outputs = model(features_tensor)
|
|
|
52 |
logits = outputs.logits
|
53 |
predicted_index = logits.argmax()
|
54 |
label = model.config.id2label[predicted_index.item()]
|
55 |
confidence = torch.softmax(logits, dim=1).max().item() * 100
|
56 |
+
result = f"Prediction: {label} with {confidence:.2f}% confidence."
|
57 |
+
logging.info(result)
|
58 |
+
return result
|
59 |
except Exception as e:
|
60 |
+
error_msg = f"Error during processing: {e}"
|
61 |
+
logging.error(error_msg)
|
62 |
+
return error_msg
|
|
|
63 |
|
64 |
+
# Gradio interface for easy and interactive model testing
|
65 |
iface = gr.Interface(
|
66 |
fn=predict_voice,
|
67 |
inputs=gr.Audio(label="Upload Audio File", type="filepath"),
|
|
|
71 |
)
|
72 |
|
73 |
iface.launch()
|
74 |
+
|