Spaces:
Runtime error
Runtime error
Update app.py
Browse files
app.py
CHANGED
@@ -2,58 +2,51 @@ import gradio as gr
|
|
2 |
import librosa
|
3 |
import numpy as np
|
4 |
import torch
|
5 |
-
import torch.nn.functional as F
|
6 |
import logging
|
7 |
from transformers import AutoModelForAudioClassification
|
|
|
8 |
|
9 |
# Configure logging for debugging and information
|
10 |
logging.basicConfig(level=logging.INFO)
|
11 |
|
12 |
-
#
|
13 |
-
|
14 |
-
model = AutoModelForAudioClassification.from_pretrained(
|
15 |
|
16 |
-
def
|
17 |
-
|
18 |
-
|
19 |
-
|
20 |
-
|
21 |
-
|
22 |
-
|
23 |
-
|
24 |
-
|
25 |
-
|
26 |
-
|
27 |
-
|
28 |
-
|
29 |
-
|
30 |
-
|
31 |
-
|
32 |
-
|
33 |
-
|
34 |
-
|
35 |
-
|
36 |
-
|
37 |
-
|
38 |
-
|
39 |
-
|
40 |
-
|
41 |
-
|
42 |
-
mel_tensor = mel_tensor.unsqueeze(0) # Add batch dimension for compatibility with model
|
43 |
-
return mel_tensor
|
44 |
|
45 |
def predict_voice(audio_file_path):
|
46 |
-
"""
|
47 |
-
Predicts the audio class using a pre-trained model and custom feature extraction.
|
48 |
-
Args:
|
49 |
-
audio_file_path: Path to the audio file for prediction.
|
50 |
-
Returns:
|
51 |
-
A string containing the predicted class and confidence level.
|
52 |
-
"""
|
53 |
try:
|
54 |
-
|
|
|
55 |
with torch.no_grad():
|
56 |
-
outputs = model(
|
|
|
57 |
logits = outputs.logits
|
58 |
predicted_index = logits.argmax()
|
59 |
label = model.config.id2label[predicted_index.item()]
|
@@ -67,7 +60,6 @@ def predict_voice(audio_file_path):
|
|
67 |
|
68 |
return result
|
69 |
|
70 |
-
# Setting up the Gradio interface
|
71 |
iface = gr.Interface(
|
72 |
fn=predict_voice,
|
73 |
inputs=gr.Audio(label="Upload Audio File", type="filepath"),
|
@@ -76,5 +68,4 @@ iface = gr.Interface(
|
|
76 |
description="Detects whether a voice is real or AI-generated. Upload an audio file to see the results."
|
77 |
)
|
78 |
|
79 |
-
# Launching the interface
|
80 |
iface.launch()
|
|
|
2 |
import librosa
|
3 |
import numpy as np
|
4 |
import torch
|
|
|
5 |
import logging
|
6 |
from transformers import AutoModelForAudioClassification
|
7 |
+
import soundfile as sf
|
8 |
|
9 |
# Configure logging for debugging and information
|
10 |
logging.basicConfig(level=logging.INFO)
|
11 |
|
12 |
+
# Load the model
|
13 |
+
model_path = "./"
|
14 |
+
model = AutoModelForAudioClassification.from_pretrained(model_path)
|
15 |
|
16 |
+
def augment_and_extract_features(audio_path, output_path=None, sr=16000, n_mfcc=40, n_fft=2048, hop_length=512):
|
17 |
+
# Load and augment the audio file
|
18 |
+
y, sr = librosa.load(audio_path, sr=sr)
|
19 |
+
y_augmented = librosa.effects.pitch_shift(y, sr, n_steps=4) # Pitch shifting
|
20 |
+
y_augmented = librosa.effects.time_stretch(y_augmented, rate=1.2) # Time stretching
|
21 |
+
|
22 |
+
# Save the augmented audio if an output path is provided
|
23 |
+
if output_path is not None:
|
24 |
+
sf.write(output_path, y_augmented, sr)
|
25 |
+
|
26 |
+
# Extract features
|
27 |
+
mfcc = librosa.feature.mfcc(y=y_augmented, sr=sr, n_mfcc=n_mfcc, n_fft=n_fft, hop_length=hop_length)
|
28 |
+
chroma = librosa.feature.chroma_stft(y=y_augmented, sr=sr, n_fft=n_fft, hop_length=hop_length)
|
29 |
+
mel = librosa.feature.melspectrogram(y=y_augmented, sr=sr, n_fft=n_fft, hop_length=hop_length)
|
30 |
+
contrast = librosa.feature.spectral_contrast(y=y_augmented, sr=sr, n_fft=n_fft, hop_length=hop_length)
|
31 |
+
tonnetz = librosa.feature.tonnetz(y=librosa.effects.harmonic(y_augmented), sr=sr)
|
32 |
+
|
33 |
+
# Combine all features
|
34 |
+
features = np.concatenate((mfcc, chroma, mel, contrast, tonnetz), axis=0)
|
35 |
+
|
36 |
+
# Normalize features
|
37 |
+
features = (features - np.mean(features, axis=1, keepdims=True)) / np.std(features, axis=1, keepdims=True)
|
38 |
+
|
39 |
+
# Convert to tensor
|
40 |
+
features_tensor = torch.tensor(features).float().unsqueeze(0) # Add batch dimension
|
41 |
+
return features_tensor
|
|
|
|
|
42 |
|
43 |
def predict_voice(audio_file_path):
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
44 |
try:
|
45 |
+
features_tensor = augment_and_extract_features(audio_file_path)
|
46 |
+
|
47 |
with torch.no_grad():
|
48 |
+
outputs = model(features_tensor)
|
49 |
+
|
50 |
logits = outputs.logits
|
51 |
predicted_index = logits.argmax()
|
52 |
label = model.config.id2label[predicted_index.item()]
|
|
|
60 |
|
61 |
return result
|
62 |
|
|
|
63 |
iface = gr.Interface(
|
64 |
fn=predict_voice,
|
65 |
inputs=gr.Audio(label="Upload Audio File", type="filepath"),
|
|
|
68 |
description="Detects whether a voice is real or AI-generated. Upload an audio file to see the results."
|
69 |
)
|
70 |
|
|
|
71 |
iface.launch()
|