Spaces:
Runtime error
Runtime error
File size: 3,275 Bytes
dfabd2f 30a5efb 0a26e54 30a5efb 637d0ca 30a5efb f0dd070 ee91d94 53b1abc 0c35856 30c595f 50facbf dfabd2f 805e4a6 dfabd2f 805e4a6 dfabd2f 30c595f 30a5efb dfabd2f 805e4a6 411539a 50facbf 411539a 30c595f 50facbf 805e4a6 dfabd2f 53b1abc 50facbf 53b1abc 50facbf 637d0ca 50facbf e8e81bf 637d0ca 53b1abc 637d0ca ee91d94 15eca51 637d0ca 30c595f 50facbf 637d0ca 805e4a6 15eca51 ee91d94 637d0ca |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 |
import numpy as np
import torch
import librosa
import gradio as gr
from transformers import AutoModelForAudioClassification
import logging
logging.basicConfig(level=logging.INFO)
model_path = "./"
model = AutoModelForAudioClassification.from_pretrained(model_path)
def preprocess_audio(audio_path, sr=22050):
audio, sr = librosa.load(audio_path, sr=sr)
audio, _ = librosa.effects.trim(audio)
return audio, sr
def extract_patches(S_DB, patch_size=16, patch_overlap=6):
stride = patch_size - patch_overlap
num_patches_time = (S_DB.shape[1] - patch_overlap) // stride
num_patches_freq = (S_DB.shape[0] - patch_overlap) // stride
patches = []
for i in range(0, num_patches_freq * stride, stride):
for j in range(0, num_patches_time * stride, stride):
patch = S_DB[i:i+patch_size, j:j+patch_size]
if patch.shape == (patch_size, patch_size):
patches.append(patch.reshape(-1))
return np.stack(patches) if patches else np.empty((0, patch_size*patch_size))
def extract_features(audio, sr):
S = librosa.feature.melspectrogram(y=audio, sr=sr, n_mels=128, hop_length=512, n_fft=2048)
S_DB = librosa.power_to_db(S, ref=np.max)
patches = extract_patches(S_DB)
# Assuming each patch is flattened to a vector of size 256 (16*16) and then projected to 768 dimensions
# Here we simulate this projection by creating a dummy tensor, in practice, this should be done by a learned linear layer
patches_tensor = torch.tensor(patches).float()
# Simulate linear projection (e.g., via a fully connected layer) to match the embedding size
if patches_tensor.nelement() == 0: # Handle case of no patches
patch_embeddings_tensor = torch.empty(0, 768)
else:
patch_embeddings_tensor = patches_tensor # This is a placeholder, replace with actual projection
return patch_embeddings_tensor.unsqueeze(0) # Add batch dimension for compatibility with model
def predict_voice(audio_file_path):
try:
audio, sr = preprocess_audio(audio_file_path)
features = extract_features(audio, sr)
# Adjust the features size to match the model input, if necessary
# Example: Reshape or pad the features tensor
# features = adjust_features_shape(features, expected_shape)
with torch.no_grad():
outputs = model(features)
logits = outputs.logits
predicted_index = logits.argmax()
label = model.config.id2label[predicted_index.item()]
confidence = torch.softmax(logits, dim=1).max().item() * 100
result = f"The voice is classified as '{label}' with a confidence of {confidence:.2f}%."
logging.info("Prediction successful.")
except Exception as e:
result = f"Error during processing: {e}"
logging.error(result)
return result
iface = gr.Interface(
fn=predict_voice,
inputs=gr.Audio(label="Upload Audio File", type="filepath"),
outputs=gr.Text(label="Prediction"),
title="Voice Authenticity Detection",
description="This system uses advanced audio processing to detect whether a voice is real or AI-generated. Upload an audio file to see the results."
)
iface.launch()
|