File size: 3,275 Bytes
dfabd2f
30a5efb
0a26e54
30a5efb
637d0ca
30a5efb
f0dd070
 
ee91d94
53b1abc
 
0c35856
30c595f
 
 
 
50facbf
dfabd2f
 
805e4a6
 
dfabd2f
805e4a6
 
 
 
 
 
 
dfabd2f
30c595f
30a5efb
 
dfabd2f
805e4a6
 
 
 
 
 
 
 
 
 
 
411539a
50facbf
411539a
30c595f
 
50facbf
805e4a6
 
 
dfabd2f
53b1abc
50facbf
 
53b1abc
 
 
50facbf
637d0ca
50facbf
e8e81bf
637d0ca
 
53b1abc
637d0ca
ee91d94
15eca51
637d0ca
30c595f
50facbf
637d0ca
805e4a6
15eca51
ee91d94
637d0ca
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
import numpy as np
import torch
import librosa
import gradio as gr
from transformers import AutoModelForAudioClassification
import logging

logging.basicConfig(level=logging.INFO)

model_path = "./"
model = AutoModelForAudioClassification.from_pretrained(model_path)

def preprocess_audio(audio_path, sr=22050):
    audio, sr = librosa.load(audio_path, sr=sr)
    audio, _ = librosa.effects.trim(audio)
    return audio, sr

def extract_patches(S_DB, patch_size=16, patch_overlap=6):
    stride = patch_size - patch_overlap
    num_patches_time = (S_DB.shape[1] - patch_overlap) // stride
    num_patches_freq = (S_DB.shape[0] - patch_overlap) // stride
    
    patches = []
    for i in range(0, num_patches_freq * stride, stride):
        for j in range(0, num_patches_time * stride, stride):
            patch = S_DB[i:i+patch_size, j:j+patch_size]
            if patch.shape == (patch_size, patch_size):
                patches.append(patch.reshape(-1))
    return np.stack(patches) if patches else np.empty((0, patch_size*patch_size))

def extract_features(audio, sr):
    S = librosa.feature.melspectrogram(y=audio, sr=sr, n_mels=128, hop_length=512, n_fft=2048)
    S_DB = librosa.power_to_db(S, ref=np.max)
    patches = extract_patches(S_DB)
    
    # Assuming each patch is flattened to a vector of size 256 (16*16) and then projected to 768 dimensions
    # Here we simulate this projection by creating a dummy tensor, in practice, this should be done by a learned linear layer
    patches_tensor = torch.tensor(patches).float()
    # Simulate linear projection (e.g., via a fully connected layer) to match the embedding size
    if patches_tensor.nelement() == 0:  # Handle case of no patches
        patch_embeddings_tensor = torch.empty(0, 768)
    else:
        patch_embeddings_tensor = patches_tensor  # This is a placeholder, replace with actual projection
    
    return patch_embeddings_tensor.unsqueeze(0)  # Add batch dimension for compatibility with model

def predict_voice(audio_file_path):
    try:
        audio, sr = preprocess_audio(audio_file_path)
        features = extract_features(audio, sr)
        
        # Adjust the features size to match the model input, if necessary
        # Example: Reshape or pad the features tensor
        # features = adjust_features_shape(features, expected_shape)
        
        with torch.no_grad():
            outputs = model(features)
            logits = outputs.logits
            predicted_index = logits.argmax()
            label = model.config.id2label[predicted_index.item()]
            confidence = torch.softmax(logits, dim=1).max().item() * 100
        
        result = f"The voice is classified as '{label}' with a confidence of {confidence:.2f}%."
        logging.info("Prediction successful.")
    except Exception as e:
        result = f"Error during processing: {e}"
        logging.error(result)

    return result

iface = gr.Interface(
    fn=predict_voice,
    inputs=gr.Audio(label="Upload Audio File", type="filepath"),
    outputs=gr.Text(label="Prediction"),
    title="Voice Authenticity Detection",
    description="This system uses advanced audio processing to detect whether a voice is real or AI-generated. Upload an audio file to see the results."
)

iface.launch()