File size: 2,707 Bytes
ee91d94
0a26e54
1364a7f
6530ee3
f0dd070
b6be582
f0dd070
 
ee91d94
e02dec8
 
0c35856
b6be582
 
 
 
94d6357
b6be582
 
 
 
 
94d6357
b6be582
 
94d6357
b6be582
 
 
 
 
797b845
b6be582
 
8a834c6
3b392fa
df3ef47
b6be582
8a834c6
e02dec8
323b26a
e8e81bf
 
 
1364a7f
323b26a
 
 
e8e81bf
323b26a
 
 
ee91d94
 
cea8753
e8e81bf
cea8753
0c35856
9ff14b4
ee91d94
 
7045c5c
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
import gradio as gr
import librosa
import numpy as np
import torch
import logging
from transformers import AutoModelForAudioClassification

logging.basicConfig(level=logging.INFO)

model_path = "./"
model = AutoModelForAudioClassification.from_pretrained(model_path)

def augment_and_extract_features(audio_path, sr=16000, n_mfcc=40, n_fft=2048, hop_length=512, target_length=512):
    y, sr = librosa.load(audio_path, sr=sr)
    y_pitch_shifted = librosa.effects.pitch_shift(y, sr=sr, n_steps=4)
    y_time_stretched = librosa.effects.time_stretch(y_pitch_shifted, rate=1.2)

    mfcc = librosa.feature.mfcc(y=y_time_stretched, sr=sr, n_mfcc=n_mfcc, n_fft=n_fft, hop_length=hop_length)
    chroma = librosa.feature.chroma_stft(y=y_time_stretched, sr=sr, n_fft=n_fft, hop_length=hop_length)
    mel = librosa.feature.melspectrogram(y=y_time_stretched, sr=sr, n_fft=n_fft, hop_length=hop_length)
    contrast = librosa.feature.spectral_contrast(y=y_time_stretched, sr=sr, n_fft=n_fft, hop_length=hop_length)
    tonnetz = librosa.feature.tonnetz(y=librosa.effects.harmonic(y_time_stretched), sr=sr)

    features = np.concatenate((mfcc, chroma, mel, contrast, tonnetz), axis=0)
    features_normalized = (features - np.mean(features, axis=1, keepdims=True)) / np.std(features, axis=1, keepdims=True)

    if features_normalized.shape[1] > target_length:
        features_normalized = features_normalized[:, :target_length]
    else:
        padding = target_length - features_normalized.shape[1]
        features_normalized = np.pad(features_normalized, ((0, 0), (0, padding)), 'constant')

    features_tensor = torch.tensor(features_normalized).float().unsqueeze(0)  # Add batch dimension
    return features_tensor

def predict_voice(audio_file_path):
    try:
        features_tensor = augment_and_extract_features(audio_file_path)
        with torch.no_grad():
            outputs = model(features_tensor)

        logits = outputs.logits
        predicted_index = logits.argmax()
        label = model.config.id2label[predicted_index.item()]
        confidence = torch.softmax(logits, dim=1).max().item() * 100

        result = f"The voice is classified as '{label}' with a confidence of {confidence:.2f}%."
        logging.info("Prediction successful.")
    except Exception as e:
        result = f"Error during processing: {e}"
        logging.error(result)
    return result

iface = gr.Interface(
    fn=predict_voice,
    inputs=gr.Audio(label="Upload Audio File", type="filepath"),
    outputs=gr.Textbox(label="Prediction"),
    title="Voice Authenticity Detection",
    description="Detects whether a voice is real or AI-generated. Upload an audio file to see the results."
)

iface.launch()