Mrkomiljon commited on
Commit
d331afd
·
verified ·
1 Parent(s): a7a281c

import torch
from transformers import Wav2Vec2ForSequenceClassification, Wav2Vec2Processor
import torchaudio

# Load model and processor
model = Wav2Vec2ForSequenceClassification.from_pretrained("Mrkomiljon/voiceGUARD")
processor = Wav2Vec2Processor.from_pretrained("Mrkomiljon/voiceGUARD")

# Inference function
def classify_audio(audio_path, target_sample_rate=16000):
# Load and preprocess audio
waveform, sample_rate = torchaudio.load(audio_path)
if sample_rate != target_sample_rate:
resampler = torchaudio.transforms.Resample(orig_freq=sample_rate, new_freq=target_sample_rate)
waveform = resampler(waveform)

# Ensure audio is 10 seconds (truncate/pad)
max_length = target_sample_rate * 10
if waveform.size(1) > max_length:
waveform = waveform[:, :max_length]
elif waveform.size(1) < max_length:
waveform = torch.nn.functional.pad(waveform, (0, max_length - waveform.size(1)))

# Process input
inputs = processor(waveform.squeeze().numpy(), sampling_rate=target_sample_rate, return_tensors="pt")
inputs = {key: val.to(model.device) for key, val in inputs.items()}

# Perform inference
with torch.no_grad():
logits = model(**inputs).logits
predicted_label = torch.argmax(logits, dim=-1).item()

return predicted_label

# Example usage
audio_path = "path_to_audio_file.wav"
label = classify_audio(audio_path)
print(f"Predicted Label: {label}")

Files changed (1) hide show
  1. README.md +43 -0
README.md ADDED
@@ -0,0 +1,43 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ license: mit
3
+ datasets:
4
+ - fixie-ai/librispeech_asr
5
+ language:
6
+ - en
7
+ base_model:
8
+ - facebook/wav2vec2-base
9
+ pipeline_tag: voice-activity-detection
10
+ ---
11
+ # Voice Detection AI - Real vs AI Audio Classifier
12
+
13
+ ### **Model Overview**
14
+ This model is a fine-tuned Wav2Vec2-based audio classifier capable of distinguishing between **real human voices** and **AI-generated voices**. It has been trained on a dataset containing samples from various TTS models and real human audio recordings.
15
+
16
+ ---
17
+
18
+ ### **Model Details**
19
+ - **Architecture:** Wav2Vec2ForSequenceClassification
20
+ - **Fine-tuned on:** Custom dataset with real and AI-generated audio
21
+ - **Classes:**
22
+ 1. Real Human Voice
23
+ 2. AI-generated (e.g., Melgan, DiffWave, etc.)
24
+ - **Input Requirements:**
25
+ - Audio format: `.wav`, `.mp3`, etc.
26
+ - Sample rate: 16kHz
27
+ - Max duration: 10 seconds (longer audios are truncated, shorter ones are padded)
28
+
29
+ ---
30
+
31
+ ### **Performance**
32
+ - **Validation Accuracy:** 99.8%
33
+ - **Robustness:** Successfully classifies across multiple AI-generation models.
34
+ - **Limitations:** Struggles with certain unseen AI-generation models (e.g., ElevenLabs).
35
+
36
+ ---
37
+
38
+ ### **How to Use**
39
+
40
+ #### **1. Install Dependencies**
41
+ Make sure you have `transformers` and `torch` installed:
42
+ ```bash
43
+ pip install transformers torch torchaudio