Inference
Browse filesimport torch
from transformers import Wav2Vec2ForSequenceClassification, Wav2Vec2Processor
import torchaudio
# Load model and processor
model = Wav2Vec2ForSequenceClassification.from_pretrained("Mrkomiljon/voiceGUARD")
processor = Wav2Vec2Processor.from_pretrained("Mrkomiljon/voiceGUARD")
# Inference function
def classify_audio(audio_path, target_sample_rate=16000):
# Load and preprocess audio
waveform, sample_rate = torchaudio.load(audio_path)
if sample_rate != target_sample_rate:
resampler = torchaudio.transforms.Resample(orig_freq=sample_rate, new_freq=target_sample_rate)
waveform = resampler(waveform)
# Ensure audio is 10 seconds (truncate/pad)
max_length = target_sample_rate * 10
if waveform.size(1) > max_length:
waveform = waveform[:, :max_length]
elif waveform.size(1) < max_length:
waveform = torch.nn.functional.pad(waveform, (0, max_length - waveform.size(1)))
# Process input
inputs = processor(waveform.squeeze().numpy(), sampling_rate=target_sample_rate, return_tensors="pt")
inputs = {key: val.to(model.device) for key, val in inputs.items()}
# Perform inference
with torch.no_grad():
logits = model(**inputs).logits
predicted_label = torch.argmax(logits, dim=-1).item()
return predicted_label
# Example usage
audio_path = "path_to_audio_file.wav"
label = classify_audio(audio_path)
print(f"Predicted Label: {label}")
@@ -0,0 +1,43 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
---
|
2 |
+
license: mit
|
3 |
+
datasets:
|
4 |
+
- fixie-ai/librispeech_asr
|
5 |
+
language:
|
6 |
+
- en
|
7 |
+
base_model:
|
8 |
+
- facebook/wav2vec2-base
|
9 |
+
pipeline_tag: voice-activity-detection
|
10 |
+
---
|
11 |
+
# Voice Detection AI - Real vs AI Audio Classifier
|
12 |
+
|
13 |
+
### **Model Overview**
|
14 |
+
This model is a fine-tuned Wav2Vec2-based audio classifier capable of distinguishing between **real human voices** and **AI-generated voices**. It has been trained on a dataset containing samples from various TTS models and real human audio recordings.
|
15 |
+
|
16 |
+
---
|
17 |
+
|
18 |
+
### **Model Details**
|
19 |
+
- **Architecture:** Wav2Vec2ForSequenceClassification
|
20 |
+
- **Fine-tuned on:** Custom dataset with real and AI-generated audio
|
21 |
+
- **Classes:**
|
22 |
+
1. Real Human Voice
|
23 |
+
2. AI-generated (e.g., Melgan, DiffWave, etc.)
|
24 |
+
- **Input Requirements:**
|
25 |
+
- Audio format: `.wav`, `.mp3`, etc.
|
26 |
+
- Sample rate: 16kHz
|
27 |
+
- Max duration: 10 seconds (longer audios are truncated, shorter ones are padded)
|
28 |
+
|
29 |
+
---
|
30 |
+
|
31 |
+
### **Performance**
|
32 |
+
- **Validation Accuracy:** 99.8%
|
33 |
+
- **Robustness:** Successfully classifies across multiple AI-generation models.
|
34 |
+
- **Limitations:** Struggles with certain unseen AI-generation models (e.g., ElevenLabs).
|
35 |
+
|
36 |
+
---
|
37 |
+
|
38 |
+
### **How to Use**
|
39 |
+
|
40 |
+
#### **1. Install Dependencies**
|
41 |
+
Make sure you have `transformers` and `torch` installed:
|
42 |
+
```bash
|
43 |
+
pip install transformers torch torchaudio
|