File size: 6,668 Bytes
287c11d 4085e10 287c11d 539ffe8 287c11d c429c87 287c11d c429c87 287c11d 5f634fc 287c11d 4ab68a5 287c11d |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 |
---
license: apache-2.0
---
## This repository contains the model checkpoints related to the paper: *[Less is More for Synthetic Speech Detection in the Wild](https://arxiv.org/abs/2502.05674)*
Dataset can be downloaded from [here](https://huggingface.co/datasets/ash56/ShiftySpeech/tree/main)
Model Architecture : [SSL-AASIST](https://arxiv.org/pdf/2202.12233)
**Note: Model is trained on audio samples generated using HiFiGAN vocoder with source dataset as LJSpeech. Both real and spoof samples are derived from [WaveFake](https://arxiv.org/abs/2111.02813) dataset**
## ⚙️ Usage
#### Install libraries
```bash
conda create -n ssl-aasist python=3.10.14
conda activate ssl-aasist
pip install pip==23
pip install omegaconf==2.0.6 pyarrow==19.0
```
*Note: pip version < 24.1 *
```bash
pip install torch datasets transformers librosa numpy scikit-learn huggingface_hub
```
```bash
pip install git+https://github.com/facebookresearch/fairseq.git@920a548ca770fb1a951f7f4289b4d3a0c1bc226f
```
#### Load Model and Dataset
```bash
from transformers import AutoConfig, AutoModel
import torch
import librosa
from datasets import load_dataset
import numpy as np
from torch import Tensor
from sklearn.metrics import roc_auc_score
config = AutoConfig.from_pretrained("ash56/ssl-aasist", trust_remote_code=True)
device = 'cuda' if torch.cuda.is_available() else 'cpu'
model = AutoModel.from_pretrained("ash56/ssl-aasist", config=config,trust_remote_code=True, force_download=True).to(device)
#Load ShiftySpeech dataset
spoof_data= load_dataset("ash56/ShiftySpeech", data_files={"data": "Vocoders/apnet2/apnet2_aishell_flac.tar.gz"})["data"]
real_data = load_dataset("ash56/ShiftySpeech", data_files={"data": "real_data_flac/real_data_aishell_flac.tar.gz"})["data"]
model.eval()
```
#### Inference
For batch inference:
```bash
def pad(x, max_len=64600):
x_len = x.shape[0]
if x_len >= max_len:
return x[:max_len]
# need to pad
num_repeats = int(max_len / x_len)+1
padded_x = np.tile(x, (1, num_repeats))[:, :max_len][0]
return padded_x
```
Get CM scores:
```bash
output_file = "apnet2-aishell_scores.txt"
#inference on spoof data
with open(output_file, "a") as f:
# get scores of spoof audios
for sample in spoof_data:
fname = sample["__key__"]
audio = sample["flac"]["array"]
sampling_rate = sample["flac"]["sampling_rate"]
if sampling_rate != 16000:
audio = librosa.resample(audio, orig_sr=sampling_rate, target_sr=16000)
audio_padded = pad(audio,64600)
x_inp = Tensor(audio_padded).unsqueeze(0).to(device)
with torch.no_grad():
batch_out = model(x_inp)
batch_score = batch_out[:, 1].cpu().numpy().ravel()[0]
f.write(f"{fname} spoof {batch_score}\n")
#get scores of real audios
for sample in real_data:
print(real_data)
fname = sample["__key__"]
audio = sample["flac"]["array"]
sampling_rate = sample["flac"]["sampling_rate"]
if sampling_rate != 16000:
audio = librosa.resample(audio, orig_sr=sampling_rate, target_sr=16000)
audio_padded = pad(audio,64600)
x_inp = Tensor(audio_padded).unsqueeze(0).to(device)
with torch.no_grad():
batch_out = model(x_inp)
batch_score = batch_out[:, 1].cpu().numpy().ravel()[0]
f.write(f"{fname} bonafide {batch_score}\n")
print(f"Scores saved in {output_file}")
```
#### Compute EER
```bash
# helper functions to calculate EER
def compute_eer(target_scores, nontarget_scores):
""" Returns equal error rate (EER) and the corresponding threshold. """
frr, far, thresholds = compute_det_curve(target_scores, nontarget_scores)
abs_diffs = np.abs(frr - far)
min_index = np.argmin(abs_diffs)
eer = np.mean((frr[min_index], far[min_index]))
return eer, thresholds[min_index], frr, far
def compute_det_curve(target_scores, nontarget_scores):
n_scores = target_scores.size + nontarget_scores.size
all_scores = np.concatenate((target_scores, nontarget_scores))
labels = np.concatenate(
(np.ones(target_scores.size), np.zeros(nontarget_scores.size)))
# Sort labels based on scores
indices = np.argsort(all_scores, kind='mergesort')
labels = labels[indices]
# Compute false rejection and false acceptance rates
tar_trial_sums = np.cumsum(labels)
nontarget_trial_sums = nontarget_scores.size - \
(np.arange(1, n_scores + 1) - tar_trial_sums)
# false rejection rates
frr = np.concatenate(
(np.atleast_1d(0), tar_trial_sums / target_scores.size))
far = np.concatenate((np.atleast_1d(1), nontarget_trial_sums /
nontarget_scores.size)) # false acceptance rates
# Thresholds are the sorted scores
thresholds = np.concatenate(
(np.atleast_1d(all_scores[indices[0]] - 0.001), all_scores[indices]))
return frr, far, thresholds
# get EER
def calculate_EER(cm_scores_file,
output_file,
printout=True):
# Load CM scores
cm_data = np.genfromtxt(cm_scores_file, dtype=str)
cm_utt_id = cm_data[:, 0]
cm_keys = cm_data[:, 1]
cm_scores = cm_data[:, 2].astype(float)
# Extract bona fide (real human) and spoof scores from the CM scores
bona_cm = cm_scores[cm_keys == 'bonafide']
spoof_cm = cm_scores[cm_keys == 'spoof']
all_scores = np.concatenate([bona_cm, spoof_cm])
all_true_labels = np.concatenate([np.ones_like(bona_cm), np.zeros_like(spoof_cm)])
auc = roc_auc_score(all_true_labels, all_scores, max_fpr=0.05)
eer_cm, eer_threshold, frr, far = compute_eer(bona_cm, spoof_cm)
if printout:
with open(output_file, "w") as f_res:
f_res.write('\nCM SYSTEM\n')
f_res.write('\tEER\t\t= {:8.9f} % '
'(Equal error rate for countermeasure)\n'.format(
eer_cm * 100))
eval_eer = calculate_EER(
cm_scores_file=output_file,output_file="apnet2_aishell_eer.txt")
```
If you find the dataset or this resource helpful for your research, please cite our work:
```bibtex
@misc{garg2025syntheticspeechdetectionwild,
title={Less is More for Synthetic Speech Detection in the Wild},
author={Ashi Garg and Zexin Cai and Henry Li Xinyuan and Leibny Paola García-Perera and Kevin Duh and Sanjeev Khudanpur and Matthew Wiesner and Nicholas Andrews},
year={2025},
eprint={2502.05674},
archivePrefix={arXiv},
primaryClass={eess.AS},
url={https://arxiv.org/abs/2502.05674},
}
```
|