Create README.md
Browse files
README.md
ADDED
@@ -0,0 +1,167 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
---
|
2 |
+
license: apache-2.0
|
3 |
+
---
|
4 |
+
|
5 |
+
## This repository contains the model checkpoints related to the paper: *[Less is More for Synthetic Speech Detection in the Wild](https://arxiv.org/abs/2502.05674)*
|
6 |
+
|
7 |
+
Dataset can be downloaded from [here](https://huggingface.co/datasets/ash56/ShiftySpeech/tree/main)
|
8 |
+
|
9 |
+
## ⚙️ Usage
|
10 |
+
|
11 |
+
#### Install libraries
|
12 |
+
```bash
|
13 |
+
pip install datasets transformers librosa numpy scikit-learn
|
14 |
+
```
|
15 |
+
|
16 |
+
#### Load Model and Dataset
|
17 |
+
```bash
|
18 |
+
from transformers import AutoConfig, AutoModel
|
19 |
+
import torch
|
20 |
+
import librosa
|
21 |
+
from datasets import load_dataset
|
22 |
+
import numpy as np
|
23 |
+
|
24 |
+
config = AutoConfig.from_pretrained("ash56/ssl-aasist", trust_remote_code=True)
|
25 |
+
device = 'cuda' if torch.cuda.is_available() else 'cpu'
|
26 |
+
model = AutoModel.from_pretrained("ash56/ssl-aasist", config=config,trust_remote_code=True, force_download=True).to(device)
|
27 |
+
|
28 |
+
#Load ShiftySpeech dataset
|
29 |
+
spoof_data= load_dataset("ash56/ShiftySpeech", data_files={"data": "Vocoders/apnet2/apnet2_aishell_flac.tar.gz"})["data"]
|
30 |
+
real_data = load_dataset("ash56/ShiftySpeech", data_files={"data": "real_data_flac/real_data_aishell_flac.tar.gz"})["data"]
|
31 |
+
model.eval()
|
32 |
+
|
33 |
+
```
|
34 |
+
|
35 |
+
#### Inference
|
36 |
+
For batch inference:
|
37 |
+
|
38 |
+
```bash
|
39 |
+
def pad(x, max_len=64600):
|
40 |
+
x_len = x.shape[0]
|
41 |
+
if x_len >= max_len:
|
42 |
+
return x[:max_len]
|
43 |
+
# need to pad
|
44 |
+
num_repeats = int(max_len / x_len)+1
|
45 |
+
padded_x = np.tile(x, (1, num_repeats))[:, :max_len][0]
|
46 |
+
return padded_x
|
47 |
+
```
|
48 |
+
|
49 |
+
```bash
|
50 |
+
output_file = "apnet2-aishell_scores.txt"
|
51 |
+
|
52 |
+
#inference on spoof data
|
53 |
+
with open(output_file, "a") as f:
|
54 |
+
# get scores of spoof audios
|
55 |
+
for sample in spoof_data:
|
56 |
+
fname = sample["__key__"]
|
57 |
+
audio = sample["flac"]["array"]
|
58 |
+
sampling_rate = sample["flac"]["sampling_rate"]
|
59 |
+
if sampling_rate != 16000:
|
60 |
+
audio = librosa.resample(audio, orig_sr=sampling_rate, target_sr=16000)
|
61 |
+
audio_padded = pad(audio,64600)
|
62 |
+
x_inp = Tensor(audio_padded).unsqueeze(0).to(device)
|
63 |
+
with torch.no_grad():
|
64 |
+
batch_out = model(x_inp)
|
65 |
+
batch_score = batch_out[:, 1].cpu().numpy().ravel()[0]
|
66 |
+
f.write(f"{fname} spoof {batch_score}\n")
|
67 |
+
|
68 |
+
#get scores of real audios
|
69 |
+
for sample in real_data:
|
70 |
+
print(real_data)
|
71 |
+
fname = sample["__key__"]
|
72 |
+
audio = sample["flac"]["array"]
|
73 |
+
sampling_rate = sample["flac"]["sampling_rate"]
|
74 |
+
if sampling_rate != 16000:
|
75 |
+
audio = librosa.resample(audio, orig_sr=sampling_rate, target_sr=16000)
|
76 |
+
audio_padded = pad(audio,64600)
|
77 |
+
x_inp = Tensor(audio_padded).unsqueeze(0).to(device)
|
78 |
+
with torch.no_grad():
|
79 |
+
batch_out = model(x_inp)
|
80 |
+
batch_score = batch_out[:, 1].cpu().numpy().ravel()[0]
|
81 |
+
f.write(f"{fname} bonafide {batch_score}\n")
|
82 |
+
|
83 |
+
print(f"Scores saved in {output_file}")
|
84 |
+
```
|
85 |
+
|
86 |
+
#### Compute EER
|
87 |
+
|
88 |
+
```bash
|
89 |
+
# helper functions to calculate EER
|
90 |
+
def compute_eer(target_scores, nontarget_scores):
|
91 |
+
""" Returns equal error rate (EER) and the corresponding threshold. """
|
92 |
+
frr, far, thresholds = compute_det_curve(target_scores, nontarget_scores)
|
93 |
+
abs_diffs = np.abs(frr - far)
|
94 |
+
min_index = np.argmin(abs_diffs)
|
95 |
+
eer = np.mean((frr[min_index], far[min_index]))
|
96 |
+
return eer, thresholds[min_index], frr, far
|
97 |
+
|
98 |
+
def compute_det_curve(target_scores, nontarget_scores):
|
99 |
+
|
100 |
+
n_scores = target_scores.size + nontarget_scores.size
|
101 |
+
all_scores = np.concatenate((target_scores, nontarget_scores))
|
102 |
+
labels = np.concatenate(
|
103 |
+
(np.ones(target_scores.size), np.zeros(nontarget_scores.size)))
|
104 |
+
|
105 |
+
# Sort labels based on scores
|
106 |
+
indices = np.argsort(all_scores, kind='mergesort')
|
107 |
+
labels = labels[indices]
|
108 |
+
|
109 |
+
# Compute false rejection and false acceptance rates
|
110 |
+
tar_trial_sums = np.cumsum(labels)
|
111 |
+
nontarget_trial_sums = nontarget_scores.size - \
|
112 |
+
(np.arange(1, n_scores + 1) - tar_trial_sums)
|
113 |
+
|
114 |
+
# false rejection rates
|
115 |
+
frr = np.concatenate(
|
116 |
+
(np.atleast_1d(0), tar_trial_sums / target_scores.size))
|
117 |
+
far = np.concatenate((np.atleast_1d(1), nontarget_trial_sums /
|
118 |
+
nontarget_scores.size)) # false acceptance rates
|
119 |
+
# Thresholds are the sorted scores
|
120 |
+
thresholds = np.concatenate(
|
121 |
+
(np.atleast_1d(all_scores[indices[0]] - 0.001), all_scores[indices]))
|
122 |
+
|
123 |
+
return frr, far, thresholds
|
124 |
+
|
125 |
+
# get EER
|
126 |
+
def calculate_EER(cm_scores_file,
|
127 |
+
output_file,
|
128 |
+
printout=True):
|
129 |
+
# Load CM scores
|
130 |
+
cm_data = np.genfromtxt(cm_scores_file, dtype=str)
|
131 |
+
cm_utt_id = cm_data[:, 0]
|
132 |
+
cm_keys = cm_data[:, 1]
|
133 |
+
cm_scores = cm_data[:, 2].astype(float)
|
134 |
+
# Extract bona fide (real human) and spoof scores from the CM scores
|
135 |
+
bona_cm = cm_scores[cm_keys == 'bonafide']
|
136 |
+
spoof_cm = cm_scores[cm_keys == 'spoof']
|
137 |
+
all_scores = np.concatenate([bona_cm, spoof_cm])
|
138 |
+
all_true_labels = np.concatenate([np.ones_like(bona_cm), np.zeros_like(spoof_cm)])
|
139 |
+
|
140 |
+
auc = roc_auc_score(all_true_labels, all_scores, max_fpr=0.05)
|
141 |
+
eer_cm, eer_threshold, frr, far = compute_eer(bona_cm, spoof_cm)
|
142 |
+
|
143 |
+
if printout:
|
144 |
+
with open(output_file, "w") as f_res:
|
145 |
+
f_res.write('\nCM SYSTEM\n')
|
146 |
+
f_res.write('\tEER\t\t= {:8.9f} % '
|
147 |
+
'(Equal error rate for countermeasure)\n'.format(
|
148 |
+
eer_cm * 100))
|
149 |
+
|
150 |
+
|
151 |
+
eval_eer = calculate_EER(
|
152 |
+
cm_scores_file=output_file,output_file="apnet2_aishell_eer.txt")
|
153 |
+
|
154 |
+
```
|
155 |
+
If you find the dataset or this resource helpful for your research, please cite our work:
|
156 |
+
|
157 |
+
```bibtex
|
158 |
+
@misc{garg2025syntheticspeechdetectionwild,
|
159 |
+
title={Less is More for Synthetic Speech Detection in the Wild},
|
160 |
+
author={Ashi Garg and Zexin Cai and Henry Li Xinyuan and Leibny Paola García-Perera and Kevin Duh and Sanjeev Khudanpur and Matthew Wiesner and Nicholas Andrews},
|
161 |
+
year={2025},
|
162 |
+
eprint={2502.05674},
|
163 |
+
archivePrefix={arXiv},
|
164 |
+
primaryClass={eess.AS},
|
165 |
+
url={https://arxiv.org/abs/2502.05674},
|
166 |
+
}
|
167 |
+
```
|