PyTorch
ssl-aasist
custom_code
ash56 commited on
Commit
287c11d
·
verified ·
1 Parent(s): dc0ad20

Create README.md

Browse files
Files changed (1) hide show
  1. README.md +167 -0
README.md ADDED
@@ -0,0 +1,167 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ license: apache-2.0
3
+ ---
4
+
5
+ ## This repository contains the model checkpoints related to the paper: *[Less is More for Synthetic Speech Detection in the Wild](https://arxiv.org/abs/2502.05674)*
6
+
7
+ Dataset can be downloaded from [here](https://huggingface.co/datasets/ash56/ShiftySpeech/tree/main)
8
+
9
+ ## ⚙️ Usage
10
+
11
+ #### Install libraries
12
+ ```bash
13
+ pip install datasets transformers librosa numpy scikit-learn
14
+ ```
15
+
16
+ #### Load Model and Dataset
17
+ ```bash
18
+ from transformers import AutoConfig, AutoModel
19
+ import torch
20
+ import librosa
21
+ from datasets import load_dataset
22
+ import numpy as np
23
+
24
+ config = AutoConfig.from_pretrained("ash56/ssl-aasist", trust_remote_code=True)
25
+ device = 'cuda' if torch.cuda.is_available() else 'cpu'
26
+ model = AutoModel.from_pretrained("ash56/ssl-aasist", config=config,trust_remote_code=True, force_download=True).to(device)
27
+
28
+ #Load ShiftySpeech dataset
29
+ spoof_data= load_dataset("ash56/ShiftySpeech", data_files={"data": "Vocoders/apnet2/apnet2_aishell_flac.tar.gz"})["data"]
30
+ real_data = load_dataset("ash56/ShiftySpeech", data_files={"data": "real_data_flac/real_data_aishell_flac.tar.gz"})["data"]
31
+ model.eval()
32
+
33
+ ```
34
+
35
+ #### Inference
36
+ For batch inference:
37
+
38
+ ```bash
39
+ def pad(x, max_len=64600):
40
+ x_len = x.shape[0]
41
+ if x_len >= max_len:
42
+ return x[:max_len]
43
+ # need to pad
44
+ num_repeats = int(max_len / x_len)+1
45
+ padded_x = np.tile(x, (1, num_repeats))[:, :max_len][0]
46
+ return padded_x
47
+ ```
48
+
49
+ ```bash
50
+ output_file = "apnet2-aishell_scores.txt"
51
+
52
+ #inference on spoof data
53
+ with open(output_file, "a") as f:
54
+ # get scores of spoof audios
55
+ for sample in spoof_data:
56
+ fname = sample["__key__"]
57
+ audio = sample["flac"]["array"]
58
+ sampling_rate = sample["flac"]["sampling_rate"]
59
+ if sampling_rate != 16000:
60
+ audio = librosa.resample(audio, orig_sr=sampling_rate, target_sr=16000)
61
+ audio_padded = pad(audio,64600)
62
+ x_inp = Tensor(audio_padded).unsqueeze(0).to(device)
63
+ with torch.no_grad():
64
+ batch_out = model(x_inp)
65
+ batch_score = batch_out[:, 1].cpu().numpy().ravel()[0]
66
+ f.write(f"{fname} spoof {batch_score}\n")
67
+
68
+ #get scores of real audios
69
+ for sample in real_data:
70
+ print(real_data)
71
+ fname = sample["__key__"]
72
+ audio = sample["flac"]["array"]
73
+ sampling_rate = sample["flac"]["sampling_rate"]
74
+ if sampling_rate != 16000:
75
+ audio = librosa.resample(audio, orig_sr=sampling_rate, target_sr=16000)
76
+ audio_padded = pad(audio,64600)
77
+ x_inp = Tensor(audio_padded).unsqueeze(0).to(device)
78
+ with torch.no_grad():
79
+ batch_out = model(x_inp)
80
+ batch_score = batch_out[:, 1].cpu().numpy().ravel()[0]
81
+ f.write(f"{fname} bonafide {batch_score}\n")
82
+
83
+ print(f"Scores saved in {output_file}")
84
+ ```
85
+
86
+ #### Compute EER
87
+
88
+ ```bash
89
+ # helper functions to calculate EER
90
+ def compute_eer(target_scores, nontarget_scores):
91
+ """ Returns equal error rate (EER) and the corresponding threshold. """
92
+ frr, far, thresholds = compute_det_curve(target_scores, nontarget_scores)
93
+ abs_diffs = np.abs(frr - far)
94
+ min_index = np.argmin(abs_diffs)
95
+ eer = np.mean((frr[min_index], far[min_index]))
96
+ return eer, thresholds[min_index], frr, far
97
+
98
+ def compute_det_curve(target_scores, nontarget_scores):
99
+
100
+ n_scores = target_scores.size + nontarget_scores.size
101
+ all_scores = np.concatenate((target_scores, nontarget_scores))
102
+ labels = np.concatenate(
103
+ (np.ones(target_scores.size), np.zeros(nontarget_scores.size)))
104
+
105
+ # Sort labels based on scores
106
+ indices = np.argsort(all_scores, kind='mergesort')
107
+ labels = labels[indices]
108
+
109
+ # Compute false rejection and false acceptance rates
110
+ tar_trial_sums = np.cumsum(labels)
111
+ nontarget_trial_sums = nontarget_scores.size - \
112
+ (np.arange(1, n_scores + 1) - tar_trial_sums)
113
+
114
+ # false rejection rates
115
+ frr = np.concatenate(
116
+ (np.atleast_1d(0), tar_trial_sums / target_scores.size))
117
+ far = np.concatenate((np.atleast_1d(1), nontarget_trial_sums /
118
+ nontarget_scores.size)) # false acceptance rates
119
+ # Thresholds are the sorted scores
120
+ thresholds = np.concatenate(
121
+ (np.atleast_1d(all_scores[indices[0]] - 0.001), all_scores[indices]))
122
+
123
+ return frr, far, thresholds
124
+
125
+ # get EER
126
+ def calculate_EER(cm_scores_file,
127
+ output_file,
128
+ printout=True):
129
+ # Load CM scores
130
+ cm_data = np.genfromtxt(cm_scores_file, dtype=str)
131
+ cm_utt_id = cm_data[:, 0]
132
+ cm_keys = cm_data[:, 1]
133
+ cm_scores = cm_data[:, 2].astype(float)
134
+ # Extract bona fide (real human) and spoof scores from the CM scores
135
+ bona_cm = cm_scores[cm_keys == 'bonafide']
136
+ spoof_cm = cm_scores[cm_keys == 'spoof']
137
+ all_scores = np.concatenate([bona_cm, spoof_cm])
138
+ all_true_labels = np.concatenate([np.ones_like(bona_cm), np.zeros_like(spoof_cm)])
139
+
140
+ auc = roc_auc_score(all_true_labels, all_scores, max_fpr=0.05)
141
+ eer_cm, eer_threshold, frr, far = compute_eer(bona_cm, spoof_cm)
142
+
143
+ if printout:
144
+ with open(output_file, "w") as f_res:
145
+ f_res.write('\nCM SYSTEM\n')
146
+ f_res.write('\tEER\t\t= {:8.9f} % '
147
+ '(Equal error rate for countermeasure)\n'.format(
148
+ eer_cm * 100))
149
+
150
+
151
+ eval_eer = calculate_EER(
152
+ cm_scores_file=output_file,output_file="apnet2_aishell_eer.txt")
153
+
154
+ ```
155
+ If you find the dataset or this resource helpful for your research, please cite our work:
156
+
157
+ ```bibtex
158
+ @misc{garg2025syntheticspeechdetectionwild,
159
+ title={Less is More for Synthetic Speech Detection in the Wild},
160
+ author={Ashi Garg and Zexin Cai and Henry Li Xinyuan and Leibny Paola García-Perera and Kevin Duh and Sanjeev Khudanpur and Matthew Wiesner and Nicholas Andrews},
161
+ year={2025},
162
+ eprint={2502.05674},
163
+ archivePrefix={arXiv},
164
+ primaryClass={eess.AS},
165
+ url={https://arxiv.org/abs/2502.05674},
166
+ }
167
+ ```