Adding filtering option
Browse files
eval.py
CHANGED
@@ -1,3 +1,38 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
#!/usr/bin/env python3
|
2 |
import argparse
|
3 |
import re
|
@@ -126,7 +161,12 @@ def normalize_text(original_text: str, dataset: str) -> str:
|
|
126 |
def main(args):
|
127 |
# load dataset
|
128 |
dataset = load_dataset(args.dataset, args.config, split=args.split, use_auth_token=True)
|
129 |
-
|
|
|
|
|
|
|
|
|
|
|
130 |
# for testing: only process the first two examples as a test
|
131 |
# dataset = dataset.select(range(10))
|
132 |
|
@@ -197,6 +237,9 @@ if __name__ == "__main__":
|
|
197 |
parser.add_argument(
|
198 |
"--config", type=str, required=True, help="Config of the dataset. *E.g.* `'en'` for Common Voice"
|
199 |
)
|
|
|
|
|
|
|
200 |
parser.add_argument("--split", type=str, required=True, help="Split of the dataset. *E.g.* `'test'`")
|
201 |
parser.add_argument(
|
202 |
"--text_column", type=str, default="text", help="Column name containing the transcription."
|
@@ -222,3 +265,4 @@ if __name__ == "__main__":
|
|
222 |
args = parser.parse_args()
|
223 |
|
224 |
main(args)
|
|
|
|
1 |
+
|
2 |
+
Hugging Face's logo Hugging Face
|
3 |
+
|
4 |
+
Models
|
5 |
+
Datasets
|
6 |
+
Spaces
|
7 |
+
Docs
|
8 |
+
Pricing
|
9 |
+
|
10 |
+
NbAiLab
|
11 |
+
/
|
12 |
+
wav2vec2-1b-npsc-nst
|
13 |
+
Automatic Speech Recognition
|
14 |
+
PyTorch
|
15 |
+
Transformers
|
16 |
+
wav2vec2
|
17 |
+
generated_from_trainer
|
18 |
+
License: apache-2.0
|
19 |
+
Model card
|
20 |
+
Files and versions
|
21 |
+
Community
|
22 |
+
Settings
|
23 |
+
wav2vec2-1b-npsc-nst
|
24 |
+
/ eval.py
|
25 |
+
versae's picture
|
26 |
+
versae
|
27 |
+
Update eval.py
|
28 |
+
f9bf2c9
|
29 |
+
less than a minute ago
|
30 |
+
raw
|
31 |
+
history
|
32 |
+
blame
|
33 |
+
edit
|
34 |
+
delete
|
35 |
+
8.9 kB
|
36 |
#!/usr/bin/env python3
|
37 |
import argparse
|
38 |
import re
|
|
|
161 |
def main(args):
|
162 |
# load dataset
|
163 |
dataset = load_dataset(args.dataset, args.config, split=args.split, use_auth_token=True)
|
164 |
+
if args.filter:
|
165 |
+
attribute, value = list(map(str.strip, args.filter.split(":")))
|
166 |
+
dataset = dataset.filter(
|
167 |
+
lambda x: x[attribute == value],
|
168 |
+
desc=f"Filtering on {args.filter}",
|
169 |
+
)
|
170 |
# for testing: only process the first two examples as a test
|
171 |
# dataset = dataset.select(range(10))
|
172 |
|
|
|
237 |
parser.add_argument(
|
238 |
"--config", type=str, required=True, help="Config of the dataset. *E.g.* `'en'` for Common Voice"
|
239 |
)
|
240 |
+
parser.add_argument(
|
241 |
+
"--filter", type=str, default="", help="Simple filter on attributes. *E.g.* `region_of_youth:Troms` would only keep those samples for which the condition is met"
|
242 |
+
)
|
243 |
parser.add_argument("--split", type=str, required=True, help="Split of the dataset. *E.g.* `'test'`")
|
244 |
parser.add_argument(
|
245 |
"--text_column", type=str, default="text", help="Column name containing the transcription."
|
|
|
265 |
args = parser.parse_args()
|
266 |
|
267 |
main(args)
|
268 |
+
|