versae commited on
Commit
1567496
1 Parent(s): f9bf2c9

Adding filtering option

Browse files
Files changed (1) hide show
  1. eval.py +45 -1
eval.py CHANGED
@@ -1,3 +1,38 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  #!/usr/bin/env python3
2
  import argparse
3
  import re
@@ -126,7 +161,12 @@ def normalize_text(original_text: str, dataset: str) -> str:
126
  def main(args):
127
  # load dataset
128
  dataset = load_dataset(args.dataset, args.config, split=args.split, use_auth_token=True)
129
-
 
 
 
 
 
130
  # for testing: only process the first two examples as a test
131
  # dataset = dataset.select(range(10))
132
 
@@ -197,6 +237,9 @@ if __name__ == "__main__":
197
  parser.add_argument(
198
  "--config", type=str, required=True, help="Config of the dataset. *E.g.* `'en'` for Common Voice"
199
  )
 
 
 
200
  parser.add_argument("--split", type=str, required=True, help="Split of the dataset. *E.g.* `'test'`")
201
  parser.add_argument(
202
  "--text_column", type=str, default="text", help="Column name containing the transcription."
@@ -222,3 +265,4 @@ if __name__ == "__main__":
222
  args = parser.parse_args()
223
 
224
  main(args)
 
 
1
+
2
+ Hugging Face's logo Hugging Face
3
+
4
+ Models
5
+ Datasets
6
+ Spaces
7
+ Docs
8
+ Pricing
9
+
10
+ NbAiLab
11
+ /
12
+ wav2vec2-1b-npsc-nst
13
+ Automatic Speech Recognition
14
+ PyTorch
15
+ Transformers
16
+ wav2vec2
17
+ generated_from_trainer
18
+ License: apache-2.0
19
+ Model card
20
+ Files and versions
21
+ Community
22
+ Settings
23
+ wav2vec2-1b-npsc-nst
24
+ / eval.py
25
+ versae's picture
26
+ versae
27
+ Update eval.py
28
+ f9bf2c9
29
+ less than a minute ago
30
+ raw
31
+ history
32
+ blame
33
+ edit
34
+ delete
35
+ 8.9 kB
36
  #!/usr/bin/env python3
37
  import argparse
38
  import re
 
161
  def main(args):
162
  # load dataset
163
  dataset = load_dataset(args.dataset, args.config, split=args.split, use_auth_token=True)
164
+ if args.filter:
165
+ attribute, value = list(map(str.strip, args.filter.split(":")))
166
+ dataset = dataset.filter(
167
+ lambda x: x[attribute == value],
168
+ desc=f"Filtering on {args.filter}",
169
+ )
170
  # for testing: only process the first two examples as a test
171
  # dataset = dataset.select(range(10))
172
 
 
237
  parser.add_argument(
238
  "--config", type=str, required=True, help="Config of the dataset. *E.g.* `'en'` for Common Voice"
239
  )
240
+ parser.add_argument(
241
+ "--filter", type=str, default="", help="Simple filter on attributes. *E.g.* `region_of_youth:Troms` would only keep those samples for which the condition is met"
242
+ )
243
  parser.add_argument("--split", type=str, required=True, help="Split of the dataset. *E.g.* `'test'`")
244
  parser.add_argument(
245
  "--text_column", type=str, default="text", help="Column name containing the transcription."
 
265
  args = parser.parse_args()
266
 
267
  main(args)
268
+