Spaces:
Sleeping
Sleeping
Abid
commited on
Commit
•
ff0d701
1
Parent(s):
1c78367
eval edit
Browse files
eval.py
CHANGED
@@ -61,8 +61,6 @@ def normalize_text(text: str) -> str:
|
|
61 |
text = re.sub("['ّ]", '', text)
|
62 |
text = re.sub("['ٔ]", '', text)
|
63 |
text = re.sub("['ٰ]", '', text)
|
64 |
-
# batch["sentence"] = re.sub("[ء]", '', batch["sentence"])
|
65 |
-
# batch["sentence"] = re.sub("[آ]", 'ا', batch["sentence"])
|
66 |
text = re.sub("[ۂ]", 'ہ', text)
|
67 |
text = re.sub("[ي]", "ی",text)
|
68 |
text = re.sub("[ؤ]", "و", text)
|
@@ -74,15 +72,20 @@ def normalize_text(text: str) -> str:
|
|
74 |
# note that order is important here!
|
75 |
token_sequences_to_ignore = ["\n\n", "\n", " ", " "]
|
76 |
|
|
|
77 |
for t in token_sequences_to_ignore:
|
78 |
text = " ".join(text.split(t))
|
79 |
|
80 |
return text
|
81 |
|
|
|
|
|
|
|
82 |
|
83 |
def main(args):
|
84 |
# load dataset
|
85 |
dataset = load_dataset(args.dataset, args.config,delimiter="\t",split=args.split, use_auth_token=True)
|
|
|
86 |
|
87 |
# for testing: only process the first two examples as a test
|
88 |
# dataset = dataset.select(range(10))
|
@@ -92,7 +95,8 @@ def main(args):
|
|
92 |
sampling_rate = feature_extractor.sampling_rate
|
93 |
|
94 |
# resample audio
|
95 |
-
dataset = dataset.cast_column("
|
|
|
96 |
|
97 |
# load eval pipeline
|
98 |
if args.device is None:
|
@@ -102,7 +106,7 @@ def main(args):
|
|
102 |
# map function to decode audio
|
103 |
def map_to_pred(batch):
|
104 |
prediction = asr(
|
105 |
-
batch["
|
106 |
)
|
107 |
|
108 |
batch["prediction"] = prediction["text"]
|
|
|
61 |
text = re.sub("['ّ]", '', text)
|
62 |
text = re.sub("['ٔ]", '', text)
|
63 |
text = re.sub("['ٰ]", '', text)
|
|
|
|
|
64 |
text = re.sub("[ۂ]", 'ہ', text)
|
65 |
text = re.sub("[ي]", "ی",text)
|
66 |
text = re.sub("[ؤ]", "و", text)
|
|
|
72 |
# note that order is important here!
|
73 |
token_sequences_to_ignore = ["\n\n", "\n", " ", " "]
|
74 |
|
75 |
+
|
76 |
for t in token_sequences_to_ignore:
|
77 |
text = " ".join(text.split(t))
|
78 |
|
79 |
return text
|
80 |
|
81 |
+
def path_adjust(batch):
|
82 |
+
batch["path"] = "Data/ur/clips/"+str(batch["path"])
|
83 |
+
return batch
|
84 |
|
85 |
def main(args):
|
86 |
# load dataset
|
87 |
dataset = load_dataset(args.dataset, args.config,delimiter="\t",split=args.split, use_auth_token=True)
|
88 |
+
|
89 |
|
90 |
# for testing: only process the first two examples as a test
|
91 |
# dataset = dataset.select(range(10))
|
|
|
95 |
sampling_rate = feature_extractor.sampling_rate
|
96 |
|
97 |
# resample audio
|
98 |
+
dataset = dataset.cast_column("path", path_adjust())
|
99 |
+
dataset = dataset.cast_column("path", Audio(sampling_rate=sampling_rate))
|
100 |
|
101 |
# load eval pipeline
|
102 |
if args.device is None:
|
|
|
106 |
# map function to decode audio
|
107 |
def map_to_pred(batch):
|
108 |
prediction = asr(
|
109 |
+
batch["path"]["array"], chunk_length_s=args.chunk_length_s, stride_length_s=args.stride_length_s
|
110 |
)
|
111 |
|
112 |
batch["prediction"] = prediction["text"]
|