chrisjay commited on
Commit
9469e8b
·
1 Parent(s): 509522b

Update README.md

Browse files
Files changed (1) hide show
  1. README.md +7 -6
README.md CHANGED
@@ -1,4 +1,5 @@
1
  language: fon
 
2
  datasets:
3
  - [Fon Dataset](https://github.com/laleye/pyFongbe/tree/master/data)
4
  metrics:
@@ -130,7 +131,7 @@ for root, dirs, files in os.walk(test_path):
130
  test_dataset= load_dataset("json", data_files=[os.path.join(root,i) for i in files],split="train")
131
 
132
  #Remove unnecessary chars
133
- chars_to_ignore_regex = '[\,\?\.\!\-\;\:\"\“\%\‘\”\�]'
134
  def remove_special_characters(batch):
135
  batch["sentence"] = re.sub(chars_to_ignore_regex, '', batch["sentence"]).lower() + " "
136
  return batch
@@ -146,15 +147,15 @@ model = Wav2Vec2ForCTC.from_pretrained("chrisjay/wav2vec2-large-xlsr-53-fon")
146
  # Preprocessing the datasets.
147
  # We need to read the audio files as arrays
148
  def speech_file_to_array_fn(batch):
149
- speech_array, sampling_rate = torchaudio.load(batch["path"])
150
- batch["speech"]=speech_array.squeeze().numpy()
151
- return batch
152
 
153
  test_dataset = test_dataset.map(speech_file_to_array_fn)
154
  inputs = processor(test_dataset["speech"][:2], sampling_rate=16_000, return_tensors="pt", padding=True)
155
 
156
  with torch.no_grad():
157
- logits = model(inputs.input_values, attention_mask=inputs.attention_mask).logits
158
 
159
  predicted_ids = torch.argmax(logits, dim=-1)
160
 
@@ -177,7 +178,7 @@ import re
177
  for root, dirs, files in os.walk(test_path):
178
  test_dataset = load_dataset("json", data_files=[os.path.join(root,i) for i in files],split="train")
179
 
180
- chars_to_ignore_regex = '[\,\?\.\!\-\;\:\"\“\%\‘\”\�]'
181
  def remove_special_characters(batch):
182
  batch["sentence"] = re.sub(chars_to_ignore_regex, '', batch["sentence"]).lower() + " "
183
  return batch
 
1
  language: fon
2
+
3
  datasets:
4
  - [Fon Dataset](https://github.com/laleye/pyFongbe/tree/master/data)
5
  metrics:
 
131
  test_dataset= load_dataset("json", data_files=[os.path.join(root,i) for i in files],split="train")
132
 
133
  #Remove unnecessary chars
134
+ chars_to_ignore_regex = '[\\,\\?\\.\\!\\-\\;\\:\\"\\“\\%\\‘\\”\\�]'
135
  def remove_special_characters(batch):
136
  batch["sentence"] = re.sub(chars_to_ignore_regex, '', batch["sentence"]).lower() + " "
137
  return batch
 
147
  # Preprocessing the datasets.
148
  # We need to read the audio files as arrays
149
  def speech_file_to_array_fn(batch):
150
+ \tspeech_array, sampling_rate = torchaudio.load(batch["path"])
151
+ \tbatch["speech"]=speech_array.squeeze().numpy()
152
+ \treturn batch
153
 
154
  test_dataset = test_dataset.map(speech_file_to_array_fn)
155
  inputs = processor(test_dataset["speech"][:2], sampling_rate=16_000, return_tensors="pt", padding=True)
156
 
157
  with torch.no_grad():
158
+ \tlogits = model(inputs.input_values, attention_mask=inputs.attention_mask).logits
159
 
160
  predicted_ids = torch.argmax(logits, dim=-1)
161
 
 
178
  for root, dirs, files in os.walk(test_path):
179
  test_dataset = load_dataset("json", data_files=[os.path.join(root,i) for i in files],split="train")
180
 
181
+ chars_to_ignore_regex = '[\\,\\?\\.\\!\\-\\;\\:\\"\\“\\%\\‘\\”\\�]'
182
  def remove_special_characters(batch):
183
  batch["sentence"] = re.sub(chars_to_ignore_regex, '', batch["sentence"]).lower() + " "
184
  return batch