chrisjay commited on
Commit
f6c0eb1
·
1 Parent(s): 761edb9

Update README.md

Browse files
Files changed (1) hide show
  1. README.md +13 -11
README.md CHANGED
@@ -66,20 +66,22 @@ with open('./FonAudio/pyFongbe-master/data/test.csv', newline='',encoding='UTF-8
66
 
67
 
68
  #Get valid indices
69
- random.seed(42) #this seed was used specifically to compare with Okwugbe model
 
70
 
71
 
72
- v = 1500 #200 samples for valid. Change as you want
73
  test_list = [i for i in range(len(t_data))]
74
  valid_indices = random.choices(test_list, k=v)
75
 
76
  test_data = [t_data[i] for i in range(len(t_data)) if i not in valid_indices]
77
  valid_data = [t_data[i] for i in range(len(t_data)) if i in valid_indices]
78
 
79
- #Length of validation_dataset -> 1107
80
- #Length of test_dataset -> 1061
81
 
82
- #Please note, the final validation size is is smaller than the expected (1500) because we used random.choices which could contain duplicates.
 
83
 
84
  #Create JSON files
85
  def create_json_file(d):
@@ -135,7 +137,7 @@ for root, dirs, files in os.walk(test_path):
135
  test_dataset= load_dataset("json", data_files=[os.path.join(root,i) for i in files],split="train")
136
 
137
  #Remove unnecessary chars
138
- chars_to_ignore_regex = '[\\\\,\\\\?\\\\.\\\\!\\\\-\\\\;\\\\:\\\\"\\\\“\\\\%\\\\‘\\\\”\\\\�]'
139
  def remove_special_characters(batch):
140
  batch["sentence"] = re.sub(chars_to_ignore_regex, '', batch["sentence"]).lower() + " "
141
  return batch
@@ -151,15 +153,15 @@ model = Wav2Vec2ForCTC.from_pretrained("chrisjay/wav2vec2-large-xlsr-53-fon")
151
  # Preprocessing the datasets.
152
  # We need to read the audio files as arrays
153
  def speech_file_to_array_fn(batch):
154
- \\tspeech_array, sampling_rate = torchaudio.load(batch["path"])
155
- \\tbatch["speech"]=speech_array.squeeze().numpy()
156
- \\treturn batch
157
 
158
  test_dataset = test_dataset.map(speech_file_to_array_fn)
159
  inputs = processor(test_dataset["speech"][:2], sampling_rate=16_000, return_tensors="pt", padding=True)
160
 
161
  with torch.no_grad():
162
- \\tlogits = model(inputs.input_values, attention_mask=inputs.attention_mask).logits
163
 
164
  predicted_ids = torch.argmax(logits, dim=-1)
165
 
@@ -182,7 +184,7 @@ import re
182
  for root, dirs, files in os.walk(test_path):
183
  test_dataset = load_dataset("json", data_files=[os.path.join(root,i) for i in files],split="train")
184
 
185
- chars_to_ignore_regex = '[\\\\,\\\\?\\\\.\\\\!\\\\-\\\\;\\\\:\\\\"\\\\“\\\\%\\\\‘\\\\”\\\\�]'
186
  def remove_special_characters(batch):
187
  batch["sentence"] = re.sub(chars_to_ignore_regex, '', batch["sentence"]).lower() + " "
188
  return batch
 
66
 
67
 
68
  #Get valid indices
69
+ random.seed(42) #this seed was used specifically to compare
70
+ # with Okwugbe model (https://arxiv.org/abs/2103.07762)
71
 
72
 
73
+ v = 1500
74
  test_list = [i for i in range(len(t_data))]
75
  valid_indices = random.choices(test_list, k=v)
76
 
77
  test_data = [t_data[i] for i in range(len(t_data)) if i not in valid_indices]
78
  valid_data = [t_data[i] for i in range(len(t_data)) if i in valid_indices]
79
 
80
+ #Final length of validation_dataset -> 1107
81
+ #Final length of test_dataset -> 1061
82
 
83
+ #Please note, the final validation size is is smaller than the
84
+ #expected (1500) because we used random.choices which could contain duplicates.
85
 
86
  #Create JSON files
87
  def create_json_file(d):
 
137
  test_dataset= load_dataset("json", data_files=[os.path.join(root,i) for i in files],split="train")
138
 
139
  #Remove unnecessary chars
140
+ chars_to_ignore_regex = '[\\\\\\\\,\\\\\\\\?\\\\\\\\.\\\\\\\\!\\\\\\\\-\\\\\\\\;\\\\\\\\:\\\\\\\\"\\\\\\\\“\\\\\\\\%\\\\\\\\‘\\\\\\\\”\\\\\\\\�]'
141
  def remove_special_characters(batch):
142
  batch["sentence"] = re.sub(chars_to_ignore_regex, '', batch["sentence"]).lower() + " "
143
  return batch
 
153
  # Preprocessing the datasets.
154
  # We need to read the audio files as arrays
155
  def speech_file_to_array_fn(batch):
156
+ \\\\tspeech_array, sampling_rate = torchaudio.load(batch["path"])
157
+ \\\\tbatch["speech"]=speech_array.squeeze().numpy()
158
+ \\\\treturn batch
159
 
160
  test_dataset = test_dataset.map(speech_file_to_array_fn)
161
  inputs = processor(test_dataset["speech"][:2], sampling_rate=16_000, return_tensors="pt", padding=True)
162
 
163
  with torch.no_grad():
164
+ \\\\tlogits = model(inputs.input_values, attention_mask=inputs.attention_mask).logits
165
 
166
  predicted_ids = torch.argmax(logits, dim=-1)
167
 
 
184
  for root, dirs, files in os.walk(test_path):
185
  test_dataset = load_dataset("json", data_files=[os.path.join(root,i) for i in files],split="train")
186
 
187
+ chars_to_ignore_regex = '[\\\\\\\\,\\\\\\\\?\\\\\\\\.\\\\\\\\!\\\\\\\\-\\\\\\\\;\\\\\\\\:\\\\\\\\"\\\\\\\\“\\\\\\\\%\\\\\\\\‘\\\\\\\\”\\\\\\\\�]'
188
  def remove_special_characters(batch):
189
  batch["sentence"] = re.sub(chars_to_ignore_regex, '', batch["sentence"]).lower() + " "
190
  return batch