carlosdanielhernandezmena commited on
Commit
6f22a10
1 Parent(s): e782413

Updating the example code

Browse files
Files changed (1) hide show
  1. README.md +29 -30
README.md CHANGED
@@ -137,57 +137,56 @@ The fine-tuning process was perform during November (2022) in the servers of the
137
  import torch
138
  from transformers import Wav2Vec2Processor
139
  from transformers import Wav2Vec2ForCTC
 
140
  #Load the processor and model.
141
  MODEL_NAME="carlosdanielhernandezmena/wav2vec2-large-xlsr-53-spanish-ep5-944h"
142
  processor = Wav2Vec2Processor.from_pretrained(MODEL_NAME)
143
  model = Wav2Vec2ForCTC.from_pretrained(MODEL_NAME)
 
144
  #Load the dataset
145
  from datasets import load_dataset, load_metric, Audio
146
  ds=load_dataset("ciempiess/ciempiess_test", split="test")
147
- #Normalize the transcriptions
148
- import re
149
- chars_to_ignore_regex = '[\\,\\?\\.\\!\\\;\\:\\"\\“\\%\\‘\\”\\�\\)\\(\\*)]'
150
- def remove_special_characters(batch):
151
- batch["normalized_text"] = re.sub(chars_to_ignore_regex, '', batch["normalized_text"]).lower()
152
- return batch
153
- ds = ds.map(remove_special_characters)
154
  #Downsample to 16kHz
155
  ds = ds.cast_column("audio", Audio(sampling_rate=16_000))
 
156
  #Process the dataset
157
  def prepare_dataset(batch):
158
- audio = batch["audio"]
159
- #Batched output is "un-batched" to ensure mapping is correct
160
- batch["input_values"] = processor(audio["array"], sampling_rate=audio["sampling_rate"]).input_values[0]
161
- with processor.as_target_processor():
162
- batch["labels"] = processor(batch["normalized_text"]).input_ids
163
- return batch
164
  ds = ds.map(prepare_dataset, remove_columns=ds.column_names,num_proc=1)
 
165
  #Define the evaluation metric
166
  import numpy as np
167
  wer_metric = load_metric("wer")
168
  def compute_metrics(pred):
169
- pred_logits = pred.predictions
170
- pred_ids = np.argmax(pred_logits, axis=-1)
171
- pred.label_ids[pred.label_ids == -100] = processor.tokenizer.pad_token_id
172
- pred_str = processor.batch_decode(pred_ids)
173
- #We do not want to group tokens when computing the metrics
174
- label_str = processor.batch_decode(pred.label_ids, group_tokens=False)
175
- wer = wer_metric.compute(predictions=pred_str, references=label_str)
176
- return {"wer": wer}
 
177
  #Do the evaluation (with batch_size=1)
178
  model = model.to(torch.device("cuda"))
179
  def map_to_result(batch):
180
- with torch.no_grad():
181
- input_values = torch.tensor(batch["input_values"], device="cuda").unsqueeze(0)
182
- logits = model(input_values).logits
183
- pred_ids = torch.argmax(logits, dim=-1)
184
- batch["pred_str"] = processor.batch_decode(pred_ids)[0]
185
- batch["normalized_text"] = processor.decode(batch["labels"], group_tokens=False)
186
- return batch
187
  results = ds.map(map_to_result,remove_columns=ds.column_names)
188
- #Compute the overall WER now.
189
- print("Test WER: {:.3f}".format(wer_metric.compute(predictions=results["pred_str"], references=results["normalized_text"])))
190
 
 
 
191
  ```
192
  **Test Result**: 0.112
193
  # BibTeX entry and citation info
 
137
  import torch
138
  from transformers import Wav2Vec2Processor
139
  from transformers import Wav2Vec2ForCTC
140
+
141
  #Load the processor and model.
142
  MODEL_NAME="carlosdanielhernandezmena/wav2vec2-large-xlsr-53-spanish-ep5-944h"
143
  processor = Wav2Vec2Processor.from_pretrained(MODEL_NAME)
144
  model = Wav2Vec2ForCTC.from_pretrained(MODEL_NAME)
145
+
146
  #Load the dataset
147
  from datasets import load_dataset, load_metric, Audio
148
  ds=load_dataset("ciempiess/ciempiess_test", split="test")
149
+
 
 
 
 
 
 
150
  #Downsample to 16kHz
151
  ds = ds.cast_column("audio", Audio(sampling_rate=16_000))
152
+
153
  #Process the dataset
154
  def prepare_dataset(batch):
155
+ audio = batch["audio"]
156
+ #Batched output is "un-batched" to ensure mapping is correct
157
+ batch["input_values"] = processor(audio["array"], sampling_rate=audio["sampling_rate"]).input_values[0]
158
+ with processor.as_target_processor():
159
+ batch["labels"] = processor(batch["normalized_text"]).input_ids
160
+ return batch
161
  ds = ds.map(prepare_dataset, remove_columns=ds.column_names,num_proc=1)
162
+
163
  #Define the evaluation metric
164
  import numpy as np
165
  wer_metric = load_metric("wer")
166
  def compute_metrics(pred):
167
+ pred_logits = pred.predictions
168
+ pred_ids = np.argmax(pred_logits, axis=-1)
169
+ pred.label_ids[pred.label_ids == -100] = processor.tokenizer.pad_token_id
170
+ pred_str = processor.batch_decode(pred_ids)
171
+ #We do not want to group tokens when computing the metrics
172
+ label_str = processor.batch_decode(pred.label_ids, group_tokens=False)
173
+ wer = wer_metric.compute(predictions=pred_str, references=label_str)
174
+ return {"wer": wer}
175
+
176
  #Do the evaluation (with batch_size=1)
177
  model = model.to(torch.device("cuda"))
178
  def map_to_result(batch):
179
+ with torch.no_grad():
180
+ input_values = torch.tensor(batch["input_values"], device="cuda").unsqueeze(0)
181
+ logits = model(input_values).logits
182
+ pred_ids = torch.argmax(logits, dim=-1)
183
+ batch["pred_str"] = processor.batch_decode(pred_ids)[0]
184
+ batch["sentence"] = processor.decode(batch["labels"], group_tokens=False)
185
+ return batch
186
  results = ds.map(map_to_result,remove_columns=ds.column_names)
 
 
187
 
188
+ #Compute the overall WER now.
189
+ print("Test WER: {:.3f}".format(wer_metric.compute(predictions=results["pred_str"], references=results["sentence"])))
190
  ```
191
  **Test Result**: 0.112
192
  # BibTeX entry and citation info