patrickvonplaten commited on
Commit
aaab379
·
1 Parent(s): 698a3d1

Update README.md

Browse files
Files changed (1) hide show
  1. README.md +40 -40
README.md CHANGED
@@ -23,7 +23,7 @@ model-index:
23
  metrics:
24
  - name: Test WER
25
  type: wer
26
- value: 12.90
27
  ---
28
 
29
  # Wav2Vec2-Large-XLSR-53-German
@@ -123,30 +123,30 @@ processor = Wav2Vec2Processor.from_pretrained("maxidl/wav2vec2-large-xlsr-german
123
  model = Wav2Vec2ForCTC.from_pretrained("maxidl/wav2vec2-large-xlsr-german")
124
  model.to("cuda")
125
 
126
- chars_to_ignore_regex = '[\,\?\.\!\-\;\:\"\“]'
127
  resampler = torchaudio.transforms.Resample(48_000, 16_000)
128
 
129
  # Preprocessing the datasets.
130
  # We need to read the aduio files as arrays
131
  def speech_file_to_array_fn(batch):
132
- batch["sentence"] = re.sub(chars_to_ignore_regex, '', batch["sentence"]).lower()
133
- speech_array, sampling_rate = torchaudio.load(batch["path"])
134
- batch["speech"] = resampler(speech_array).squeeze().numpy()
135
- return batch
136
 
137
  test_dataset = test_dataset.map(speech_file_to_array_fn)
138
 
139
  # Preprocessing the datasets.
140
  # We need to read the audio files as arrays
141
  def evaluate(batch):
142
- inputs = processor(batch["speech"], sampling_rate=16_000, return_tensors="pt", padding=True)
143
 
144
- with torch.no_grad():
145
- logits = model(inputs.input_values.to("cuda"), attention_mask=inputs.attention_mask.to("cuda")).logits
146
 
147
- pred_ids = torch.argmax(logits, dim=-1)
148
- batch["pred_strings"] = processor.batch_decode(pred_ids)
149
- return batch
150
 
151
  result = test_dataset.map(evaluate, batched=True, batch_size=8) # batch_size=8 -> requires ~14.5GB GPU memory
152
 
@@ -176,7 +176,7 @@ print("Total (chunk_size=1000), WER: {:2f}".format(100 * chunked_wer(result["pre
176
  # Total (chunk=1000), WER: 12.768981
177
  ```
178
 
179
- **Test Result**: WER: 12.90 %
180
 
181
 
182
  ## Training
@@ -187,32 +187,32 @@ The model was trained for 50k steps, taking around 30 hours on a single A100.
187
 
188
  The arguments used for training this model are:
189
  ```
190
- python run_finetuning.py \
191
- --model_name_or_path="facebook/wav2vec2-large-xlsr-53" \
192
- --dataset_config_name="de" \
193
- --output_dir=./wav2vec2-large-xlsr-german \
194
- --preprocessing_num_workers="16" \
195
- --overwrite_output_dir \
196
- --num_train_epochs="20" \
197
- --per_device_train_batch_size="64" \
198
- --per_device_eval_batch_size="32" \
199
- --learning_rate="1e-4" \
200
- --warmup_steps="500" \
201
- --evaluation_strategy="steps" \
202
- --save_steps="5000" \
203
- --eval_steps="5000" \
204
- --logging_steps="1000" \
205
- --save_total_limit="3" \
206
- --freeze_feature_extractor \
207
- --activation_dropout="0.055" \
208
- --attention_dropout="0.094" \
209
- --feat_proj_dropout="0.04" \
210
- --layerdrop="0.04" \
211
- --mask_time_prob="0.08" \
212
- --gradient_checkpointing="1" \
213
- --fp16 \
214
- --do_train \
215
- --do_eval \
216
- --dataloader_num_workers="16" \
217
  --group_by_length
218
  ```
 
23
  metrics:
24
  - name: Test WER
25
  type: wer
26
+ value: 12.77
27
  ---
28
 
29
  # Wav2Vec2-Large-XLSR-53-German
 
123
  model = Wav2Vec2ForCTC.from_pretrained("maxidl/wav2vec2-large-xlsr-german")
124
  model.to("cuda")
125
 
126
+ chars_to_ignore_regex = '[\\,\\?\\.\\!\\-\\;\\:\\"\\“]'
127
  resampler = torchaudio.transforms.Resample(48_000, 16_000)
128
 
129
  # Preprocessing the datasets.
130
  # We need to read the aduio files as arrays
131
  def speech_file_to_array_fn(batch):
132
+ \tbatch["sentence"] = re.sub(chars_to_ignore_regex, '', batch["sentence"]).lower()
133
+ \tspeech_array, sampling_rate = torchaudio.load(batch["path"])
134
+ \tbatch["speech"] = resampler(speech_array).squeeze().numpy()
135
+ \treturn batch
136
 
137
  test_dataset = test_dataset.map(speech_file_to_array_fn)
138
 
139
  # Preprocessing the datasets.
140
  # We need to read the audio files as arrays
141
  def evaluate(batch):
142
+ \tinputs = processor(batch["speech"], sampling_rate=16_000, return_tensors="pt", padding=True)
143
 
144
+ \twith torch.no_grad():
145
+ \t\tlogits = model(inputs.input_values.to("cuda"), attention_mask=inputs.attention_mask.to("cuda")).logits
146
 
147
+ \tpred_ids = torch.argmax(logits, dim=-1)
148
+ \tbatch["pred_strings"] = processor.batch_decode(pred_ids)
149
+ \treturn batch
150
 
151
  result = test_dataset.map(evaluate, batched=True, batch_size=8) # batch_size=8 -> requires ~14.5GB GPU memory
152
 
 
176
  # Total (chunk=1000), WER: 12.768981
177
  ```
178
 
179
+ **Test Result**: WER: 12.77 %
180
 
181
 
182
  ## Training
 
187
 
188
  The arguments used for training this model are:
189
  ```
190
+ python run_finetuning.py \\
191
+ --model_name_or_path="facebook/wav2vec2-large-xlsr-53" \\
192
+ --dataset_config_name="de" \\
193
+ --output_dir=./wav2vec2-large-xlsr-german \\
194
+ --preprocessing_num_workers="16" \\
195
+ --overwrite_output_dir \\
196
+ --num_train_epochs="20" \\
197
+ --per_device_train_batch_size="64" \\
198
+ --per_device_eval_batch_size="32" \\
199
+ --learning_rate="1e-4" \\
200
+ --warmup_steps="500" \\
201
+ --evaluation_strategy="steps" \\
202
+ --save_steps="5000" \\
203
+ --eval_steps="5000" \\
204
+ --logging_steps="1000" \\
205
+ --save_total_limit="3" \\
206
+ --freeze_feature_extractor \\
207
+ --activation_dropout="0.055" \\
208
+ --attention_dropout="0.094" \\
209
+ --feat_proj_dropout="0.04" \\
210
+ --layerdrop="0.04" \\
211
+ --mask_time_prob="0.08" \\
212
+ --gradient_checkpointing="1" \\
213
+ --fp16 \\
214
+ --do_train \\
215
+ --do_eval \\
216
+ --dataloader_num_workers="16" \\
217
  --group_by_length
218
  ```