marinone94 commited on
Commit
ba980b2
·
1 Parent(s): bf11fb8

new training

Browse files
added_tokens.json CHANGED
@@ -1 +1 @@
1
- {"<s>": 35, "</s>": 36}
 
1
+ {"<s>": 33, "</s>": 34}
alphabet.json DELETED
@@ -1 +0,0 @@
1
- {"labels": [" ", "a", "b", "c", "d", "e", "f", "g", "h", "i", "j", "k", "l", "m", "n", "o", "p", "q", "r", "s", "t", "u", "v", "w", "x", "y", "z", "\u00e4", "\u00e5", "\u00e9", "\u00f4", "\u00f6", "\u00fc", "\u2047", "", "<s>", "</s>"], "is_bpe": false}
 
 
config.json CHANGED
@@ -6,7 +6,7 @@
6
  "add_adapter": false,
7
  "apply_spec_augment": true,
8
  "architectures": [
9
- "Wav2Vec2ForCTC"
10
  ],
11
  "attention_dropout": 0.0,
12
  "bos_token_id": 1,
@@ -84,7 +84,7 @@
84
  "num_hidden_layers": 24,
85
  "num_negatives": 100,
86
  "output_hidden_size": 1024,
87
- "pad_token_id": 34,
88
  "proj_codevector_dim": 768,
89
  "tdnn_dilation": [
90
  1,
@@ -107,9 +107,8 @@
107
  1,
108
  1
109
  ],
110
- "torch_dtype": "float32",
111
- "transformers_version": "4.16.0.dev0",
112
  "use_weighted_layer_sum": false,
113
- "vocab_size": 37,
114
  "xvector_output_dim": 512
115
  }
 
6
  "add_adapter": false,
7
  "apply_spec_augment": true,
8
  "architectures": [
9
+ "Wav2Vec2ForPreTraining"
10
  ],
11
  "attention_dropout": 0.0,
12
  "bos_token_id": 1,
 
84
  "num_hidden_layers": 24,
85
  "num_negatives": 100,
86
  "output_hidden_size": 1024,
87
+ "pad_token_id": 32,
88
  "proj_codevector_dim": 768,
89
  "tdnn_dilation": [
90
  1,
 
107
  1,
108
  1
109
  ],
110
+ "transformers_version": "4.17.0.dev0",
 
111
  "use_weighted_layer_sum": false,
112
+ "vocab_size": 35,
113
  "xvector_output_dim": 512
114
  }
run.sh CHANGED
@@ -4,7 +4,6 @@ python run_speech_recognition_ctc.py \
4
  --dataset_config_name="sv-SE,distant_channel" \
5
  --train_split_name="train+validation,train" \
6
  --eval_split_name="test,None" \
7
- --preprocessing_only \
8
  --output_dir="./" \
9
  --overwrite_output_dir \
10
  --num_train_epochs="5" \
@@ -17,7 +16,7 @@ python run_speech_recognition_ctc.py \
17
  --evaluation_strategy="epoch" \
18
  --save_strategy="epoch" \
19
  --text_column_name="sentence" \
20
- --chars_to_ignore , ? . ! \- \; \: \" “ % ‘ ” � — ’ … – / \
21
  --logging_steps="100" \
22
  --layerdrop="0.0" \
23
  --activation_dropout="0.1" \
 
4
  --dataset_config_name="sv-SE,distant_channel" \
5
  --train_split_name="train+validation,train" \
6
  --eval_split_name="test,None" \
 
7
  --output_dir="./" \
8
  --overwrite_output_dir \
9
  --num_train_epochs="5" \
 
16
  --evaluation_strategy="epoch" \
17
  --save_strategy="epoch" \
18
  --text_column_name="sentence" \
19
+ --chars_to_ignore , ? . ! \- \; \: \" “ % ‘ ” � — ’ … – \
20
  --logging_steps="100" \
21
  --layerdrop="0.0" \
22
  --activation_dropout="0.1" \
run_speech_recognition_ctc.py CHANGED
@@ -321,25 +321,20 @@ def create_vocabulary_from_data(
321
  pad_token: Optional[str] = None,
322
  ):
323
  # Given training and test labels create vocabulary
324
- def extract_all_chars(batch):
325
- all_text = " ".join(batch["target_text"])
326
- vocab = list(set(all_text))
327
- return {"vocab": [vocab], "all_text": [all_text]}
328
 
329
- vocabs = datasets.map(
330
- extract_all_chars,
331
- batched=True,
332
- batch_size=10000,
333
- keep_in_memory=False,
334
- remove_columns=[col for col in datasets["train"].column_names if col in datasets["eval"].column_names],
335
- )
336
-
337
- # take union of all unique characters in each dataset
338
- vocab_set = functools.reduce(
339
- lambda vocab_1, vocab_2: set(vocab_1["vocab"][0]) | set(vocab_2["vocab"][0]), vocabs.values()
340
- )
341
 
342
- vocab_dict = {v: k for k, v in enumerate(sorted(list(vocab_set)))}
343
 
344
  # replace white space with delimiter token
345
  if word_delimiter_token is not None:
@@ -458,7 +453,7 @@ def main():
458
  )
459
  min_columns_train = common_cols(min_columns_train, new_dataset.column_names)
460
  else:
461
- logging.warning(f"{dataset_name} {dataset_config_name} as split is {train_split_name}")
462
 
463
  if data_args.audio_column_name not in raw_datasets["train"].column_names:
464
  raise ValueError(
@@ -512,7 +507,7 @@ def main():
512
  )
513
  min_columns_eval = common_cols(min_columns_eval, new_dataset.column_names)
514
  else:
515
- logging.warning(f"{dataset_name} {dataset_config_name} as split is {eval_split_name}")
516
 
517
  if data_args.max_eval_samples is not None:
518
  raw_datasets["eval"] = raw_datasets["eval"].select(range(data_args.max_eval_samples))
@@ -536,9 +531,32 @@ def main():
536
 
537
  def remove_special_characters(batch):
538
  if chars_to_ignore_regex is not None:
539
- batch["target_text"] = re.sub(chars_to_ignore_regex, "", batch[text_column_name]).replace("\\\\Punkt", "").replace("\\\\Komma", "").lower() + " "
 
 
 
 
 
 
 
 
 
 
 
 
540
  else:
541
- batch["target_text"] = batch[text_column_name].replace("\\\\Punkt", "").replace("\\\\Komma", "").lower() + " "
 
 
 
 
 
 
 
 
 
 
 
542
  return batch
543
 
544
  num_workers = data_args.preprocessing_num_workers
@@ -694,9 +712,16 @@ def main():
694
  return batch
695
 
696
  with training_args.main_process_first(desc="dataset map preprocessing"):
697
- vectorized_datasets = raw_datasets.map(
 
 
 
 
 
 
 
698
  prepare_dataset,
699
- remove_columns=next(iter(raw_datasets.values())).column_names,
700
  num_proc=num_workers,
701
  desc="preprocess datasets",
702
  )
 
321
  pad_token: Optional[str] = None,
322
  ):
323
  # Given training and test labels create vocabulary
324
+ def extract_all_chars(batch, vocab):
325
+ all_text = " ".join(batch)
326
+ return list(set(list(set(all_text)) + vocab))
 
327
 
328
+ batch_size = 10000
329
+ vocab = []
330
+ for i in range(0, datasets["train"].num_rows, 10000):
331
+ batch = datasets["train"].select(range(i, min(datasets["train"].num_rows, i+batch_size)))
332
+ vocab = extract_all_chars(batch["target_text"], vocab)
333
+ for i in range(0, datasets["eval"].num_rows, 10000):
334
+ batch = datasets["eval"].select(range(i, min(datasets["eval"].num_rows, i+batch_size)))
335
+ vocab = extract_all_chars(batch["target_text"], vocab)
 
 
 
 
336
 
337
+ vocab_dict = {v: k for k, v in enumerate(sorted(vocab))}
338
 
339
  # replace white space with delimiter token
340
  if word_delimiter_token is not None:
 
453
  )
454
  min_columns_train = common_cols(min_columns_train, new_dataset.column_names)
455
  else:
456
+ logging.warning(f"{dataset_name} {dataset_config_name} train not loaded as split is {train_split_name}")
457
 
458
  if data_args.audio_column_name not in raw_datasets["train"].column_names:
459
  raise ValueError(
 
507
  )
508
  min_columns_eval = common_cols(min_columns_eval, new_dataset.column_names)
509
  else:
510
+ logging.warning(f"{dataset_name} {dataset_config_name} eval not loaded as split is {eval_split_name}")
511
 
512
  if data_args.max_eval_samples is not None:
513
  raw_datasets["eval"] = raw_datasets["eval"].select(range(data_args.max_eval_samples))
 
531
 
532
  def remove_special_characters(batch):
533
  if chars_to_ignore_regex is not None:
534
+ batch["target_text"] = \
535
+ re.sub(chars_to_ignore_regex, "", batch[text_column_name]) \
536
+ .replace("\\\\Punkt", "") \
537
+ .replace("\\\\Komma", "") \
538
+ .replace("è", "e") \
539
+ .replace("é", "e") \
540
+ .replace("î", "i") \
541
+ .replace("ü", "u") \
542
+ .replace("ÿ", "y") \
543
+ .replace("\\", "") \
544
+ .replace("/", "") \
545
+ .replace("|", "") \
546
+ .lower() + " "
547
  else:
548
+ batch["target_text"] = batch[text_column_name] \
549
+ .replace("\\\\Punkt", "") \
550
+ .replace("\\\\Komma", "") \
551
+ .replace("è", "e") \
552
+ .replace("é", "e") \
553
+ .replace("î", "i") \
554
+ .replace("ü", "u") \
555
+ .replace("ÿ", "y") \
556
+ .replace("\\", "") \
557
+ .replace("/", "") \
558
+ .replace("|", "") \
559
+ .lower() + " "
560
  return batch
561
 
562
  num_workers = data_args.preprocessing_num_workers
 
712
  return batch
713
 
714
  with training_args.main_process_first(desc="dataset map preprocessing"):
715
+ vectorized_datasets = DatasetDict()
716
+ vectorized_datasets["train"] = raw_datasets["train"].map(
717
+ prepare_dataset,
718
+ remove_columns=raw_datasets["train"].column_names,
719
+ num_proc=num_workers,
720
+ desc="preprocess datasets",
721
+ )
722
+ vectorized_datasets["eval"] = raw_datasets["eval"].map(
723
  prepare_dataset,
724
+ remove_columns=raw_datasets["eval"].column_names,
725
  num_proc=num_workers,
726
  desc="preprocess datasets",
727
  )
special_tokens_map.json CHANGED
@@ -1 +1 @@
1
- {"bos_token": "<s>", "eos_token": "</s>", "unk_token": "[UNK]", "pad_token": "[PAD]", "additional_special_tokens": [{"content": "<s>", "single_word": false, "lstrip": false, "rstrip": false, "normalized": true}, {"content": "</s>", "single_word": false, "lstrip": false, "rstrip": false, "normalized": true}, {"content": "<s>", "single_word": false, "lstrip": false, "rstrip": false, "normalized": true}, {"content": "</s>", "single_word": false, "lstrip": false, "rstrip": false, "normalized": true}, {"content": "<s>", "single_word": false, "lstrip": false, "rstrip": false, "normalized": true}, {"content": "</s>", "single_word": false, "lstrip": false, "rstrip": false, "normalized": true}, {"content": "<s>", "single_word": false, "lstrip": false, "rstrip": false, "normalized": true}, {"content": "</s>", "single_word": false, "lstrip": false, "rstrip": false, "normalized": true}, {"content": "<s>", "single_word": false, "lstrip": false, "rstrip": false, "normalized": true}, {"content": "</s>", "single_word": false, "lstrip": false, "rstrip": false, "normalized": true}]}
 
1
+ {"bos_token": "<s>", "eos_token": "</s>", "unk_token": "[UNK]", "pad_token": "[PAD]", "additional_special_tokens": [{"content": "<s>", "single_word": false, "lstrip": false, "rstrip": false, "normalized": true}, {"content": "</s>", "single_word": false, "lstrip": false, "rstrip": false, "normalized": true}, {"content": "<s>", "single_word": false, "lstrip": false, "rstrip": false, "normalized": true}, {"content": "</s>", "single_word": false, "lstrip": false, "rstrip": false, "normalized": true}, {"content": "<s>", "single_word": false, "lstrip": false, "rstrip": false, "normalized": true}, {"content": "</s>", "single_word": false, "lstrip": false, "rstrip": false, "normalized": true}, {"content": "<s>", "single_word": false, "lstrip": false, "rstrip": false, "normalized": true}, {"content": "</s>", "single_word": false, "lstrip": false, "rstrip": false, "normalized": true}, {"content": "<s>", "single_word": false, "lstrip": false, "rstrip": false, "normalized": true}, {"content": "</s>", "single_word": false, "lstrip": false, "rstrip": false, "normalized": true}, {"content": "<s>", "single_word": false, "lstrip": false, "rstrip": false, "normalized": true}, {"content": "</s>", "single_word": false, "lstrip": false, "rstrip": false, "normalized": true}]}
vocab.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"a": 1, "b": 2, "c": 3, "d": 4, "e": 5, "f": 6, "g": 7, "h": 8, "i": 9, "j": 10, "k": 11, "l": 12, "m": 13, "n": 14, "o": 15, "p": 16, "q": 17, "r": 18, "s": 19, "t": 20, "u": 21, "v": 22, "w": 23, "x": 24, "y": 25, "z": 26, "ä": 27, "å": 28, "ô": 29, "ö": 30, "|": 0, "[UNK]": 31, "[PAD]": 32}