AndrewMcDowell
commited on
Commit
·
db4ff8d
1
Parent(s):
b4be586
Training in progress, step 1000
Browse files- .ipynb_checkpoints/mozilla-foundation_common_voice_8_0_ja_test_eval_results-checkpoint.txt +2 -0
- .ipynb_checkpoints/run_speech_recognition_ctc_bnb-checkpoint.py +13 -3
- .ipynb_checkpoints/speech_training_notebook-checkpoint.ipynb +85 -30
- added_tokens.json +1 -1
- config.json +2 -2
- mozilla-foundation_common_voice_8_0_ja_test_eval_results.txt +2 -2
- pytorch_model.bin +2 -2
- run_speech_recognition_ctc_bnb.py +13 -3
- special_tokens_map.json +1 -1
- speech_training_notebook.ipynb +197 -41
- training_args.bin +1 -1
- vocab.json +1 -1
.ipynb_checkpoints/mozilla-foundation_common_voice_8_0_ja_test_eval_results-checkpoint.txt
ADDED
@@ -0,0 +1,2 @@
|
|
|
|
|
|
|
1 |
+
WER: 0.9490658362989324
|
2 |
+
CER: 0.233251654006371
|
.ipynb_checkpoints/run_speech_recognition_ctc_bnb-checkpoint.py
CHANGED
@@ -358,6 +358,8 @@ def main():
|
|
358 |
else:
|
359 |
model_args, data_args, training_args = parser.parse_args_into_dataclasses()
|
360 |
|
|
|
|
|
361 |
# Detecting last checkpoint.
|
362 |
last_checkpoint = None
|
363 |
if os.path.isdir(training_args.output_dir) and training_args.do_train and not training_args.overwrite_output_dir:
|
@@ -432,7 +434,12 @@ def main():
|
|
432 |
|
433 |
if data_args.max_eval_samples is not None:
|
434 |
raw_datasets["eval"] = raw_datasets["eval"].select(range(data_args.max_eval_samples))
|
|
|
|
|
|
|
435 |
|
|
|
|
|
436 |
# 2. We remove some special characters from the datasets
|
437 |
# that make training complicated and do not help in transcribing the speech
|
438 |
# E.g. characters, such as `,` and `.` do not really have an acoustic characteristic
|
@@ -444,11 +451,14 @@ def main():
|
|
444 |
# kakasi.setMode("K", "H") #Convert from katakana to hiragana
|
445 |
conv = kakasi.getConverter()
|
446 |
|
|
|
447 |
chars_to_ignore_regex = (
|
448 |
-
f'[{"".join(data_args.chars_to_ignore)}]' if data_args.chars_to_ignore is not None else '[\,\?\!\-\;\:\"\“\%\‘\”\�\—\’\…\–\(\,\[\]\)\(
|
449 |
)
|
450 |
-
text_column_name = data_args.text_column_name
|
451 |
|
|
|
|
|
|
|
452 |
|
453 |
|
454 |
def remove_special_characters(batch):
|
@@ -580,7 +590,7 @@ def main():
|
|
580 |
max_input_length = data_args.max_duration_in_seconds * feature_extractor.sampling_rate
|
581 |
min_input_length = data_args.min_duration_in_seconds * feature_extractor.sampling_rate
|
582 |
audio_column_name = data_args.audio_column_name
|
583 |
-
|
584 |
|
585 |
# `phoneme_language` is only relevant if the model is fine-tuned on phoneme classification
|
586 |
phoneme_language = data_args.phoneme_language
|
|
|
358 |
else:
|
359 |
model_args, data_args, training_args = parser.parse_args_into_dataclasses()
|
360 |
|
361 |
+
|
362 |
+
num_workers = data_args.preprocessing_num_workers
|
363 |
# Detecting last checkpoint.
|
364 |
last_checkpoint = None
|
365 |
if os.path.isdir(training_args.output_dir) and training_args.do_train and not training_args.overwrite_output_dir:
|
|
|
434 |
|
435 |
if data_args.max_eval_samples is not None:
|
436 |
raw_datasets["eval"] = raw_datasets["eval"].select(range(data_args.max_eval_samples))
|
437 |
+
|
438 |
+
# ADDITIONS
|
439 |
+
# Remove alphanumeric characters
|
440 |
|
441 |
+
raw_datasets = raw_datasets.filter(lambda example: not re.search('[a-zA-ZA-Za-z]',example['sentence']))
|
442 |
+
|
443 |
# 2. We remove some special characters from the datasets
|
444 |
# that make training complicated and do not help in transcribing the speech
|
445 |
# E.g. characters, such as `,` and `.` do not really have an acoustic characteristic
|
|
|
451 |
# kakasi.setMode("K", "H") #Convert from katakana to hiragana
|
452 |
conv = kakasi.getConverter()
|
453 |
|
454 |
+
# Default to set of extra characters seen in CV 8.
|
455 |
chars_to_ignore_regex = (
|
456 |
+
f'[{"".join(data_args.chars_to_ignore)}]' if data_args.chars_to_ignore is not None else '[\,\?\!\-\;\:\"\“\%\‘\”\�\—\’\…\–\(\,\[\]\)\(\!\/\「\」\『\』]'
|
457 |
)
|
|
|
458 |
|
459 |
+
# ADDITIONS END
|
460 |
+
|
461 |
+
text_column_name = data_args.text_column_name
|
462 |
|
463 |
|
464 |
def remove_special_characters(batch):
|
|
|
590 |
max_input_length = data_args.max_duration_in_seconds * feature_extractor.sampling_rate
|
591 |
min_input_length = data_args.min_duration_in_seconds * feature_extractor.sampling_rate
|
592 |
audio_column_name = data_args.audio_column_name
|
593 |
+
|
594 |
|
595 |
# `phoneme_language` is only relevant if the model is fine-tuned on phoneme classification
|
596 |
phoneme_language = data_args.phoneme_language
|
.ipynb_checkpoints/speech_training_notebook-checkpoint.ipynb
CHANGED
@@ -1122,46 +1122,101 @@
|
|
1122 |
},
|
1123 |
{
|
1124 |
"cell_type": "code",
|
1125 |
-
"execution_count":
|
1126 |
-
"metadata": {
|
1127 |
-
|
1128 |
-
|
1129 |
-
"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1130 |
}
|
1131 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1132 |
"outputs": [
|
1133 |
{
|
1134 |
-
"
|
1135 |
-
"
|
1136 |
-
"
|
1137 |
-
|
1138 |
-
"\
|
1139 |
-
"\u001b[0;31mKeyboardInterrupt\u001b[0m Traceback (most recent call last)",
|
1140 |
-
"Input \u001b[0;32mIn [38]\u001b[0m, in \u001b[0;36m<module>\u001b[0;34m\u001b[0m\n\u001b[1;32m 1\u001b[0m odd_example_texts \u001b[38;5;241m=\u001b[39m []\n\u001b[0;32m----> 2\u001b[0m \u001b[38;5;28;01mfor\u001b[39;00m row \u001b[38;5;129;01min\u001b[39;00m common_voice_train:\n\u001b[1;32m 3\u001b[0m \u001b[38;5;28;01mfor\u001b[39;00m letter \u001b[38;5;129;01min\u001b[39;00m odd_values:\n\u001b[1;32m 4\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m letter \u001b[38;5;129;01min\u001b[39;00m row[\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124msentence\u001b[39m\u001b[38;5;124m\"\u001b[39m]: \n",
|
1141 |
-
"File \u001b[0;32m/opt/conda/lib/python3.8/site-packages/datasets/arrow_dataset.py:1664\u001b[0m, in \u001b[0;36mDataset._iter\u001b[0;34m(self, decoded)\u001b[0m\n\u001b[1;32m 1658\u001b[0m \u001b[38;5;124;03m\"\"\"Iterate through the examples.\u001b[39;00m\n\u001b[1;32m 1659\u001b[0m \n\u001b[1;32m 1660\u001b[0m \u001b[38;5;124;03mIf a formatting is set with :meth:`Dataset.set_format` rows will be returned with the\u001b[39;00m\n\u001b[1;32m 1661\u001b[0m \u001b[38;5;124;03mselected format.\u001b[39;00m\n\u001b[1;32m 1662\u001b[0m \u001b[38;5;124;03m\"\"\"\u001b[39;00m\n\u001b[1;32m 1663\u001b[0m \u001b[38;5;28;01mfor\u001b[39;00m index \u001b[38;5;129;01min\u001b[39;00m \u001b[38;5;28mrange\u001b[39m(\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mnum_rows):\n\u001b[0;32m-> 1664\u001b[0m \u001b[38;5;28;01myield\u001b[39;00m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_getitem\u001b[49m\u001b[43m(\u001b[49m\n\u001b[1;32m 1665\u001b[0m \u001b[43m \u001b[49m\u001b[43mindex\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 1666\u001b[0m \u001b[43m \u001b[49m\u001b[43mdecoded\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mdecoded\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 1667\u001b[0m \u001b[43m \u001b[49m\u001b[43m)\u001b[49m\n",
|
1142 |
-
"File \u001b[0;32m/opt/conda/lib/python3.8/site-packages/datasets/arrow_dataset.py:1915\u001b[0m, in \u001b[0;36mDataset._getitem\u001b[0;34m(self, key, decoded, **kwargs)\u001b[0m\n\u001b[1;32m 1913\u001b[0m formatter \u001b[38;5;241m=\u001b[39m get_formatter(format_type, features\u001b[38;5;241m=\u001b[39m\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mfeatures, decoded\u001b[38;5;241m=\u001b[39mdecoded, \u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39mformat_kwargs)\n\u001b[1;32m 1914\u001b[0m pa_subtable \u001b[38;5;241m=\u001b[39m query_table(\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_data, key, indices\u001b[38;5;241m=\u001b[39m\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_indices \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_indices \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m \u001b[38;5;28;01melse\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m)\n\u001b[0;32m-> 1915\u001b[0m formatted_output \u001b[38;5;241m=\u001b[39m \u001b[43mformat_table\u001b[49m\u001b[43m(\u001b[49m\n\u001b[1;32m 1916\u001b[0m \u001b[43m \u001b[49m\u001b[43mpa_subtable\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mkey\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mformatter\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mformatter\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mformat_columns\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mformat_columns\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43moutput_all_columns\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43moutput_all_columns\u001b[49m\n\u001b[1;32m 1917\u001b[0m \u001b[43m\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 1918\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m formatted_output\n",
|
1143 |
-
"File \u001b[0;32m/opt/conda/lib/python3.8/site-packages/datasets/formatting/formatting.py:533\u001b[0m, in \u001b[0;36mformat_table\u001b[0;34m(table, key, formatter, format_columns, output_all_columns)\u001b[0m\n\u001b[1;32m 531\u001b[0m python_formatter \u001b[38;5;241m=\u001b[39m PythonFormatter(features\u001b[38;5;241m=\u001b[39m\u001b[38;5;28;01mNone\u001b[39;00m)\n\u001b[1;32m 532\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m format_columns \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m:\n\u001b[0;32m--> 533\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43mformatter\u001b[49m\u001b[43m(\u001b[49m\u001b[43mpa_table\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mquery_type\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mquery_type\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 534\u001b[0m \u001b[38;5;28;01melif\u001b[39;00m query_type \u001b[38;5;241m==\u001b[39m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mcolumn\u001b[39m\u001b[38;5;124m\"\u001b[39m:\n\u001b[1;32m 535\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m key \u001b[38;5;129;01min\u001b[39;00m format_columns:\n",
|
1144 |
-
"File \u001b[0;32m/opt/conda/lib/python3.8/site-packages/datasets/formatting/formatting.py:282\u001b[0m, in \u001b[0;36mFormatter.__call__\u001b[0;34m(self, pa_table, query_type)\u001b[0m\n\u001b[1;32m 280\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m \u001b[38;5;21m__call__\u001b[39m(\u001b[38;5;28mself\u001b[39m, pa_table: pa\u001b[38;5;241m.\u001b[39mTable, query_type: \u001b[38;5;28mstr\u001b[39m) \u001b[38;5;241m-\u001b[39m\u001b[38;5;241m>\u001b[39m Union[RowFormat, ColumnFormat, BatchFormat]:\n\u001b[1;32m 281\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m query_type \u001b[38;5;241m==\u001b[39m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mrow\u001b[39m\u001b[38;5;124m\"\u001b[39m:\n\u001b[0;32m--> 282\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mformat_row\u001b[49m\u001b[43m(\u001b[49m\u001b[43mpa_table\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 283\u001b[0m \u001b[38;5;28;01melif\u001b[39;00m query_type \u001b[38;5;241m==\u001b[39m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mcolumn\u001b[39m\u001b[38;5;124m\"\u001b[39m:\n\u001b[1;32m 284\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mformat_column(pa_table)\n",
|
1145 |
-
"File \u001b[0;32m/opt/conda/lib/python3.8/site-packages/datasets/formatting/formatting.py:313\u001b[0m, in \u001b[0;36mPythonFormatter.format_row\u001b[0;34m(self, pa_table)\u001b[0m\n\u001b[1;32m 311\u001b[0m row \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mpython_arrow_extractor()\u001b[38;5;241m.\u001b[39mextract_row(pa_table)\n\u001b[1;32m 312\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mdecoded:\n\u001b[0;32m--> 313\u001b[0m row \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mpython_features_decoder\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mdecode_row\u001b[49m\u001b[43m(\u001b[49m\u001b[43mrow\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 314\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m row\n",
|
1146 |
-
"File \u001b[0;32m/opt/conda/lib/python3.8/site-packages/datasets/formatting/formatting.py:222\u001b[0m, in \u001b[0;36mPythonFeaturesDecoder.decode_row\u001b[0;34m(self, row)\u001b[0m\n\u001b[1;32m 221\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m \u001b[38;5;21mdecode_row\u001b[39m(\u001b[38;5;28mself\u001b[39m, row: \u001b[38;5;28mdict\u001b[39m) \u001b[38;5;241m-\u001b[39m\u001b[38;5;241m>\u001b[39m \u001b[38;5;28mdict\u001b[39m:\n\u001b[0;32m--> 222\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mfeatures\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mdecode_example\u001b[49m\u001b[43m(\u001b[49m\u001b[43mrow\u001b[49m\u001b[43m)\u001b[49m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mfeatures \u001b[38;5;28;01melse\u001b[39;00m row\n",
|
1147 |
-
"File \u001b[0;32m/opt/conda/lib/python3.8/site-packages/datasets/features/features.py:1318\u001b[0m, in \u001b[0;36mFeatures.decode_example\u001b[0;34m(self, example)\u001b[0m\n\u001b[1;32m 1308\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m \u001b[38;5;21mdecode_example\u001b[39m(\u001b[38;5;28mself\u001b[39m, example: \u001b[38;5;28mdict\u001b[39m):\n\u001b[1;32m 1309\u001b[0m \u001b[38;5;124;03m\"\"\"Decode example with custom feature decoding.\u001b[39;00m\n\u001b[1;32m 1310\u001b[0m \n\u001b[1;32m 1311\u001b[0m \u001b[38;5;124;03m Args:\u001b[39;00m\n\u001b[0;32m (...)\u001b[0m\n\u001b[1;32m 1315\u001b[0m \u001b[38;5;124;03m :obj:`dict[str, Any]`\u001b[39;00m\n\u001b[1;32m 1316\u001b[0m \u001b[38;5;124;03m \"\"\"\u001b[39;00m\n\u001b[0;32m-> 1318\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m {\n\u001b[1;32m 1319\u001b[0m column_name: decode_nested_example(feature, value)\n\u001b[1;32m 1320\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_column_requires_decoding[column_name]\n\u001b[1;32m 1321\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m value\n\u001b[1;32m 1322\u001b[0m \u001b[38;5;28;01mfor\u001b[39;00m column_name, (feature, value) \u001b[38;5;129;01min\u001b[39;00m utils\u001b[38;5;241m.\u001b[39mzip_dict(\n\u001b[1;32m 1323\u001b[0m {key: value \u001b[38;5;28;01mfor\u001b[39;00m key, value \u001b[38;5;129;01min\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mitems() \u001b[38;5;28;01mif\u001b[39;00m key \u001b[38;5;129;01min\u001b[39;00m example}, example\n\u001b[1;32m 1324\u001b[0m )\n\u001b[1;32m 1325\u001b[0m }\n",
|
1148 |
-
"File \u001b[0;32m/opt/conda/lib/python3.8/site-packages/datasets/features/features.py:1319\u001b[0m, in \u001b[0;36m<dictcomp>\u001b[0;34m(.0)\u001b[0m\n\u001b[1;32m 1308\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m \u001b[38;5;21mdecode_example\u001b[39m(\u001b[38;5;28mself\u001b[39m, example: \u001b[38;5;28mdict\u001b[39m):\n\u001b[1;32m 1309\u001b[0m \u001b[38;5;124;03m\"\"\"Decode example with custom feature decoding.\u001b[39;00m\n\u001b[1;32m 1310\u001b[0m \n\u001b[1;32m 1311\u001b[0m \u001b[38;5;124;03m Args:\u001b[39;00m\n\u001b[0;32m (...)\u001b[0m\n\u001b[1;32m 1315\u001b[0m \u001b[38;5;124;03m :obj:`dict[str, Any]`\u001b[39;00m\n\u001b[1;32m 1316\u001b[0m \u001b[38;5;124;03m \"\"\"\u001b[39;00m\n\u001b[1;32m 1318\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m {\n\u001b[0;32m-> 1319\u001b[0m column_name: \u001b[43mdecode_nested_example\u001b[49m\u001b[43m(\u001b[49m\u001b[43mfeature\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mvalue\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 1320\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_column_requires_decoding[column_name]\n\u001b[1;32m 1321\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m value\n\u001b[1;32m 1322\u001b[0m \u001b[38;5;28;01mfor\u001b[39;00m column_name, (feature, value) \u001b[38;5;129;01min\u001b[39;00m utils\u001b[38;5;241m.\u001b[39mzip_dict(\n\u001b[1;32m 1323\u001b[0m {key: value \u001b[38;5;28;01mfor\u001b[39;00m key, value \u001b[38;5;129;01min\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mitems() \u001b[38;5;28;01mif\u001b[39;00m key \u001b[38;5;129;01min\u001b[39;00m example}, example\n\u001b[1;32m 1324\u001b[0m )\n\u001b[1;32m 1325\u001b[0m }\n",
|
1149 |
-
"File \u001b[0;32m/opt/conda/lib/python3.8/site-packages/datasets/features/features.py:1056\u001b[0m, in \u001b[0;36mdecode_nested_example\u001b[0;34m(schema, obj)\u001b[0m\n\u001b[1;32m 1054\u001b[0m \u001b[38;5;66;03m# Object with special decoding:\u001b[39;00m\n\u001b[1;32m 1055\u001b[0m \u001b[38;5;28;01melif\u001b[39;00m \u001b[38;5;28misinstance\u001b[39m(schema, (Audio, Image)):\n\u001b[0;32m-> 1056\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43mschema\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mdecode_example\u001b[49m\u001b[43m(\u001b[49m\u001b[43mobj\u001b[49m\u001b[43m)\u001b[49m \u001b[38;5;28;01mif\u001b[39;00m obj \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m \u001b[38;5;28;01melse\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m\n\u001b[1;32m 1057\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m obj\n",
|
1150 |
-
"File \u001b[0;32m/opt/conda/lib/python3.8/site-packages/datasets/features/audio.py:97\u001b[0m, in \u001b[0;36mAudio.decode_example\u001b[0;34m(self, value)\u001b[0m\n\u001b[1;32m 95\u001b[0m \u001b[38;5;28;01mraise\u001b[39;00m \u001b[38;5;167;01mValueError\u001b[39;00m(\u001b[38;5;124mf\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mAn audio sample should have one of \u001b[39m\u001b[38;5;124m'\u001b[39m\u001b[38;5;124mpath\u001b[39m\u001b[38;5;124m'\u001b[39m\u001b[38;5;124m or \u001b[39m\u001b[38;5;124m'\u001b[39m\u001b[38;5;124mbytes\u001b[39m\u001b[38;5;124m'\u001b[39m\u001b[38;5;124m but both are None in \u001b[39m\u001b[38;5;132;01m{\u001b[39;00mvalue\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m.\u001b[39m\u001b[38;5;124m\"\u001b[39m)\n\u001b[1;32m 96\u001b[0m \u001b[38;5;28;01melif\u001b[39;00m path \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m \u001b[38;5;129;01mand\u001b[39;00m path\u001b[38;5;241m.\u001b[39mendswith(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mmp3\u001b[39m\u001b[38;5;124m\"\u001b[39m):\n\u001b[0;32m---> 97\u001b[0m array, sampling_rate \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_decode_mp3\u001b[49m\u001b[43m(\u001b[49m\u001b[43mfile\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;28;43;01mif\u001b[39;49;00m\u001b[43m \u001b[49m\u001b[43mfile\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;28;43;01melse\u001b[39;49;00m\u001b[43m \u001b[49m\u001b[43mpath\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 98\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[1;32m 99\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m file:\n",
|
1151 |
-
"File \u001b[0;32m/opt/conda/lib/python3.8/site-packages/datasets/features/audio.py:183\u001b[0m, in \u001b[0;36mAudio._decode_mp3\u001b[0;34m(self, path_or_file)\u001b[0m\n\u001b[1;32m 181\u001b[0m array \u001b[38;5;241m=\u001b[39m array\u001b[38;5;241m.\u001b[39mnumpy()\n\u001b[1;32m 182\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mmono:\n\u001b[0;32m--> 183\u001b[0m array \u001b[38;5;241m=\u001b[39m \u001b[43marray\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mmean\u001b[49m\u001b[43m(\u001b[49m\u001b[43maxis\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;241;43m0\u001b[39;49m\u001b[43m)\u001b[49m\n\u001b[1;32m 184\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m array, sampling_rate\n",
|
1152 |
-
"File \u001b[0;32m/opt/conda/lib/python3.8/site-packages/numpy/core/_methods.py:154\u001b[0m, in \u001b[0;36m_mean\u001b[0;34m(a, axis, dtype, out, keepdims)\u001b[0m\n\u001b[1;32m 152\u001b[0m \u001b[38;5;66;03m# Cast bool, unsigned int, and int to float64 by default\u001b[39;00m\n\u001b[1;32m 153\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m dtype \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m:\n\u001b[0;32m--> 154\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28missubclass\u001b[39m(arr\u001b[38;5;241m.\u001b[39mdtype\u001b[38;5;241m.\u001b[39mtype, (\u001b[43mnt\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43minteger\u001b[49m, nt\u001b[38;5;241m.\u001b[39mbool_)):\n\u001b[1;32m 155\u001b[0m dtype \u001b[38;5;241m=\u001b[39m mu\u001b[38;5;241m.\u001b[39mdtype(\u001b[38;5;124m'\u001b[39m\u001b[38;5;124mf8\u001b[39m\u001b[38;5;124m'\u001b[39m)\n\u001b[1;32m 156\u001b[0m \u001b[38;5;28;01melif\u001b[39;00m \u001b[38;5;28missubclass\u001b[39m(arr\u001b[38;5;241m.\u001b[39mdtype\u001b[38;5;241m.\u001b[39mtype, nt\u001b[38;5;241m.\u001b[39mfloat16):\n",
|
1153 |
-
"\u001b[0;31mKeyboardInterrupt\u001b[0m: "
|
1154 |
]
|
1155 |
}
|
1156 |
],
|
1157 |
-
"source": [
|
|
|
|
|
|
|
1158 |
},
|
1159 |
{
|
1160 |
"cell_type": "code",
|
1161 |
-
"execution_count":
|
1162 |
"metadata": {},
|
1163 |
-
"outputs": [
|
1164 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1165 |
},
|
1166 |
{
|
1167 |
"cell_type": "code",
|
|
|
1122 |
},
|
1123 |
{
|
1124 |
"cell_type": "code",
|
1125 |
+
"execution_count": 30,
|
1126 |
+
"metadata": {},
|
1127 |
+
"outputs": [
|
1128 |
+
{
|
1129 |
+
"data": {
|
1130 |
+
"application/vnd.jupyter.widget-view+json": {
|
1131 |
+
"model_id": "501e1eb7f6a545c496873545b992c2ad",
|
1132 |
+
"version_major": 2,
|
1133 |
+
"version_minor": 0
|
1134 |
+
},
|
1135 |
+
"text/plain": [
|
1136 |
+
" 0%| | 0/11 [00:00<?, ?ba/s]"
|
1137 |
+
]
|
1138 |
+
},
|
1139 |
+
"metadata": {},
|
1140 |
+
"output_type": "display_data"
|
1141 |
}
|
1142 |
+
],
|
1143 |
+
"source": [
|
1144 |
+
"alpha_rows = common_voice_train.filter(lambda example: re.search('[a-zA-Z]',example['sentence']))"
|
1145 |
+
]
|
1146 |
+
},
|
1147 |
+
{
|
1148 |
+
"cell_type": "code",
|
1149 |
+
"execution_count": 32,
|
1150 |
+
"metadata": {},
|
1151 |
"outputs": [
|
1152 |
{
|
1153 |
+
"name": "stdout",
|
1154 |
+
"output_type": "stream",
|
1155 |
+
"text": [
|
1156 |
+
"467\n",
|
1157 |
+
"10623\n"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1158 |
]
|
1159 |
}
|
1160 |
],
|
1161 |
+
"source": [
|
1162 |
+
"print(len(alpha_rows))\n",
|
1163 |
+
"print(len(common_voice_train))\n"
|
1164 |
+
]
|
1165 |
},
|
1166 |
{
|
1167 |
"cell_type": "code",
|
1168 |
+
"execution_count": 35,
|
1169 |
"metadata": {},
|
1170 |
+
"outputs": [
|
1171 |
+
{
|
1172 |
+
"name": "stdout",
|
1173 |
+
"output_type": "stream",
|
1174 |
+
"text": [
|
1175 |
+
"グループは、「Winters and Happy」でさくしゃとしてなまえがのることをシェアしています。\n",
|
1176 |
+
"Flowoodには、リモートコントロールレースようのおくないトラックがある。\n",
|
1177 |
+
"じもとのこうかんやHarrow Civic Centreのゆうじんにより、さらなるけいびがおこなわれました。\n",
|
1178 |
+
"かれはHeman Huntersでアコーディオンやドラムをえんそうします。\n",
|
1179 |
+
"これらはさまざまなWebベースのインターフェースをかいしてりようかのうになります。\n",
|
1180 |
+
"これでかそうのループbをかんがえることができます。\n",
|
1181 |
+
"のちにかれは、『Moth or Phoenix』というほんのなかで、これらのできごとについてかいた。\n",
|
1182 |
+
"ダリル・バンクスは、オハイオしゅうのColumbus College of Art and Designでまなびました。\n",
|
1183 |
+
"サンスクリットごでSaketというなまえは、てんごくにちかいばしょをいみします。\n",
|
1184 |
+
"Justinは、バンドのIntangibleのメンバーです。\n",
|
1185 |
+
"「U」を、「X」のとじたぶぶんくうかんであるとかていします。\n",
|
1186 |
+
"コンデはAcademy of Sciences and Letters のメンバーでもありました。\n",
|
1187 |
+
"Steersはホッチキススクールからもカルバーミリタリーアカデミーからもついほうされた。\n",
|
1188 |
+
"そののち、Bangladeshはあたまをうった。\n",
|
1189 |
+
"かれはけいざいがくしゃでありきょうじゅでもある、Cillian Ryanのちちおやだ。\n",
|
1190 |
+
"Webサイトのレビュー、コメント、およびひょうかをひょうじします。\n",
|
1191 |
+
"Arthurのけいれきはおおくのそしょうがしめしています。\n",
|
1192 |
+
"Aeroしゃによるたいりょうせいさんはありませんでした。\n",
|
1193 |
+
"これには、シングル「King of England」、「Somewhere」および「Clarinet Town」がしゅうろくされている。\n",
|
1194 |
+
"かれは、オーバーンだいがくモンゴメリーこうでWeilフェローだった。\n"
|
1195 |
+
]
|
1196 |
+
}
|
1197 |
+
],
|
1198 |
+
"source": [
|
1199 |
+
"for i in range(0,20):\n",
|
1200 |
+
" print(alpha_rows[i]['sentence'])"
|
1201 |
+
]
|
1202 |
+
},
|
1203 |
+
{
|
1204 |
+
"cell_type": "code",
|
1205 |
+
"execution_count": 28,
|
1206 |
+
"metadata": {},
|
1207 |
+
"outputs": [
|
1208 |
+
{
|
1209 |
+
"name": "stdout",
|
1210 |
+
"output_type": "stream",
|
1211 |
+
"text": [
|
1212 |
+
"<re.Match object; span=(1, 2), match='a'>\n"
|
1213 |
+
]
|
1214 |
+
}
|
1215 |
+
],
|
1216 |
+
"source": [
|
1217 |
+
"import regex\n",
|
1218 |
+
"print(re.search('[a-zA-Z]', \"9a2\"))"
|
1219 |
+
]
|
1220 |
},
|
1221 |
{
|
1222 |
"cell_type": "code",
|
added_tokens.json
CHANGED
@@ -1 +1 @@
|
|
1 |
-
{"<s>":
|
|
|
1 |
+
{"<s>": 179, "</s>": 180}
|
config.json
CHANGED
@@ -76,7 +76,7 @@
|
|
76 |
"num_hidden_layers": 24,
|
77 |
"num_negatives": 100,
|
78 |
"output_hidden_size": 1024,
|
79 |
-
"pad_token_id":
|
80 |
"proj_codevector_dim": 768,
|
81 |
"tdnn_dilation": [
|
82 |
1,
|
@@ -102,6 +102,6 @@
|
|
102 |
"torch_dtype": "float32",
|
103 |
"transformers_version": "4.17.0.dev0",
|
104 |
"use_weighted_layer_sum": false,
|
105 |
-
"vocab_size":
|
106 |
"xvector_output_dim": 512
|
107 |
}
|
|
|
76 |
"num_hidden_layers": 24,
|
77 |
"num_negatives": 100,
|
78 |
"output_hidden_size": 1024,
|
79 |
+
"pad_token_id": 178,
|
80 |
"proj_codevector_dim": 768,
|
81 |
"tdnn_dilation": [
|
82 |
1,
|
|
|
102 |
"torch_dtype": "float32",
|
103 |
"transformers_version": "4.17.0.dev0",
|
104 |
"use_weighted_layer_sum": false,
|
105 |
+
"vocab_size": 181,
|
106 |
"xvector_output_dim": 512
|
107 |
}
|
mozilla-foundation_common_voice_8_0_ja_test_eval_results.txt
CHANGED
@@ -1,2 +1,2 @@
|
|
1 |
-
WER: 0.
|
2 |
-
CER: 0.
|
|
|
1 |
+
WER: 0.9490658362989324
|
2 |
+
CER: 0.233251654006371
|
pytorch_model.bin
CHANGED
@@ -1,3 +1,3 @@
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
-
oid sha256:
|
3 |
-
size
|
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:cec559d37e4950e12a68238d91702538827b3e3a578f44c9eea97dc5f9450578
|
3 |
+
size 1262665777
|
run_speech_recognition_ctc_bnb.py
CHANGED
@@ -358,6 +358,8 @@ def main():
|
|
358 |
else:
|
359 |
model_args, data_args, training_args = parser.parse_args_into_dataclasses()
|
360 |
|
|
|
|
|
361 |
# Detecting last checkpoint.
|
362 |
last_checkpoint = None
|
363 |
if os.path.isdir(training_args.output_dir) and training_args.do_train and not training_args.overwrite_output_dir:
|
@@ -432,7 +434,12 @@ def main():
|
|
432 |
|
433 |
if data_args.max_eval_samples is not None:
|
434 |
raw_datasets["eval"] = raw_datasets["eval"].select(range(data_args.max_eval_samples))
|
|
|
|
|
|
|
435 |
|
|
|
|
|
436 |
# 2. We remove some special characters from the datasets
|
437 |
# that make training complicated and do not help in transcribing the speech
|
438 |
# E.g. characters, such as `,` and `.` do not really have an acoustic characteristic
|
@@ -444,11 +451,14 @@ def main():
|
|
444 |
# kakasi.setMode("K", "H") #Convert from katakana to hiragana
|
445 |
conv = kakasi.getConverter()
|
446 |
|
|
|
447 |
chars_to_ignore_regex = (
|
448 |
-
f'[{"".join(data_args.chars_to_ignore)}]' if data_args.chars_to_ignore is not None else '[\,\?\!\-\;\:\"\“\%\‘\”\�\—\’\…\–\(\,\[\]\)\(
|
449 |
)
|
450 |
-
text_column_name = data_args.text_column_name
|
451 |
|
|
|
|
|
|
|
452 |
|
453 |
|
454 |
def remove_special_characters(batch):
|
@@ -580,7 +590,7 @@ def main():
|
|
580 |
max_input_length = data_args.max_duration_in_seconds * feature_extractor.sampling_rate
|
581 |
min_input_length = data_args.min_duration_in_seconds * feature_extractor.sampling_rate
|
582 |
audio_column_name = data_args.audio_column_name
|
583 |
-
|
584 |
|
585 |
# `phoneme_language` is only relevant if the model is fine-tuned on phoneme classification
|
586 |
phoneme_language = data_args.phoneme_language
|
|
|
358 |
else:
|
359 |
model_args, data_args, training_args = parser.parse_args_into_dataclasses()
|
360 |
|
361 |
+
|
362 |
+
num_workers = data_args.preprocessing_num_workers
|
363 |
# Detecting last checkpoint.
|
364 |
last_checkpoint = None
|
365 |
if os.path.isdir(training_args.output_dir) and training_args.do_train and not training_args.overwrite_output_dir:
|
|
|
434 |
|
435 |
if data_args.max_eval_samples is not None:
|
436 |
raw_datasets["eval"] = raw_datasets["eval"].select(range(data_args.max_eval_samples))
|
437 |
+
|
438 |
+
# ADDITIONS
|
439 |
+
# Remove alphanumeric characters
|
440 |
|
441 |
+
raw_datasets = raw_datasets.filter(lambda example: not re.search('[a-zA-ZA-Za-z]',example['sentence']))
|
442 |
+
|
443 |
# 2. We remove some special characters from the datasets
|
444 |
# that make training complicated and do not help in transcribing the speech
|
445 |
# E.g. characters, such as `,` and `.` do not really have an acoustic characteristic
|
|
|
451 |
# kakasi.setMode("K", "H") #Convert from katakana to hiragana
|
452 |
conv = kakasi.getConverter()
|
453 |
|
454 |
+
# Default to set of extra characters seen in CV 8.
|
455 |
chars_to_ignore_regex = (
|
456 |
+
f'[{"".join(data_args.chars_to_ignore)}]' if data_args.chars_to_ignore is not None else '[\,\?\!\-\;\:\"\“\%\‘\”\�\—\’\…\–\(\,\[\]\)\(\!\/\「\」\『\』]'
|
457 |
)
|
|
|
458 |
|
459 |
+
# ADDITIONS END
|
460 |
+
|
461 |
+
text_column_name = data_args.text_column_name
|
462 |
|
463 |
|
464 |
def remove_special_characters(batch):
|
|
|
590 |
max_input_length = data_args.max_duration_in_seconds * feature_extractor.sampling_rate
|
591 |
min_input_length = data_args.min_duration_in_seconds * feature_extractor.sampling_rate
|
592 |
audio_column_name = data_args.audio_column_name
|
593 |
+
|
594 |
|
595 |
# `phoneme_language` is only relevant if the model is fine-tuned on phoneme classification
|
596 |
phoneme_language = data_args.phoneme_language
|
special_tokens_map.json
CHANGED
@@ -1 +1 @@
|
|
1 |
-
{"bos_token": "<s>", "eos_token": "</s>", "unk_token": "[UNK]", "pad_token": "[PAD]", "additional_special_tokens": [{"content": "<s>", "single_word": false, "lstrip": false, "rstrip": false, "normalized": true}, {"content": "</s>", "single_word": false, "lstrip": false, "rstrip": false, "normalized": true}, {"content": "<s>", "single_word": false, "lstrip": false, "rstrip": false, "normalized": true}, {"content": "</s>", "single_word": false, "lstrip": false, "rstrip": false, "normalized": true}]}
|
|
|
1 |
+
{"bos_token": "<s>", "eos_token": "</s>", "unk_token": "[UNK]", "pad_token": "[PAD]", "additional_special_tokens": [{"content": "<s>", "single_word": false, "lstrip": false, "rstrip": false, "normalized": true}, {"content": "</s>", "single_word": false, "lstrip": false, "rstrip": false, "normalized": true}, {"content": "<s>", "single_word": false, "lstrip": false, "rstrip": false, "normalized": true}, {"content": "</s>", "single_word": false, "lstrip": false, "rstrip": false, "normalized": true}, {"content": "<s>", "single_word": false, "lstrip": false, "rstrip": false, "normalized": true}, {"content": "</s>", "single_word": false, "lstrip": false, "rstrip": false, "normalized": true}, {"content": "<s>", "single_word": false, "lstrip": false, "rstrip": false, "normalized": true}, {"content": "</s>", "single_word": false, "lstrip": false, "rstrip": false, "normalized": true}, {"content": "<s>", "single_word": false, "lstrip": false, "rstrip": false, "normalized": true}, {"content": "</s>", "single_word": false, "lstrip": false, "rstrip": false, "normalized": true}]}
|
speech_training_notebook.ipynb
CHANGED
@@ -577,20 +577,59 @@
|
|
577 |
},
|
578 |
{
|
579 |
"cell_type": "code",
|
580 |
-
"execution_count":
|
581 |
"metadata": {},
|
582 |
-
"outputs": [
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
583 |
"source": []
|
584 |
},
|
585 |
{
|
586 |
"cell_type": "code",
|
587 |
-
"execution_count":
|
588 |
"metadata": {},
|
589 |
"outputs": [
|
590 |
{
|
591 |
"data": {
|
592 |
"application/vnd.jupyter.widget-view+json": {
|
593 |
-
"model_id": "
|
594 |
"version_major": 2,
|
595 |
"version_minor": 0
|
596 |
},
|
@@ -604,7 +643,7 @@
|
|
604 |
{
|
605 |
"data": {
|
606 |
"application/vnd.jupyter.widget-view+json": {
|
607 |
-
"model_id": "
|
608 |
"version_major": 2,
|
609 |
"version_minor": 0
|
610 |
},
|
@@ -617,13 +656,13 @@
|
|
617 |
}
|
618 |
],
|
619 |
"source": [
|
620 |
-
"vocab_train =
|
621 |
-
"vocab_test =
|
622 |
]
|
623 |
},
|
624 |
{
|
625 |
"cell_type": "code",
|
626 |
-
"execution_count":
|
627 |
"metadata": {},
|
628 |
"outputs": [],
|
629 |
"source": [
|
@@ -898,15 +937,15 @@
|
|
898 |
},
|
899 |
{
|
900 |
"cell_type": "code",
|
901 |
-
"execution_count":
|
902 |
"metadata": {},
|
903 |
"outputs": [
|
904 |
{
|
905 |
"name": "stdout",
|
906 |
"output_type": "stream",
|
907 |
"text": [
|
908 |
-
"
|
909 |
-
"['ダ', 'た', '
|
910 |
]
|
911 |
}
|
912 |
],
|
@@ -1122,46 +1161,163 @@
|
|
1122 |
},
|
1123 |
{
|
1124 |
"cell_type": "code",
|
1125 |
-
"execution_count":
|
1126 |
-
"metadata": {
|
1127 |
-
|
1128 |
-
|
1129 |
-
"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1130 |
}
|
1131 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1132 |
"outputs": [
|
1133 |
{
|
1134 |
-
"
|
1135 |
-
|
1136 |
-
|
1137 |
-
|
1138 |
-
|
1139 |
-
|
1140 |
-
"
|
1141 |
-
|
1142 |
-
|
1143 |
-
|
1144 |
-
|
1145 |
-
|
1146 |
-
|
1147 |
-
|
1148 |
-
|
1149 |
-
|
1150 |
-
|
1151 |
-
|
1152 |
-
|
1153 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1154 |
]
|
1155 |
}
|
1156 |
],
|
1157 |
-
"source": [
|
|
|
|
|
|
|
|
|
1158 |
},
|
1159 |
{
|
1160 |
"cell_type": "code",
|
1161 |
-
"execution_count":
|
1162 |
"metadata": {},
|
1163 |
-
"outputs": [
|
1164 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1165 |
},
|
1166 |
{
|
1167 |
"cell_type": "code",
|
|
|
577 |
},
|
578 |
{
|
579 |
"cell_type": "code",
|
580 |
+
"execution_count": 36,
|
581 |
"metadata": {},
|
582 |
+
"outputs": [
|
583 |
+
{
|
584 |
+
"data": {
|
585 |
+
"application/vnd.jupyter.widget-view+json": {
|
586 |
+
"model_id": "c433125fde60482ab48e7db72a0759a0",
|
587 |
+
"version_major": 2,
|
588 |
+
"version_minor": 0
|
589 |
+
},
|
590 |
+
"text/plain": [
|
591 |
+
" 0%| | 0/11 [00:00<?, ?ba/s]"
|
592 |
+
]
|
593 |
+
},
|
594 |
+
"metadata": {},
|
595 |
+
"output_type": "display_data"
|
596 |
+
}
|
597 |
+
],
|
598 |
+
"source": [
|
599 |
+
"common_voice_train_no_alpha = common_voice_train.filter(lambda example: not re.search('[a-zA-Z]',example['sentence']))\n"
|
600 |
+
]
|
601 |
+
},
|
602 |
+
{
|
603 |
+
"cell_type": "code",
|
604 |
+
"execution_count": 38,
|
605 |
+
"metadata": {},
|
606 |
+
"outputs": [
|
607 |
+
{
|
608 |
+
"data": {
|
609 |
+
"application/vnd.jupyter.widget-view+json": {
|
610 |
+
"model_id": "7eb50868575b4ebb8143c46761a96550",
|
611 |
+
"version_major": 2,
|
612 |
+
"version_minor": 0
|
613 |
+
},
|
614 |
+
"text/plain": [
|
615 |
+
" 0%| | 0/5 [00:00<?, ?ba/s]"
|
616 |
+
]
|
617 |
+
},
|
618 |
+
"metadata": {},
|
619 |
+
"output_type": "display_data"
|
620 |
+
}
|
621 |
+
],
|
622 |
"source": []
|
623 |
},
|
624 |
{
|
625 |
"cell_type": "code",
|
626 |
+
"execution_count": 39,
|
627 |
"metadata": {},
|
628 |
"outputs": [
|
629 |
{
|
630 |
"data": {
|
631 |
"application/vnd.jupyter.widget-view+json": {
|
632 |
+
"model_id": "208cd0b1845341ff91372fb784096860",
|
633 |
"version_major": 2,
|
634 |
"version_minor": 0
|
635 |
},
|
|
|
643 |
{
|
644 |
"data": {
|
645 |
"application/vnd.jupyter.widget-view+json": {
|
646 |
+
"model_id": "6405ced5205448bd8d3db8c188698403",
|
647 |
"version_major": 2,
|
648 |
"version_minor": 0
|
649 |
},
|
|
|
656 |
}
|
657 |
],
|
658 |
"source": [
|
659 |
+
"vocab_train = common_voice_train_no_alpha.map(extract_all_chars, batched=True, batch_size=-1, keep_in_memory=True, remove_columns=common_voice_train.column_names)\n",
|
660 |
+
"vocab_test = common_voice_test_no_alpha.map(extract_all_chars, batched=True, batch_size=-1, keep_in_memory=True, remove_columns=common_voice_test.column_names)"
|
661 |
]
|
662 |
},
|
663 |
{
|
664 |
"cell_type": "code",
|
665 |
+
"execution_count": 40,
|
666 |
"metadata": {},
|
667 |
"outputs": [],
|
668 |
"source": [
|
|
|
937 |
},
|
938 |
{
|
939 |
"cell_type": "code",
|
940 |
+
"execution_count": 41,
|
941 |
"metadata": {},
|
942 |
"outputs": [
|
943 |
{
|
944 |
"name": "stdout",
|
945 |
"output_type": "stream",
|
946 |
"text": [
|
947 |
+
"194\n",
|
948 |
+
"['ダ', 'た', 'か', 'よ', 'や', 'を', 'F', 'h', 'ち', 'リ', 'ゲ', 'フ', 'め', 'タ', 'せ', '」', 'ば', 'ア', 'ャ', 'イ', 'ぶ', 'は', 'と', 'ノ', 'ェ', 'く', '?', '〜', 'つ', 'こ', 'S', 'ぼ', 'ゼ', 'U', 'き', 'ゥ', 'が', 'も', 'エ', 'ク', 'づ', 'グ', 'ブ', 'ゅ', 'ィ', 'ロ', 'ー', '/', 'の', 'ケ', '・', 'お', 'む', 'P', 'ベ', 'プ', '『', 'ソ', '.', 'ヴ', 'ド', 'み', 'ガ', 'ょ', 'カ', 'ぜ', '.', 'ご', 'ど', 'ハ', 'ね', 'j', ' ', 'マ', '―', '-', 'デ', 'ゾ', 'ポ', 'ペ', 'ぱ', 'ふ', 'べ', 'ヒ', 'サ', 'N', 'ュ', 'り', 'ひ', 'げ', 'ゆ', 'ず', 'ゴ', 'ョ', 'ツ', '〇', 'え', '』', 'ッ', 'ん', 'ン', 'う', 'ぽ', ':', '々', 'ぞ', 'ヨ', 'ゃ', 'だ', 'ピ', 'ボ', 'ウ', 'あ', 'ヶ', 'ぬ', 'て', 'す', 'び', 'へ', '繫', 'バ', 'ぎ', 'ざ', 'A', 'チ', 'け', 'ぇ', 'わ', 'ス', 'ズ', 'し', '、', '!', 'G', '・', 'ぁ', 'ナ', 'ヅ', 'ほ', ')', 'ネ', 'パ', 'ム', 'ミ', '=', 'O', 'い', 'ろ', 'ザ', 'ヌ', 'に', 'ら', 'ヘ', '。', 'ギ', 'モ', 'D', 'キ', \"'\", 'で', 'ぴ', 'ぷ', 'ビ', 'ヤ', 'ユ', 'シ', 'る', 'そ', 'テ', 'れ', 'じ', 'ワ', 'レ', 'ォ', 'ジ', 'な', 'ニ', '&', 'っ', '「', 'ぢ', 'ル', 'さ', 'ぺ', 'ト', 'ホ', 'コ', 'オ', 'セ', 'ま', 'メ', 'ァ', 'ぐ', 'ラ']\n"
|
949 |
]
|
950 |
}
|
951 |
],
|
|
|
1161 |
},
|
1162 |
{
|
1163 |
"cell_type": "code",
|
1164 |
+
"execution_count": 30,
|
1165 |
+
"metadata": {},
|
1166 |
+
"outputs": [
|
1167 |
+
{
|
1168 |
+
"data": {
|
1169 |
+
"application/vnd.jupyter.widget-view+json": {
|
1170 |
+
"model_id": "501e1eb7f6a545c496873545b992c2ad",
|
1171 |
+
"version_major": 2,
|
1172 |
+
"version_minor": 0
|
1173 |
+
},
|
1174 |
+
"text/plain": [
|
1175 |
+
" 0%| | 0/11 [00:00<?, ?ba/s]"
|
1176 |
+
]
|
1177 |
+
},
|
1178 |
+
"metadata": {},
|
1179 |
+
"output_type": "display_data"
|
1180 |
}
|
1181 |
+
],
|
1182 |
+
"source": [
|
1183 |
+
"alpha_rows = common_voice_train.filter(lambda example: re.search('[a-zA-Z]',example['sentence']))\n"
|
1184 |
+
]
|
1185 |
+
},
|
1186 |
+
{
|
1187 |
+
"cell_type": "code",
|
1188 |
+
"execution_count": 42,
|
1189 |
+
"metadata": {},
|
1190 |
"outputs": [
|
1191 |
{
|
1192 |
+
"data": {
|
1193 |
+
"application/vnd.jupyter.widget-view+json": {
|
1194 |
+
"model_id": "75d9652cda2c4d99adca0e0e455dd005",
|
1195 |
+
"version_major": 2,
|
1196 |
+
"version_minor": 0
|
1197 |
+
},
|
1198 |
+
"text/plain": [
|
1199 |
+
" 0%| | 0/11 [00:00<?, ?ba/s]"
|
1200 |
+
]
|
1201 |
+
},
|
1202 |
+
"metadata": {},
|
1203 |
+
"output_type": "display_data"
|
1204 |
+
}
|
1205 |
+
],
|
1206 |
+
"source": [
|
1207 |
+
"odd_alpha_rows = common_voice_train.filter(lambda example: re.search('[A-Uhj]',example['sentence']))\n",
|
1208 |
+
"\n"
|
1209 |
+
]
|
1210 |
+
},
|
1211 |
+
{
|
1212 |
+
"cell_type": "code",
|
1213 |
+
"execution_count": 43,
|
1214 |
+
"metadata": {},
|
1215 |
+
"outputs": [
|
1216 |
+
{
|
1217 |
+
"name": "stdout",
|
1218 |
+
"output_type": "stream",
|
1219 |
+
"text": [
|
1220 |
+
"467\n",
|
1221 |
+
"10623\n",
|
1222 |
+
"4\n"
|
1223 |
]
|
1224 |
}
|
1225 |
],
|
1226 |
+
"source": [
|
1227 |
+
"print(len(alpha_rows))\n",
|
1228 |
+
"print(len(common_voice_train))\n",
|
1229 |
+
"print(len(odd_alpha_rows))\n"
|
1230 |
+
]
|
1231 |
},
|
1232 |
{
|
1233 |
"cell_type": "code",
|
1234 |
+
"execution_count": 51,
|
1235 |
"metadata": {},
|
1236 |
+
"outputs": [
|
1237 |
+
{
|
1238 |
+
"data": {
|
1239 |
+
"application/vnd.jupyter.widget-view+json": {
|
1240 |
+
"model_id": "50779e11b97f42d0aec1c17121b8087a",
|
1241 |
+
"version_major": 2,
|
1242 |
+
"version_minor": 0
|
1243 |
+
},
|
1244 |
+
"text/plain": [
|
1245 |
+
" 0%| | 0/11 [00:00<?, ?ba/s]"
|
1246 |
+
]
|
1247 |
+
},
|
1248 |
+
"metadata": {},
|
1249 |
+
"output_type": "display_data"
|
1250 |
+
}
|
1251 |
+
],
|
1252 |
+
"source": [
|
1253 |
+
"# Remove alphanumeric characters\n",
|
1254 |
+
"def has_no_alpha_numeric_characters(sentence):\n",
|
1255 |
+
" return re.search('[a-zA-Z]]',sentence)\n",
|
1256 |
+
"# common_voice_train\n",
|
1257 |
+
"common_voice_train__filter = common_voice_train.filter(\n",
|
1258 |
+
" has_no_alpha_numeric_characters,\n",
|
1259 |
+
"# num_proc=num_workers,\n",
|
1260 |
+
"common_voice_train.filter(lambda example: re.search('[A-Uhj]',example['sentence']))\n",
|
1261 |
+
")"
|
1262 |
+
]
|
1263 |
+
},
|
1264 |
+
{
|
1265 |
+
"cell_type": "code",
|
1266 |
+
"execution_count": 52,
|
1267 |
+
"metadata": {},
|
1268 |
+
"outputs": [
|
1269 |
+
{
|
1270 |
+
"name": "stdout",
|
1271 |
+
"output_type": "stream",
|
1272 |
+
"text": [
|
1273 |
+
"0\n",
|
1274 |
+
"10623\n"
|
1275 |
+
]
|
1276 |
+
}
|
1277 |
+
],
|
1278 |
+
"source": [
|
1279 |
+
"print(len(common_voice_train__filter))\n",
|
1280 |
+
"print(len(common_voice_train))"
|
1281 |
+
]
|
1282 |
+
},
|
1283 |
+
{
|
1284 |
+
"cell_type": "code",
|
1285 |
+
"execution_count": 44,
|
1286 |
+
"metadata": {},
|
1287 |
+
"outputs": [
|
1288 |
+
{
|
1289 |
+
"name": "stdout",
|
1290 |
+
"output_type": "stream",
|
1291 |
+
"text": [
|
1292 |
+
"アーケードにはhジェネシーコミュニティカレッジのぶんこうがある。\n",
|
1293 |
+
"かのじょjはひだりききのピッチングをしていたものの、ゴルフはみぎききをしゅうとくした。\n",
|
1294 |
+
"ジャック・マッカートニーというせいともスーパーAレベルしけんをうけました。\n",
|
1295 |
+
"でも、ところどころ、UFOらしききょだいなえんばんのおもかげはのこっている。\n"
|
1296 |
+
]
|
1297 |
+
}
|
1298 |
+
],
|
1299 |
+
"source": [
|
1300 |
+
"for i in range(0,4):\n",
|
1301 |
+
" print(odd_alpha_rows[i]['sentence'])"
|
1302 |
+
]
|
1303 |
+
},
|
1304 |
+
{
|
1305 |
+
"cell_type": "code",
|
1306 |
+
"execution_count": 28,
|
1307 |
+
"metadata": {},
|
1308 |
+
"outputs": [
|
1309 |
+
{
|
1310 |
+
"name": "stdout",
|
1311 |
+
"output_type": "stream",
|
1312 |
+
"text": [
|
1313 |
+
"<re.Match object; span=(1, 2), match='a'>\n"
|
1314 |
+
]
|
1315 |
+
}
|
1316 |
+
],
|
1317 |
+
"source": [
|
1318 |
+
"import regex\n",
|
1319 |
+
"print(re.search('[a-zA-Z]', \"9a2\"))"
|
1320 |
+
]
|
1321 |
},
|
1322 |
{
|
1323 |
"cell_type": "code",
|
training_args.bin
CHANGED
@@ -1,3 +1,3 @@
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
-
oid sha256:
|
3 |
size 2991
|
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:3a128c5e47bae3a7da28bb745038c537c781db028b85a0a4f86e721372d51cc3
|
3 |
size 2991
|
vocab.json
CHANGED
@@ -1 +1 @@
|
|
1 |
-
{"
|
|
|
1 |
+
{"'": 1, ".": 2, "―": 3, "、": 4, "。": 5, "々": 6, "〇": 7, "〜": 8, "ぁ": 9, "あ": 10, "い": 11, "う": 12, "ぇ": 13, "え": 14, "お": 15, "か": 16, "が": 17, "き": 18, "ぎ": 19, "く": 20, "ぐ": 21, "け": 22, "げ": 23, "こ": 24, "ご": 25, "さ": 26, "ざ": 27, "し": 28, "じ": 29, "す": 30, "ず": 31, "せ": 32, "ぜ": 33, "そ": 34, "ぞ": 35, "た": 36, "だ": 37, "ち": 38, "ぢ": 39, "っ": 40, "つ": 41, "づ": 42, "て": 43, "で": 44, "と": 45, "ど": 46, "な": 47, "に": 48, "ぬ": 49, "ね": 50, "の": 51, "は": 52, "ば": 53, "ぱ": 54, "ひ": 55, "び": 56, "ぴ": 57, "ふ": 58, "ぶ": 59, "ぷ": 60, "へ": 61, "べ": 62, "ぺ": 63, "ほ": 64, "ぼ": 65, "ぽ": 66, "ま": 67, "み": 68, "む": 69, "め": 70, "も": 71, "ゃ": 72, "や": 73, "ゅ": 74, "ゆ": 75, "ょ": 76, "よ": 77, "ら": 78, "り": 79, "る": 80, "れ": 81, "ろ": 82, "わ": 83, "を": 84, "ん": 85, "ァ": 86, "ア": 87, "ィ": 88, "イ": 89, "ゥ": 90, "ウ": 91, "ェ": 92, "エ": 93, "ォ": 94, "オ": 95, "カ": 96, "ガ": 97, "キ": 98, "ギ": 99, "ク": 100, "グ": 101, "ケ": 102, "ゲ": 103, "コ": 104, "ゴ": 105, "サ": 106, "ザ": 107, "シ": 108, "ジ": 109, "ス": 110, "ズ": 111, "セ": 112, "ゼ": 113, "ソ": 114, "ゾ": 115, "タ": 116, "ダ": 117, "チ": 118, "ッ": 119, "ツ": 120, "ヅ": 121, "テ": 122, "デ": 123, "ト": 124, "ド": 125, "ナ": 126, "ニ": 127, "ヌ": 128, "ネ": 129, "ノ": 130, "ハ": 131, "バ": 132, "パ": 133, "ヒ": 134, "ビ": 135, "ピ": 136, "フ": 137, "ブ": 138, "プ": 139, "ヘ": 140, "ベ": 141, "ペ": 142, "ホ": 143, "ボ": 144, "ポ": 145, "マ": 146, "ミ": 147, "ム": 148, "メ": 149, "モ": 150, "ャ": 151, "ヤ": 152, "ュ": 153, "ユ": 154, "ョ": 155, "ヨ": 156, "ラ": 157, "リ": 158, "ル": 159, "レ": 160, "ロ": 161, "ワ": 162, "ン": 163, "ヴ": 164, "ヶ": 165, "・": 166, "ー": 167, "繫": 168, "&": 169, ")": 170, "-": 171, ".": 172, ":": 173, "=": 174, "?": 175, "・": 176, "|": 0, "[UNK]": 177, "[PAD]": 178}
|