AndrewMcDowell commited on
Commit
db4ff8d
·
1 Parent(s): b4be586

Training in progress, step 1000

Browse files
.ipynb_checkpoints/mozilla-foundation_common_voice_8_0_ja_test_eval_results-checkpoint.txt ADDED
@@ -0,0 +1,2 @@
 
 
 
1
+ WER: 0.9490658362989324
2
+ CER: 0.233251654006371
.ipynb_checkpoints/run_speech_recognition_ctc_bnb-checkpoint.py CHANGED
@@ -358,6 +358,8 @@ def main():
358
  else:
359
  model_args, data_args, training_args = parser.parse_args_into_dataclasses()
360
 
 
 
361
  # Detecting last checkpoint.
362
  last_checkpoint = None
363
  if os.path.isdir(training_args.output_dir) and training_args.do_train and not training_args.overwrite_output_dir:
@@ -432,7 +434,12 @@ def main():
432
 
433
  if data_args.max_eval_samples is not None:
434
  raw_datasets["eval"] = raw_datasets["eval"].select(range(data_args.max_eval_samples))
 
 
 
435
 
 
 
436
  # 2. We remove some special characters from the datasets
437
  # that make training complicated and do not help in transcribing the speech
438
  # E.g. characters, such as `,` and `.` do not really have an acoustic characteristic
@@ -444,11 +451,14 @@ def main():
444
  # kakasi.setMode("K", "H") #Convert from katakana to hiragana
445
  conv = kakasi.getConverter()
446
 
 
447
  chars_to_ignore_regex = (
448
- f'[{"".join(data_args.chars_to_ignore)}]' if data_args.chars_to_ignore is not None else '[\,\?\!\-\;\:\"\“\%\‘\”\�\—\’\…\–\(\,\[\]\)\(\!]'
449
  )
450
- text_column_name = data_args.text_column_name
451
 
 
 
 
452
 
453
 
454
  def remove_special_characters(batch):
@@ -580,7 +590,7 @@ def main():
580
  max_input_length = data_args.max_duration_in_seconds * feature_extractor.sampling_rate
581
  min_input_length = data_args.min_duration_in_seconds * feature_extractor.sampling_rate
582
  audio_column_name = data_args.audio_column_name
583
- num_workers = data_args.preprocessing_num_workers
584
 
585
  # `phoneme_language` is only relevant if the model is fine-tuned on phoneme classification
586
  phoneme_language = data_args.phoneme_language
 
358
  else:
359
  model_args, data_args, training_args = parser.parse_args_into_dataclasses()
360
 
361
+
362
+ num_workers = data_args.preprocessing_num_workers
363
  # Detecting last checkpoint.
364
  last_checkpoint = None
365
  if os.path.isdir(training_args.output_dir) and training_args.do_train and not training_args.overwrite_output_dir:
 
434
 
435
  if data_args.max_eval_samples is not None:
436
  raw_datasets["eval"] = raw_datasets["eval"].select(range(data_args.max_eval_samples))
437
+
438
+ # ADDITIONS
439
+ # Remove alphanumeric characters
440
 
441
+ raw_datasets = raw_datasets.filter(lambda example: not re.search('[a-zA-ZA-Za-z]',example['sentence']))
442
+
443
  # 2. We remove some special characters from the datasets
444
  # that make training complicated and do not help in transcribing the speech
445
  # E.g. characters, such as `,` and `.` do not really have an acoustic characteristic
 
451
  # kakasi.setMode("K", "H") #Convert from katakana to hiragana
452
  conv = kakasi.getConverter()
453
 
454
+ # Default to set of extra characters seen in CV 8.
455
  chars_to_ignore_regex = (
456
+ f'[{"".join(data_args.chars_to_ignore)}]' if data_args.chars_to_ignore is not None else '[\,\?\!\-\;\:\"\“\%\‘\”\�\—\’\…\–\(\,\[\]\)\(\!\/\「\」\『\』]'
457
  )
 
458
 
459
+ # ADDITIONS END
460
+
461
+ text_column_name = data_args.text_column_name
462
 
463
 
464
  def remove_special_characters(batch):
 
590
  max_input_length = data_args.max_duration_in_seconds * feature_extractor.sampling_rate
591
  min_input_length = data_args.min_duration_in_seconds * feature_extractor.sampling_rate
592
  audio_column_name = data_args.audio_column_name
593
+
594
 
595
  # `phoneme_language` is only relevant if the model is fine-tuned on phoneme classification
596
  phoneme_language = data_args.phoneme_language
.ipynb_checkpoints/speech_training_notebook-checkpoint.ipynb CHANGED
@@ -1122,46 +1122,101 @@
1122
  },
1123
  {
1124
  "cell_type": "code",
1125
- "execution_count": 38,
1126
- "metadata": {
1127
- "collapsed": true,
1128
- "jupyter": {
1129
- "outputs_hidden": true
 
 
 
 
 
 
 
 
 
 
 
1130
  }
1131
- },
 
 
 
 
 
 
 
 
1132
  "outputs": [
1133
  {
1134
- "ename": "KeyboardInterrupt",
1135
- "evalue": "",
1136
- "output_type": "error",
1137
- "traceback": [
1138
- "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
1139
- "\u001b[0;31mKeyboardInterrupt\u001b[0m Traceback (most recent call last)",
1140
- "Input \u001b[0;32mIn [38]\u001b[0m, in \u001b[0;36m<module>\u001b[0;34m\u001b[0m\n\u001b[1;32m 1\u001b[0m odd_example_texts \u001b[38;5;241m=\u001b[39m []\n\u001b[0;32m----> 2\u001b[0m \u001b[38;5;28;01mfor\u001b[39;00m row \u001b[38;5;129;01min\u001b[39;00m common_voice_train:\n\u001b[1;32m 3\u001b[0m \u001b[38;5;28;01mfor\u001b[39;00m letter \u001b[38;5;129;01min\u001b[39;00m odd_values:\n\u001b[1;32m 4\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m letter \u001b[38;5;129;01min\u001b[39;00m row[\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124msentence\u001b[39m\u001b[38;5;124m\"\u001b[39m]: \n",
1141
- "File \u001b[0;32m/opt/conda/lib/python3.8/site-packages/datasets/arrow_dataset.py:1664\u001b[0m, in \u001b[0;36mDataset._iter\u001b[0;34m(self, decoded)\u001b[0m\n\u001b[1;32m 1658\u001b[0m \u001b[38;5;124;03m\"\"\"Iterate through the examples.\u001b[39;00m\n\u001b[1;32m 1659\u001b[0m \n\u001b[1;32m 1660\u001b[0m \u001b[38;5;124;03mIf a formatting is set with :meth:`Dataset.set_format` rows will be returned with the\u001b[39;00m\n\u001b[1;32m 1661\u001b[0m \u001b[38;5;124;03mselected format.\u001b[39;00m\n\u001b[1;32m 1662\u001b[0m \u001b[38;5;124;03m\"\"\"\u001b[39;00m\n\u001b[1;32m 1663\u001b[0m \u001b[38;5;28;01mfor\u001b[39;00m index \u001b[38;5;129;01min\u001b[39;00m \u001b[38;5;28mrange\u001b[39m(\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mnum_rows):\n\u001b[0;32m-> 1664\u001b[0m \u001b[38;5;28;01myield\u001b[39;00m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_getitem\u001b[49m\u001b[43m(\u001b[49m\n\u001b[1;32m 1665\u001b[0m \u001b[43m \u001b[49m\u001b[43mindex\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 1666\u001b[0m \u001b[43m \u001b[49m\u001b[43mdecoded\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mdecoded\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 1667\u001b[0m \u001b[43m \u001b[49m\u001b[43m)\u001b[49m\n",
1142
- "File \u001b[0;32m/opt/conda/lib/python3.8/site-packages/datasets/arrow_dataset.py:1915\u001b[0m, in \u001b[0;36mDataset._getitem\u001b[0;34m(self, key, decoded, **kwargs)\u001b[0m\n\u001b[1;32m 1913\u001b[0m formatter \u001b[38;5;241m=\u001b[39m get_formatter(format_type, features\u001b[38;5;241m=\u001b[39m\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mfeatures, decoded\u001b[38;5;241m=\u001b[39mdecoded, \u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39mformat_kwargs)\n\u001b[1;32m 1914\u001b[0m pa_subtable \u001b[38;5;241m=\u001b[39m query_table(\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_data, key, indices\u001b[38;5;241m=\u001b[39m\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_indices \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_indices \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m \u001b[38;5;28;01melse\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m)\n\u001b[0;32m-> 1915\u001b[0m formatted_output \u001b[38;5;241m=\u001b[39m \u001b[43mformat_table\u001b[49m\u001b[43m(\u001b[49m\n\u001b[1;32m 1916\u001b[0m \u001b[43m \u001b[49m\u001b[43mpa_subtable\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mkey\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mformatter\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mformatter\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mformat_columns\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mformat_columns\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43moutput_all_columns\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43moutput_all_columns\u001b[49m\n\u001b[1;32m 1917\u001b[0m \u001b[43m\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 1918\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m formatted_output\n",
1143
- "File \u001b[0;32m/opt/conda/lib/python3.8/site-packages/datasets/formatting/formatting.py:533\u001b[0m, in \u001b[0;36mformat_table\u001b[0;34m(table, key, formatter, format_columns, output_all_columns)\u001b[0m\n\u001b[1;32m 531\u001b[0m python_formatter \u001b[38;5;241m=\u001b[39m PythonFormatter(features\u001b[38;5;241m=\u001b[39m\u001b[38;5;28;01mNone\u001b[39;00m)\n\u001b[1;32m 532\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m format_columns \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m:\n\u001b[0;32m--> 533\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43mformatter\u001b[49m\u001b[43m(\u001b[49m\u001b[43mpa_table\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mquery_type\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mquery_type\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 534\u001b[0m \u001b[38;5;28;01melif\u001b[39;00m query_type \u001b[38;5;241m==\u001b[39m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mcolumn\u001b[39m\u001b[38;5;124m\"\u001b[39m:\n\u001b[1;32m 535\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m key \u001b[38;5;129;01min\u001b[39;00m format_columns:\n",
1144
- "File \u001b[0;32m/opt/conda/lib/python3.8/site-packages/datasets/formatting/formatting.py:282\u001b[0m, in \u001b[0;36mFormatter.__call__\u001b[0;34m(self, pa_table, query_type)\u001b[0m\n\u001b[1;32m 280\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m \u001b[38;5;21m__call__\u001b[39m(\u001b[38;5;28mself\u001b[39m, pa_table: pa\u001b[38;5;241m.\u001b[39mTable, query_type: \u001b[38;5;28mstr\u001b[39m) \u001b[38;5;241m-\u001b[39m\u001b[38;5;241m>\u001b[39m Union[RowFormat, ColumnFormat, BatchFormat]:\n\u001b[1;32m 281\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m query_type \u001b[38;5;241m==\u001b[39m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mrow\u001b[39m\u001b[38;5;124m\"\u001b[39m:\n\u001b[0;32m--> 282\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mformat_row\u001b[49m\u001b[43m(\u001b[49m\u001b[43mpa_table\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 283\u001b[0m \u001b[38;5;28;01melif\u001b[39;00m query_type \u001b[38;5;241m==\u001b[39m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mcolumn\u001b[39m\u001b[38;5;124m\"\u001b[39m:\n\u001b[1;32m 284\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mformat_column(pa_table)\n",
1145
- "File \u001b[0;32m/opt/conda/lib/python3.8/site-packages/datasets/formatting/formatting.py:313\u001b[0m, in \u001b[0;36mPythonFormatter.format_row\u001b[0;34m(self, pa_table)\u001b[0m\n\u001b[1;32m 311\u001b[0m row \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mpython_arrow_extractor()\u001b[38;5;241m.\u001b[39mextract_row(pa_table)\n\u001b[1;32m 312\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mdecoded:\n\u001b[0;32m--> 313\u001b[0m row \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mpython_features_decoder\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mdecode_row\u001b[49m\u001b[43m(\u001b[49m\u001b[43mrow\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 314\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m row\n",
1146
- "File \u001b[0;32m/opt/conda/lib/python3.8/site-packages/datasets/formatting/formatting.py:222\u001b[0m, in \u001b[0;36mPythonFeaturesDecoder.decode_row\u001b[0;34m(self, row)\u001b[0m\n\u001b[1;32m 221\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m \u001b[38;5;21mdecode_row\u001b[39m(\u001b[38;5;28mself\u001b[39m, row: \u001b[38;5;28mdict\u001b[39m) \u001b[38;5;241m-\u001b[39m\u001b[38;5;241m>\u001b[39m \u001b[38;5;28mdict\u001b[39m:\n\u001b[0;32m--> 222\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mfeatures\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mdecode_example\u001b[49m\u001b[43m(\u001b[49m\u001b[43mrow\u001b[49m\u001b[43m)\u001b[49m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mfeatures \u001b[38;5;28;01melse\u001b[39;00m row\n",
1147
- "File \u001b[0;32m/opt/conda/lib/python3.8/site-packages/datasets/features/features.py:1318\u001b[0m, in \u001b[0;36mFeatures.decode_example\u001b[0;34m(self, example)\u001b[0m\n\u001b[1;32m 1308\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m \u001b[38;5;21mdecode_example\u001b[39m(\u001b[38;5;28mself\u001b[39m, example: \u001b[38;5;28mdict\u001b[39m):\n\u001b[1;32m 1309\u001b[0m \u001b[38;5;124;03m\"\"\"Decode example with custom feature decoding.\u001b[39;00m\n\u001b[1;32m 1310\u001b[0m \n\u001b[1;32m 1311\u001b[0m \u001b[38;5;124;03m Args:\u001b[39;00m\n\u001b[0;32m (...)\u001b[0m\n\u001b[1;32m 1315\u001b[0m \u001b[38;5;124;03m :obj:`dict[str, Any]`\u001b[39;00m\n\u001b[1;32m 1316\u001b[0m \u001b[38;5;124;03m \"\"\"\u001b[39;00m\n\u001b[0;32m-> 1318\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m {\n\u001b[1;32m 1319\u001b[0m column_name: decode_nested_example(feature, value)\n\u001b[1;32m 1320\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_column_requires_decoding[column_name]\n\u001b[1;32m 1321\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m value\n\u001b[1;32m 1322\u001b[0m \u001b[38;5;28;01mfor\u001b[39;00m column_name, (feature, value) \u001b[38;5;129;01min\u001b[39;00m utils\u001b[38;5;241m.\u001b[39mzip_dict(\n\u001b[1;32m 1323\u001b[0m {key: value \u001b[38;5;28;01mfor\u001b[39;00m key, value \u001b[38;5;129;01min\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mitems() \u001b[38;5;28;01mif\u001b[39;00m key \u001b[38;5;129;01min\u001b[39;00m example}, example\n\u001b[1;32m 1324\u001b[0m )\n\u001b[1;32m 1325\u001b[0m }\n",
1148
- "File \u001b[0;32m/opt/conda/lib/python3.8/site-packages/datasets/features/features.py:1319\u001b[0m, in \u001b[0;36m<dictcomp>\u001b[0;34m(.0)\u001b[0m\n\u001b[1;32m 1308\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m \u001b[38;5;21mdecode_example\u001b[39m(\u001b[38;5;28mself\u001b[39m, example: \u001b[38;5;28mdict\u001b[39m):\n\u001b[1;32m 1309\u001b[0m \u001b[38;5;124;03m\"\"\"Decode example with custom feature decoding.\u001b[39;00m\n\u001b[1;32m 1310\u001b[0m \n\u001b[1;32m 1311\u001b[0m \u001b[38;5;124;03m Args:\u001b[39;00m\n\u001b[0;32m (...)\u001b[0m\n\u001b[1;32m 1315\u001b[0m \u001b[38;5;124;03m :obj:`dict[str, Any]`\u001b[39;00m\n\u001b[1;32m 1316\u001b[0m \u001b[38;5;124;03m \"\"\"\u001b[39;00m\n\u001b[1;32m 1318\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m {\n\u001b[0;32m-> 1319\u001b[0m column_name: \u001b[43mdecode_nested_example\u001b[49m\u001b[43m(\u001b[49m\u001b[43mfeature\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mvalue\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 1320\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_column_requires_decoding[column_name]\n\u001b[1;32m 1321\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m value\n\u001b[1;32m 1322\u001b[0m \u001b[38;5;28;01mfor\u001b[39;00m column_name, (feature, value) \u001b[38;5;129;01min\u001b[39;00m utils\u001b[38;5;241m.\u001b[39mzip_dict(\n\u001b[1;32m 1323\u001b[0m {key: value \u001b[38;5;28;01mfor\u001b[39;00m key, value \u001b[38;5;129;01min\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mitems() \u001b[38;5;28;01mif\u001b[39;00m key \u001b[38;5;129;01min\u001b[39;00m example}, example\n\u001b[1;32m 1324\u001b[0m )\n\u001b[1;32m 1325\u001b[0m }\n",
1149
- "File \u001b[0;32m/opt/conda/lib/python3.8/site-packages/datasets/features/features.py:1056\u001b[0m, in \u001b[0;36mdecode_nested_example\u001b[0;34m(schema, obj)\u001b[0m\n\u001b[1;32m 1054\u001b[0m \u001b[38;5;66;03m# Object with special decoding:\u001b[39;00m\n\u001b[1;32m 1055\u001b[0m \u001b[38;5;28;01melif\u001b[39;00m \u001b[38;5;28misinstance\u001b[39m(schema, (Audio, Image)):\n\u001b[0;32m-> 1056\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43mschema\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mdecode_example\u001b[49m\u001b[43m(\u001b[49m\u001b[43mobj\u001b[49m\u001b[43m)\u001b[49m \u001b[38;5;28;01mif\u001b[39;00m obj \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m \u001b[38;5;28;01melse\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m\n\u001b[1;32m 1057\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m obj\n",
1150
- "File \u001b[0;32m/opt/conda/lib/python3.8/site-packages/datasets/features/audio.py:97\u001b[0m, in \u001b[0;36mAudio.decode_example\u001b[0;34m(self, value)\u001b[0m\n\u001b[1;32m 95\u001b[0m \u001b[38;5;28;01mraise\u001b[39;00m \u001b[38;5;167;01mValueError\u001b[39;00m(\u001b[38;5;124mf\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mAn audio sample should have one of \u001b[39m\u001b[38;5;124m'\u001b[39m\u001b[38;5;124mpath\u001b[39m\u001b[38;5;124m'\u001b[39m\u001b[38;5;124m or \u001b[39m\u001b[38;5;124m'\u001b[39m\u001b[38;5;124mbytes\u001b[39m\u001b[38;5;124m'\u001b[39m\u001b[38;5;124m but both are None in \u001b[39m\u001b[38;5;132;01m{\u001b[39;00mvalue\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m.\u001b[39m\u001b[38;5;124m\"\u001b[39m)\n\u001b[1;32m 96\u001b[0m \u001b[38;5;28;01melif\u001b[39;00m path \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m \u001b[38;5;129;01mand\u001b[39;00m path\u001b[38;5;241m.\u001b[39mendswith(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mmp3\u001b[39m\u001b[38;5;124m\"\u001b[39m):\n\u001b[0;32m---> 97\u001b[0m array, sampling_rate \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_decode_mp3\u001b[49m\u001b[43m(\u001b[49m\u001b[43mfile\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;28;43;01mif\u001b[39;49;00m\u001b[43m \u001b[49m\u001b[43mfile\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;28;43;01melse\u001b[39;49;00m\u001b[43m \u001b[49m\u001b[43mpath\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 98\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[1;32m 99\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m file:\n",
1151
- "File \u001b[0;32m/opt/conda/lib/python3.8/site-packages/datasets/features/audio.py:183\u001b[0m, in \u001b[0;36mAudio._decode_mp3\u001b[0;34m(self, path_or_file)\u001b[0m\n\u001b[1;32m 181\u001b[0m array \u001b[38;5;241m=\u001b[39m array\u001b[38;5;241m.\u001b[39mnumpy()\n\u001b[1;32m 182\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mmono:\n\u001b[0;32m--> 183\u001b[0m array \u001b[38;5;241m=\u001b[39m \u001b[43marray\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mmean\u001b[49m\u001b[43m(\u001b[49m\u001b[43maxis\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;241;43m0\u001b[39;49m\u001b[43m)\u001b[49m\n\u001b[1;32m 184\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m array, sampling_rate\n",
1152
- "File \u001b[0;32m/opt/conda/lib/python3.8/site-packages/numpy/core/_methods.py:154\u001b[0m, in \u001b[0;36m_mean\u001b[0;34m(a, axis, dtype, out, keepdims)\u001b[0m\n\u001b[1;32m 152\u001b[0m \u001b[38;5;66;03m# Cast bool, unsigned int, and int to float64 by default\u001b[39;00m\n\u001b[1;32m 153\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m dtype \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m:\n\u001b[0;32m--> 154\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28missubclass\u001b[39m(arr\u001b[38;5;241m.\u001b[39mdtype\u001b[38;5;241m.\u001b[39mtype, (\u001b[43mnt\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43minteger\u001b[49m, nt\u001b[38;5;241m.\u001b[39mbool_)):\n\u001b[1;32m 155\u001b[0m dtype \u001b[38;5;241m=\u001b[39m mu\u001b[38;5;241m.\u001b[39mdtype(\u001b[38;5;124m'\u001b[39m\u001b[38;5;124mf8\u001b[39m\u001b[38;5;124m'\u001b[39m)\n\u001b[1;32m 156\u001b[0m \u001b[38;5;28;01melif\u001b[39;00m \u001b[38;5;28missubclass\u001b[39m(arr\u001b[38;5;241m.\u001b[39mdtype\u001b[38;5;241m.\u001b[39mtype, nt\u001b[38;5;241m.\u001b[39mfloat16):\n",
1153
- "\u001b[0;31mKeyboardInterrupt\u001b[0m: "
1154
  ]
1155
  }
1156
  ],
1157
- "source": []
 
 
 
1158
  },
1159
  {
1160
  "cell_type": "code",
1161
- "execution_count": null,
1162
  "metadata": {},
1163
- "outputs": [],
1164
- "source": []
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1165
  },
1166
  {
1167
  "cell_type": "code",
 
1122
  },
1123
  {
1124
  "cell_type": "code",
1125
+ "execution_count": 30,
1126
+ "metadata": {},
1127
+ "outputs": [
1128
+ {
1129
+ "data": {
1130
+ "application/vnd.jupyter.widget-view+json": {
1131
+ "model_id": "501e1eb7f6a545c496873545b992c2ad",
1132
+ "version_major": 2,
1133
+ "version_minor": 0
1134
+ },
1135
+ "text/plain": [
1136
+ " 0%| | 0/11 [00:00<?, ?ba/s]"
1137
+ ]
1138
+ },
1139
+ "metadata": {},
1140
+ "output_type": "display_data"
1141
  }
1142
+ ],
1143
+ "source": [
1144
+ "alpha_rows = common_voice_train.filter(lambda example: re.search('[a-zA-Z]',example['sentence']))"
1145
+ ]
1146
+ },
1147
+ {
1148
+ "cell_type": "code",
1149
+ "execution_count": 32,
1150
+ "metadata": {},
1151
  "outputs": [
1152
  {
1153
+ "name": "stdout",
1154
+ "output_type": "stream",
1155
+ "text": [
1156
+ "467\n",
1157
+ "10623\n"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1158
  ]
1159
  }
1160
  ],
1161
+ "source": [
1162
+ "print(len(alpha_rows))\n",
1163
+ "print(len(common_voice_train))\n"
1164
+ ]
1165
  },
1166
  {
1167
  "cell_type": "code",
1168
+ "execution_count": 35,
1169
  "metadata": {},
1170
+ "outputs": [
1171
+ {
1172
+ "name": "stdout",
1173
+ "output_type": "stream",
1174
+ "text": [
1175
+ "グループは、「Winters and Happy」でさくしゃとしてなまえがのることをシェアしています。\n",
1176
+ "Flowoodには、リモートコントロールレースようのおくないトラックがある。\n",
1177
+ "じもとのこうかんやHarrow Civic Centreのゆうじんにより、さらなるけいびがおこなわれました。\n",
1178
+ "かれはHeman Huntersでアコーディオンやドラムをえんそうします。\n",
1179
+ "これらはさまざまなWebベースのインターフェースをかいしてりようかのうになります。\n",
1180
+ "これでかそうのループbをかんがえることができます。\n",
1181
+ "のちにかれは、『Moth or Phoenix』というほんのなかで、これらのできごとについてかいた。\n",
1182
+ "ダリル・バンクスは、オハイオしゅうのColumbus College of Art and Designでまなびました。\n",
1183
+ "サンスクリットごでSaketというなまえは、てんごくにちかいばしょをいみします。\n",
1184
+ "Justinは、バンドのIntangibleのメンバーです。\n",
1185
+ "「U」を、「X」のとじたぶぶんくうかんであるとかていします。\n",
1186
+ "コンデはAcademy of Sciences and Letters のメンバーでもありました。\n",
1187
+ "Steersはホッチキススクールからもカルバーミリタリーアカデミーからもついほうされた。\n",
1188
+ "そののち、Bangladeshはあたまをうった。\n",
1189
+ "かれはけいざいがくしゃでありきょうじゅでもある、Cillian Ryanのちちおやだ。\n",
1190
+ "Webサイトのレビュー、コメント、およびひょうかをひょうじします。\n",
1191
+ "Arthurのけいれきはおおくのそしょうがしめしています。\n",
1192
+ "Aeroしゃによるたいりょうせいさんはありませんでした。\n",
1193
+ "これには、シングル「King of England」、「Somewhere」および「Clarinet Town」がしゅうろくされている。\n",
1194
+ "かれは、オーバーンだいがくモンゴメリーこうでWeilフェローだった。\n"
1195
+ ]
1196
+ }
1197
+ ],
1198
+ "source": [
1199
+ "for i in range(0,20):\n",
1200
+ " print(alpha_rows[i]['sentence'])"
1201
+ ]
1202
+ },
1203
+ {
1204
+ "cell_type": "code",
1205
+ "execution_count": 28,
1206
+ "metadata": {},
1207
+ "outputs": [
1208
+ {
1209
+ "name": "stdout",
1210
+ "output_type": "stream",
1211
+ "text": [
1212
+ "<re.Match object; span=(1, 2), match='a'>\n"
1213
+ ]
1214
+ }
1215
+ ],
1216
+ "source": [
1217
+ "import regex\n",
1218
+ "print(re.search('[a-zA-Z]', \"9a2\"))"
1219
+ ]
1220
  },
1221
  {
1222
  "cell_type": "code",
added_tokens.json CHANGED
@@ -1 +1 @@
1
- {"<s>": 250, "</s>": 251}
 
1
+ {"<s>": 179, "</s>": 180}
config.json CHANGED
@@ -76,7 +76,7 @@
76
  "num_hidden_layers": 24,
77
  "num_negatives": 100,
78
  "output_hidden_size": 1024,
79
- "pad_token_id": 249,
80
  "proj_codevector_dim": 768,
81
  "tdnn_dilation": [
82
  1,
@@ -102,6 +102,6 @@
102
  "torch_dtype": "float32",
103
  "transformers_version": "4.17.0.dev0",
104
  "use_weighted_layer_sum": false,
105
- "vocab_size": 252,
106
  "xvector_output_dim": 512
107
  }
 
76
  "num_hidden_layers": 24,
77
  "num_negatives": 100,
78
  "output_hidden_size": 1024,
79
+ "pad_token_id": 178,
80
  "proj_codevector_dim": 768,
81
  "tdnn_dilation": [
82
  1,
 
102
  "torch_dtype": "float32",
103
  "transformers_version": "4.17.0.dev0",
104
  "use_weighted_layer_sum": false,
105
+ "vocab_size": 181,
106
  "xvector_output_dim": 512
107
  }
mozilla-foundation_common_voice_8_0_ja_test_eval_results.txt CHANGED
@@ -1,2 +1,2 @@
1
- WER: 0.9855427046263345
2
- CER: 0.328342726455934
 
1
+ WER: 0.9490658362989324
2
+ CER: 0.233251654006371
pytorch_model.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:eb027f63750d6848a5b8ceebf51f79a50bca790898a87e1723e5019ee5a484ab
3
- size 1262956849
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:cec559d37e4950e12a68238d91702538827b3e3a578f44c9eea97dc5f9450578
3
+ size 1262665777
run_speech_recognition_ctc_bnb.py CHANGED
@@ -358,6 +358,8 @@ def main():
358
  else:
359
  model_args, data_args, training_args = parser.parse_args_into_dataclasses()
360
 
 
 
361
  # Detecting last checkpoint.
362
  last_checkpoint = None
363
  if os.path.isdir(training_args.output_dir) and training_args.do_train and not training_args.overwrite_output_dir:
@@ -432,7 +434,12 @@ def main():
432
 
433
  if data_args.max_eval_samples is not None:
434
  raw_datasets["eval"] = raw_datasets["eval"].select(range(data_args.max_eval_samples))
 
 
 
435
 
 
 
436
  # 2. We remove some special characters from the datasets
437
  # that make training complicated and do not help in transcribing the speech
438
  # E.g. characters, such as `,` and `.` do not really have an acoustic characteristic
@@ -444,11 +451,14 @@ def main():
444
  # kakasi.setMode("K", "H") #Convert from katakana to hiragana
445
  conv = kakasi.getConverter()
446
 
 
447
  chars_to_ignore_regex = (
448
- f'[{"".join(data_args.chars_to_ignore)}]' if data_args.chars_to_ignore is not None else '[\,\?\!\-\;\:\"\“\%\‘\”\�\—\’\…\–\(\,\[\]\)\(\!]'
449
  )
450
- text_column_name = data_args.text_column_name
451
 
 
 
 
452
 
453
 
454
  def remove_special_characters(batch):
@@ -580,7 +590,7 @@ def main():
580
  max_input_length = data_args.max_duration_in_seconds * feature_extractor.sampling_rate
581
  min_input_length = data_args.min_duration_in_seconds * feature_extractor.sampling_rate
582
  audio_column_name = data_args.audio_column_name
583
- num_workers = data_args.preprocessing_num_workers
584
 
585
  # `phoneme_language` is only relevant if the model is fine-tuned on phoneme classification
586
  phoneme_language = data_args.phoneme_language
 
358
  else:
359
  model_args, data_args, training_args = parser.parse_args_into_dataclasses()
360
 
361
+
362
+ num_workers = data_args.preprocessing_num_workers
363
  # Detecting last checkpoint.
364
  last_checkpoint = None
365
  if os.path.isdir(training_args.output_dir) and training_args.do_train and not training_args.overwrite_output_dir:
 
434
 
435
  if data_args.max_eval_samples is not None:
436
  raw_datasets["eval"] = raw_datasets["eval"].select(range(data_args.max_eval_samples))
437
+
438
+ # ADDITIONS
439
+ # Remove alphanumeric characters
440
 
441
+ raw_datasets = raw_datasets.filter(lambda example: not re.search('[a-zA-ZA-Za-z]',example['sentence']))
442
+
443
  # 2. We remove some special characters from the datasets
444
  # that make training complicated and do not help in transcribing the speech
445
  # E.g. characters, such as `,` and `.` do not really have an acoustic characteristic
 
451
  # kakasi.setMode("K", "H") #Convert from katakana to hiragana
452
  conv = kakasi.getConverter()
453
 
454
+ # Default to set of extra characters seen in CV 8.
455
  chars_to_ignore_regex = (
456
+ f'[{"".join(data_args.chars_to_ignore)}]' if data_args.chars_to_ignore is not None else '[\,\?\!\-\;\:\"\“\%\‘\”\�\—\’\…\–\(\,\[\]\)\(\!\/\「\」\『\』]'
457
  )
 
458
 
459
+ # ADDITIONS END
460
+
461
+ text_column_name = data_args.text_column_name
462
 
463
 
464
  def remove_special_characters(batch):
 
590
  max_input_length = data_args.max_duration_in_seconds * feature_extractor.sampling_rate
591
  min_input_length = data_args.min_duration_in_seconds * feature_extractor.sampling_rate
592
  audio_column_name = data_args.audio_column_name
593
+
594
 
595
  # `phoneme_language` is only relevant if the model is fine-tuned on phoneme classification
596
  phoneme_language = data_args.phoneme_language
special_tokens_map.json CHANGED
@@ -1 +1 @@
1
- {"bos_token": "<s>", "eos_token": "</s>", "unk_token": "[UNK]", "pad_token": "[PAD]", "additional_special_tokens": [{"content": "<s>", "single_word": false, "lstrip": false, "rstrip": false, "normalized": true}, {"content": "</s>", "single_word": false, "lstrip": false, "rstrip": false, "normalized": true}, {"content": "<s>", "single_word": false, "lstrip": false, "rstrip": false, "normalized": true}, {"content": "</s>", "single_word": false, "lstrip": false, "rstrip": false, "normalized": true}]}
 
1
+ {"bos_token": "<s>", "eos_token": "</s>", "unk_token": "[UNK]", "pad_token": "[PAD]", "additional_special_tokens": [{"content": "<s>", "single_word": false, "lstrip": false, "rstrip": false, "normalized": true}, {"content": "</s>", "single_word": false, "lstrip": false, "rstrip": false, "normalized": true}, {"content": "<s>", "single_word": false, "lstrip": false, "rstrip": false, "normalized": true}, {"content": "</s>", "single_word": false, "lstrip": false, "rstrip": false, "normalized": true}, {"content": "<s>", "single_word": false, "lstrip": false, "rstrip": false, "normalized": true}, {"content": "</s>", "single_word": false, "lstrip": false, "rstrip": false, "normalized": true}, {"content": "<s>", "single_word": false, "lstrip": false, "rstrip": false, "normalized": true}, {"content": "</s>", "single_word": false, "lstrip": false, "rstrip": false, "normalized": true}, {"content": "<s>", "single_word": false, "lstrip": false, "rstrip": false, "normalized": true}, {"content": "</s>", "single_word": false, "lstrip": false, "rstrip": false, "normalized": true}]}
speech_training_notebook.ipynb CHANGED
@@ -577,20 +577,59 @@
577
  },
578
  {
579
  "cell_type": "code",
580
- "execution_count": null,
581
  "metadata": {},
582
- "outputs": [],
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
583
  "source": []
584
  },
585
  {
586
  "cell_type": "code",
587
- "execution_count": 16,
588
  "metadata": {},
589
  "outputs": [
590
  {
591
  "data": {
592
  "application/vnd.jupyter.widget-view+json": {
593
- "model_id": "c40f4d6b6bb74a56b2c570a3a53d7f4b",
594
  "version_major": 2,
595
  "version_minor": 0
596
  },
@@ -604,7 +643,7 @@
604
  {
605
  "data": {
606
  "application/vnd.jupyter.widget-view+json": {
607
- "model_id": "f69b6a3c0b54477ea15c56b02464bacd",
608
  "version_major": 2,
609
  "version_minor": 0
610
  },
@@ -617,13 +656,13 @@
617
  }
618
  ],
619
  "source": [
620
- "vocab_train = common_voice_train.map(extract_all_chars, batched=True, batch_size=-1, keep_in_memory=True, remove_columns=common_voice_train.column_names)\n",
621
- "vocab_test = common_voice_test.map(extract_all_chars, batched=True, batch_size=-1, keep_in_memory=True, remove_columns=common_voice_test.column_names)"
622
  ]
623
  },
624
  {
625
  "cell_type": "code",
626
- "execution_count": 17,
627
  "metadata": {},
628
  "outputs": [],
629
  "source": [
@@ -898,15 +937,15 @@
898
  },
899
  {
900
  "cell_type": "code",
901
- "execution_count": 18,
902
  "metadata": {},
903
  "outputs": [
904
  {
905
  "name": "stdout",
906
  "output_type": "stream",
907
  "text": [
908
- "249\n",
909
- "['ダ', 'た', 'P', 'か', 'よ', 'や', 'Q', 'を', 'F', 'h', 'E', 'ち', 'リ', 'ゲ', 'フ', 'め', 'タ', 'せ', 'b', '」', 'ば', 'ア', 'A', 'ャ', 'イ', 'ぶ', 'は', 'u', 'と', 'ノ', 'I', 'R', '「', 'G', 'ェ', 'く', '?', '〜', 'つ', 'こ', 'S', 'ぼ', 'ゼ', 's', 'U', 'き', 'ゥ', 'が', 'も', 'エ', 'ク', 'づ', 'グ', 'ブ', 'ゅ', 'ィ', 't', 'n', 'ロ', 'ー', '/', 'の', 'ケ', '・', 'J', 'お', 'む', 'P', 'ベ', 'h', 'プ', 'o', '&', '『', 'ソ', '.', 'ヴ', 'ド', 'み', 'Y', 'ガ', 'ょ', 'カ', 'C', 'ぜ', 'j', '.', 'ご', 'ど', 'ハ', 'ね', 'W', 'j', 'T', ' ', 'マ', '―', '-', 'デ', 'ゾ', 'ポ', 'K', 'ペ', 'ぱ', 'ふ', 'べ', 'ヒ', 'e', 'サ', 'N', 'X', 'ュ', 'k', 'り', 'U', 'ひ', 'げ', 'ゆ', 'ず', 'ゴ', 'a', 'ョ', 'ツ', '〇', 'え', 'F', 'B', '』', 'ッ', 'ん', 'ン', 'S', 'う', 'ぽ', ':', '々', 'ぞ', 'N', 'ヨ', 'ゃ', 'だ', 'L', 'ピ', 'ボ', 'w', 'ウ', 'あ', 'ヶ', 'ぬ', 'て', 'す', 'び', 'r', 'へ', '繫', 'バ', 'ぎ', 'ざ', 'A', 'チ', 'け', 'ぇ', 'わ', 'ス', 'p', 'ズ', 'y', 'し', '、', '!', 'G', '・', 'O', 'ぁ', 'd', 'g', 'ナ', 'ヅ', 'ほ', ')', 'D', 'ネ', 'パ', 'ム', 'ミ', '=', 'z', 'い', 'ろ', 'c', 'O', 'ザ', 'l', 'v', 'x', 'ヌ', 'に', 'ら', 'ヘ', '。', 'ギ', 'モ', 'D', 'キ', 'i', \"'\", 'M', 'で', 'ぴ', 'ぷ', 'ビ', 'H', 'f', 'ヤ', 'ユ', 'シ', 'Z', 'る', 'そ', 'テ', 'V', 'れ', '」', 'じ', 'ワ', 'レ', 'ォ', 'ジ', 'な', 'ニ', 'q', '&', 'っ', '「', 'ぢ', 'ル', 'さ', 'ぺ', 'm', 'ト', 'ホ', 'コ', 'オ', 'セ', 'ま', 'メ', 'ァ', 'ぐ', 'ラ']\n"
910
  ]
911
  }
912
  ],
@@ -1122,46 +1161,163 @@
1122
  },
1123
  {
1124
  "cell_type": "code",
1125
- "execution_count": 38,
1126
- "metadata": {
1127
- "collapsed": true,
1128
- "jupyter": {
1129
- "outputs_hidden": true
 
 
 
 
 
 
 
 
 
 
 
1130
  }
1131
- },
 
 
 
 
 
 
 
 
1132
  "outputs": [
1133
  {
1134
- "ename": "KeyboardInterrupt",
1135
- "evalue": "",
1136
- "output_type": "error",
1137
- "traceback": [
1138
- "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
1139
- "\u001b[0;31mKeyboardInterrupt\u001b[0m Traceback (most recent call last)",
1140
- "Input \u001b[0;32mIn [38]\u001b[0m, in \u001b[0;36m<module>\u001b[0;34m\u001b[0m\n\u001b[1;32m 1\u001b[0m odd_example_texts \u001b[38;5;241m=\u001b[39m []\n\u001b[0;32m----> 2\u001b[0m \u001b[38;5;28;01mfor\u001b[39;00m row \u001b[38;5;129;01min\u001b[39;00m common_voice_train:\n\u001b[1;32m 3\u001b[0m \u001b[38;5;28;01mfor\u001b[39;00m letter \u001b[38;5;129;01min\u001b[39;00m odd_values:\n\u001b[1;32m 4\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m letter \u001b[38;5;129;01min\u001b[39;00m row[\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124msentence\u001b[39m\u001b[38;5;124m\"\u001b[39m]: \n",
1141
- "File \u001b[0;32m/opt/conda/lib/python3.8/site-packages/datasets/arrow_dataset.py:1664\u001b[0m, in \u001b[0;36mDataset._iter\u001b[0;34m(self, decoded)\u001b[0m\n\u001b[1;32m 1658\u001b[0m \u001b[38;5;124;03m\"\"\"Iterate through the examples.\u001b[39;00m\n\u001b[1;32m 1659\u001b[0m \n\u001b[1;32m 1660\u001b[0m \u001b[38;5;124;03mIf a formatting is set with :meth:`Dataset.set_format` rows will be returned with the\u001b[39;00m\n\u001b[1;32m 1661\u001b[0m \u001b[38;5;124;03mselected format.\u001b[39;00m\n\u001b[1;32m 1662\u001b[0m \u001b[38;5;124;03m\"\"\"\u001b[39;00m\n\u001b[1;32m 1663\u001b[0m \u001b[38;5;28;01mfor\u001b[39;00m index \u001b[38;5;129;01min\u001b[39;00m \u001b[38;5;28mrange\u001b[39m(\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mnum_rows):\n\u001b[0;32m-> 1664\u001b[0m \u001b[38;5;28;01myield\u001b[39;00m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_getitem\u001b[49m\u001b[43m(\u001b[49m\n\u001b[1;32m 1665\u001b[0m \u001b[43m \u001b[49m\u001b[43mindex\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 1666\u001b[0m \u001b[43m \u001b[49m\u001b[43mdecoded\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mdecoded\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 1667\u001b[0m \u001b[43m \u001b[49m\u001b[43m)\u001b[49m\n",
1142
- "File \u001b[0;32m/opt/conda/lib/python3.8/site-packages/datasets/arrow_dataset.py:1915\u001b[0m, in \u001b[0;36mDataset._getitem\u001b[0;34m(self, key, decoded, **kwargs)\u001b[0m\n\u001b[1;32m 1913\u001b[0m formatter \u001b[38;5;241m=\u001b[39m get_formatter(format_type, features\u001b[38;5;241m=\u001b[39m\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mfeatures, decoded\u001b[38;5;241m=\u001b[39mdecoded, \u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39mformat_kwargs)\n\u001b[1;32m 1914\u001b[0m pa_subtable \u001b[38;5;241m=\u001b[39m query_table(\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_data, key, indices\u001b[38;5;241m=\u001b[39m\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_indices \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_indices \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m \u001b[38;5;28;01melse\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m)\n\u001b[0;32m-> 1915\u001b[0m formatted_output \u001b[38;5;241m=\u001b[39m \u001b[43mformat_table\u001b[49m\u001b[43m(\u001b[49m\n\u001b[1;32m 1916\u001b[0m \u001b[43m \u001b[49m\u001b[43mpa_subtable\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mkey\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mformatter\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mformatter\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mformat_columns\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mformat_columns\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43moutput_all_columns\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43moutput_all_columns\u001b[49m\n\u001b[1;32m 1917\u001b[0m \u001b[43m\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 1918\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m formatted_output\n",
1143
- "File \u001b[0;32m/opt/conda/lib/python3.8/site-packages/datasets/formatting/formatting.py:533\u001b[0m, in \u001b[0;36mformat_table\u001b[0;34m(table, key, formatter, format_columns, output_all_columns)\u001b[0m\n\u001b[1;32m 531\u001b[0m python_formatter \u001b[38;5;241m=\u001b[39m PythonFormatter(features\u001b[38;5;241m=\u001b[39m\u001b[38;5;28;01mNone\u001b[39;00m)\n\u001b[1;32m 532\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m format_columns \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m:\n\u001b[0;32m--> 533\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43mformatter\u001b[49m\u001b[43m(\u001b[49m\u001b[43mpa_table\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mquery_type\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mquery_type\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 534\u001b[0m \u001b[38;5;28;01melif\u001b[39;00m query_type \u001b[38;5;241m==\u001b[39m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mcolumn\u001b[39m\u001b[38;5;124m\"\u001b[39m:\n\u001b[1;32m 535\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m key \u001b[38;5;129;01min\u001b[39;00m format_columns:\n",
1144
- "File \u001b[0;32m/opt/conda/lib/python3.8/site-packages/datasets/formatting/formatting.py:282\u001b[0m, in \u001b[0;36mFormatter.__call__\u001b[0;34m(self, pa_table, query_type)\u001b[0m\n\u001b[1;32m 280\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m \u001b[38;5;21m__call__\u001b[39m(\u001b[38;5;28mself\u001b[39m, pa_table: pa\u001b[38;5;241m.\u001b[39mTable, query_type: \u001b[38;5;28mstr\u001b[39m) \u001b[38;5;241m-\u001b[39m\u001b[38;5;241m>\u001b[39m Union[RowFormat, ColumnFormat, BatchFormat]:\n\u001b[1;32m 281\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m query_type \u001b[38;5;241m==\u001b[39m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mrow\u001b[39m\u001b[38;5;124m\"\u001b[39m:\n\u001b[0;32m--> 282\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mformat_row\u001b[49m\u001b[43m(\u001b[49m\u001b[43mpa_table\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 283\u001b[0m \u001b[38;5;28;01melif\u001b[39;00m query_type \u001b[38;5;241m==\u001b[39m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mcolumn\u001b[39m\u001b[38;5;124m\"\u001b[39m:\n\u001b[1;32m 284\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mformat_column(pa_table)\n",
1145
- "File \u001b[0;32m/opt/conda/lib/python3.8/site-packages/datasets/formatting/formatting.py:313\u001b[0m, in \u001b[0;36mPythonFormatter.format_row\u001b[0;34m(self, pa_table)\u001b[0m\n\u001b[1;32m 311\u001b[0m row \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mpython_arrow_extractor()\u001b[38;5;241m.\u001b[39mextract_row(pa_table)\n\u001b[1;32m 312\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mdecoded:\n\u001b[0;32m--> 313\u001b[0m row \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mpython_features_decoder\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mdecode_row\u001b[49m\u001b[43m(\u001b[49m\u001b[43mrow\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 314\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m row\n",
1146
- "File \u001b[0;32m/opt/conda/lib/python3.8/site-packages/datasets/formatting/formatting.py:222\u001b[0m, in \u001b[0;36mPythonFeaturesDecoder.decode_row\u001b[0;34m(self, row)\u001b[0m\n\u001b[1;32m 221\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m \u001b[38;5;21mdecode_row\u001b[39m(\u001b[38;5;28mself\u001b[39m, row: \u001b[38;5;28mdict\u001b[39m) \u001b[38;5;241m-\u001b[39m\u001b[38;5;241m>\u001b[39m \u001b[38;5;28mdict\u001b[39m:\n\u001b[0;32m--> 222\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mfeatures\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mdecode_example\u001b[49m\u001b[43m(\u001b[49m\u001b[43mrow\u001b[49m\u001b[43m)\u001b[49m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mfeatures \u001b[38;5;28;01melse\u001b[39;00m row\n",
1147
- "File \u001b[0;32m/opt/conda/lib/python3.8/site-packages/datasets/features/features.py:1318\u001b[0m, in \u001b[0;36mFeatures.decode_example\u001b[0;34m(self, example)\u001b[0m\n\u001b[1;32m 1308\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m \u001b[38;5;21mdecode_example\u001b[39m(\u001b[38;5;28mself\u001b[39m, example: \u001b[38;5;28mdict\u001b[39m):\n\u001b[1;32m 1309\u001b[0m \u001b[38;5;124;03m\"\"\"Decode example with custom feature decoding.\u001b[39;00m\n\u001b[1;32m 1310\u001b[0m \n\u001b[1;32m 1311\u001b[0m \u001b[38;5;124;03m Args:\u001b[39;00m\n\u001b[0;32m (...)\u001b[0m\n\u001b[1;32m 1315\u001b[0m \u001b[38;5;124;03m :obj:`dict[str, Any]`\u001b[39;00m\n\u001b[1;32m 1316\u001b[0m \u001b[38;5;124;03m \"\"\"\u001b[39;00m\n\u001b[0;32m-> 1318\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m {\n\u001b[1;32m 1319\u001b[0m column_name: decode_nested_example(feature, value)\n\u001b[1;32m 1320\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_column_requires_decoding[column_name]\n\u001b[1;32m 1321\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m value\n\u001b[1;32m 1322\u001b[0m \u001b[38;5;28;01mfor\u001b[39;00m column_name, (feature, value) \u001b[38;5;129;01min\u001b[39;00m utils\u001b[38;5;241m.\u001b[39mzip_dict(\n\u001b[1;32m 1323\u001b[0m {key: value \u001b[38;5;28;01mfor\u001b[39;00m key, value \u001b[38;5;129;01min\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mitems() \u001b[38;5;28;01mif\u001b[39;00m key \u001b[38;5;129;01min\u001b[39;00m example}, example\n\u001b[1;32m 1324\u001b[0m )\n\u001b[1;32m 1325\u001b[0m }\n",
1148
- "File \u001b[0;32m/opt/conda/lib/python3.8/site-packages/datasets/features/features.py:1319\u001b[0m, in \u001b[0;36m<dictcomp>\u001b[0;34m(.0)\u001b[0m\n\u001b[1;32m 1308\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m \u001b[38;5;21mdecode_example\u001b[39m(\u001b[38;5;28mself\u001b[39m, example: \u001b[38;5;28mdict\u001b[39m):\n\u001b[1;32m 1309\u001b[0m \u001b[38;5;124;03m\"\"\"Decode example with custom feature decoding.\u001b[39;00m\n\u001b[1;32m 1310\u001b[0m \n\u001b[1;32m 1311\u001b[0m \u001b[38;5;124;03m Args:\u001b[39;00m\n\u001b[0;32m (...)\u001b[0m\n\u001b[1;32m 1315\u001b[0m \u001b[38;5;124;03m :obj:`dict[str, Any]`\u001b[39;00m\n\u001b[1;32m 1316\u001b[0m \u001b[38;5;124;03m \"\"\"\u001b[39;00m\n\u001b[1;32m 1318\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m {\n\u001b[0;32m-> 1319\u001b[0m column_name: \u001b[43mdecode_nested_example\u001b[49m\u001b[43m(\u001b[49m\u001b[43mfeature\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mvalue\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 1320\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_column_requires_decoding[column_name]\n\u001b[1;32m 1321\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m value\n\u001b[1;32m 1322\u001b[0m \u001b[38;5;28;01mfor\u001b[39;00m column_name, (feature, value) \u001b[38;5;129;01min\u001b[39;00m utils\u001b[38;5;241m.\u001b[39mzip_dict(\n\u001b[1;32m 1323\u001b[0m {key: value \u001b[38;5;28;01mfor\u001b[39;00m key, value \u001b[38;5;129;01min\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mitems() \u001b[38;5;28;01mif\u001b[39;00m key \u001b[38;5;129;01min\u001b[39;00m example}, example\n\u001b[1;32m 1324\u001b[0m )\n\u001b[1;32m 1325\u001b[0m }\n",
1149
- "File \u001b[0;32m/opt/conda/lib/python3.8/site-packages/datasets/features/features.py:1056\u001b[0m, in \u001b[0;36mdecode_nested_example\u001b[0;34m(schema, obj)\u001b[0m\n\u001b[1;32m 1054\u001b[0m \u001b[38;5;66;03m# Object with special decoding:\u001b[39;00m\n\u001b[1;32m 1055\u001b[0m \u001b[38;5;28;01melif\u001b[39;00m \u001b[38;5;28misinstance\u001b[39m(schema, (Audio, Image)):\n\u001b[0;32m-> 1056\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43mschema\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mdecode_example\u001b[49m\u001b[43m(\u001b[49m\u001b[43mobj\u001b[49m\u001b[43m)\u001b[49m \u001b[38;5;28;01mif\u001b[39;00m obj \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m \u001b[38;5;28;01melse\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m\n\u001b[1;32m 1057\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m obj\n",
1150
- "File \u001b[0;32m/opt/conda/lib/python3.8/site-packages/datasets/features/audio.py:97\u001b[0m, in \u001b[0;36mAudio.decode_example\u001b[0;34m(self, value)\u001b[0m\n\u001b[1;32m 95\u001b[0m \u001b[38;5;28;01mraise\u001b[39;00m \u001b[38;5;167;01mValueError\u001b[39;00m(\u001b[38;5;124mf\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mAn audio sample should have one of \u001b[39m\u001b[38;5;124m'\u001b[39m\u001b[38;5;124mpath\u001b[39m\u001b[38;5;124m'\u001b[39m\u001b[38;5;124m or \u001b[39m\u001b[38;5;124m'\u001b[39m\u001b[38;5;124mbytes\u001b[39m\u001b[38;5;124m'\u001b[39m\u001b[38;5;124m but both are None in \u001b[39m\u001b[38;5;132;01m{\u001b[39;00mvalue\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m.\u001b[39m\u001b[38;5;124m\"\u001b[39m)\n\u001b[1;32m 96\u001b[0m \u001b[38;5;28;01melif\u001b[39;00m path \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m \u001b[38;5;129;01mand\u001b[39;00m path\u001b[38;5;241m.\u001b[39mendswith(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mmp3\u001b[39m\u001b[38;5;124m\"\u001b[39m):\n\u001b[0;32m---> 97\u001b[0m array, sampling_rate \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_decode_mp3\u001b[49m\u001b[43m(\u001b[49m\u001b[43mfile\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;28;43;01mif\u001b[39;49;00m\u001b[43m \u001b[49m\u001b[43mfile\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;28;43;01melse\u001b[39;49;00m\u001b[43m \u001b[49m\u001b[43mpath\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 98\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[1;32m 99\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m file:\n",
1151
- "File \u001b[0;32m/opt/conda/lib/python3.8/site-packages/datasets/features/audio.py:183\u001b[0m, in \u001b[0;36mAudio._decode_mp3\u001b[0;34m(self, path_or_file)\u001b[0m\n\u001b[1;32m 181\u001b[0m array \u001b[38;5;241m=\u001b[39m array\u001b[38;5;241m.\u001b[39mnumpy()\n\u001b[1;32m 182\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mmono:\n\u001b[0;32m--> 183\u001b[0m array \u001b[38;5;241m=\u001b[39m \u001b[43marray\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mmean\u001b[49m\u001b[43m(\u001b[49m\u001b[43maxis\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;241;43m0\u001b[39;49m\u001b[43m)\u001b[49m\n\u001b[1;32m 184\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m array, sampling_rate\n",
1152
- "File \u001b[0;32m/opt/conda/lib/python3.8/site-packages/numpy/core/_methods.py:154\u001b[0m, in \u001b[0;36m_mean\u001b[0;34m(a, axis, dtype, out, keepdims)\u001b[0m\n\u001b[1;32m 152\u001b[0m \u001b[38;5;66;03m# Cast bool, unsigned int, and int to float64 by default\u001b[39;00m\n\u001b[1;32m 153\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m dtype \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m:\n\u001b[0;32m--> 154\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28missubclass\u001b[39m(arr\u001b[38;5;241m.\u001b[39mdtype\u001b[38;5;241m.\u001b[39mtype, (\u001b[43mnt\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43minteger\u001b[49m, nt\u001b[38;5;241m.\u001b[39mbool_)):\n\u001b[1;32m 155\u001b[0m dtype \u001b[38;5;241m=\u001b[39m mu\u001b[38;5;241m.\u001b[39mdtype(\u001b[38;5;124m'\u001b[39m\u001b[38;5;124mf8\u001b[39m\u001b[38;5;124m'\u001b[39m)\n\u001b[1;32m 156\u001b[0m \u001b[38;5;28;01melif\u001b[39;00m \u001b[38;5;28missubclass\u001b[39m(arr\u001b[38;5;241m.\u001b[39mdtype\u001b[38;5;241m.\u001b[39mtype, nt\u001b[38;5;241m.\u001b[39mfloat16):\n",
1153
- "\u001b[0;31mKeyboardInterrupt\u001b[0m: "
 
 
 
 
 
 
 
 
 
 
 
1154
  ]
1155
  }
1156
  ],
1157
- "source": []
 
 
 
 
1158
  },
1159
  {
1160
  "cell_type": "code",
1161
- "execution_count": null,
1162
  "metadata": {},
1163
- "outputs": [],
1164
- "source": []
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1165
  },
1166
  {
1167
  "cell_type": "code",
 
577
  },
578
  {
579
  "cell_type": "code",
580
+ "execution_count": 36,
581
  "metadata": {},
582
+ "outputs": [
583
+ {
584
+ "data": {
585
+ "application/vnd.jupyter.widget-view+json": {
586
+ "model_id": "c433125fde60482ab48e7db72a0759a0",
587
+ "version_major": 2,
588
+ "version_minor": 0
589
+ },
590
+ "text/plain": [
591
+ " 0%| | 0/11 [00:00<?, ?ba/s]"
592
+ ]
593
+ },
594
+ "metadata": {},
595
+ "output_type": "display_data"
596
+ }
597
+ ],
598
+ "source": [
599
+ "common_voice_train_no_alpha = common_voice_train.filter(lambda example: not re.search('[a-zA-Z]',example['sentence']))\n"
600
+ ]
601
+ },
602
+ {
603
+ "cell_type": "code",
604
+ "execution_count": 38,
605
+ "metadata": {},
606
+ "outputs": [
607
+ {
608
+ "data": {
609
+ "application/vnd.jupyter.widget-view+json": {
610
+ "model_id": "7eb50868575b4ebb8143c46761a96550",
611
+ "version_major": 2,
612
+ "version_minor": 0
613
+ },
614
+ "text/plain": [
615
+ " 0%| | 0/5 [00:00<?, ?ba/s]"
616
+ ]
617
+ },
618
+ "metadata": {},
619
+ "output_type": "display_data"
620
+ }
621
+ ],
622
  "source": []
623
  },
624
  {
625
  "cell_type": "code",
626
+ "execution_count": 39,
627
  "metadata": {},
628
  "outputs": [
629
  {
630
  "data": {
631
  "application/vnd.jupyter.widget-view+json": {
632
+ "model_id": "208cd0b1845341ff91372fb784096860",
633
  "version_major": 2,
634
  "version_minor": 0
635
  },
 
643
  {
644
  "data": {
645
  "application/vnd.jupyter.widget-view+json": {
646
+ "model_id": "6405ced5205448bd8d3db8c188698403",
647
  "version_major": 2,
648
  "version_minor": 0
649
  },
 
656
  }
657
  ],
658
  "source": [
659
+ "vocab_train = common_voice_train_no_alpha.map(extract_all_chars, batched=True, batch_size=-1, keep_in_memory=True, remove_columns=common_voice_train.column_names)\n",
660
+ "vocab_test = common_voice_test_no_alpha.map(extract_all_chars, batched=True, batch_size=-1, keep_in_memory=True, remove_columns=common_voice_test.column_names)"
661
  ]
662
  },
663
  {
664
  "cell_type": "code",
665
+ "execution_count": 40,
666
  "metadata": {},
667
  "outputs": [],
668
  "source": [
 
937
  },
938
  {
939
  "cell_type": "code",
940
+ "execution_count": 41,
941
  "metadata": {},
942
  "outputs": [
943
  {
944
  "name": "stdout",
945
  "output_type": "stream",
946
  "text": [
947
+ "194\n",
948
+ "['ダ', 'た', 'か', 'よ', 'や', 'を', 'F', 'h', 'ち', 'リ', 'ゲ', 'フ', 'め', 'タ', 'せ', '」', 'ば', 'ア', 'ャ', 'イ', 'ぶ', 'は', 'と', 'ノ', 'ェ', 'く', '?', '〜', 'つ', 'こ', 'S', 'ぼ', 'ゼ', 'U', 'き', 'ゥ', 'が', 'も', 'エ', 'ク', 'づ', 'グ', 'ブ', 'ゅ', 'ィ', 'ロ', 'ー', '/', 'の', 'ケ', '・', 'お', 'む', 'P', 'ベ', 'プ', '『', 'ソ', '.', 'ヴ', 'ド', 'み', 'ガ', 'ょ', 'カ', 'ぜ', '.', 'ご', 'ど', 'ハ', 'ね', 'j', ' ', 'マ', '―', '-', 'デ', 'ゾ', 'ポ', 'ペ', 'ぱ', 'ふ', 'べ', 'ヒ', 'サ', 'N', 'ュ', 'り', 'ひ', 'げ', 'ゆ', 'ず', 'ゴ', 'ョ', 'ツ', '〇', 'え', '』', 'ッ', 'ん', 'ン', 'う', 'ぽ', ':', '々', 'ぞ', 'ヨ', 'ゃ', 'だ', 'ピ', 'ボ', 'ウ', 'あ', 'ヶ', 'ぬ', 'て', 'す', 'び', 'へ', '繫', 'バ', 'ぎ', 'ざ', 'A', 'チ', 'け', 'ぇ', 'わ', 'ス', 'ズ', 'し', '、', '!', 'G', '・', 'ぁ', 'ナ', 'ヅ', 'ほ', ')', 'ネ', 'パ', 'ム', 'ミ', '=', '', 'い', 'ろ', 'ザ', 'ヌ', 'に', 'ら', 'ヘ', '。', 'ギ', 'モ', 'D', 'キ', \"'\", 'で', 'ぴ', 'ぷ', 'ビ', 'ヤ', 'ユ', 'シ', 'る', 'そ', 'テ', 'れ', 'じ', 'ワ', 'レ', 'ォ', 'ジ', 'な', 'ニ', '&', 'っ', '「', 'ぢ', 'ル', 'さ', 'ぺ', 'ト', 'ホ', 'コ', 'オ', 'セ', 'ま', 'メ', 'ァ', 'ぐ', 'ラ']\n"
949
  ]
950
  }
951
  ],
 
1161
  },
1162
  {
1163
  "cell_type": "code",
1164
+ "execution_count": 30,
1165
+ "metadata": {},
1166
+ "outputs": [
1167
+ {
1168
+ "data": {
1169
+ "application/vnd.jupyter.widget-view+json": {
1170
+ "model_id": "501e1eb7f6a545c496873545b992c2ad",
1171
+ "version_major": 2,
1172
+ "version_minor": 0
1173
+ },
1174
+ "text/plain": [
1175
+ " 0%| | 0/11 [00:00<?, ?ba/s]"
1176
+ ]
1177
+ },
1178
+ "metadata": {},
1179
+ "output_type": "display_data"
1180
  }
1181
+ ],
1182
+ "source": [
1183
+ "alpha_rows = common_voice_train.filter(lambda example: re.search('[a-zA-Z]',example['sentence']))\n"
1184
+ ]
1185
+ },
1186
+ {
1187
+ "cell_type": "code",
1188
+ "execution_count": 42,
1189
+ "metadata": {},
1190
  "outputs": [
1191
  {
1192
+ "data": {
1193
+ "application/vnd.jupyter.widget-view+json": {
1194
+ "model_id": "75d9652cda2c4d99adca0e0e455dd005",
1195
+ "version_major": 2,
1196
+ "version_minor": 0
1197
+ },
1198
+ "text/plain": [
1199
+ " 0%| | 0/11 [00:00<?, ?ba/s]"
1200
+ ]
1201
+ },
1202
+ "metadata": {},
1203
+ "output_type": "display_data"
1204
+ }
1205
+ ],
1206
+ "source": [
1207
+ "odd_alpha_rows = common_voice_train.filter(lambda example: re.search('[A-Uhj]',example['sentence']))\n",
1208
+ "\n"
1209
+ ]
1210
+ },
1211
+ {
1212
+ "cell_type": "code",
1213
+ "execution_count": 43,
1214
+ "metadata": {},
1215
+ "outputs": [
1216
+ {
1217
+ "name": "stdout",
1218
+ "output_type": "stream",
1219
+ "text": [
1220
+ "467\n",
1221
+ "10623\n",
1222
+ "4\n"
1223
  ]
1224
  }
1225
  ],
1226
+ "source": [
1227
+ "print(len(alpha_rows))\n",
1228
+ "print(len(common_voice_train))\n",
1229
+ "print(len(odd_alpha_rows))\n"
1230
+ ]
1231
  },
1232
  {
1233
  "cell_type": "code",
1234
+ "execution_count": 51,
1235
  "metadata": {},
1236
+ "outputs": [
1237
+ {
1238
+ "data": {
1239
+ "application/vnd.jupyter.widget-view+json": {
1240
+ "model_id": "50779e11b97f42d0aec1c17121b8087a",
1241
+ "version_major": 2,
1242
+ "version_minor": 0
1243
+ },
1244
+ "text/plain": [
1245
+ " 0%| | 0/11 [00:00<?, ?ba/s]"
1246
+ ]
1247
+ },
1248
+ "metadata": {},
1249
+ "output_type": "display_data"
1250
+ }
1251
+ ],
1252
+ "source": [
1253
+ "# Remove alphanumeric characters\n",
1254
+ "def has_no_alpha_numeric_characters(sentence):\n",
1255
+ " return re.search('[a-zA-Z]]',sentence)\n",
1256
+ "# common_voice_train\n",
1257
+ "common_voice_train__filter = common_voice_train.filter(\n",
1258
+ " has_no_alpha_numeric_characters,\n",
1259
+ "# num_proc=num_workers,\n",
1260
+ "common_voice_train.filter(lambda example: re.search('[A-Uhj]',example['sentence']))\n",
1261
+ ")"
1262
+ ]
1263
+ },
1264
+ {
1265
+ "cell_type": "code",
1266
+ "execution_count": 52,
1267
+ "metadata": {},
1268
+ "outputs": [
1269
+ {
1270
+ "name": "stdout",
1271
+ "output_type": "stream",
1272
+ "text": [
1273
+ "0\n",
1274
+ "10623\n"
1275
+ ]
1276
+ }
1277
+ ],
1278
+ "source": [
1279
+ "print(len(common_voice_train__filter))\n",
1280
+ "print(len(common_voice_train))"
1281
+ ]
1282
+ },
1283
+ {
1284
+ "cell_type": "code",
1285
+ "execution_count": 44,
1286
+ "metadata": {},
1287
+ "outputs": [
1288
+ {
1289
+ "name": "stdout",
1290
+ "output_type": "stream",
1291
+ "text": [
1292
+ "アーケードにはhジェネシーコミュニティカレッジのぶんこうがある。\n",
1293
+ "かのじょjはひだりききのピッチングをしていたものの、ゴルフはみぎききをしゅうとくした。\n",
1294
+ "ジャック・マッカートニーというせいともスーパーAレベルしけんをうけました。\n",
1295
+ "でも、ところどころ、UFOらしききょだいなえんばんのおもかげはのこっている。\n"
1296
+ ]
1297
+ }
1298
+ ],
1299
+ "source": [
1300
+ "for i in range(0,4):\n",
1301
+ " print(odd_alpha_rows[i]['sentence'])"
1302
+ ]
1303
+ },
1304
+ {
1305
+ "cell_type": "code",
1306
+ "execution_count": 28,
1307
+ "metadata": {},
1308
+ "outputs": [
1309
+ {
1310
+ "name": "stdout",
1311
+ "output_type": "stream",
1312
+ "text": [
1313
+ "<re.Match object; span=(1, 2), match='a'>\n"
1314
+ ]
1315
+ }
1316
+ ],
1317
+ "source": [
1318
+ "import regex\n",
1319
+ "print(re.search('[a-zA-Z]', \"9a2\"))"
1320
+ ]
1321
  },
1322
  {
1323
  "cell_type": "code",
training_args.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:093731fe91be156fc7e4a872c63f46f6ccef7059d892048324989ebf1c39c91e
3
  size 2991
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:3a128c5e47bae3a7da28bb745038c537c781db028b85a0a4f86e721372d51cc3
3
  size 2991
vocab.json CHANGED
@@ -1 +1 @@
1
- {"&": 1, "'": 2, ".": 3, "/": 4, "A": 5, "B": 6, "C": 7, "D": 8, "E": 9, "F": 10, "G": 11, "H": 12, "I": 13, "J": 14, "K": 15, "L": 16, "M": 17, "N": 18, "O": 19, "P": 20, "Q": 21, "R": 22, "S": 23, "T": 24, "U": 25, "V": 26, "W": 27, "X": 28, "Y": 29, "Z": 30, "a": 31, "b": 32, "c": 33, "d": 34, "e": 35, "f": 36, "g": 37, "h": 38, "i": 39, "j": 40, "k": 41, "l": 42, "m": 43, "n": 44, "o": 45, "p": 46, "q": 47, "r": 48, "s": 49, "t": 50, "u": 51, "v": 52, "w": 53, "x": 54, "y": 55, "z": 56, "": 57, "": 58, "": 59, "": 60, "": 61, "": 62, "": 63, "": 64, "": 65, "": 66, "": 67, "": 68, "": 69, "": 70, "": 71, "": 72, "": 73, "": 74, "": 75, "": 76, "": 77, "": 78, "": 79, "": 80, "": 81, "": 82, "": 83, "": 84, "": 85, "": 86, "": 87, "": 88, "": 89, "": 90, "": 91, "": 92, "": 93, "": 94, "": 95, "": 96, "": 97, "": 98, "": 99, "": 100, "": 101, "": 102, "": 103, "": 104, "": 105, "": 106, "": 107, "": 108, "": 109, "": 110, "": 111, "": 112, "": 113, "": 114, "": 115, "": 116, "": 117, "": 118, "": 119, "": 120, "": 121, "": 122, "": 123, "": 124, "": 125, "": 126, "": 127, "": 128, "": 129, "": 130, "": 131, "": 132, "": 133, "": 134, "": 135, "": 136, "": 137, "": 138, "": 139, "": 140, "": 141, "": 142, "": 143, "": 144, "": 145, "": 146, "": 147, "": 148, "": 149, "": 150, "": 151, "": 152, "": 153, "": 154, "": 155, "": 156, "": 157, "": 158, "": 159, "": 160, "": 161, "": 162, "": 163, "": 164, "": 165, "": 166, "": 167, "": 168, "": 169, "": 170, "": 171, "": 172, "": 173, "": 174, "": 175, "": 176, "ッ": 177, "ツ": 178, "ヅ": 179, "テ": 180, "デ": 181, "ト": 182, "ド": 183, "ナ": 184, "ニ": 185, "ヌ": 186, "ネ": 187, "ノ": 188, "ハ": 189, "バ": 190, "パ": 191, "ヒ": 192, "ビ": 193, "ピ": 194, "フ": 195, "ブ": 196, "プ": 197, "ヘ": 198, "ベ": 199, "ペ": 200, "ホ": 201, "ボ": 202, "ポ": 203, "マ": 204, "ミ": 205, "ム": 206, "メ": 207, "モ": 208, "ャ": 209, "ヤ": 210, "ュ": 211, "ユ": 212, "ョ": 213, "ヨ": 214, "ラ": 215, "リ": 216, "ル": 217, "レ": 218, "ロ": 219, "ワ": 220, "ン": 221, "ヴ": 222, "ヶ": 223, "・": 224, "ー": 225, "繫": 226, "&": 227, ")": 228, "-": 229, ".": 230, ":": 231, "=": 232, "?": 233, "A": 234, "D": 235, "F": 236, "G": 237, "N": 238, "O": 239, "P": 240, "S": 241, "U": 242, "h": 243, "j": 244, "「": 245, "」": 246, "・": 247, "|": 0, "[UNK]": 248, "[PAD]": 249}
 
1
+ {"'": 1, ".": 2, "": 3, "": 4, "": 5, "": 6, "": 7, "": 8, "": 9, "": 10, "": 11, "": 12, "": 13, "": 14, "": 15, "": 16, "": 17, "": 18, "": 19, "": 20, "": 21, "": 22, "": 23, "": 24, "": 25, "": 26, "": 27, "": 28, "": 29, "": 30, "": 31, "": 32, "": 33, "": 34, "": 35, "": 36, "": 37, "": 38, "": 39, "": 40, "": 41, "": 42, "": 43, "": 44, "": 45, "": 46, "": 47, "": 48, "": 49, "": 50, "": 51, "": 52, "": 53, "": 54, "": 55, "": 56, "": 57, "": 58, "": 59, "": 60, "": 61, "": 62, "": 63, "": 64, "": 65, "": 66, "": 67, "": 68, "": 69, "": 70, "": 71, "": 72, "": 73, "": 74, "": 75, "": 76, "": 77, "": 78, "": 79, "": 80, "": 81, "": 82, "": 83, "": 84, "": 85, "": 86, "": 87, "": 88, "": 89, "": 90, "": 91, "": 92, "": 93, "": 94, "": 95, "": 96, "": 97, "": 98, "": 99, "": 100, "": 101, "": 102, "": 103, "": 104, "": 105, "": 106, "": 107, "": 108, "": 109, "": 110, "": 111, "": 112, "": 113, "": 114, "": 115, "": 116, "": 117, "": 118, "": 119, "": 120, "": 121, "": 122, "": 123, "": 124, "": 125, "": 126, "": 127, "": 128, "": 129, "": 130, "": 131, "": 132, "": 133, "": 134, "": 135, "": 136, "": 137, "": 138, "": 139, "": 140, "": 141, "": 142, "": 143, "": 144, "": 145, "": 146, "": 147, "": 148, "": 149, "": 150, "": 151, "": 152, "": 153, "": 154, "": 155, "": 156, "": 157, "": 158, "": 159, "": 160, "": 161, "": 162, "": 163, "": 164, "": 165, "": 166, "": 167, "": 168, "": 169, "": 170, "": 171, "": 172, "": 173, "": 174, "": 175, "": 176, "|": 0, "[UNK]": 177, "[PAD]": 178}