Josh Cole
commited on
Commit
·
20c1366
1
Parent(s):
6e2f9e3
initial commit
Browse files
.gitignore
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
.ipynb_checkpoints/
|
Generate.ipynb
ADDED
@@ -0,0 +1,562 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"cells": [
|
3 |
+
{
|
4 |
+
"cell_type": "code",
|
5 |
+
"execution_count": 1,
|
6 |
+
"id": "5205c0d3-2272-4a43-9345-9553af479fe6",
|
7 |
+
"metadata": {},
|
8 |
+
"outputs": [
|
9 |
+
{
|
10 |
+
"data": {
|
11 |
+
"application/vnd.jupyter.widget-view+json": {
|
12 |
+
"model_id": "50bf0f78f5f044dd8be6b181b2cb0949",
|
13 |
+
"version_major": 2,
|
14 |
+
"version_minor": 0
|
15 |
+
},
|
16 |
+
"text/plain": [
|
17 |
+
"VBox(children=(HTML(value='<center> <img\\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…"
|
18 |
+
]
|
19 |
+
},
|
20 |
+
"metadata": {},
|
21 |
+
"output_type": "display_data"
|
22 |
+
}
|
23 |
+
],
|
24 |
+
"source": [
|
25 |
+
"from huggingface_hub import notebook_login\n",
|
26 |
+
"notebook_login()"
|
27 |
+
]
|
28 |
+
},
|
29 |
+
{
|
30 |
+
"cell_type": "code",
|
31 |
+
"execution_count": 3,
|
32 |
+
"id": "38bdf299-f60d-43ea-9230-df1be861e406",
|
33 |
+
"metadata": {},
|
34 |
+
"outputs": [
|
35 |
+
{
|
36 |
+
"name": "stderr",
|
37 |
+
"output_type": "stream",
|
38 |
+
"text": [
|
39 |
+
"Using custom data configuration sharpcoder--bjorn_training-8c32a3534606a113\n",
|
40 |
+
"Reusing dataset parquet (/home/sharpcoder/.cache/huggingface/datasets/sharpcoder___parquet/sharpcoder--bjorn_training-8c32a3534606a113/0.0.0/7328ef7ee03eaf3f86ae40594d46a1cec86161704e02dd19f232d81eee72ade8)\n"
|
41 |
+
]
|
42 |
+
},
|
43 |
+
{
|
44 |
+
"data": {
|
45 |
+
"application/vnd.jupyter.widget-view+json": {
|
46 |
+
"model_id": "c495fe2f4a44499fb32751d60ac1488e",
|
47 |
+
"version_major": 2,
|
48 |
+
"version_minor": 0
|
49 |
+
},
|
50 |
+
"text/plain": [
|
51 |
+
" 0%| | 0/1 [00:00<?, ?it/s]"
|
52 |
+
]
|
53 |
+
},
|
54 |
+
"metadata": {},
|
55 |
+
"output_type": "display_data"
|
56 |
+
}
|
57 |
+
],
|
58 |
+
"source": [
|
59 |
+
"from datasets import load_dataset, load_metric\n",
|
60 |
+
"ds = load_dataset(\"sharpcoder/bjorn_training\")"
|
61 |
+
]
|
62 |
+
},
|
63 |
+
{
|
64 |
+
"cell_type": "code",
|
65 |
+
"execution_count": 38,
|
66 |
+
"id": "75b32151-eb53-4476-8c1f-7e6da72e173e",
|
67 |
+
"metadata": {},
|
68 |
+
"outputs": [
|
69 |
+
{
|
70 |
+
"data": {
|
71 |
+
"application/vnd.jupyter.widget-view+json": {
|
72 |
+
"model_id": "0f019d1f864b4b56af5c828588fd89bf",
|
73 |
+
"version_major": 2,
|
74 |
+
"version_minor": 0
|
75 |
+
},
|
76 |
+
"text/plain": [
|
77 |
+
" 0%| | 0/1 [00:00<?, ?ba/s]"
|
78 |
+
]
|
79 |
+
},
|
80 |
+
"metadata": {},
|
81 |
+
"output_type": "display_data"
|
82 |
+
}
|
83 |
+
],
|
84 |
+
"source": [
|
85 |
+
"def extract_all_chars(batch):\n",
|
86 |
+
" all_text = \" \".join(batch[\"text\"])\n",
|
87 |
+
" vocab = list(set(all_text))\n",
|
88 |
+
" return {\"vocab\": [vocab], \"all_text\": [all_text]}\n",
|
89 |
+
"\n",
|
90 |
+
"vocabs = ds.map(extract_all_chars, batched=True, batch_size=-1, keep_in_memory=True, remove_columns=ds.column_names[\"train\"])\n",
|
91 |
+
"vocab_list = list(set(vocabs[\"train\"][\"vocab\"][0]) | set(vocabs[\"train\"][\"vocab\"][0]))\n",
|
92 |
+
"vocab_dict = {v: k for k, v in enumerate(vocab_list)}\n",
|
93 |
+
"vocab_dict[\"|\"] = vocab_dict[\" \"]\n",
|
94 |
+
"del vocab_dict[\" \"]\n",
|
95 |
+
"vocab_dict[\"[UNK]\"] = len(vocab_dict)\n",
|
96 |
+
"vocab_dict[\"[PAD]\"] = len(vocab_dict)\n",
|
97 |
+
"len(vocab_dict)\n",
|
98 |
+
"import json\n",
|
99 |
+
"with open('vocab.json', 'w') as vocab_file:\n",
|
100 |
+
" json.dump(vocab_dict, vocab_file)"
|
101 |
+
]
|
102 |
+
},
|
103 |
+
{
|
104 |
+
"cell_type": "code",
|
105 |
+
"execution_count": 39,
|
106 |
+
"id": "d214872e-d4b1-4aa7-be07-8a1591961968",
|
107 |
+
"metadata": {},
|
108 |
+
"outputs": [],
|
109 |
+
"source": [
|
110 |
+
"from transformers import Wav2Vec2CTCTokenizer\n",
|
111 |
+
"from transformers import Wav2Vec2FeatureExtractor\n",
|
112 |
+
"from transformers import Wav2Vec2Processor\n",
|
113 |
+
"\n",
|
114 |
+
"tokenizer = Wav2Vec2CTCTokenizer(\"./vocab.json\", unk_token=\"[UNK]\", pad_token=\"[PAD]\", word_delimiter_token=\"|\")\n",
|
115 |
+
"feature_extractor = Wav2Vec2FeatureExtractor(feature_size=1, sampling_rate=16000, padding_value=0.0, do_normalize=True, return_attention_mask=False)\n",
|
116 |
+
"processor = Wav2Vec2Processor(feature_extractor=feature_extractor, tokenizer=tokenizer)"
|
117 |
+
]
|
118 |
+
},
|
119 |
+
{
|
120 |
+
"cell_type": "code",
|
121 |
+
"execution_count": 40,
|
122 |
+
"id": "e906c45f-6971-43c3-ad0a-b13363100bdf",
|
123 |
+
"metadata": {},
|
124 |
+
"outputs": [],
|
125 |
+
"source": [
|
126 |
+
"def prepare_dataset(batch):\n",
|
127 |
+
" audio = batch[\"audio\"]\n",
|
128 |
+
"\n",
|
129 |
+
" # batched output is \"un-batched\" to ensure mapping is correct\n",
|
130 |
+
" batch[\"input_values\"] = processor(audio[\"array\"], sampling_rate=audio[\"sample_rate\"]).input_values[0]\n",
|
131 |
+
" batch[\"input_length\"] = len(batch[\"input_values\"])\n",
|
132 |
+
" \n",
|
133 |
+
" with processor.as_target_processor():\n",
|
134 |
+
" batch[\"labels\"] = processor(batch[\"text\"]).input_ids\n",
|
135 |
+
" return batch"
|
136 |
+
]
|
137 |
+
},
|
138 |
+
{
|
139 |
+
"cell_type": "code",
|
140 |
+
"execution_count": 41,
|
141 |
+
"id": "8c083db6-eab5-4f25-9a08-eab50d2d30ac",
|
142 |
+
"metadata": {},
|
143 |
+
"outputs": [
|
144 |
+
{
|
145 |
+
"name": "stderr",
|
146 |
+
"output_type": "stream",
|
147 |
+
"text": [
|
148 |
+
"num_proc must be <= 1. Reducing num_proc to 1 for dataset of size 1.\n"
|
149 |
+
]
|
150 |
+
},
|
151 |
+
{
|
152 |
+
"data": {
|
153 |
+
"application/vnd.jupyter.widget-view+json": {
|
154 |
+
"model_id": "3b36aee8ffc44253a8381da4d0f4c362",
|
155 |
+
"version_major": 2,
|
156 |
+
"version_minor": 0
|
157 |
+
},
|
158 |
+
"text/plain": [
|
159 |
+
" 0%| | 0/1 [00:00<?, ?ex/s]"
|
160 |
+
]
|
161 |
+
},
|
162 |
+
"metadata": {},
|
163 |
+
"output_type": "display_data"
|
164 |
+
}
|
165 |
+
],
|
166 |
+
"source": [
|
167 |
+
"ds_prepared = ds.map(prepare_dataset, remove_columns=ds.column_names[\"train\"], num_proc=4)"
|
168 |
+
]
|
169 |
+
},
|
170 |
+
{
|
171 |
+
"cell_type": "code",
|
172 |
+
"execution_count": 42,
|
173 |
+
"id": "50c9a6ad-9e79-4a1c-a5ce-6e1f73a96e4d",
|
174 |
+
"metadata": {},
|
175 |
+
"outputs": [],
|
176 |
+
"source": [
|
177 |
+
"import torch\n",
|
178 |
+
"\n",
|
179 |
+
"from dataclasses import dataclass, field\n",
|
180 |
+
"from typing import Any, Dict, List, Optional, Union\n",
|
181 |
+
"\n",
|
182 |
+
"@dataclass\n",
|
183 |
+
"class DataCollatorCTCWithPadding:\n",
|
184 |
+
" \"\"\"\n",
|
185 |
+
" Data collator that will dynamically pad the inputs received.\n",
|
186 |
+
" Args:\n",
|
187 |
+
" processor (:class:`~transformers.Wav2Vec2Processor`)\n",
|
188 |
+
" The processor used for proccessing the data.\n",
|
189 |
+
" padding (:obj:`bool`, :obj:`str` or :class:`~transformers.tokenization_utils_base.PaddingStrategy`, `optional`, defaults to :obj:`True`):\n",
|
190 |
+
" Select a strategy to pad the returned sequences (according to the model's padding side and padding index)\n",
|
191 |
+
" among:\n",
|
192 |
+
" * :obj:`True` or :obj:`'longest'`: Pad to the longest sequence in the batch (or no padding if only a single\n",
|
193 |
+
" sequence if provided).\n",
|
194 |
+
" * :obj:`'max_length'`: Pad to a maximum length specified with the argument :obj:`max_length` or to the\n",
|
195 |
+
" maximum acceptable input length for the model if that argument is not provided.\n",
|
196 |
+
" * :obj:`False` or :obj:`'do_not_pad'` (default): No padding (i.e., can output a batch with sequences of\n",
|
197 |
+
" different lengths).\n",
|
198 |
+
" \"\"\"\n",
|
199 |
+
"\n",
|
200 |
+
" processor: Wav2Vec2Processor\n",
|
201 |
+
" padding: Union[bool, str] = True\n",
|
202 |
+
"\n",
|
203 |
+
" def __call__(self, features: List[Dict[str, Union[List[int], torch.Tensor]]]) -> Dict[str, torch.Tensor]:\n",
|
204 |
+
" # split inputs and labels since they have to be of different lenghts and need\n",
|
205 |
+
" # different padding methods\n",
|
206 |
+
" input_features = [{\"input_values\": feature[\"input_values\"]} for feature in features]\n",
|
207 |
+
" label_features = [{\"input_ids\": feature[\"labels\"]} for feature in features]\n",
|
208 |
+
"\n",
|
209 |
+
" batch = self.processor.pad(\n",
|
210 |
+
" input_features,\n",
|
211 |
+
" padding=self.padding,\n",
|
212 |
+
" return_tensors=\"pt\",\n",
|
213 |
+
" )\n",
|
214 |
+
" with self.processor.as_target_processor():\n",
|
215 |
+
" labels_batch = self.processor.pad(\n",
|
216 |
+
" label_features,\n",
|
217 |
+
" padding=self.padding,\n",
|
218 |
+
" return_tensors=\"pt\",\n",
|
219 |
+
" )\n",
|
220 |
+
"\n",
|
221 |
+
" # replace padding with -100 to ignore loss correctly\n",
|
222 |
+
" labels = labels_batch[\"input_ids\"].masked_fill(labels_batch.attention_mask.ne(1), -100)\n",
|
223 |
+
"\n",
|
224 |
+
" batch[\"labels\"] = labels\n",
|
225 |
+
"\n",
|
226 |
+
" return batch\n",
|
227 |
+
" \n",
|
228 |
+
"def compute_metrics(pred):\n",
|
229 |
+
" pred_logits = pred.predictions\n",
|
230 |
+
" pred_ids = np.argmax(pred_logits, axis=-1)\n",
|
231 |
+
"\n",
|
232 |
+
" pred.label_ids[pred.label_ids == -100] = processor.tokenizer.pad_token_id\n",
|
233 |
+
"\n",
|
234 |
+
" pred_str = processor.batch_decode(pred_ids)\n",
|
235 |
+
" # we do not want to group tokens when computing the metrics\n",
|
236 |
+
" label_str = processor.batch_decode(pred.label_ids, group_tokens=False)\n",
|
237 |
+
"\n",
|
238 |
+
" wer = wer_metric.compute(predictions=pred_str, references=label_str)\n",
|
239 |
+
"\n",
|
240 |
+
" return {\"wer\": wer}"
|
241 |
+
]
|
242 |
+
},
|
243 |
+
{
|
244 |
+
"cell_type": "code",
|
245 |
+
"execution_count": 43,
|
246 |
+
"id": "1025ffdf-cb83-4895-89ab-a98bc3fab642",
|
247 |
+
"metadata": {},
|
248 |
+
"outputs": [],
|
249 |
+
"source": [
|
250 |
+
"data_collator = DataCollatorCTCWithPadding(processor=processor, padding=True)\n",
|
251 |
+
"wer_metric = load_metric(\"wer\")"
|
252 |
+
]
|
253 |
+
},
|
254 |
+
{
|
255 |
+
"cell_type": "code",
|
256 |
+
"execution_count": 44,
|
257 |
+
"id": "71351cf4-6d00-40ae-89cc-cedb87073625",
|
258 |
+
"metadata": {},
|
259 |
+
"outputs": [
|
260 |
+
{
|
261 |
+
"name": "stderr",
|
262 |
+
"output_type": "stream",
|
263 |
+
"text": [
|
264 |
+
"loading configuration file https://huggingface.co/facebook/wav2vec2-base/resolve/main/config.json from cache at /home/sharpcoder/.cache/huggingface/transformers/c7746642f045322fd01afa31271dd490e677ea11999e68660a92619ec7c892b4.ce1f96bfaf3d7475cb8187b9668c7f19437ade45fb9ceb78d2b06a2cec198015\n",
|
265 |
+
"/home/sharpcoder/.local/lib/python3.10/site-packages/transformers/configuration_utils.py:336: UserWarning: Passing `gradient_checkpointing` to a config initialization is deprecated and will be removed in v5 Transformers. Using `model.gradient_checkpointing_enable()` instead, or if you are using the `Trainer` API, pass `gradient_checkpointing=True` in your `TrainingArguments`.\n",
|
266 |
+
" warnings.warn(\n",
|
267 |
+
"Model config Wav2Vec2Config {\n",
|
268 |
+
" \"activation_dropout\": 0.0,\n",
|
269 |
+
" \"apply_spec_augment\": true,\n",
|
270 |
+
" \"architectures\": [\n",
|
271 |
+
" \"Wav2Vec2ForPreTraining\"\n",
|
272 |
+
" ],\n",
|
273 |
+
" \"attention_dropout\": 0.1,\n",
|
274 |
+
" \"bos_token_id\": 1,\n",
|
275 |
+
" \"classifier_proj_size\": 256,\n",
|
276 |
+
" \"codevector_dim\": 256,\n",
|
277 |
+
" \"contrastive_logits_temperature\": 0.1,\n",
|
278 |
+
" \"conv_bias\": false,\n",
|
279 |
+
" \"conv_dim\": [\n",
|
280 |
+
" 512,\n",
|
281 |
+
" 512,\n",
|
282 |
+
" 512,\n",
|
283 |
+
" 512,\n",
|
284 |
+
" 512,\n",
|
285 |
+
" 512,\n",
|
286 |
+
" 512\n",
|
287 |
+
" ],\n",
|
288 |
+
" \"conv_kernel\": [\n",
|
289 |
+
" 10,\n",
|
290 |
+
" 3,\n",
|
291 |
+
" 3,\n",
|
292 |
+
" 3,\n",
|
293 |
+
" 3,\n",
|
294 |
+
" 2,\n",
|
295 |
+
" 2\n",
|
296 |
+
" ],\n",
|
297 |
+
" \"conv_stride\": [\n",
|
298 |
+
" 5,\n",
|
299 |
+
" 2,\n",
|
300 |
+
" 2,\n",
|
301 |
+
" 2,\n",
|
302 |
+
" 2,\n",
|
303 |
+
" 2,\n",
|
304 |
+
" 2\n",
|
305 |
+
" ],\n",
|
306 |
+
" \"ctc_loss_reduction\": \"mean\",\n",
|
307 |
+
" \"ctc_zero_infinity\": false,\n",
|
308 |
+
" \"diversity_loss_weight\": 0.1,\n",
|
309 |
+
" \"do_stable_layer_norm\": false,\n",
|
310 |
+
" \"eos_token_id\": 2,\n",
|
311 |
+
" \"feat_extract_activation\": \"gelu\",\n",
|
312 |
+
" \"feat_extract_norm\": \"group\",\n",
|
313 |
+
" \"feat_proj_dropout\": 0.1,\n",
|
314 |
+
" \"feat_quantizer_dropout\": 0.0,\n",
|
315 |
+
" \"final_dropout\": 0.0,\n",
|
316 |
+
" \"freeze_feat_extract_train\": true,\n",
|
317 |
+
" \"gradient_checkpointing\": true,\n",
|
318 |
+
" \"hidden_act\": \"gelu\",\n",
|
319 |
+
" \"hidden_dropout\": 0.1,\n",
|
320 |
+
" \"hidden_size\": 768,\n",
|
321 |
+
" \"initializer_range\": 0.02,\n",
|
322 |
+
" \"intermediate_size\": 3072,\n",
|
323 |
+
" \"layer_norm_eps\": 1e-05,\n",
|
324 |
+
" \"layerdrop\": 0.0,\n",
|
325 |
+
" \"mask_channel_length\": 10,\n",
|
326 |
+
" \"mask_channel_min_space\": 1,\n",
|
327 |
+
" \"mask_channel_other\": 0.0,\n",
|
328 |
+
" \"mask_channel_prob\": 0.0,\n",
|
329 |
+
" \"mask_channel_selection\": \"static\",\n",
|
330 |
+
" \"mask_feature_length\": 10,\n",
|
331 |
+
" \"mask_feature_prob\": 0.0,\n",
|
332 |
+
" \"mask_time_length\": 10,\n",
|
333 |
+
" \"mask_time_min_space\": 1,\n",
|
334 |
+
" \"mask_time_other\": 0.0,\n",
|
335 |
+
" \"mask_time_prob\": 0.05,\n",
|
336 |
+
" \"mask_time_selection\": \"static\",\n",
|
337 |
+
" \"model_type\": \"wav2vec2\",\n",
|
338 |
+
" \"no_mask_channel_overlap\": false,\n",
|
339 |
+
" \"no_mask_time_overlap\": false,\n",
|
340 |
+
" \"num_attention_heads\": 12,\n",
|
341 |
+
" \"num_codevector_groups\": 2,\n",
|
342 |
+
" \"num_codevectors_per_group\": 320,\n",
|
343 |
+
" \"num_conv_pos_embedding_groups\": 16,\n",
|
344 |
+
" \"num_conv_pos_embeddings\": 128,\n",
|
345 |
+
" \"num_feat_extract_layers\": 7,\n",
|
346 |
+
" \"num_hidden_layers\": 12,\n",
|
347 |
+
" \"num_negatives\": 100,\n",
|
348 |
+
" \"pad_token_id\": 19,\n",
|
349 |
+
" \"proj_codevector_dim\": 256,\n",
|
350 |
+
" \"transformers_version\": \"4.11.3\",\n",
|
351 |
+
" \"use_weighted_layer_sum\": false,\n",
|
352 |
+
" \"vocab_size\": 32\n",
|
353 |
+
"}\n",
|
354 |
+
"\n",
|
355 |
+
"loading weights file https://huggingface.co/facebook/wav2vec2-base/resolve/main/pytorch_model.bin from cache at /home/sharpcoder/.cache/huggingface/transformers/ef45231897ce572a660ebc5a63d3702f1a6041c4c5fb78cbec330708531939b3.fcae05302a685f7904c551c8ea571e8bc2a2c4a1777ea81ad66e47f7883a650a\n",
|
356 |
+
"Some weights of the model checkpoint at facebook/wav2vec2-base were not used when initializing Wav2Vec2ForCTC: ['project_q.bias', 'project_hid.bias', 'quantizer.codevectors', 'project_q.weight', 'quantizer.weight_proj.weight', 'quantizer.weight_proj.bias', 'project_hid.weight']\n",
|
357 |
+
"- This IS expected if you are initializing Wav2Vec2ForCTC from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).\n",
|
358 |
+
"- This IS NOT expected if you are initializing Wav2Vec2ForCTC from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).\n",
|
359 |
+
"Some weights of Wav2Vec2ForCTC were not initialized from the model checkpoint at facebook/wav2vec2-base and are newly initialized: ['lm_head.bias', 'lm_head.weight']\n",
|
360 |
+
"You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.\n"
|
361 |
+
]
|
362 |
+
}
|
363 |
+
],
|
364 |
+
"source": [
|
365 |
+
"from transformers import Wav2Vec2ForCTC\n",
|
366 |
+
"\n",
|
367 |
+
"model = Wav2Vec2ForCTC.from_pretrained(\n",
|
368 |
+
" \"facebook/wav2vec2-base\",\n",
|
369 |
+
" ctc_loss_reduction=\"mean\", \n",
|
370 |
+
" pad_token_id=processor.tokenizer.pad_token_id,\n",
|
371 |
+
")"
|
372 |
+
]
|
373 |
+
},
|
374 |
+
{
|
375 |
+
"cell_type": "code",
|
376 |
+
"execution_count": 45,
|
377 |
+
"id": "208eac7d-9fdd-4c82-b46f-25c1a1f246ee",
|
378 |
+
"metadata": {},
|
379 |
+
"outputs": [
|
380 |
+
{
|
381 |
+
"name": "stderr",
|
382 |
+
"output_type": "stream",
|
383 |
+
"text": [
|
384 |
+
"PyTorch: setting up devices\n",
|
385 |
+
"The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).\n"
|
386 |
+
]
|
387 |
+
}
|
388 |
+
],
|
389 |
+
"source": [
|
390 |
+
"from transformers import TrainingArguments\n",
|
391 |
+
"from transformers import Trainer\n",
|
392 |
+
"\n",
|
393 |
+
"training_args = TrainingArguments(\n",
|
394 |
+
" output_dir=\"sharpcoder/wav2vec2_bjorn\",\n",
|
395 |
+
" group_by_length=True,\n",
|
396 |
+
" per_device_train_batch_size=8,\n",
|
397 |
+
" evaluation_strategy=\"steps\",\n",
|
398 |
+
" num_train_epochs=30,\n",
|
399 |
+
" fp16=False,\n",
|
400 |
+
" gradient_checkpointing=True,\n",
|
401 |
+
" save_steps=500,\n",
|
402 |
+
" eval_steps=500,\n",
|
403 |
+
" logging_steps=500,\n",
|
404 |
+
" learning_rate=1e-4,\n",
|
405 |
+
" weight_decay=0.005,\n",
|
406 |
+
" warmup_steps=1000,\n",
|
407 |
+
" save_total_limit=2,\n",
|
408 |
+
")\n",
|
409 |
+
"\n",
|
410 |
+
"trainer = Trainer(\n",
|
411 |
+
" model=model,\n",
|
412 |
+
" data_collator=data_collator,\n",
|
413 |
+
" args=training_args,\n",
|
414 |
+
" compute_metrics=compute_metrics,\n",
|
415 |
+
" train_dataset=ds_prepared[\"train\"],\n",
|
416 |
+
" eval_dataset=ds_prepared[\"train\"],\n",
|
417 |
+
" tokenizer=processor.feature_extractor,\n",
|
418 |
+
")"
|
419 |
+
]
|
420 |
+
},
|
421 |
+
{
|
422 |
+
"cell_type": "code",
|
423 |
+
"execution_count": 46,
|
424 |
+
"id": "d58f6b8c-441c-4fa9-a308-e687948875e1",
|
425 |
+
"metadata": {},
|
426 |
+
"outputs": [
|
427 |
+
{
|
428 |
+
"name": "stderr",
|
429 |
+
"output_type": "stream",
|
430 |
+
"text": [
|
431 |
+
"The following columns in the training set don't have a corresponding argument in `Wav2Vec2ForCTC.forward` and have been ignored: input_length.\n",
|
432 |
+
"***** Running training *****\n",
|
433 |
+
" Num examples = 1\n",
|
434 |
+
" Num Epochs = 30\n",
|
435 |
+
" Instantaneous batch size per device = 8\n",
|
436 |
+
" Total train batch size (w. parallel, distributed & accumulation) = 8\n",
|
437 |
+
" Gradient Accumulation steps = 1\n",
|
438 |
+
" Total optimization steps = 30\n",
|
439 |
+
"/home/sharpcoder/.local/lib/python3.10/site-packages/transformers/feature_extraction_utils.py:158: UserWarning: Creating a tensor from a list of numpy.ndarrays is extremely slow. Please consider converting the list to a single numpy.ndarray with numpy.array() before converting to a tensor. (Triggered internally at ../torch/csrc/utils/tensor_new.cpp:210.)\n",
|
440 |
+
" tensor = as_tensor(value)\n",
|
441 |
+
"/home/sharpcoder/.local/lib/python3.10/site-packages/transformers/models/wav2vec2/modeling_wav2vec2.py:882: UserWarning: __floordiv__ is deprecated, and its behavior will change in a future version of pytorch. It currently rounds toward 0 (like the 'trunc' function NOT 'floor'). This results in incorrect rounding for negative values. To keep the current behavior, use torch.div(a, b, rounding_mode='trunc'), or for actual floor division, use torch.div(a, b, rounding_mode='floor').\n",
|
442 |
+
" return (input_length - kernel_size) // stride + 1\n",
|
443 |
+
"/home/sharpcoder/.local/lib/python3.10/site-packages/torch/autocast_mode.py:162: UserWarning: User provided device_type of 'cuda', but CUDA is not available. Disabling\n",
|
444 |
+
" warnings.warn('User provided device_type of \\'cuda\\', but CUDA is not available. Disabling')\n"
|
445 |
+
]
|
446 |
+
},
|
447 |
+
{
|
448 |
+
"data": {
|
449 |
+
"text/html": [
|
450 |
+
"\n",
|
451 |
+
" <div>\n",
|
452 |
+
" \n",
|
453 |
+
" <progress value='30' max='30' style='width:300px; height:20px; vertical-align: middle;'></progress>\n",
|
454 |
+
" [30/30 00:29, Epoch 30/30]\n",
|
455 |
+
" </div>\n",
|
456 |
+
" <table border=\"1\" class=\"dataframe\">\n",
|
457 |
+
" <thead>\n",
|
458 |
+
" <tr style=\"text-align: left;\">\n",
|
459 |
+
" <th>Step</th>\n",
|
460 |
+
" <th>Training Loss</th>\n",
|
461 |
+
" <th>Validation Loss</th>\n",
|
462 |
+
" </tr>\n",
|
463 |
+
" </thead>\n",
|
464 |
+
" <tbody>\n",
|
465 |
+
" </tbody>\n",
|
466 |
+
"</table><p>"
|
467 |
+
],
|
468 |
+
"text/plain": [
|
469 |
+
"<IPython.core.display.HTML object>"
|
470 |
+
]
|
471 |
+
},
|
472 |
+
"metadata": {},
|
473 |
+
"output_type": "display_data"
|
474 |
+
},
|
475 |
+
{
|
476 |
+
"name": "stderr",
|
477 |
+
"output_type": "stream",
|
478 |
+
"text": [
|
479 |
+
"\n",
|
480 |
+
"\n",
|
481 |
+
"Training completed. Do not forget to share your model on huggingface.co/models =)\n",
|
482 |
+
"\n",
|
483 |
+
"\n"
|
484 |
+
]
|
485 |
+
},
|
486 |
+
{
|
487 |
+
"data": {
|
488 |
+
"text/plain": [
|
489 |
+
"TrainOutput(global_step=30, training_loss=7.301008097330729, metrics={'train_runtime': 30.6312, 'train_samples_per_second': 0.979, 'train_steps_per_second': 0.979, 'total_flos': 943749864316800.0, 'train_loss': 7.301008097330729, 'epoch': 30.0})"
|
490 |
+
]
|
491 |
+
},
|
492 |
+
"execution_count": 46,
|
493 |
+
"metadata": {},
|
494 |
+
"output_type": "execute_result"
|
495 |
+
}
|
496 |
+
],
|
497 |
+
"source": [
|
498 |
+
"trainer.train()"
|
499 |
+
]
|
500 |
+
},
|
501 |
+
{
|
502 |
+
"cell_type": "code",
|
503 |
+
"execution_count": 47,
|
504 |
+
"id": "70866f1f-3745-4e68-acd5-f50b6eff348b",
|
505 |
+
"metadata": {},
|
506 |
+
"outputs": [
|
507 |
+
{
|
508 |
+
"name": "stderr",
|
509 |
+
"output_type": "stream",
|
510 |
+
"text": [
|
511 |
+
"Saving model checkpoint to sharpcoder/wav2vec2_bjorn\n",
|
512 |
+
"Configuration saved in sharpcoder/wav2vec2_bjorn/config.json\n",
|
513 |
+
"Model weights saved in sharpcoder/wav2vec2_bjorn/pytorch_model.bin\n",
|
514 |
+
"Configuration saved in sharpcoder/wav2vec2_bjorn/preprocessor_config.json\n"
|
515 |
+
]
|
516 |
+
},
|
517 |
+
{
|
518 |
+
"ename": "AttributeError",
|
519 |
+
"evalue": "'Trainer' object has no attribute 'repo'",
|
520 |
+
"output_type": "error",
|
521 |
+
"traceback": [
|
522 |
+
"\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
|
523 |
+
"\u001b[0;31mAttributeError\u001b[0m Traceback (most recent call last)",
|
524 |
+
"Input \u001b[0;32mIn [47]\u001b[0m, in \u001b[0;36m<cell line: 1>\u001b[0;34m()\u001b[0m\n\u001b[0;32m----> 1\u001b[0m \u001b[43mtrainer\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mpush_to_hub\u001b[49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m\n",
|
525 |
+
"File \u001b[0;32m~/.local/lib/python3.10/site-packages/transformers/trainer.py:2677\u001b[0m, in \u001b[0;36mTrainer.push_to_hub\u001b[0;34m(self, commit_message, blocking, **kwargs)\u001b[0m\n\u001b[1;32m 2674\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mis_world_process_zero():\n\u001b[1;32m 2675\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m\n\u001b[0;32m-> 2677\u001b[0m git_head_commit_url \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mrepo\u001b[49m\u001b[38;5;241m.\u001b[39mpush_to_hub(commit_message\u001b[38;5;241m=\u001b[39mcommit_message, blocking\u001b[38;5;241m=\u001b[39mblocking)\n\u001b[1;32m 2678\u001b[0m \u001b[38;5;66;03m# push separately the model card to be independant from the rest of the model\u001b[39;00m\n\u001b[1;32m 2679\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39margs\u001b[38;5;241m.\u001b[39mshould_save:\n",
|
526 |
+
"\u001b[0;31mAttributeError\u001b[0m: 'Trainer' object has no attribute 'repo'"
|
527 |
+
]
|
528 |
+
}
|
529 |
+
],
|
530 |
+
"source": []
|
531 |
+
},
|
532 |
+
{
|
533 |
+
"cell_type": "code",
|
534 |
+
"execution_count": null,
|
535 |
+
"id": "333d43cf-add3-4d78-bbca-b44c638519fe",
|
536 |
+
"metadata": {},
|
537 |
+
"outputs": [],
|
538 |
+
"source": []
|
539 |
+
}
|
540 |
+
],
|
541 |
+
"metadata": {
|
542 |
+
"kernelspec": {
|
543 |
+
"display_name": "Python 3 (ipykernel)",
|
544 |
+
"language": "python",
|
545 |
+
"name": "python3"
|
546 |
+
},
|
547 |
+
"language_info": {
|
548 |
+
"codemirror_mode": {
|
549 |
+
"name": "ipython",
|
550 |
+
"version": 3
|
551 |
+
},
|
552 |
+
"file_extension": ".py",
|
553 |
+
"mimetype": "text/x-python",
|
554 |
+
"name": "python",
|
555 |
+
"nbconvert_exporter": "python",
|
556 |
+
"pygments_lexer": "ipython3",
|
557 |
+
"version": "3.10.4"
|
558 |
+
}
|
559 |
+
},
|
560 |
+
"nbformat": 4,
|
561 |
+
"nbformat_minor": 5
|
562 |
+
}
|
sharpcoder/wav2vec2_bjorn/config.json
ADDED
@@ -0,0 +1,88 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"_name_or_path": "facebook/wav2vec2-base",
|
3 |
+
"activation_dropout": 0.0,
|
4 |
+
"apply_spec_augment": true,
|
5 |
+
"architectures": [
|
6 |
+
"Wav2Vec2ForCTC"
|
7 |
+
],
|
8 |
+
"attention_dropout": 0.1,
|
9 |
+
"bos_token_id": 1,
|
10 |
+
"classifier_proj_size": 256,
|
11 |
+
"codevector_dim": 256,
|
12 |
+
"contrastive_logits_temperature": 0.1,
|
13 |
+
"conv_bias": false,
|
14 |
+
"conv_dim": [
|
15 |
+
512,
|
16 |
+
512,
|
17 |
+
512,
|
18 |
+
512,
|
19 |
+
512,
|
20 |
+
512,
|
21 |
+
512
|
22 |
+
],
|
23 |
+
"conv_kernel": [
|
24 |
+
10,
|
25 |
+
3,
|
26 |
+
3,
|
27 |
+
3,
|
28 |
+
3,
|
29 |
+
2,
|
30 |
+
2
|
31 |
+
],
|
32 |
+
"conv_stride": [
|
33 |
+
5,
|
34 |
+
2,
|
35 |
+
2,
|
36 |
+
2,
|
37 |
+
2,
|
38 |
+
2,
|
39 |
+
2
|
40 |
+
],
|
41 |
+
"ctc_loss_reduction": "mean",
|
42 |
+
"ctc_zero_infinity": false,
|
43 |
+
"diversity_loss_weight": 0.1,
|
44 |
+
"do_stable_layer_norm": false,
|
45 |
+
"eos_token_id": 2,
|
46 |
+
"feat_extract_activation": "gelu",
|
47 |
+
"feat_extract_norm": "group",
|
48 |
+
"feat_proj_dropout": 0.1,
|
49 |
+
"feat_quantizer_dropout": 0.0,
|
50 |
+
"final_dropout": 0.0,
|
51 |
+
"freeze_feat_extract_train": true,
|
52 |
+
"hidden_act": "gelu",
|
53 |
+
"hidden_dropout": 0.1,
|
54 |
+
"hidden_size": 768,
|
55 |
+
"initializer_range": 0.02,
|
56 |
+
"intermediate_size": 3072,
|
57 |
+
"layer_norm_eps": 1e-05,
|
58 |
+
"layerdrop": 0.0,
|
59 |
+
"mask_channel_length": 10,
|
60 |
+
"mask_channel_min_space": 1,
|
61 |
+
"mask_channel_other": 0.0,
|
62 |
+
"mask_channel_prob": 0.0,
|
63 |
+
"mask_channel_selection": "static",
|
64 |
+
"mask_feature_length": 10,
|
65 |
+
"mask_feature_prob": 0.0,
|
66 |
+
"mask_time_length": 10,
|
67 |
+
"mask_time_min_space": 1,
|
68 |
+
"mask_time_other": 0.0,
|
69 |
+
"mask_time_prob": 0.05,
|
70 |
+
"mask_time_selection": "static",
|
71 |
+
"model_type": "wav2vec2",
|
72 |
+
"no_mask_channel_overlap": false,
|
73 |
+
"no_mask_time_overlap": false,
|
74 |
+
"num_attention_heads": 12,
|
75 |
+
"num_codevector_groups": 2,
|
76 |
+
"num_codevectors_per_group": 320,
|
77 |
+
"num_conv_pos_embedding_groups": 16,
|
78 |
+
"num_conv_pos_embeddings": 128,
|
79 |
+
"num_feat_extract_layers": 7,
|
80 |
+
"num_hidden_layers": 12,
|
81 |
+
"num_negatives": 100,
|
82 |
+
"pad_token_id": 19,
|
83 |
+
"proj_codevector_dim": 256,
|
84 |
+
"torch_dtype": "float32",
|
85 |
+
"transformers_version": "4.11.3",
|
86 |
+
"use_weighted_layer_sum": false,
|
87 |
+
"vocab_size": 32
|
88 |
+
}
|
sharpcoder/wav2vec2_bjorn/preprocessor_config.json
ADDED
@@ -0,0 +1,9 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"do_normalize": true,
|
3 |
+
"feature_extractor_type": "Wav2Vec2FeatureExtractor",
|
4 |
+
"feature_size": 1,
|
5 |
+
"padding_side": "right",
|
6 |
+
"padding_value": 0.0,
|
7 |
+
"return_attention_mask": false,
|
8 |
+
"sampling_rate": 16000
|
9 |
+
}
|
sharpcoder/wav2vec2_bjorn/pytorch_model.bin
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:873bf552da3de5ce2fc1efbe234017f06cf7b9b70812d408585136c69486cb81
|
3 |
+
size 377667031
|
sharpcoder/wav2vec2_bjorn/training_args.bin
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:2b61aecf82c993254e7b0fbeb1c240469688a2bf27cd91d288ef05824cd7c911
|
3 |
+
size 2799
|
vocab.json
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
{"w": 0, "y": 1, "m": 2, "i": 3, "e": 4, "s": 5, "r": 6, "p": 7, "n": 8, "a": 9, "h": 10, ".": 11, "j": 12, "b": 13, "d": 14, "l": 15, "o": 16, "|": 17, "[UNK]": 18, "[PAD]": 19}
|