ZiaTohidi commited on
Commit
8aeeadb
1 Parent(s): dcd9ddf

Upload 3 files

Browse files

My first attempt to train a base model
Trained on Shahname Ferdowsi for 5000 iterations on colab

.gitattributes CHANGED
@@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
 
 
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
36
+ Shahname[[:space:]]Ferdowsi.docx filter=lfs diff=lfs merge=lfs -text
Shahname Ferdowsi.docx ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:71d0194fdb6375dc10532ad7487835896c3fcc0831218af38fe432f45f4ffa92
3
+ size 4034012
ShahnameBSgenerator.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:335f91529330c9db5bf6a83808aeb7458eb4650ae870fe16c697fe75e29b0103
3
+ size 35064196
gpt_dev.ipynb ADDED
@@ -0,0 +1,872 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "nbformat": 4,
3
+ "nbformat_minor": 0,
4
+ "metadata": {
5
+ "colab": {
6
+ "provenance": [],
7
+ "gpuType": "T4"
8
+ },
9
+ "kernelspec": {
10
+ "name": "python3",
11
+ "display_name": "Python 3"
12
+ },
13
+ "language_info": {
14
+ "name": "python"
15
+ },
16
+ "accelerator": "GPU"
17
+ },
18
+ "cells": [
19
+ {
20
+ "cell_type": "markdown",
21
+ "source": [
22
+ "## Building a GPT\n",
23
+ "\n",
24
+ "Companion notebook to the [Zero To Hero](https://karpathy.ai/zero-to-hero.html) video on GPT."
25
+ ],
26
+ "metadata": {
27
+ "id": "wJpXpmjEYC_T"
28
+ }
29
+ },
30
+ {
31
+ "cell_type": "code",
32
+ "source": [
33
+ "!pip install -q python-docx\n"
34
+ ],
35
+ "metadata": {
36
+ "id": "Rd8lAG81GIZR"
37
+ },
38
+ "execution_count": null,
39
+ "outputs": []
40
+ },
41
+ {
42
+ "cell_type": "code",
43
+ "source": [
44
+ "import docx\n",
45
+ "import re\n",
46
+ "\n",
47
+ "# Replace 'your_file.docx' with your file path\n",
48
+ "doc_path = '/content/Shahname Ferdowsi.docx'\n",
49
+ "\n",
50
+ "def read_docx(file_path):\n",
51
+ " doc = docx.Document(file_path)\n",
52
+ " text = []\n",
53
+ " for para in doc.paragraphs:\n",
54
+ " text.append(para.text)\n",
55
+ " return '\\n'.join(text)\n",
56
+ "\n",
57
+ "# Read the .docx file\n",
58
+ "content = read_docx(doc_path)\n",
59
+ "\n",
60
+ "# Remove English alphabets using regex\n",
61
+ "content_without_english = re.sub('[a-zA-Z]', '', content)\n",
62
+ "\n",
63
+ "text = content_without_english\n"
64
+ ],
65
+ "metadata": {
66
+ "id": "O6medjfRsLD9"
67
+ },
68
+ "execution_count": 1,
69
+ "outputs": []
70
+ },
71
+ {
72
+ "cell_type": "code",
73
+ "source": [
74
+ "print(\"length of dataset in characters: \", len(text))"
75
+ ],
76
+ "metadata": {
77
+ "colab": {
78
+ "base_uri": "https://localhost:8080/"
79
+ },
80
+ "id": "6xWI_VyAsN8F",
81
+ "outputId": "d703a4c4-8318-4a65-a48a-c51c94deb4c8"
82
+ },
83
+ "execution_count": 2,
84
+ "outputs": [
85
+ {
86
+ "output_type": "stream",
87
+ "name": "stdout",
88
+ "text": [
89
+ "length of dataset in characters: 3867092\n"
90
+ ]
91
+ }
92
+ ]
93
+ },
94
+ {
95
+ "cell_type": "code",
96
+ "source": [
97
+ "# let's look at the first 1000 characters\n",
98
+ "print(text[:1000])"
99
+ ],
100
+ "metadata": {
101
+ "colab": {
102
+ "base_uri": "https://localhost:8080/"
103
+ },
104
+ "id": "2c5V0FvqseE0",
105
+ "outputId": "de14fbee-c5d0-4ef9-95d3-23ab5d96edad"
106
+ },
107
+ "execution_count": 3,
108
+ "outputs": [
109
+ {
110
+ "output_type": "stream",
111
+ "name": "stdout",
112
+ "text": [
113
+ "\n",
114
+ "\n",
115
+ "آغاز كتاب‏\n",
116
+ " بنام خداوند جان و خرد \t \t كزين برتر انديشه بر نگذرد\n",
117
+ " خداوند نام و خداوند جاى \t\t خداوند روزى‏ده رهنماى‏\n",
118
+ " خداوند كيوان و گردان سپهر \t فروزنده ماه و ناهيد و مهر\n",
119
+ " ز نام و نشان و گمان برترست \t \t نگارنده برشده پيكرست‏\n",
120
+ " به بينندگان آفريننده را \t \t نبينى مرنجان دو بيننده را\n",
121
+ " نيابد بدو نيز انديشه راه \t\t كه او برتر از نام و از جايگاه‏\n",
122
+ " سخن هر چه زين گوهران بگذرد \t نيابد بدو راه جان و خرد\n",
123
+ " خرد گر سخن برگزيند همى \t همان را گزيند كه بيند همى‏\n",
124
+ " ستودن نداند كس او را چو هست \t ميان بندگى را ببايدت بست‏\n",
125
+ " خرد را و جان را همى سنجد اوى در انديشۀ سخته كى گنجد اوى‏\n",
126
+ " بدين آلت راى و جان و زبان \t \t ستود آفريننده را كى توان‏\n",
127
+ " به هستيش بايد كه خستو شوى \t ز گفتار بى‏كار يك سو شوى‏\n",
128
+ " پرستنده باشى و جوينده راه \t بژرفى بفرمانش كردن نگاه‏\n",
129
+ " توانا بود هر كه دانا بود \n"
130
+ ]
131
+ }
132
+ ]
133
+ },
134
+ {
135
+ "cell_type": "code",
136
+ "source": [
137
+ "# here are all the unique characters that occur in this text\n",
138
+ "chars = sorted(list(set(text)))\n",
139
+ "vocab_size = len(chars)\n",
140
+ "print(''.join(chars))\n",
141
+ "print(vocab_size)"
142
+ ],
143
+ "metadata": {
144
+ "colab": {
145
+ "base_uri": "https://localhost:8080/"
146
+ },
147
+ "id": "0e-Rbyr8sfM8",
148
+ "outputId": "5742a07a-c567-465c-8ba4-520eec8dbeef"
149
+ },
150
+ "execution_count": 4,
151
+ "outputs": [
152
+ {
153
+ "output_type": "stream",
154
+ "name": "stdout",
155
+ "text": [
156
+ "\t\n",
157
+ " &()*-0123456789:[]،؟ءآأؤئابتثجحخدذرزسشصضطظعغفقكلمنهوىيَُِّْپچژکگۀی‏\n",
158
+ "70\n"
159
+ ]
160
+ }
161
+ ]
162
+ },
163
+ {
164
+ "cell_type": "code",
165
+ "source": [
166
+ "# create a mapping from characters to integers\n",
167
+ "stoi = { ch:i for i,ch in enumerate(chars) }\n",
168
+ "itos = { i:ch for i,ch in enumerate(chars) }\n",
169
+ "encode = lambda s: [stoi[c] for c in s] # encoder: take a string, output a list of integers\n",
170
+ "decode = lambda l: ''.join([itos[i] for i in l]) # decoder: take a list of integers, output a string\n",
171
+ "\n",
172
+ "print(encode(\"سلااام چطوری\"))\n",
173
+ "print(decode(encode(\"سلااام چطوری\")))"
174
+ ],
175
+ "metadata": {
176
+ "colab": {
177
+ "base_uri": "https://localhost:8080/"
178
+ },
179
+ "id": "Yw1LKNCgwjj1",
180
+ "outputId": "717375fd-ece5-49fa-f0f4-97b215c1dc5a"
181
+ },
182
+ "execution_count": 5,
183
+ "outputs": [
184
+ {
185
+ "output_type": "stream",
186
+ "name": "stdout",
187
+ "text": [
188
+ "[39, 50, 28, 28, 28, 51, 2, 63, 43, 54, 37, 68]\n",
189
+ "سلااام چطوری\n"
190
+ ]
191
+ }
192
+ ]
193
+ },
194
+ {
195
+ "cell_type": "code",
196
+ "source": [
197
+ "# let's now encode the entire text dataset and store it into a torch.Tensor\n",
198
+ "import torch # we use PyTorch: https://pytorch.org\n",
199
+ "data = torch.tensor(encode(text), dtype=torch.long)\n",
200
+ "print(data.shape, data.dtype)\n",
201
+ "print(data[:1000]) # the 1000 characters we looked at earier will to the GPT look like this"
202
+ ],
203
+ "metadata": {
204
+ "id": "YJb0OXPwzvqg"
205
+ },
206
+ "execution_count": null,
207
+ "outputs": []
208
+ },
209
+ {
210
+ "cell_type": "code",
211
+ "source": [
212
+ "# Let's now split up the data into train and validation sets\n",
213
+ "n = int(0.9*len(data)) # first 90% will be train, rest val\n",
214
+ "train_data = data[:n]\n",
215
+ "val_data = data[n:]"
216
+ ],
217
+ "metadata": {
218
+ "id": "f_WIXqxz0lU5"
219
+ },
220
+ "execution_count": 8,
221
+ "outputs": []
222
+ },
223
+ {
224
+ "cell_type": "code",
225
+ "source": [
226
+ "block_size = 8\n",
227
+ "train_data[:block_size+1]"
228
+ ],
229
+ "metadata": {
230
+ "colab": {
231
+ "base_uri": "https://localhost:8080/"
232
+ },
233
+ "id": "TD5Bj8Y6IAD4",
234
+ "outputId": "fef174ac-01f6-4043-ee46-d3d59fdba345"
235
+ },
236
+ "execution_count": 9,
237
+ "outputs": [
238
+ {
239
+ "output_type": "execute_result",
240
+ "data": {
241
+ "text/plain": [
242
+ "tensor([ 1, 1, 24, 46, 28, 38, 2, 49, 30])"
243
+ ]
244
+ },
245
+ "metadata": {},
246
+ "execution_count": 9
247
+ }
248
+ ]
249
+ },
250
+ {
251
+ "cell_type": "code",
252
+ "source": [
253
+ "x = train_data[:block_size]\n",
254
+ "y = train_data[1:block_size+1]\n",
255
+ "for t in range(block_size):\n",
256
+ " context = x[:t+1]\n",
257
+ " target = y[t]\n",
258
+ " print(f\"when input is {context} the target: {target}\")"
259
+ ],
260
+ "metadata": {
261
+ "colab": {
262
+ "base_uri": "https://localhost:8080/"
263
+ },
264
+ "id": "9HXDe8vGJCEn",
265
+ "outputId": "2f223db6-2278-43fe-c4b0-1353dddfe538"
266
+ },
267
+ "execution_count": 10,
268
+ "outputs": [
269
+ {
270
+ "output_type": "stream",
271
+ "name": "stdout",
272
+ "text": [
273
+ "when input is tensor([1]) the target: 1\n",
274
+ "when input is tensor([1, 1]) the target: 24\n",
275
+ "when input is tensor([ 1, 1, 24]) the target: 46\n",
276
+ "when input is tensor([ 1, 1, 24, 46]) the target: 28\n",
277
+ "when input is tensor([ 1, 1, 24, 46, 28]) the target: 38\n",
278
+ "when input is tensor([ 1, 1, 24, 46, 28, 38]) the target: 2\n",
279
+ "when input is tensor([ 1, 1, 24, 46, 28, 38, 2]) the target: 49\n",
280
+ "when input is tensor([ 1, 1, 24, 46, 28, 38, 2, 49]) the target: 30\n"
281
+ ]
282
+ }
283
+ ]
284
+ },
285
+ {
286
+ "cell_type": "code",
287
+ "source": [
288
+ "torch.manual_seed(1337)\n",
289
+ "batch_size = 4 # how many independent sequences will we process in parallel?\n",
290
+ "block_size = 8 # what is the maximum context length for predictions?\n",
291
+ "\n",
292
+ "def get_batch(split):\n",
293
+ " # generate a small batch of data of inputs x and targets y\n",
294
+ " data = train_data if split == 'train' else val_data\n",
295
+ " ix = torch.randint(len(data) - block_size, (batch_size,))\n",
296
+ " x = torch.stack([data[i:i+block_size] for i in ix])\n",
297
+ " y = torch.stack([data[i+1:i+block_size+1] for i in ix])\n",
298
+ " return x, y\n",
299
+ "\n",
300
+ "xb, yb = get_batch('train')\n",
301
+ "print('inputs:')\n",
302
+ "print(xb.shape)\n",
303
+ "print(xb)\n",
304
+ "print('targets:')\n",
305
+ "print(yb.shape)\n",
306
+ "print(yb)\n",
307
+ "\n",
308
+ "print('----')\n",
309
+ "\n",
310
+ "for b in range(batch_size): # batch dimension\n",
311
+ " for t in range(block_size): # time dimension\n",
312
+ " context = xb[b, :t+1]\n",
313
+ " target = yb[b,t]\n",
314
+ " print(f\"when input is {context.tolist()} the target: {target}\")"
315
+ ],
316
+ "metadata": {
317
+ "id": "Q3k1Czf7LuA9"
318
+ },
319
+ "execution_count": null,
320
+ "outputs": []
321
+ },
322
+ {
323
+ "cell_type": "code",
324
+ "source": [
325
+ "print(xb) # our input to the transformer"
326
+ ],
327
+ "metadata": {
328
+ "colab": {
329
+ "base_uri": "https://localhost:8080/"
330
+ },
331
+ "id": "qpyyAeIzQjlO",
332
+ "outputId": "b4ac6055-9b61-42fa-e1e6-0f957abe5bcd"
333
+ },
334
+ "execution_count": 12,
335
+ "outputs": [
336
+ {
337
+ "output_type": "stream",
338
+ "name": "stdout",
339
+ "text": [
340
+ "tensor([[30, 37, 28, 2, 29, 34, 30, 2],\n",
341
+ " [51, 2, 40, 28, 62, 54, 37, 2],\n",
342
+ " [ 2, 2, 2, 49, 53, 2, 37, 40],\n",
343
+ " [35, 52, 35, 2, 66, 37, 35, 28]])\n"
344
+ ]
345
+ }
346
+ ]
347
+ },
348
+ {
349
+ "cell_type": "code",
350
+ "source": [
351
+ "import torch\n",
352
+ "import torch.nn as nn\n",
353
+ "from torch.nn import functional as F\n",
354
+ "torch.manual_seed(1337)\n",
355
+ "\n",
356
+ "class BigramLanguageModel(nn.Module):\n",
357
+ "\n",
358
+ " def __init__(self, vocab_size):\n",
359
+ " super().__init__()\n",
360
+ " # each token directly reads off the logits for the next token from a lookup table\n",
361
+ " self.token_embedding_table = nn.Embedding(vocab_size, vocab_size)\n",
362
+ "\n",
363
+ " def forward(self, idx, targets=None):\n",
364
+ "\n",
365
+ " # idx and targets are both (B,T) tensor of integers\n",
366
+ " logits = self.token_embedding_table(idx) # (B,T,C)\n",
367
+ "\n",
368
+ " if targets is None:\n",
369
+ " loss = None\n",
370
+ " else:\n",
371
+ " B, T, C = logits.shape\n",
372
+ " logits = logits.view(B*T, C)\n",
373
+ " targets = targets.view(B*T)\n",
374
+ " loss = F.cross_entropy(logits, targets)\n",
375
+ "\n",
376
+ " return logits, loss\n",
377
+ "\n",
378
+ " def generate(self, idx, max_new_tokens):\n",
379
+ " # idx is (B, T) array of indices in the current context\n",
380
+ " for _ in range(max_new_tokens):\n",
381
+ " # get the predictions\n",
382
+ " logits, loss = self(idx)\n",
383
+ " # focus only on the last time step\n",
384
+ " logits = logits[:, -1, :] # becomes (B, C)\n",
385
+ " # apply softmax to get probabilities\n",
386
+ " probs = F.softmax(logits, dim=-1) # (B, C)\n",
387
+ " # sample from the distribution\n",
388
+ " idx_next = torch.multinomial(probs, num_samples=1) # (B, 1)\n",
389
+ " # append sampled index to the running sequence\n",
390
+ " idx = torch.cat((idx, idx_next), dim=1) # (B, T+1)\n",
391
+ " return idx\n",
392
+ "\n",
393
+ "m = BigramLanguageModel(vocab_size)\n",
394
+ "logits, loss = m(xb, yb)\n",
395
+ "print(logits.shape)\n",
396
+ "print(loss)\n",
397
+ "\n",
398
+ "print(decode(m.generate(idx = torch.zeros((1, 1), dtype=torch.long), max_new_tokens=100)[0].tolist()))\n"
399
+ ],
400
+ "metadata": {
401
+ "id": "nql_1ER53oCf"
402
+ },
403
+ "execution_count": null,
404
+ "outputs": []
405
+ },
406
+ {
407
+ "cell_type": "code",
408
+ "source": [
409
+ "# create a PyTorch optimizer\n",
410
+ "optimizer = torch.optim.AdamW(m.parameters(), lr=1e-3)"
411
+ ],
412
+ "metadata": {
413
+ "id": "eTyJ8qAaDdiF"
414
+ },
415
+ "execution_count": 14,
416
+ "outputs": []
417
+ },
418
+ {
419
+ "cell_type": "code",
420
+ "source": [
421
+ "batch_size = 32\n",
422
+ "for steps in range(100): # increase number of steps for good results...\n",
423
+ "\n",
424
+ " # sample a batch of data\n",
425
+ " xb, yb = get_batch('train')\n",
426
+ "\n",
427
+ " # evaluate the loss\n",
428
+ " logits, loss = m(xb, yb)\n",
429
+ " optimizer.zero_grad(set_to_none=True)\n",
430
+ " loss.backward()\n",
431
+ " optimizer.step()\n",
432
+ "\n",
433
+ "print(loss.item())\n"
434
+ ],
435
+ "metadata": {
436
+ "colab": {
437
+ "base_uri": "https://localhost:8080/"
438
+ },
439
+ "id": "Hs4kI8YdEkQj",
440
+ "outputId": "31371728-b7fb-48e6-8b52-f00571f8d89f"
441
+ },
442
+ "execution_count": 15,
443
+ "outputs": [
444
+ {
445
+ "output_type": "stream",
446
+ "name": "stdout",
447
+ "text": [
448
+ "4.402019023895264\n"
449
+ ]
450
+ }
451
+ ]
452
+ },
453
+ {
454
+ "cell_type": "code",
455
+ "source": [
456
+ "print(decode(m.generate(idx = torch.zeros((1, 1), dtype=torch.long), max_new_tokens=500)[0].tolist()))"
457
+ ],
458
+ "metadata": {
459
+ "id": "EcVIDWAZEtjN"
460
+ },
461
+ "execution_count": null,
462
+ "outputs": []
463
+ },
464
+ {
465
+ "cell_type": "markdown",
466
+ "source": [
467
+ "### Full finished code, for reference\n",
468
+ "\n",
469
+ "You may want to refer directly to the git repo instead though."
470
+ ],
471
+ "metadata": {
472
+ "id": "ZcvKeBXoZFOY"
473
+ }
474
+ },
475
+ {
476
+ "cell_type": "code",
477
+ "source": [
478
+ "torch.cuda.is_available()"
479
+ ],
480
+ "metadata": {
481
+ "id": "IJFiK1n_WqLd",
482
+ "outputId": "f42d7502-df43-4a8d-9905-d64b4048a8fb",
483
+ "colab": {
484
+ "base_uri": "https://localhost:8080/"
485
+ }
486
+ },
487
+ "execution_count": 3,
488
+ "outputs": [
489
+ {
490
+ "output_type": "execute_result",
491
+ "data": {
492
+ "text/plain": [
493
+ "True"
494
+ ]
495
+ },
496
+ "metadata": {},
497
+ "execution_count": 3
498
+ }
499
+ ]
500
+ },
501
+ {
502
+ "cell_type": "code",
503
+ "source": [
504
+ "import torch\n",
505
+ "import torch.nn as nn\n",
506
+ "from torch.nn import functional as F\n",
507
+ "\n",
508
+ "# hyperparameters\n",
509
+ "batch_size = 128 # how many independent sequences will we process in parallel?\n",
510
+ "block_size = 256 # what is the maximum context length for predictions?\n",
511
+ "max_iters = 5000\n",
512
+ "eval_interval = 300\n",
513
+ "learning_rate = 1e-3\n",
514
+ "device = 'cuda' if torch.cuda.is_available() else 'cpu'\n",
515
+ "eval_iters = 100\n",
516
+ "n_embd = 128 # Increase hidden size\n",
517
+ "n_head = 8 # Adjust number of attention heads\n",
518
+ "n_layer = 12 # Increase number of layers\n",
519
+ "\n",
520
+ "dropout = 0.2\n",
521
+ "# ------------\n",
522
+ "\n",
523
+ "torch.manual_seed(1337)\n",
524
+ "\n",
525
+ "\n",
526
+ "text = text\n",
527
+ "\n",
528
+ "# here are all the unique characters that occur in this text\n",
529
+ "chars = sorted(list(set(text)))\n",
530
+ "vocab_size = len(chars)\n",
531
+ "# create a mapping from characters to integers\n",
532
+ "stoi = { ch:i for i,ch in enumerate(chars) }\n",
533
+ "itos = { i:ch for i,ch in enumerate(chars) }\n",
534
+ "encode = lambda s: [stoi[c] for c in s] # encoder: take a string, output a list of integers\n",
535
+ "decode = lambda l: ''.join([itos[i] for i in l]) # decoder: take a list of integers, output a string\n",
536
+ "\n",
537
+ "# Train and test splits\n",
538
+ "data = torch.tensor(encode(text), dtype=torch.long)\n",
539
+ "n = int(0.9*len(data)) # first 90% will be train, rest val\n",
540
+ "train_data = data[:n]\n",
541
+ "val_data = data[n:]\n",
542
+ "\n",
543
+ "# data loading\n",
544
+ "def get_batch(split):\n",
545
+ " # generate a small batch of data of inputs x and targets y\n",
546
+ " data = train_data if split == 'train' else val_data\n",
547
+ " ix = torch.randint(len(data) - block_size, (batch_size,))\n",
548
+ " x = torch.stack([data[i:i+block_size] for i in ix])\n",
549
+ " y = torch.stack([data[i+1:i+block_size+1] for i in ix])\n",
550
+ " x, y = x.to(device), y.to(device)\n",
551
+ " return x, y\n",
552
+ "\n",
553
+ "@torch.no_grad()\n",
554
+ "def estimate_loss():\n",
555
+ " out = {}\n",
556
+ " model.eval()\n",
557
+ " for split in ['train', 'val']:\n",
558
+ " losses = torch.zeros(eval_iters)\n",
559
+ " for k in range(eval_iters):\n",
560
+ " X, Y = get_batch(split)\n",
561
+ " logits, loss = model(X, Y)\n",
562
+ " losses[k] = loss.item()\n",
563
+ " out[split] = losses.mean()\n",
564
+ " model.train()\n",
565
+ " return out\n",
566
+ "\n",
567
+ "class Head(nn.Module):\n",
568
+ " \"\"\" one head of self-attention \"\"\"\n",
569
+ "\n",
570
+ " def __init__(self, head_size):\n",
571
+ " super().__init__()\n",
572
+ " self.key = nn.Linear(n_embd, head_size, bias=False)\n",
573
+ " self.query = nn.Linear(n_embd, head_size, bias=False)\n",
574
+ " self.value = nn.Linear(n_embd, head_size, bias=False)\n",
575
+ " self.register_buffer('tril', torch.tril(torch.ones(block_size, block_size)))\n",
576
+ "\n",
577
+ " self.dropout = nn.Dropout(dropout)\n",
578
+ "\n",
579
+ " def forward(self, x):\n",
580
+ " B,T,C = x.shape\n",
581
+ " k = self.key(x) # (B,T,C)\n",
582
+ " q = self.query(x) # (B,T,C)\n",
583
+ " # compute attention scores (\"affinities\")\n",
584
+ " wei = q @ k.transpose(-2,-1) * C**-0.5 # (B, T, C) @ (B, C, T) -> (B, T, T)\n",
585
+ " wei = wei.masked_fill(self.tril[:T, :T] == 0, float('-inf')) # (B, T, T)\n",
586
+ " wei = F.softmax(wei, dim=-1) # (B, T, T)\n",
587
+ " wei = self.dropout(wei)\n",
588
+ " # perform the weighted aggregation of the values\n",
589
+ " v = self.value(x) # (B,T,C)\n",
590
+ " out = wei @ v # (B, T, T) @ (B, T, C) -> (B, T, C)\n",
591
+ " return out\n",
592
+ "\n",
593
+ "class MultiHeadAttention(nn.Module):\n",
594
+ " \"\"\" multiple heads of self-attention in parallel \"\"\"\n",
595
+ "\n",
596
+ " def __init__(self, num_heads, head_size):\n",
597
+ " super().__init__()\n",
598
+ " self.heads = nn.ModuleList([Head(head_size) for _ in range(num_heads)])\n",
599
+ " self.proj = nn.Linear(n_embd, n_embd)\n",
600
+ " self.dropout = nn.Dropout(dropout)\n",
601
+ "\n",
602
+ " def forward(self, x):\n",
603
+ " out = torch.cat([h(x) for h in self.heads], dim=-1)\n",
604
+ " out = self.dropout(self.proj(out))\n",
605
+ " return out\n",
606
+ "\n",
607
+ "class FeedFoward(nn.Module):\n",
608
+ " \"\"\" a simple linear layer followed by a non-linearity \"\"\"\n",
609
+ "\n",
610
+ " def __init__(self, n_embd):\n",
611
+ " super().__init__()\n",
612
+ " self.net = nn.Sequential(\n",
613
+ " nn.Linear(n_embd, 4 * n_embd),\n",
614
+ " nn.ReLU(),\n",
615
+ " nn.Linear(4 * n_embd, n_embd),\n",
616
+ " nn.Dropout(dropout),\n",
617
+ " )\n",
618
+ "\n",
619
+ " def forward(self, x):\n",
620
+ " return self.net(x)\n",
621
+ "\n",
622
+ "class Block(nn.Module):\n",
623
+ " \"\"\" Transformer block: communication followed by computation \"\"\"\n",
624
+ "\n",
625
+ " def __init__(self, n_embd, n_head):\n",
626
+ " # n_embd: embedding dimension, n_head: the number of heads we'd like\n",
627
+ " super().__init__()\n",
628
+ " head_size = n_embd // n_head\n",
629
+ " self.sa = MultiHeadAttention(n_head, head_size)\n",
630
+ " self.ffwd = FeedFoward(n_embd)\n",
631
+ " self.ln1 = nn.LayerNorm(n_embd)\n",
632
+ " self.ln2 = nn.LayerNorm(n_embd)\n",
633
+ "\n",
634
+ " def forward(self, x):\n",
635
+ " x = x + self.sa(self.ln1(x))\n",
636
+ " x = x + self.ffwd(self.ln2(x))\n",
637
+ " return x\n",
638
+ "\n",
639
+ "# super simple bigram model\n",
640
+ "class BigramLanguageModel(nn.Module):\n",
641
+ "\n",
642
+ " def __init__(self):\n",
643
+ " super().__init__()\n",
644
+ " # each token directly reads off the logits for the next token from a lookup table\n",
645
+ " self.token_embedding_table = nn.Embedding(vocab_size, n_embd)\n",
646
+ " self.position_embedding_table = nn.Embedding(block_size, n_embd)\n",
647
+ " self.blocks = nn.Sequential(*[Block(n_embd, n_head=n_head) for _ in range(n_layer)])\n",
648
+ " self.ln_f = nn.LayerNorm(n_embd) # final layer norm\n",
649
+ " self.lm_head = nn.Linear(n_embd, vocab_size)\n",
650
+ "\n",
651
+ " def forward(self, idx, targets=None):\n",
652
+ " B, T = idx.shape\n",
653
+ "\n",
654
+ " # idx and targets are both (B,T) tensor of integers\n",
655
+ " tok_emb = self.token_embedding_table(idx) # (B,T,C)\n",
656
+ " pos_emb = self.position_embedding_table(torch.arange(T, device=device)) # (T,C)\n",
657
+ " x = tok_emb + pos_emb # (B,T,C)\n",
658
+ " x = self.blocks(x) # (B,T,C)\n",
659
+ " x = self.ln_f(x) # (B,T,C)\n",
660
+ " logits = self.lm_head(x) # (B,T,vocab_size)\n",
661
+ "\n",
662
+ " if targets is None:\n",
663
+ " loss = None\n",
664
+ " else:\n",
665
+ " B, T, C = logits.shape\n",
666
+ " logits = logits.view(B*T, C)\n",
667
+ " targets = targets.view(B*T)\n",
668
+ " loss = F.cross_entropy(logits, targets)\n",
669
+ "\n",
670
+ " return logits, loss\n",
671
+ "\n",
672
+ " def generate(self, idx, max_new_tokens):\n",
673
+ " # idx is (B, T) array of indices in the current context\n",
674
+ " for _ in range(max_new_tokens):\n",
675
+ " # crop idx to the last block_size tokens\n",
676
+ " idx_cond = idx[:, -block_size:]\n",
677
+ " # get the predictions\n",
678
+ " logits, loss = self(idx_cond)\n",
679
+ " # focus only on the last time step\n",
680
+ " logits = logits[:, -1, :] # becomes (B, C)\n",
681
+ " # apply softmax to get probabilities\n",
682
+ " probs = F.softmax(logits, dim=-1) # (B, C)\n",
683
+ " # sample from the distribution\n",
684
+ " idx_next = torch.multinomial(probs, num_samples=1) # (B, 1)\n",
685
+ " # append sampled index to the running sequence\n",
686
+ " idx = torch.cat((idx, idx_next), dim=1) # (B, T+1)\n",
687
+ " return idx\n",
688
+ "\n",
689
+ "model = BigramLanguageModel()\n",
690
+ "m = model.to(device)\n",
691
+ "# print the number of parameters in the model\n",
692
+ "print(sum(p.numel() for p in m.parameters())/1e6, 'M parameters')\n",
693
+ "\n",
694
+ "# create a PyTorch optimizer\n",
695
+ "optimizer = torch.optim.AdamW(model.parameters(), lr=learning_rate)\n",
696
+ "\n",
697
+ "for iter in range(max_iters):\n",
698
+ "\n",
699
+ " # every once in a while evaluate the loss on train and val sets\n",
700
+ " if iter % eval_interval == 0 or iter == max_iters - 1:\n",
701
+ " losses = estimate_loss()\n",
702
+ " print(f\"step {iter}: train loss {losses['train']:.4f}, val loss {losses['val']:.4f}\")\n",
703
+ "\n",
704
+ " # sample a batch of data\n",
705
+ " xb, yb = get_batch('train')\n",
706
+ "\n",
707
+ " # evaluate the loss\n",
708
+ " logits, loss = model(xb, yb)\n",
709
+ " optimizer.zero_grad(set_to_none=True)\n",
710
+ " loss.backward()\n",
711
+ " optimizer.step()\n",
712
+ "\n",
713
+ "# generate from the model\n",
714
+ "context = torch.zeros((1, 1), dtype=torch.long, device=device)\n",
715
+ "print(decode(m.generate(context, max_new_tokens=2000)[0].tolist()))\n"
716
+ ],
717
+ "metadata": {
718
+ "colab": {
719
+ "base_uri": "https://localhost:8080/"
720
+ },
721
+ "id": "hoelkOrFY8bN",
722
+ "outputId": "c01f10ef-048b-41b4-c862-031c7e7281c9"
723
+ },
724
+ "execution_count": 4,
725
+ "outputs": [
726
+ {
727
+ "output_type": "stream",
728
+ "name": "stdout",
729
+ "text": [
730
+ "2.42567 M parameters\n",
731
+ "step 0: train loss 4.4474, val loss 4.4467\n",
732
+ "step 300: train loss 1.7789, val loss 1.7773\n",
733
+ "step 600: train loss 1.4613, val loss 1.4679\n",
734
+ "step 900: train loss 1.2493, val loss 1.2604\n",
735
+ "step 1200: train loss 1.1231, val loss 1.1440\n",
736
+ "step 1500: train loss 1.0568, val loss 1.0844\n",
737
+ "step 1800: train loss 1.0104, val loss 1.0401\n",
738
+ "step 2100: train loss 0.9701, val loss 1.0066\n",
739
+ "step 2400: train loss 0.9385, val loss 0.9754\n",
740
+ "step 2700: train loss 0.9122, val loss 0.9547\n",
741
+ "step 3000: train loss 0.8927, val loss 0.9387\n",
742
+ "step 3300: train loss 0.8747, val loss 0.9226\n",
743
+ "step 3600: train loss 0.8646, val loss 0.9148\n",
744
+ "step 3900: train loss 0.8546, val loss 0.9087\n",
745
+ "step 4200: train loss 0.8414, val loss 0.8990\n",
746
+ "step 4500: train loss 0.8352, val loss 0.8919\n",
747
+ "step 4800: train loss 0.8238, val loss 0.8827\n",
748
+ "step 4999: train loss 0.8193, val loss 0.8796\n",
749
+ "\t گروهر شده جوشن با يوز رخ سروه‏\n",
750
+ " همى گور و ديده بيوق و تير همان غلت شاپور و چندى مپير\n",
751
+ " هم اندر زمان غلعه فرخ اوست همه سال گردنده شد گيو اوست‏\n",
752
+ " اگر سوگوارست پيكار بيد همى ژعف و خنجر ز سازند بيد\n",
753
+ " همه جنگ را مشك هست و غم زمين شد ز آهوش استر دژم‏\n",
754
+ " سپه را سر بابر افراسياب بزد باد و پاى و رعد پذير\n",
755
+ " يكى جنگ پيلى فرو مايه كرد همه بگذرد اختر اينسان كرد\n",
756
+ " بدو گفت با دو پى اى داشتست سخن‏گوى و كشور بافراج داست‏\n",
757
+ " همى جنگ جمّى بمستى زوان بشد گستهم چشم بد نيك روان‏\n",
758
+ " خداوند پر ما ز گستهم خور بهر معدبان طرز گهر هور\n",
759
+ " چنان تاخت شاه آمد از چو گنگ جز از غم ديدگان بس اندر درنگ‏\n",
760
+ " [ و گر زين و از باره آهخت و راه بدين تيغ زن شاه در رزمگاه‏]\n",
761
+ " سكندر بشمزين يكى رزم زشت خرد شاد بايد استيد گل‏\n",
762
+ " [ شگاهى تور رستم‏]\n",
763
+ " [ چو اورنده باشد آورد به‏سال زمين زرد بسيار بينيد خاك‏]\n",
764
+ " [ چو خورشيد گشت از ش��ار ديد شده لشكر از ميان كار تيد‏]\n",
765
+ " يكى كار سودابه بى‏نان وزير چو تنها بدين تا بد شهريس‏\n",
766
+ " [ بفتراف زادى مدارى پسر كه تا چيز را نيز اسبان در حرن‏]\n",
767
+ " دو مانديش از كار چونى سپاه سم زاورش از آن بهر كلاه‏\n",
768
+ " چنين گفت پيران چنين گفت بخت كه با ناموزه شاه هنره تست‏\n",
769
+ " ميان دو پاكيزه بود نگذرد بكام من بريشان بشست كرد\n",
770
+ " بزابل چو فرزند تو شوم شاد برتر چنين گفت مانى كداد\n",
771
+ " برستم بايد اكنون گشت زاد دل زخم گردان و خندان براد\n",
772
+ " ورا من دبيرون تن اندر كنيد نگر تار باشى بپيوند كنيمد\n",
773
+ " شاهنامه، ص: 87\n",
774
+ " [ ورا داد پنيروز نوشين روان گر از مردم افگنده پهلوان‏]\n",
775
+ " [ مرا زانج دانات كردار جست سپه دار گيتى نيابد بشست‏]\n",
776
+ " [ تن بى‏گمان ميز ايران مراست كه اى نامور بخورش در نعل‏]\n",
777
+ " [ كسى داده‏يى رزم شب چون درم درف\n"
778
+ ]
779
+ }
780
+ ]
781
+ },
782
+ {
783
+ "cell_type": "code",
784
+ "source": [
785
+ "torch.save(model.state_dict(), 'language_model.pth')"
786
+ ],
787
+ "metadata": {
788
+ "id": "T-rD48Xwm5pc"
789
+ },
790
+ "execution_count": 5,
791
+ "outputs": []
792
+ },
793
+ {
794
+ "cell_type": "code",
795
+ "source": [
796
+ "from google.colab import drive\n",
797
+ "drive.mount('/content/drive')"
798
+ ],
799
+ "metadata": {
800
+ "id": "grP_S0osm6-5",
801
+ "outputId": "3f478a95-bdfe-45e8-c596-ef9bdf2ce034",
802
+ "colab": {
803
+ "base_uri": "https://localhost:8080/"
804
+ }
805
+ },
806
+ "execution_count": 7,
807
+ "outputs": [
808
+ {
809
+ "output_type": "stream",
810
+ "name": "stdout",
811
+ "text": [
812
+ "Mounted at /content/drive\n"
813
+ ]
814
+ }
815
+ ]
816
+ },
817
+ {
818
+ "cell_type": "code",
819
+ "source": [
820
+ "# generate from the model\n",
821
+ "context = torch.zeros((1, 1), dtype=torch.long, device=device)\n",
822
+ "print(decode(m.generate(context, max_new_tokens=2000)[0].tolist()))"
823
+ ],
824
+ "metadata": {
825
+ "id": "p92PG-OEsCvv",
826
+ "outputId": "4a982c9e-51f3-4576-ae70-3fc51d1ae687",
827
+ "colab": {
828
+ "base_uri": "https://localhost:8080/"
829
+ }
830
+ },
831
+ "execution_count": 11,
832
+ "outputs": [
833
+ {
834
+ "output_type": "stream",
835
+ "name": "stdout",
836
+ "text": [
837
+ "\t چو نزديك سام بلند بسالار تركان بجايش گزند\n",
838
+ " فرامور بآتش از اندر بپاى توانه روان رهنماى بپاى‏\n",
839
+ " سراسر يكى مرد زان در گزيد نهان گمان آرد نه نامين كشيد\n",
840
+ " [ كه بهرام گفتش كه برداشت بجز باژ جز تخت و كشتى براشت‏]\n",
841
+ " [ كه تا از آن داد نژاد بود بزرگ آور و دل پر از بود]\n",
842
+ " [ شوم شند پيروز سا شاه ماه همه نامور تخت شاه و سپاه‏]\n",
843
+ " سر بى‏قباى و نامه برش چو با ماه شد بى‏گناهش اوى‏\n",
844
+ " پرستندگان گفت كامون شوى برم گفت رسم نجست از زوى اوى‏\n",
845
+ " همه پاك بايست مهتران همه راى گفته بديدار زيان‏\n",
846
+ " بفرمود تا مهر قارن نشست پى سر بسر بر بپر مهر دست‏\n",
847
+ " بدان تا مبادا يكى پهلوان نداريد ما دانش جهان سر و جوان‏\n",
848
+ " همى سخت شنگل اندر آيد بدرد بازان رزم را برانى دلي]\n",
849
+ " [ پند آگازان بر گيو نوذر شايستار و ژويه باك‏]\n",
850
+ " چو خورشيد زفتى هيونى گرفت بلند اندر آن شاه آن زينهارمت‏\n",
851
+ " بفرمود تا سر بسر هم همه بروبرز و ماه آمدش بمشت‏\n",
852
+ " بدو گفت كاى شهريار منست كجات كيان از پى نان نيز منست‏\n",
853
+ " بفرمود تا جشن درنج و تخت تهمتن نشنريد ماهيم و بخت‏\n",
854
+ " شاهنامه، ص: 31\n",
855
+ "\n",
856
+ " مرا نيز جنگ پآن انديشه رفت زره ساله جنگ بى‏غم در گرفت‏\n",
857
+ " از ان ناپس بهرام بيداد من‏\n",
858
+ " كه بر دوه باران بديوان رسيد شب تيره گفتار توم شنيد\n",
859
+ " اگر من ز كسرى مباديم آمدم و ز ان غرم دلاور كرد آمدم‏\n",
860
+ " ز تركان بيارى برانى زمير بمى پيل بسسيار دو تنگ‏\n",
861
+ " بگيريد چندى وفر اين برگ كه از بازگشتن ياد سرگ‏شم‏\n",
862
+ " به مردى كو را بدو دست چو كوه فراوان شنگ اندرون شد دو گروه‏\n",
863
+ " ز پيروز رخ آفرين كرد دست گرفت اين سخن يافتند ز پست‏\n",
864
+ " همى خوان تبيرست بر حال ماه همى افسرستاد بايد ز راه‏\n",
865
+ " درختيست اين راى را هرچ گفت كه برخاست نامه ز انگزيست جفت‏\n",
866
+ " شنيد ليا مشك و بيداد چهر گمان جنگش برگ\n"
867
+ ]
868
+ }
869
+ ]
870
+ }
871
+ ]
872
+ }