## Building a GPT

Companion notebook to the [Zero To Hero](https://karpathy.ai/zero-to-hero.html) video on GPT.

In [None]:
!pip install -q python-docx


In [1]:
import docx
import re

# Replace 'your_file.docx' with your file path
doc_path = '/content/Shahname Ferdowsi.docx'

def read_docx(file_path):
 doc = docx.Document(file_path)
 text = []
 for para in doc.paragraphs:
 text.append(para.text)
 return '\n'.join(text)

# Read the .docx file
content = read_docx(doc_path)

# Remove English alphabets using regex
content_without_english = re.sub('[a-zA-Z]', '', content)

text = content_without_english


In [2]:
print("length of dataset in characters: ", len(text))

length of dataset in characters: 3867092


In [3]:
# let's look at the first 1000 characters
print(text[:1000])



آغاز كتاب‏
 بنام خداوند جان و خرد 	 	 كزين برتر انديشه بر نگذرد
 خداوند نام و خداوند جاى 		 خداوند روزى‏ده رهنماى‏
 خداوند كيوان و گردان سپهر 	 فروزنده ماه و ناهيد و مهر
 ز نام و نشان و گمان برترست 	 	 نگارنده برشده پيكرست‏
 به بينندگان آفريننده را 	 	 نبينى مرنجان دو بيننده را
 نيابد بدو نيز انديشه راه 		 كه او برتر از نام و از جايگاه‏
 سخن هر چه زين گوهران بگذرد 	 نيابد بدو راه جان و خرد
 خرد گر سخن برگزيند همى 	 همان را گزيند كه بيند همى‏
 ستودن نداند كس او را چو هست 	 ميان بندگى را ببايدت بست‏
 خرد را و جان را همى سنجد اوى در انديشۀ سخته كى گنجد اوى‏
 بدين آلت راى و جان و زبان 	 	 ستود آفريننده را كى توان‏
 به هستيش بايد كه خستو شوى 	 ز گفتار بى‏كار يك سو شوى‏
 پرستنده باشى و جوينده راه 	 بژرفى بفرمانش كردن نگاه‏
 توانا بود هر كه دانا بود 


In [4]:
# here are all the unique characters that occur in this text
chars = sorted(list(set(text)))
vocab_size = len(chars)
print(''.join(chars))
print(vocab_size)

	
 &()*-0123456789:[]،؟ءآأؤئابتثجحخدذرزسشصضطظعغفقكلمنهوىيَُِّْپچژکگۀی‏
70


In [5]:
# create a mapping from characters to integers
stoi = { ch:i for i,ch in enumerate(chars) }
itos = { i:ch for i,ch in enumerate(chars) }
encode = lambda s: [stoi[c] for c in s] # encoder: take a string, output a list of integers
decode = lambda l: ''.join([itos[i] for i in l]) # decoder: take a list of integers, output a string

print(encode("سلااام چطوری"))
print(decode(encode("سلااام چطوری")))

[39, 50, 28, 28, 28, 51, 2, 63, 43, 54, 37, 68]
سلااام چطوری


In [None]:
# let's now encode the entire text dataset and store it into a torch.Tensor
import torch # we use PyTorch: https://pytorch.org
data = torch.tensor(encode(text), dtype=torch.long)
print(data.shape, data.dtype)
print(data[:1000]) # the 1000 characters we looked at earier will to the GPT look like this

In [8]:
# Let's now split up the data into train and validation sets
n = int(0.9*len(data)) # first 90% will be train, rest val
train_data = data[:n]
val_data = data[n:]

In [9]:
block_size = 8
train_data[:block_size+1]

tensor([ 1, 1, 24, 46, 28, 38, 2, 49, 30])

In [10]:
x = train_data[:block_size]
y = train_data[1:block_size+1]
for t in range(block_size):
 context = x[:t+1]
 target = y[t]
 print(f"when input is {context} the target: {target}")

when input is tensor([1]) the target: 1
when input is tensor([1, 1]) the target: 24
when input is tensor([ 1, 1, 24]) the target: 46
when input is tensor([ 1, 1, 24, 46]) the target: 28
when input is tensor([ 1, 1, 24, 46, 28]) the target: 38
when input is tensor([ 1, 1, 24, 46, 28, 38]) the target: 2
when input is tensor([ 1, 1, 24, 46, 28, 38, 2]) the target: 49
when input is tensor([ 1, 1, 24, 46, 28, 38, 2, 49]) the target: 30


In [None]:
torch.manual_seed(1337)
batch_size = 4 # how many independent sequences will we process in parallel?
block_size = 8 # what is the maximum context length for predictions?

def get_batch(split):
 # generate a small batch of data of inputs x and targets y
 data = train_data if split == 'train' else val_data
 ix = torch.randint(len(data) - block_size, (batch_size,))
 x = torch.stack([data[i:i+block_size] for i in ix])
 y = torch.stack([data[i+1:i+block_size+1] for i in ix])
 return x, y

xb, yb = get_batch('train')
print('inputs:')
print(xb.shape)
print(xb)
print('targets:')
print(yb.shape)
print(yb)

print('----')

for b in range(batch_size): # batch dimension
 for t in range(block_size): # time dimension
 context = xb[b, :t+1]
 target = yb[b,t]
 print(f"when input is {context.tolist()} the target: {target}")

In [12]:
print(xb) # our input to the transformer

tensor([[30, 37, 28, 2, 29, 34, 30, 2],
 [51, 2, 40, 28, 62, 54, 37, 2],
 [ 2, 2, 2, 49, 53, 2, 37, 40],
 [35, 52, 35, 2, 66, 37, 35, 28]])


In [None]:
import torch
import torch.nn as nn
from torch.nn import functional as F
torch.manual_seed(1337)

class BigramLanguageModel(nn.Module):

 def __init__(self, vocab_size):
 super().__init__()
 # each token directly reads off the logits for the next token from a lookup table
 self.token_embedding_table = nn.Embedding(vocab_size, vocab_size)

 def forward(self, idx, targets=None):

 # idx and targets are both (B,T) tensor of integers
 logits = self.token_embedding_table(idx) # (B,T,C)

 if targets is None:
 loss = None
 else:
 B, T, C = logits.shape
 logits = logits.view(B*T, C)
 targets = targets.view(B*T)
 loss = F.cross_entropy(logits, targets)

 return logits, loss

 def generate(self, idx, max_new_tokens):
 # idx is (B, T) array of indices in the current context
 for _ in range(max_new_tokens):
 # get the predictions
 logits, loss = self(idx)
 # focus only on the last time step
 logits = logits[:, -1, :] # becomes (B, C)
 # apply softmax to get probabilities
 probs = F.softmax(logits, dim=-1) # (B, C)
 # sample from the distribution
 idx_next = torch.multinomial(probs, num_samples=1) # (B, 1)
 # append sampled index to the running sequence
 idx = torch.cat((idx, idx_next), dim=1) # (B, T+1)
 return idx

m = BigramLanguageModel(vocab_size)
logits, loss = m(xb, yb)
print(logits.shape)
print(loss)

print(decode(m.generate(idx = torch.zeros((1, 1), dtype=torch.long), max_new_tokens=100)[0].tolist()))


In [14]:
# create a PyTorch optimizer
optimizer = torch.optim.AdamW(m.parameters(), lr=1e-3)

In [15]:
batch_size = 32
for steps in range(100): # increase number of steps for good results...

 # sample a batch of data
 xb, yb = get_batch('train')

 # evaluate the loss
 logits, loss = m(xb, yb)
 optimizer.zero_grad(set_to_none=True)
 loss.backward()
 optimizer.step()

print(loss.item())


4.402019023895264


In [None]:
print(decode(m.generate(idx = torch.zeros((1, 1), dtype=torch.long), max_new_tokens=500)[0].tolist()))

### Full finished code, for reference

You may want to refer directly to the git repo instead though.

In [3]:
torch.cuda.is_available()

True

In [4]:
import torch
import torch.nn as nn
from torch.nn import functional as F

# hyperparameters
batch_size = 128 # how many independent sequences will we process in parallel?
block_size = 256 # what is the maximum context length for predictions?
max_iters = 5000
eval_interval = 300
learning_rate = 1e-3
device = 'cuda' if torch.cuda.is_available() else 'cpu'
eval_iters = 100
n_embd = 128 # Increase hidden size
n_head = 8 # Adjust number of attention heads
n_layer = 12 # Increase number of layers

dropout = 0.2
# ------------

torch.manual_seed(1337)


text = text

# here are all the unique characters that occur in this text
chars = sorted(list(set(text)))
vocab_size = len(chars)
# create a mapping from characters to integers
stoi = { ch:i for i,ch in enumerate(chars) }
itos = { i:ch for i,ch in enumerate(chars) }
encode = lambda s: [stoi[c] for c in s] # encoder: take a string, output a list of integers
decode = lambda l: ''.join([itos[i] for i in l]) # decoder: take a list of integers, output a string

# Train and test splits
data = torch.tensor(encode(text), dtype=torch.long)
n = int(0.9*len(data)) # first 90% will be train, rest val
train_data = data[:n]
val_data = data[n:]

# data loading
def get_batch(split):
 # generate a small batch of data of inputs x and targets y
 data = train_data if split == 'train' else val_data
 ix = torch.randint(len(data) - block_size, (batch_size,))
 x = torch.stack([data[i:i+block_size] for i in ix])
 y = torch.stack([data[i+1:i+block_size+1] for i in ix])
 x, y = x.to(device), y.to(device)
 return x, y

@torch.no_grad()
def estimate_loss():
 out = {}
 model.eval()
 for split in ['train', 'val']:
 losses = torch.zeros(eval_iters)
 for k in range(eval_iters):
 X, Y = get_batch(split)
 logits, loss = model(X, Y)
 losses[k] = loss.item()
 out[split] = losses.mean()
 model.train()
 return out

class Head(nn.Module):
 """ one head of self-attention """

 def __init__(self, head_size):
 super().__init__()
 self.key = nn.Linear(n_embd, head_size, bias=False)
 self.query = nn.Linear(n_embd, head_size, bias=False)
 self.value = nn.Linear(n_embd, head_size, bias=False)
 self.register_buffer('tril', torch.tril(torch.ones(block_size, block_size)))

 self.dropout = nn.Dropout(dropout)

 def forward(self, x):
 B,T,C = x.shape
 k = self.key(x) # (B,T,C)
 q = self.query(x) # (B,T,C)
 # compute attention scores ("affinities")
 wei = q @ k.transpose(-2,-1) * C**-0.5 # (B, T, C) @ (B, C, T) -> (B, T, T)
 wei = wei.masked_fill(self.tril[:T, :T] == 0, float('-inf')) # (B, T, T)
 wei = F.softmax(wei, dim=-1) # (B, T, T)
 wei = self.dropout(wei)
 # perform the weighted aggregation of the values
 v = self.value(x) # (B,T,C)
 out = wei @ v # (B, T, T) @ (B, T, C) -> (B, T, C)
 return out

class MultiHeadAttention(nn.Module):
 """ multiple heads of self-attention in parallel """

 def __init__(self, num_heads, head_size):
 super().__init__()
 self.heads = nn.ModuleList([Head(head_size) for _ in range(num_heads)])
 self.proj = nn.Linear(n_embd, n_embd)
 self.dropout = nn.Dropout(dropout)

 def forward(self, x):
 out = torch.cat([h(x) for h in self.heads], dim=-1)
 out = self.dropout(self.proj(out))
 return out

class FeedFoward(nn.Module):
 """ a simple linear layer followed by a non-linearity """

 def __init__(self, n_embd):
 super().__init__()
 self.net = nn.Sequential(
 nn.Linear(n_embd, 4 * n_embd),
 nn.ReLU(),
 nn.Linear(4 * n_embd, n_embd),
 nn.Dropout(dropout),
 )

 def forward(self, x):
 return self.net(x)

class Block(nn.Module):
 """ Transformer block: communication followed by computation """

 def __init__(self, n_embd, n_head):
 # n_embd: embedding dimension, n_head: the number of heads we'd like
 super().__init__()
 head_size = n_embd // n_head
 self.sa = MultiHeadAttention(n_head, head_size)
 self.ffwd = FeedFoward(n_embd)
 self.ln1 = nn.LayerNorm(n_embd)
 self.ln2 = nn.LayerNorm(n_embd)

 def forward(self, x):
 x = x + self.sa(self.ln1(x))
 x = x + self.ffwd(self.ln2(x))
 return x

# super simple bigram model
class BigramLanguageModel(nn.Module):

 def __init__(self):
 super().__init__()
 # each token directly reads off the logits for the next token from a lookup table
 self.token_embedding_table = nn.Embedding(vocab_size, n_embd)
 self.position_embedding_table = nn.Embedding(block_size, n_embd)
 self.blocks = nn.Sequential(*[Block(n_embd, n_head=n_head) for _ in range(n_layer)])
 self.ln_f = nn.LayerNorm(n_embd) # final layer norm
 self.lm_head = nn.Linear(n_embd, vocab_size)

 def forward(self, idx, targets=None):
 B, T = idx.shape

 # idx and targets are both (B,T) tensor of integers
 tok_emb = self.token_embedding_table(idx) # (B,T,C)
 pos_emb = self.position_embedding_table(torch.arange(T, device=device)) # (T,C)
 x = tok_emb + pos_emb # (B,T,C)
 x = self.blocks(x) # (B,T,C)
 x = self.ln_f(x) # (B,T,C)
 logits = self.lm_head(x) # (B,T,vocab_size)

 if targets is None:
 loss = None
 else:
 B, T, C = logits.shape
 logits = logits.view(B*T, C)
 targets = targets.view(B*T)
 loss = F.cross_entropy(logits, targets)

 return logits, loss

 def generate(self, idx, max_new_tokens):
 # idx is (B, T) array of indices in the current context
 for _ in range(max_new_tokens):
 # crop idx to the last block_size tokens
 idx_cond = idx[:, -block_size:]
 # get the predictions
 logits, loss = self(idx_cond)
 # focus only on the last time step
 logits = logits[:, -1, :] # becomes (B, C)
 # apply softmax to get probabilities
 probs = F.softmax(logits, dim=-1) # (B, C)
 # sample from the distribution
 idx_next = torch.multinomial(probs, num_samples=1) # (B, 1)
 # append sampled index to the running sequence
 idx = torch.cat((idx, idx_next), dim=1) # (B, T+1)
 return idx

model = BigramLanguageModel()
m = model.to(device)
# print the number of parameters in the model
print(sum(p.numel() for p in m.parameters())/1e6, 'M parameters')

# create a PyTorch optimizer
optimizer = torch.optim.AdamW(model.parameters(), lr=learning_rate)

for iter in range(max_iters):

 # every once in a while evaluate the loss on train and val sets
 if iter % eval_interval == 0 or iter == max_iters - 1:
 losses = estimate_loss()
 print(f"step {iter}: train loss {losses['train']:.4f}, val loss {losses['val']:.4f}")

 # sample a batch of data
 xb, yb = get_batch('train')

 # evaluate the loss
 logits, loss = model(xb, yb)
 optimizer.zero_grad(set_to_none=True)
 loss.backward()
 optimizer.step()

# generate from the model
context = torch.zeros((1, 1), dtype=torch.long, device=device)
print(decode(m.generate(context, max_new_tokens=2000)[0].tolist()))


2.42567 M parameters
step 0: train loss 4.4474, val loss 4.4467
step 300: train loss 1.7789, val loss 1.7773
step 600: train loss 1.4613, val loss 1.4679
step 900: train loss 1.2493, val loss 1.2604
step 1200: train loss 1.1231, val loss 1.1440
step 1500: train loss 1.0568, val loss 1.0844
step 1800: train loss 1.0104, val loss 1.0401
step 2100: train loss 0.9701, val loss 1.0066
step 2400: train loss 0.9385, val loss 0.9754
step 2700: train loss 0.9122, val loss 0.9547
step 3000: train loss 0.8927, val loss 0.9387
step 3300: train loss 0.8747, val loss 0.9226
step 3600: train loss 0.8646, val loss 0.9148
step 3900: train loss 0.8546, val loss 0.9087
step 4200: train loss 0.8414, val loss 0.8990
step 4500: train loss 0.8352, val loss 0.8919
step 4800: train loss 0.8238, val loss 0.8827
step 4999: train loss 0.8193, val loss 0.8796
	 گروهر شده جوشن با يوز رخ سروه‏
 همى گور و ديده بيوق و تير همان غلت شاپور و چندى مپير
 هم اندر زمان غلعه فرخ اوست همه سال گردنده شد گيو اوست‏
 اگر سوگوارست 

In [5]:
torch.save(model.state_dict(), 'language_model.pth')

In [7]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [11]:
# generate from the model
context = torch.zeros((1, 1), dtype=torch.long, device=device)
print(decode(m.generate(context, max_new_tokens=2000)[0].tolist()))

	 چو نزديك سام بلند بسالار تركان بجايش گزند
 فرامور بآتش از اندر بپاى توانه روان رهنماى بپاى‏
 سراسر يكى مرد زان در گزيد نهان گمان آرد نه نامين كشيد
 [ كه بهرام گفتش كه برداشت بجز باژ جز تخت و كشتى براشت‏]
 [ كه تا از آن داد نژاد بود بزرگ آور و دل پر از بود]
 [ شوم شند پيروز سا شاه ماه همه نامور تخت شاه و سپاه‏]
 سر بى‏قباى و نامه برش چو با ماه شد بى‏گناهش اوى‏
 پرستندگان گفت كامون شوى برم گفت رسم نجست از زوى اوى‏
 همه پاك بايست مهتران همه راى گفته بديدار زيان‏
 بفرمود تا مهر قارن نشست پى سر بسر بر بپر مهر دست‏
 بدان تا مبادا يكى پهلوان نداريد ما دانش جهان سر و جوان‏
 همى سخت شنگل اندر آيد بدرد بازان رزم را برانى دلي]
 [ پند آگازان بر گيو نوذر شايستار و ژويه باك‏]
 چو خورشيد زفتى هيونى گرفت بلند اندر آن شاه آن زينهارمت‏
 بفرمود تا سر بسر هم همه بروبرز و ماه آمدش بمشت‏
 بدو گفت كاى شهريار منست كجات كيان از پى نان نيز منست‏
 بفرمود تا جشن درنج و تخت تهمتن نشنريد ماهيم و بخت‏
 شاهنامه، ص: 31

 مرا نيز جنگ پآن انديشه رفت زره ساله جنگ بى‏غم در گرفت‏
 از ان ناپس بهرام بيداد من‏
 كه بر دوه با