import torch import torch.nn as nn from torch.nn import functional as F class GPTModel(nn.Module): def __init__(self, config, vocab_size): super().__init__() self.config = config self.token_embedding = nn.Embedding(vocab_size, config.n_embeds) self.position_embedding = nn.Embedding(config.block_size, config.n_embeds) self.blocks = nn.ModuleList([ TransformerBlock(config) for _ in range(config.n_layers) ]) self.ln_f = nn.LayerNorm(config.n_embeds) self.lm_head = nn.Linear(config.n_embeds, vocab_size) def forward(self, idx, targets=None): B, T = idx.shape tok_emb = self.token_embedding(idx) pos_emb = self.position_embedding(torch.arange(T, device=idx.device)) x = tok_emb + pos_emb for block in self.blocks: x = block(x) x = self.ln_f(x) logits = self.lm_head(x) if targets is None: loss = None else: B, T, C = logits.shape logits = logits.view(B*T, C) targets = targets.view(B*T) loss = F.cross_entropy(logits, targets) return logits, loss class TransformerBlock(nn.Module): def __init__(self, config): super().__init__() self.ln1 = nn.LayerNorm(config.n_embeds) self.ln2 = nn.LayerNorm(config.n_embeds) self.attn = MultiHeadAttention(config) self.mlp = FeedForward(config) self.dropout = nn.Dropout(config.dropout) def forward(self, x): # Self-attention with residual connection x = x + self.dropout(self.attn(self.ln1(x))) # FFN with residual connection x = x + self.dropout(self.mlp(self.ln2(x))) return x class MultiHeadAttention(nn.Module): def __init__(self, config): super().__init__() self.n_heads = config.n_heads self.head_size = config.n_embeds // config.n_heads self.n_embeds = config.n_embeds # Single linear layer for Q, K, V projections self.c_attn = nn.Linear(config.n_embeds, 3 * config.n_embeds) self.c_proj = nn.Linear(config.n_embeds, config.n_embeds) self.dropout = nn.Dropout(config.dropout) # Causal mask to prevent attending to future tokens self.register_buffer( "mask", torch.tril(torch.ones(config.block_size, config.block_size)) .view(1, 1, config.block_size, config.block_size) ) def forward(self, x): B, T, C = x.shape # Calculate Q, K, V with a single linear projection q, k, v = self.c_attn(x).split(self.n_embeds, dim=2) # Reshape to (B, nh, T, hs) q = q.view(B, T, self.n_heads, self.head_size).transpose(1, 2) k = k.view(B, T, self.n_heads, self.head_size).transpose(1, 2) v = v.view(B, T, self.n_heads, self.head_size).transpose(1, 2) # Compute attention scores att = (q @ k.transpose(-2, -1)) * (1.0 / torch.sqrt(torch.tensor(self.head_size))) # Apply causal mask att = att.masked_fill(self.mask[:,:,:T,:T] == 0, float('-inf')) att = F.softmax(att, dim=-1) att = self.dropout(att) # Apply attention to values y = att @ v # (B, nh, T, T) x (B, nh, T, hs) -> (B, nh, T, hs) # Reshape and project back y = y.transpose(1, 2).contiguous().view(B, T, C) # (B, T, C) y = self.c_proj(y) return y class FeedForward(nn.Module): def __init__(self, config): super().__init__() self.net = nn.Sequential( nn.Linear(config.n_embeds, 4 * config.n_embeds), nn.GELU(), nn.Linear(4 * config.n_embeds, config.n_embeds), nn.Dropout(config.dropout), ) def forward(self, x): return self.net(x)