SeemG commited on
Commit
ba8a2e1
·
verified ·
1 Parent(s): 48492cd

Update model.py

Browse files
Files changed (1) hide show
  1. model.py +449 -483
model.py CHANGED
@@ -1,483 +1,449 @@
1
- # GPT-3 Paper
2
- # add cosing delay
3
- import os
4
- import math
5
- import time
6
- import inspect
7
- from dataclasses import dataclass
8
- import torch
9
- import torch.nn as nn
10
- import tiktoken
11
- from torch.nn import functional as F
12
-
13
-
14
- class CausalSelfAttention(nn.Module):
15
-
16
- def __init__(self, config):
17
- super().__init__()
18
-
19
- #assertion to ensure the embedding dimension is divisible by the number of heads (important for reshaping later).
20
- assert config.n_embd % config.n_head == 0
21
-
22
- # key, query, value projections for all heads, but in a batch. Each vector has the same dimension (C) as the input embedding.
23
- self.c_attn = nn.Linear(config.n_embd, 3 * config.n_embd)
24
-
25
-
26
- # output projection find the meaning?
27
- self.c_proj = nn.Linear(config.n_embd, config.n_embd)
28
- self.c_proj.NANGPT_SCALE_INIT = 1
29
-
30
-
31
-
32
- # regularization
33
- self.n_head = config.n_head
34
- self.n_embd = config.n_embd
35
- self.register_buffer("bias", torch.tril(torch.ones(config.block_size, config.block_size)).view(1, 1, config.block_size, config.block_size))
36
-
37
- def forward(self, x):
38
- # x is tokenised version of input.txt
39
- B, T, C = x.size() # batch size, sequence length, embedding dimensionality (n_embd)
40
- # calculate query, key, values for all heads in batch and move head forward to be the batch dim
41
- # nh is "number of heads", hs is "head size", and C (number of channels) = nh * hs
42
- # e.g. in GPT-2 (124M), n_head=12, hs=64, so nh*hs=C=768 channels in the Transformer
43
- qkv = self.c_attn(x)
44
- q, k, v = qkv.split(self.n_embd, dim=2)
45
- k = k.view(B, T, self.n_head, C // self.n_head).transpose(1, 2) # (B, nh, T, hs)
46
- q = q.view(B, T, self.n_head, C // self.n_head).transpose(1, 2) # (B, nh, T, hs)
47
- v = v.view(B, T, self.n_head, C // self.n_head).transpose(1, 2) # (B, nh, T, hs)
48
-
49
- # att = (q @ k.transpose(-2, -1)) * (1.0 / math.sqrt(k.size(-1)))
50
- # att = att.masked_fill(self.bias[:, :, :T, :T] == 0, float('-inf'))# find what is it???
51
-
52
- # att = F.softmax(att, dim=-1)
53
- # y = att @ v # (B, nh, T, T) x (B, nh, T, hs) -> (B, nh, T, hs)
54
-
55
-
56
-
57
- ## This function combines the dot product, scaling, and softmax operations into a single step.
58
- y = F.scaled_dot_product_attention(q, k, v, is_causal = True) # Flash attention
59
-
60
- y = y.transpose(1, 2).contiguous().view(B, T, C) # re-assemble all head outputs side by side
61
- # output projection
62
- y = self.c_proj(y)
63
- return y
64
-
65
-
66
- class MLP(nn.Module):
67
- # MLP (Multi-Layer Perceptron)
68
- ## This class implements a simple multi-layer perceptron (MLP) sub-module.
69
- ## It's often used within transformers for non-linear transformations.
70
-
71
- def __init__(self, config):
72
- #sqeeze and expand
73
- super().__init__()
74
- #c_fc: Projects the input (x) to a dimension four times larger than the embedding dimension (n_embd).
75
- self.c_fc = nn.Linear(config.n_embd, 4 * config.n_embd)
76
-
77
- # GELU (Gaussian Error Linear Unit) activation function for non-linearity.
78
- #Here, an approximate version using tanh is used.
79
- self.gelu = nn.GELU(approximate='tanh')
80
-
81
- # Projects the output back to the original embedding dimension (n_embd).
82
- self.c_proj = nn.Linear(4 * config.n_embd, config.n_embd)
83
- self.c_proj.NANOGPT_SCALE_INIT = 1
84
-
85
- def forward(self, x):
86
-
87
- #Takes the input (x).
88
- # Applies the linear layer (c_fc), followed by the GELU activation.
89
- # Applies the final projection layer (c_proj).
90
- # Returns the transformed output.
91
- x = self.c_fc(x)
92
- x = self.gelu(x)
93
- x = self.c_proj(x)
94
- return x
95
-
96
- class Block(nn.Module):
97
- # This class combines the CausalSelfAttention layer (explained previously) and the MLP layer to form a single transformer block.
98
- # The input is processed through the attention layer, followed by layer normalization and an MLP, and
99
- # then again with layer normalization.
100
-
101
- def __init__(self, config):
102
- super().__init__()
103
-
104
- #ln_1: A layer normalization layer applied before the causal self-attention.
105
- #attn: An instance of the CausalSelfAttention class (explained previously).
106
- #mlp: An instance of the MLP class (explained previously).
107
-
108
- self.ln_1 = nn.LayerNorm(config.n_embd)
109
- self.attn = CausalSelfAttention(config)
110
- self.ln_2 = nn.LayerNorm(config.n_embd)
111
- self.mlp = MLP(config)
112
-
113
- def forward(self, x):
114
- # Takes the input (x).
115
- # Performs a residual connection with the output from the causal self-attention layer (attn), preceded by layer normalization (ln_1).
116
- # Performs another residual connection with the output from the MLP layer (mlp), preceded by layer normalization (ln_2).
117
- # Returns the final output after the second residual connection.
118
- x = x + self.attn(self.ln_1(x))
119
- x = x + self.mlp(self.ln_2(x))
120
- return x
121
-
122
-
123
- @dataclass
124
- class GPTConfig:
125
- block_size: int = 1024 # max sequence length
126
- vocab_size: int = 50304 # number of tokens: 50,000 BPE merges + 256 bytes tokens + 1 <|endoftext|> token
127
- n_layer: int = 12 # number of layers
128
- n_head: int = 12 # number of heads
129
- n_embd: int = 768 # embedding dimension
130
-
131
-
132
- class GPT(nn.Module):
133
-
134
- def __init__(self, config):
135
- super().__init__()
136
- self.config = config
137
-
138
-
139
-
140
- # Creates a transformer module dictionary containing several key components:
141
- #wte: Word token embedding layer (nn.Embedding). Maps each word index to its corresponding embedding vector.
142
- #wpe: Positional embedding layer (nn.Embedding). Adds positional information to the word embeddings.
143
- #h: A module list containing multiple Block instances (explained earlier). These are the core processing units of the transformer.
144
- #ln_f: Final layer normalization layer (nn.LayerNorm) applied to the output of the transformer blocks.
145
-
146
- self.transformer = nn.ModuleDict(dict(
147
- wte = nn.Embedding(config.vocab_size, config.n_embd),
148
- wpe = nn.Embedding(config.block_size, config.n_embd),
149
- h = nn.ModuleList([Block(config) for _ in range(config.n_layer)]),
150
- ln_f = nn.LayerNorm(config.n_embd),
151
- ))
152
-
153
-
154
- #Creates the language modeling head (lm_head), a linear layer that projects the final hidden state from the
155
- #transformer to the vocabulary size, predicting the next word in the sequence.
156
- self.lm_head = nn.Linear(config.n_embd, config.vocab_size, bias=False)
157
-
158
- # weight sharing Implements weight sharing between the word token embedding layer (wte)
159
- #and the language modeling head (lm_head). This reduces the number of parameters and encourages
160
- #the model to learn a meaningful representation for words that can be used for both embedding and prediction.
161
- self.transformer.wte.weight = self.lm_head.weight
162
-
163
- # weight initialization
164
- #Initializes the weights of the model using a custom function (_init_weights).
165
- self.apply(self._init_weights)
166
-
167
- def _init_weights(self, module):
168
- if isinstance(module, nn.Linear):
169
- std = 0.02
170
- if hasattr(module, 'NANGPT_SCALE_INIT'):
171
- std *= (2 * self.config.n_layer) ** -0.5
172
- torch.nn.init.normal_(module.weight, mean = 0.0, std = std)
173
- if module.bias is not None:
174
- torch.nn.init.zeros_(module.bias)
175
- elif isinstance(module, nn.Embedding):
176
- torch.nn.init.normal_(module.weight, mean=0.0, std = 0.02)
177
-
178
-
179
-
180
- def forward(self, idx, targets=None):
181
- # idx is of shape (B, T)
182
- B, T = idx.size()
183
- assert T <= self.config.block_size, f"Cannot forward sequence of length {T}, block size is only {self.config.block_size}"
184
- # forward the token and posisition embeddings
185
- pos = torch.arange(0, T, dtype=torch.long, device=idx.device) # shape (T)
186
- pos_emb = self.transformer.wpe(pos) # position embeddings of shape (T, n_embd)
187
- tok_emb = self.transformer.wte(idx) # token embeddings of shape (B, T, n_embd)
188
- x = tok_emb + pos_emb
189
- # forward the blocks of the transformer
190
- for block in self.transformer.h:
191
- x = block(x)
192
- # forward the final layernorm and the classifier
193
- x = self.transformer.ln_f(x)
194
- logits = self.lm_head(x) # (B, T, vocab_size)
195
- loss = None
196
- if targets is not None:
197
- loss = F.cross_entropy(logits.view(-1, logits.size(-1)), targets.view(-1))
198
- return logits, loss
199
-
200
- @classmethod
201
- def from_pretrained(cls, model_type):
202
- """Loads pretrained GPT-2 model weights from huggingface"""
203
- assert model_type in {'gpt2', 'gpt2-medium', 'gpt2-large', 'gpt2-xl'}
204
- from transformers import GPT2LMHeadModel
205
- print("loading weights from pretrained gpt: %s" % model_type)
206
-
207
- # n_layer, n_head and n_embd are determined from model_type
208
- config_args = {
209
- 'gpt2': dict(n_layer=12, n_head=12, n_embd=768), # 124M params
210
- 'gpt2-medium': dict(n_layer=24, n_head=16, n_embd=1024), # 350M params
211
- 'gpt2-large': dict(n_layer=36, n_head=20, n_embd=1280), # 774M params
212
- 'gpt2-xl': dict(n_layer=48, n_head=25, n_embd=1600), # 1558M params
213
- }[model_type]
214
- config_args['vocab_size'] = 50257 # always 50257 for GPT model checkpoints
215
- config_args['block_size'] = 1024 # always 1024 for GPT model checkpoints
216
- # create a from-scratch initialized minGPT model
217
- config = GPTConfig(**config_args)
218
- model = GPT(config)
219
- sd = model.state_dict()
220
- sd_keys = sd.keys()
221
- sd_keys = [k for k in sd_keys if not k.endswith('.attn.bias')] # discard this mask / buffer, not a param
222
-
223
- # init a huggingface/transformers model
224
- model_hf = GPT2LMHeadModel.from_pretrained(model_type)
225
- sd_hf = model_hf.state_dict()
226
-
227
- # copy while ensuring all of the parameters are aligned and match in names and shapes
228
- sd_keys_hf = sd_hf.keys()
229
- sd_keys_hf = [k for k in sd_keys_hf if not k.endswith('.attn.masked_bias')] # ignore these, just a buffer
230
- sd_keys_hf = [k for k in sd_keys_hf if not k.endswith('.attn.bias')] # same, just the mask (buffer)
231
- transposed = ['attn.c_attn.weight', 'attn.c_proj.weight', 'mlp.c_fc.weight', 'mlp.c_proj.weight']
232
- # basically the openai checkpoints use a "Conv1D" module, but we only want to use a vanilla Linear
233
- # this means that we have to transpose these weights when we import them
234
- assert len(sd_keys_hf) == len(sd_keys), f"mismatched keys: {len(sd_keys_hf)} != {len(sd_keys)}"
235
- for k in sd_keys_hf:
236
- if any(k.endswith(w) for w in transposed):
237
- # special treatment for the Conv1D weights we need to transpose
238
- assert sd_hf[k].shape[::-1] == sd[k].shape
239
- with torch.no_grad():
240
- sd[k].copy_(sd_hf[k].t())
241
- else:
242
- # vanilla copy over the other parameters
243
- assert sd_hf[k].shape == sd[k].shape
244
- with torch.no_grad():
245
- sd[k].copy_(sd_hf[k])
246
-
247
- return model
248
-
249
- def configure_optimizers(self, weight_decay, learning_rate, device_type):
250
- # start with all of the candidate parameters (that require grad)
251
- param_dict = {pn: p for pn, p in self.named_parameters()}
252
- param_dict = {pn: p for pn, p in param_dict.items() if p.requires_grad}
253
- # create optim groups. Any parameters that is 2D will be weight decayed, otherwise no.
254
- # i.e. all weight tensors in matmuls + embeddings decay, all biases and layernorms don't.
255
- decay_params = [p for n, p in param_dict.items() if p.dim() >= 2]
256
- nodecay_params = [p for n, p in param_dict.items() if p.dim() < 2]
257
- optim_groups = [
258
- {'params': decay_params, 'weight_decay': weight_decay},
259
- {'params': nodecay_params, 'weight_decay': 0.0}
260
- ]
261
- num_decay_params = sum(p.numel() for p in decay_params)
262
- num_nodecay_params = sum(p.numel() for p in nodecay_params)
263
-
264
- print(f"num decayed parameter tensors: {len(decay_params)}, with {num_decay_params:,} parameters")
265
- print(f"num non-decayed parameter tensors: {len(nodecay_params)}, with {num_nodecay_params:,} parameters")
266
- # Create AdamW optimizer and use the fused version if it is available
267
- fused_available = 'fused' in inspect.signature(torch.optim.AdamW).parameters
268
- use_fused = fused_available and device_type == "cuda"
269
-
270
- print(f"using fused AdamW: {use_fused}")
271
- optimizer = torch.optim.AdamW(optim_groups, lr=learning_rate, betas=(0.9, 0.95), eps=1e-8, fused=use_fused)
272
- return optimizer
273
-
274
- # model = GPT.from_pretrained('gpt2')
275
-
276
- device = 'cpu'
277
- if torch.cuda.is_available():
278
- device = 'cuda'
279
- elif hasattr(torch.backends, "mps") and torch.backends.mps.is_available():
280
- device = "mps"
281
- print(f"using device: {device}")
282
-
283
- # SEED
284
- torch.manual_seed(1337)
285
- if torch.cuda.is_available():
286
- torch.cuda.manual_seed(1337)
287
-
288
- # STOP
289
- # num_return_sequences = 5
290
- # max_length = 30
291
-
292
-
293
-
294
- import tiktoken
295
-
296
- class DataLoaderLite:
297
- def __init__(self, B, T):
298
- self.B = B
299
- self.T = T
300
-
301
- # at init load tokens from disk and store them in memory
302
- with open('input.txt', 'r') as f:
303
- text = f.read()
304
- enc = tiktoken.get_encoding('gpt2')
305
- tokens = enc.encode(text)
306
- self.tokens = torch.tensor(tokens)
307
- print(f'loaded {len(self.tokens)} tokens')
308
- print(f'1 epoch = {len(self.tokens) // (B * T)} batches')
309
-
310
- # state
311
- self.current_position = 0
312
-
313
- def next_batch(self):
314
- B, T = self.B, self.T
315
- buf = self.tokens[self.current_position: self.current_position + B * T + 1]
316
- x = (buf[:-1]).view(B, T) # inputs
317
- y = (buf[1:]).view(B, T) # targets
318
- # advance the position in the tensor
319
- self.current_position += B*T
320
- # if loading the next batch would be out of bounds, reset
321
- if self.current_position + (B * T + 1) > len(self.tokens):
322
- self.current_position = 0
323
- return x, y
324
-
325
- # CHANGES IN CURRENT CODE
326
- torch.set_float32_matmul_precision('high')
327
- model = GPT(GPTConfig())
328
- model.to(device)
329
- # model = torch.compile(model)
330
-
331
- # CODE UPDATE HERE
332
- max_lr = 6e-4
333
- min_lr = max_lr * 0.1
334
- # warmup_steps = 100
335
- # # max_steps = 50
336
-
337
- def get_lr(it,warmup_steps, max_steps):
338
- if it < warmup_steps:
339
- return max_lr * (it + 1) / warmup_steps
340
- if it > max_steps:
341
- return min_lr
342
- decay_ratio = (it - warmup_steps) / (max_steps - warmup_steps)
343
- assert 0 <= decay_ratio <=1
344
- coeff = 0.5 * (1.0 + math.cos(math.pi * decay_ratio))
345
- return min_lr + coeff * (max_lr - min_lr)
346
-
347
-
348
- # NEW CODE
349
- import time
350
- train_loader = DataLoaderLite(B = 8, T = 512)
351
-
352
- # train_loader = DataLoaderLite(B = B, T = T)
353
- x, y = train_loader.next_batch()
354
- x.shape, y.shape
355
-
356
- def run_train (max_steps = 50 ,warmup_steps = 100, PATH = "/content/drive/MyDrive/S21/gpt_124M.pth" ):
357
- # optimizer = torch.optim.AdamW(model.parameters(), lr = 3e-4, betas=(0.9, 0.95), eps=1e-8)
358
- optimizer = model.configure_optimizers(weight_decay=0.1, learning_rate=6e-4, device_type=device)
359
- for step in range(max_steps):
360
- t0 = time.time()
361
- x, y = train_loader.next_batch()
362
- x, y = x.to(device), y.to(device)
363
- optimizer.zero_grad()
364
- # NEW CODE ADDED HERE
365
- with torch.autocast(device_type=device, dtype=torch.bfloat16):
366
- logits, loss = model(x, y)
367
- loss.backward()
368
- norm = torch.nn.utils.clip_grad_norm(model.parameters(), 1.0)
369
- # NEW CODE
370
- lr = get_lr(step, max_steps = 50 ,warmup_steps = 100)
371
- for param_group in optimizer.param_groups:
372
- param_group['lr'] = lr
373
-
374
- optimizer.step()
375
- torch.cuda.synchronize()
376
- t1 = time.time()
377
- dt = (t1 - t0) * 1000
378
- tokens_per_sec = (train_loader.B * train_loader.T) / (t1 - t0)
379
- print(f'step{step} | loss: {loss.item()} | dt: {dt:.2f}ms | tok/sec: {tokens_per_sec: .2f} | norm: {norm:.2f}')
380
- print(loss)
381
- torch.save(model.state_dict(), PATH)
382
- return model
383
-
384
- def load_fromsaved(PATH = "/content/drive/MyDrive/S21/gpt_124M.pth" ):
385
-
386
- # Create a new GPT model instance
387
- model = GPT(GPTConfig())
388
- model.to(device)
389
-
390
- # Load the saved weights into the model
391
- model.load_state_dict(torch.load(PATH))
392
-
393
-
394
- # Print confirmation message
395
- print("Loaded model weights from:", PATH)
396
- model.to(device)
397
-
398
- return model
399
-
400
-
401
- def gen_text(model,start_tokens, max_length=100, num_return_sequences=10 ):
402
- """
403
- Generates text using the loaded GPT model.
404
-
405
- Args:
406
- model: The GPT model to use for generation.
407
- start_tokens (optional): A list of token IDs to use as the starting prompt.
408
- max_length: The maximum length of the generated text.
409
- num_return_sequences: The number of text sequences to generate.
410
-
411
- Returns:
412
- None
413
- """
414
- decoded_texts = ''
415
- enc = tiktoken.get_encoding('gpt2')
416
- tokens = enc.encode(start_tokens)
417
- tokens = torch.tensor(tokens, dtype= torch.long) # (8,) #check tiktoken app
418
- tokens = tokens.unsqueeze(0).repeat(num_return_sequences, 1) # (5, 8)
419
- x = tokens.to(device)
420
-
421
- # Set random seeds for consistent generation across runs
422
- torch.manual_seed(42)
423
- torch.cuda.manual_seed(42)
424
-
425
- while x.size(1) < max_length:
426
- # forward the model to get the logits
427
- with torch.no_grad():
428
- logits = model(x)[0] # (B, T, vocab_size)
429
- # take the logits at the last position
430
- logits = logits[:, -1, :] # (B, vocab_size)
431
- # get the probabilities
432
- probs = F.softmax(logits, dim=-1)
433
- # do top-k sampling of 50 (huggingface pipeline default)
434
- # topk_probs here becomes (5, 50), topk_indices is (5, 50)
435
- topk_probs, topk_indices = torch.topk(probs, 50, dim=-1)
436
- # select a token from the top-k probabilities
437
- # note: multinomial does not demand the input to sum to 1
438
- ix = torch.multinomial(topk_probs, 1) # (B, 1)
439
- # gather the corresponding indices
440
- xcol = torch.gather(topk_indices, -1, ix) # (B, 1)
441
- # append to the sequence
442
- x = torch.cat((x, xcol), dim=1)
443
-
444
- # print the generated text
445
- for i in range(num_return_sequences):
446
- tokens = x[i, :max_length].tolist()
447
- decoded = enc.decode(tokens)
448
- print(">", decoded)
449
- # decoded_texts.append(decoded)
450
- # # Join all the decoded texts into a single string and print it
451
- # final_decoded_text = "".join(decoded_texts)
452
- # print(final_decoded_text)
453
- # return final_decoded_text
454
-
455
-
456
-
457
- # def gen_text(model,x = x, max_length = 100, num_return_sequences=10):
458
- # torch.manual_seed(42)
459
- # torch.cuda.manual_seed(42)
460
- # while x.size(1) < max_length:
461
- # # forward the model to get the logits
462
- # with torch.no_grad():
463
- # logits = model(x)[0] # (B, T, vocab_size)
464
- # # take the logits at the last position
465
- # logits = logits[:, -1, :] # (B, vocab_size)
466
- # # get the probabilities
467
- # probs = F.softmax(logits, dim=-1)
468
- # # do top-k sampling of 50 (huggingface pipeline default)
469
- # # topk_probs here becomes (5, 50), topk_indices is (5, 50)
470
- # topk_probs, topk_indices = torch.topk(probs, 50, dim=-1)
471
- # # select a token from the top-k probabilities
472
- # # note: multinomial does not demand the input to sum to 1
473
- # ix = torch.multinomial(topk_probs, 1) # (B, 1)
474
- # # gather the corresponding indices
475
- # xcol = torch.gather(topk_indices, -1, ix) # (B, 1)
476
- # # append to the sequence
477
- # x = torch.cat((x, xcol), dim=1)
478
-
479
- # # print the generated text
480
- # for i in range(num_return_sequences):
481
- # tokens = x[i, :max_length].tolist()
482
- # decoded = enc.decode(tokens)
483
- # print(">", decoded)
 
1
+ # GPT-3 Paper
2
+ import os
3
+ import math
4
+ import time
5
+ import inspect
6
+ from dataclasses import dataclass
7
+ import torch
8
+ import torch.nn as nn
9
+ import tiktoken
10
+ from torch.nn import functional as F
11
+
12
+
13
+ class CausalSelfAttention(nn.Module):
14
+
15
+ def __init__(self, config):
16
+ super().__init__()
17
+
18
+ #assertion to ensure the embedding dimension is divisible by the number of heads (important for reshaping later).
19
+ assert config.n_embd % config.n_head == 0
20
+
21
+ # key, query, value projections for all heads, but in a batch. Each vector has the same dimension (C) as the input embedding.
22
+ self.c_attn = nn.Linear(config.n_embd, 3 * config.n_embd)
23
+
24
+
25
+ # output projection find the meaning?
26
+ self.c_proj = nn.Linear(config.n_embd, config.n_embd)
27
+ self.c_proj.NANGPT_SCALE_INIT = 1
28
+
29
+
30
+
31
+ # regularization
32
+ self.n_head = config.n_head
33
+ self.n_embd = config.n_embd
34
+ self.register_buffer("bias", torch.tril(torch.ones(config.block_size, config.block_size)).view(1, 1, config.block_size, config.block_size))
35
+
36
+ def forward(self, x):
37
+ # x is tokenised version of input.txt
38
+ B, T, C = x.size() # batch size, sequence length, embedding dimensionality (n_embd)
39
+ # calculate query, key, values for all heads in batch and move head forward to be the batch dim
40
+ # nh is "number of heads", hs is "head size", and C (number of channels) = nh * hs
41
+ # e.g. in GPT-2 (124M), n_head=12, hs=64, so nh*hs=C=768 channels in the Transformer
42
+ qkv = self.c_attn(x)
43
+ q, k, v = qkv.split(self.n_embd, dim=2)
44
+ k = k.view(B, T, self.n_head, C // self.n_head).transpose(1, 2) # (B, nh, T, hs)
45
+ q = q.view(B, T, self.n_head, C // self.n_head).transpose(1, 2) # (B, nh, T, hs)
46
+ v = v.view(B, T, self.n_head, C // self.n_head).transpose(1, 2) # (B, nh, T, hs)
47
+
48
+ # att = (q @ k.transpose(-2, -1)) * (1.0 / math.sqrt(k.size(-1)))
49
+ # att = att.masked_fill(self.bias[:, :, :T, :T] == 0, float('-inf'))# find what is it???
50
+
51
+ # att = F.softmax(att, dim=-1)
52
+ # y = att @ v # (B, nh, T, T) x (B, nh, T, hs) -> (B, nh, T, hs)
53
+
54
+
55
+
56
+ ## This function combines the dot product, scaling, and softmax operations into a single step.
57
+ y = F.scaled_dot_product_attention(q, k, v, is_causal = True) # Flash attention
58
+
59
+ y = y.transpose(1, 2).contiguous().view(B, T, C) # re-assemble all head outputs side by side
60
+ # output projection
61
+ y = self.c_proj(y)
62
+ return y
63
+
64
+
65
+ class MLP(nn.Module):
66
+ # MLP (Multi-Layer Perceptron)
67
+ ## This class implements a simple multi-layer perceptron (MLP) sub-module.
68
+ ## It's often used within transformers for non-linear transformations.
69
+
70
+ def __init__(self, config):
71
+ #sqeeze and expand
72
+ super().__init__()
73
+ #c_fc: Projects the input (x) to a dimension four times larger than the embedding dimension (n_embd).
74
+ self.c_fc = nn.Linear(config.n_embd, 4 * config.n_embd)
75
+
76
+ # GELU (Gaussian Error Linear Unit) activation function for non-linearity.
77
+ #Here, an approximate version using tanh is used.
78
+ self.gelu = nn.GELU(approximate='tanh')
79
+
80
+ # Projects the output back to the original embedding dimension (n_embd).
81
+ self.c_proj = nn.Linear(4 * config.n_embd, config.n_embd)
82
+ self.c_proj.NANOGPT_SCALE_INIT = 1
83
+
84
+ def forward(self, x):
85
+
86
+ #Takes the input (x).
87
+ # Applies the linear layer (c_fc), followed by the GELU activation.
88
+ # Applies the final projection layer (c_proj).
89
+ # Returns the transformed output.
90
+ x = self.c_fc(x)
91
+ x = self.gelu(x)
92
+ x = self.c_proj(x)
93
+ return x
94
+
95
+ class Block(nn.Module):
96
+ # This class combines the CausalSelfAttention layer (explained previously) and the MLP layer to form a single transformer block.
97
+ # The input is processed through the attention layer, followed by layer normalization and an MLP, and
98
+ # then again with layer normalization.
99
+
100
+ def __init__(self, config):
101
+ super().__init__()
102
+
103
+ #ln_1: A layer normalization layer applied before the causal self-attention.
104
+ #attn: An instance of the CausalSelfAttention class (explained previously).
105
+ #mlp: An instance of the MLP class (explained previously).
106
+
107
+ self.ln_1 = nn.LayerNorm(config.n_embd)
108
+ self.attn = CausalSelfAttention(config)
109
+ self.ln_2 = nn.LayerNorm(config.n_embd)
110
+ self.mlp = MLP(config)
111
+
112
+ def forward(self, x):
113
+ # Takes the input (x).
114
+ # Performs a residual connection with the output from the causal self-attention layer (attn), preceded by layer normalization (ln_1).
115
+ # Performs another residual connection with the output from the MLP layer (mlp), preceded by layer normalization (ln_2).
116
+ # Returns the final output after the second residual connection.
117
+ x = x + self.attn(self.ln_1(x))
118
+ x = x + self.mlp(self.ln_2(x))
119
+ return x
120
+
121
+
122
+ @dataclass
123
+ class GPTConfig:
124
+ block_size: int = 1024 # max sequence length
125
+ vocab_size: int = 50304 # number of tokens: 50,000 BPE merges + 256 bytes tokens + 1 <|endoftext|> token
126
+ n_layer: int = 12 # number of layers
127
+ n_head: int = 12 # number of heads
128
+ n_embd: int = 768 # embedding dimension
129
+
130
+
131
+ class GPT(nn.Module):
132
+
133
+ def __init__(self, config):
134
+ super().__init__()
135
+ self.config = config
136
+
137
+
138
+
139
+ # Creates a transformer module dictionary containing several key components:
140
+ #wte: Word token embedding layer (nn.Embedding). Maps each word index to its corresponding embedding vector.
141
+ #wpe: Positional embedding layer (nn.Embedding). Adds positional information to the word embeddings.
142
+ #h: A module list containing multiple Block instances (explained earlier). These are the core processing units of the transformer.
143
+ #ln_f: Final layer normalization layer (nn.LayerNorm) applied to the output of the transformer blocks.
144
+
145
+ self.transformer = nn.ModuleDict(dict(
146
+ wte = nn.Embedding(config.vocab_size, config.n_embd),
147
+ wpe = nn.Embedding(config.block_size, config.n_embd),
148
+ h = nn.ModuleList([Block(config) for _ in range(config.n_layer)]),
149
+ ln_f = nn.LayerNorm(config.n_embd),
150
+ ))
151
+
152
+
153
+ #Creates the language modeling head (lm_head), a linear layer that projects the final hidden state from the
154
+ #transformer to the vocabulary size, predicting the next word in the sequence.
155
+ self.lm_head = nn.Linear(config.n_embd, config.vocab_size, bias=False)
156
+
157
+ # weight sharing Implements weight sharing between the word token embedding layer (wte)
158
+ #and the language modeling head (lm_head). This reduces the number of parameters and encourages
159
+ #the model to learn a meaningful representation for words that can be used for both embedding and prediction.
160
+ self.transformer.wte.weight = self.lm_head.weight
161
+
162
+ # weight initialization
163
+ #Initializes the weights of the model using a custom function (_init_weights).
164
+ self.apply(self._init_weights)
165
+
166
+ def _init_weights(self, module):
167
+ if isinstance(module, nn.Linear):
168
+ std = 0.02
169
+ if hasattr(module, 'NANGPT_SCALE_INIT'):
170
+ std *= (2 * self.config.n_layer) ** -0.5
171
+ torch.nn.init.normal_(module.weight, mean = 0.0, std = std)
172
+ if module.bias is not None:
173
+ torch.nn.init.zeros_(module.bias)
174
+ elif isinstance(module, nn.Embedding):
175
+ torch.nn.init.normal_(module.weight, mean=0.0, std = 0.02)
176
+
177
+
178
+
179
+ def forward(self, idx, targets=None):
180
+ # idx is of shape (B, T)
181
+ B, T = idx.size()
182
+ assert T <= self.config.block_size, f"Cannot forward sequence of length {T}, block size is only {self.config.block_size}"
183
+ # forward the token and posisition embeddings
184
+ pos = torch.arange(0, T, dtype=torch.long, device=idx.device) # shape (T)
185
+ pos_emb = self.transformer.wpe(pos) # position embeddings of shape (T, n_embd)
186
+ tok_emb = self.transformer.wte(idx) # token embeddings of shape (B, T, n_embd)
187
+ x = tok_emb + pos_emb
188
+ # forward the blocks of the transformer
189
+ for block in self.transformer.h:
190
+ x = block(x)
191
+ # forward the final layernorm and the classifier
192
+ x = self.transformer.ln_f(x)
193
+ logits = self.lm_head(x) # (B, T, vocab_size)
194
+ loss = None
195
+ if targets is not None:
196
+ loss = F.cross_entropy(logits.view(-1, logits.size(-1)), targets.view(-1))
197
+ return logits, loss
198
+
199
+ @classmethod
200
+ def from_pretrained(cls, model_type):
201
+ """Loads pretrained GPT-2 model weights from huggingface"""
202
+ assert model_type in {'gpt2', 'gpt2-medium', 'gpt2-large', 'gpt2-xl'}
203
+ from transformers import GPT2LMHeadModel
204
+ print("loading weights from pretrained gpt: %s" % model_type)
205
+
206
+ # n_layer, n_head and n_embd are determined from model_type
207
+ config_args = {
208
+ 'gpt2': dict(n_layer=12, n_head=12, n_embd=768), # 124M params
209
+ 'gpt2-medium': dict(n_layer=24, n_head=16, n_embd=1024), # 350M params
210
+ 'gpt2-large': dict(n_layer=36, n_head=20, n_embd=1280), # 774M params
211
+ 'gpt2-xl': dict(n_layer=48, n_head=25, n_embd=1600), # 1558M params
212
+ }[model_type]
213
+ config_args['vocab_size'] = 50257 # always 50257 for GPT model checkpoints
214
+ config_args['block_size'] = 1024 # always 1024 for GPT model checkpoints
215
+ # create a from-scratch initialized minGPT model
216
+ config = GPTConfig(**config_args)
217
+ model = GPT(config)
218
+ sd = model.state_dict()
219
+ sd_keys = sd.keys()
220
+ sd_keys = [k for k in sd_keys if not k.endswith('.attn.bias')] # discard this mask / buffer, not a param
221
+
222
+ # init a huggingface/transformers model
223
+ model_hf = GPT2LMHeadModel.from_pretrained(model_type)
224
+ sd_hf = model_hf.state_dict()
225
+
226
+ # copy while ensuring all of the parameters are aligned and match in names and shapes
227
+ sd_keys_hf = sd_hf.keys()
228
+ sd_keys_hf = [k for k in sd_keys_hf if not k.endswith('.attn.masked_bias')] # ignore these, just a buffer
229
+ sd_keys_hf = [k for k in sd_keys_hf if not k.endswith('.attn.bias')] # same, just the mask (buffer)
230
+ transposed = ['attn.c_attn.weight', 'attn.c_proj.weight', 'mlp.c_fc.weight', 'mlp.c_proj.weight']
231
+ # basically the openai checkpoints use a "Conv1D" module, but we only want to use a vanilla Linear
232
+ # this means that we have to transpose these weights when we import them
233
+ assert len(sd_keys_hf) == len(sd_keys), f"mismatched keys: {len(sd_keys_hf)} != {len(sd_keys)}"
234
+ for k in sd_keys_hf:
235
+ if any(k.endswith(w) for w in transposed):
236
+ # special treatment for the Conv1D weights we need to transpose
237
+ assert sd_hf[k].shape[::-1] == sd[k].shape
238
+ with torch.no_grad():
239
+ sd[k].copy_(sd_hf[k].t())
240
+ else:
241
+ # vanilla copy over the other parameters
242
+ assert sd_hf[k].shape == sd[k].shape
243
+ with torch.no_grad():
244
+ sd[k].copy_(sd_hf[k])
245
+
246
+ return model
247
+
248
+ def configure_optimizers(self, weight_decay, learning_rate, device_type):
249
+ # start with all of the candidate parameters (that require grad)
250
+ param_dict = {pn: p for pn, p in self.named_parameters()}
251
+ param_dict = {pn: p for pn, p in param_dict.items() if p.requires_grad}
252
+ # create optim groups. Any parameters that is 2D will be weight decayed, otherwise no.
253
+ # i.e. all weight tensors in matmuls + embeddings decay, all biases and layernorms don't.
254
+ decay_params = [p for n, p in param_dict.items() if p.dim() >= 2]
255
+ nodecay_params = [p for n, p in param_dict.items() if p.dim() < 2]
256
+ optim_groups = [
257
+ {'params': decay_params, 'weight_decay': weight_decay},
258
+ {'params': nodecay_params, 'weight_decay': 0.0}
259
+ ]
260
+ num_decay_params = sum(p.numel() for p in decay_params)
261
+ num_nodecay_params = sum(p.numel() for p in nodecay_params)
262
+
263
+ print(f"num decayed parameter tensors: {len(decay_params)}, with {num_decay_params:,} parameters")
264
+ print(f"num non-decayed parameter tensors: {len(nodecay_params)}, with {num_nodecay_params:,} parameters")
265
+ # Create AdamW optimizer and use the fused version if it is available
266
+ fused_available = 'fused' in inspect.signature(torch.optim.AdamW).parameters
267
+ use_fused = fused_available and device_type == "cuda"
268
+
269
+ print(f"using fused AdamW: {use_fused}")
270
+ optimizer = torch.optim.AdamW(optim_groups, lr=learning_rate, betas=(0.9, 0.95), eps=1e-8, fused=use_fused)
271
+ return optimizer
272
+
273
+ # model = GPT.from_pretrained('gpt2')
274
+
275
+ device = 'cpu'
276
+ if torch.cuda.is_available():
277
+ device = 'cuda'
278
+ elif hasattr(torch.backends, "mps") and torch.backends.mps.is_available():
279
+ device = "mps"
280
+ print(f"using device: {device}")
281
+
282
+ # SEED
283
+ torch.manual_seed(1337)
284
+ if torch.cuda.is_available():
285
+ torch.cuda.manual_seed(1337)
286
+
287
+ # STOP
288
+ # num_return_sequences = 5
289
+ # max_length = 30
290
+
291
+
292
+
293
+ import tiktoken
294
+
295
+ class DataLoaderLite:
296
+ def __init__(self, B, T):
297
+ self.B = B
298
+ self.T = T
299
+
300
+ # at init load tokens from disk and store them in memory
301
+ with open('input.txt', 'r') as f:
302
+ text = f.read()
303
+ enc = tiktoken.get_encoding('gpt2')
304
+ tokens = enc.encode(text)
305
+ self.tokens = torch.tensor(tokens)
306
+ print(f'loaded {len(self.tokens)} tokens')
307
+ print(f'1 epoch = {len(self.tokens) // (B * T)} batches')
308
+
309
+ # state
310
+ self.current_position = 0
311
+
312
+ def next_batch(self):
313
+ B, T = self.B, self.T
314
+ buf = self.tokens[self.current_position: self.current_position + B * T + 1]
315
+ x = (buf[:-1]).view(B, T) # inputs
316
+ y = (buf[1:]).view(B, T) # targets
317
+ # advance the position in the tensor
318
+ self.current_position += B*T
319
+ # if loading the next batch would be out of bounds, reset
320
+ if self.current_position + (B * T + 1) > len(self.tokens):
321
+ self.current_position = 0
322
+ return x, y
323
+
324
+ # CHANGES IN CURRENT CODE
325
+ torch.set_float32_matmul_precision('high')
326
+ model = GPT(GPTConfig())
327
+ model.to(device)
328
+ # model = torch.compile(model)
329
+
330
+ # CODE UPDATE HERE
331
+ max_lr = 6e-4
332
+ min_lr = max_lr * 0.1
333
+ # warmup_steps = 100
334
+ # # max_steps = 50
335
+
336
+ def get_lr(it,warmup_steps, max_steps):
337
+ if it < warmup_steps:
338
+ return max_lr * (it + 1) / warmup_steps
339
+ if it > max_steps:
340
+ return min_lr
341
+ decay_ratio = (it - warmup_steps) / (max_steps - warmup_steps)
342
+ assert 0 <= decay_ratio <=1
343
+ coeff = 0.5 * (1.0 + math.cos(math.pi * decay_ratio))
344
+ return min_lr + coeff * (max_lr - min_lr)
345
+
346
+
347
+ # NEW CODE
348
+ import time
349
+ train_loader = DataLoaderLite(B = 8, T = 512)
350
+
351
+ # train_loader = DataLoaderLite(B = B, T = T)
352
+ x, y = train_loader.next_batch()
353
+ x.shape, y.shape
354
+
355
+ def run_train (max_steps = 50 ,warmup_steps = 100, PATH = "/content/drive/MyDrive/S21/gpt_124M.pth" ):
356
+ # optimizer = torch.optim.AdamW(model.parameters(), lr = 3e-4, betas=(0.9, 0.95), eps=1e-8)
357
+ optimizer = model.configure_optimizers(weight_decay=0.1, learning_rate=6e-4, device_type=device)
358
+ for step in range(max_steps):
359
+ t0 = time.time()
360
+ x, y = train_loader.next_batch()
361
+ x, y = x.to(device), y.to(device)
362
+ optimizer.zero_grad()
363
+ # NEW CODE ADDED HERE
364
+ with torch.autocast(device_type=device, dtype=torch.bfloat16):
365
+ logits, loss = model(x, y)
366
+ loss.backward()
367
+ norm = torch.nn.utils.clip_grad_norm(model.parameters(), 1.0)
368
+ # NEW CODE
369
+ lr = get_lr(step, max_steps = 50 ,warmup_steps = 100)
370
+ for param_group in optimizer.param_groups:
371
+ param_group['lr'] = lr
372
+
373
+ optimizer.step()
374
+ torch.cuda.synchronize()
375
+ t1 = time.time()
376
+ dt = (t1 - t0) * 1000
377
+ tokens_per_sec = (train_loader.B * train_loader.T) / (t1 - t0)
378
+ print(f'step{step} | loss: {loss.item()} | dt: {dt:.2f}ms | tok/sec: {tokens_per_sec: .2f} | norm: {norm:.2f}')
379
+ print(loss)
380
+ torch.save(model.state_dict(), PATH)
381
+ return model
382
+
383
+ def load_fromsaved(PATH = "/content/drive/MyDrive/S21/gpt_124M.pth" ):
384
+
385
+ # Create a new GPT model instance
386
+ model = GPT(GPTConfig())
387
+ model.to(device)
388
+
389
+ # Load the saved weights into the model
390
+ model.load_state_dict(torch.load(PATH))
391
+
392
+
393
+ # Print confirmation message
394
+ print("Loaded model weights from:", PATH)
395
+ model.to(device)
396
+
397
+ return model
398
+
399
+
400
+ def gen_text(model,start_tokens, max_length=100, num_return_sequences=10 ):
401
+ """
402
+ Generates text using the loaded GPT model.
403
+
404
+ Args:
405
+ model: The GPT model to use for generation.
406
+ start_tokens (optional): A list of token IDs to use as the starting prompt.
407
+ max_length: The maximum length of the generated text.
408
+ num_return_sequences: The number of text sequences to generate.
409
+
410
+ Returns:
411
+ None
412
+ """
413
+ decoded_texts = ''
414
+ enc = tiktoken.get_encoding('gpt2')
415
+ tokens = enc.encode(start_tokens)
416
+ tokens = torch.tensor(tokens, dtype= torch.long) # (8,) #check tiktoken app
417
+ tokens = tokens.unsqueeze(0).repeat(num_return_sequences, 1) # (5, 8)
418
+ x = tokens.to(device)
419
+
420
+ # Set random seeds for consistent generation across runs
421
+ torch.manual_seed(42)
422
+ torch.cuda.manual_seed(42)
423
+ generated_text = ""
424
+ while x.size(1) < max_length:
425
+ # forward the model to get the logits
426
+ with torch.no_grad():
427
+ logits = model(x)[0] # (B, T, vocab_size)
428
+ # take the logits at the last position
429
+ logits = logits[:, -1, :] # (B, vocab_size)
430
+ # get the probabilities
431
+ probs = F.softmax(logits, dim=-1)
432
+ # do top-k sampling of 50 (huggingface pipeline default)
433
+ # topk_probs here becomes (5, 50), topk_indices is (5, 50)
434
+ topk_probs, topk_indices = torch.topk(probs, 50, dim=-1)
435
+ # select a token from the top-k probabilities
436
+ # note: multinomial does not demand the input to sum to 1
437
+ ix = torch.multinomial(topk_probs, 1) # (B, 1)
438
+ # gather the corresponding indices
439
+ xcol = torch.gather(topk_indices, -1, ix) # (B, 1)
440
+ # append to the sequence
441
+ x = torch.cat((x, xcol), dim=1)
442
+
443
+ # print the generated text
444
+ for i in range(num_return_sequences):
445
+ tokens = x[i, :max_length].tolist()
446
+ decoded = enc.decode(tokens)
447
+ print(">", decoded)
448
+ generated_text += decoded
449
+ return generated_text