Alexandre D-Julin commited on
Commit
aa60148
·
1 Parent(s): 1c4fb0f
Files changed (5) hide show
  1. .gitattributes +1 -0
  2. README.md +2 -0
  3. bigram_model.py +307 -0
  4. gradio_app.py +38 -0
  5. lafontaine_gpt_v1.pth +3 -0
.gitattributes ADDED
@@ -0,0 +1 @@
 
 
1
+ *.pth filter=lfs diff=lfs merge=lfs -text
README.md ADDED
@@ -0,0 +1,2 @@
 
 
 
1
+ # lafontaine-gpt
2
+ A rudimentary gpt model trained on the Fables de La Fontaine.
bigram_model.py ADDED
@@ -0,0 +1,307 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import torch
3
+ import onnx
4
+ import torch.nn as nn
5
+ from torch.nn import functional as F
6
+ from datetime import datetime
7
+ torch.manual_seed(1337) # for reproducibility
8
+
9
+ SEP = 50 * '-'
10
+
11
+ # hyperparameters ----------------------------------------------------------------------------------
12
+ batch_size = 64 # how many independent sequences will we process in parallel
13
+ block_size = 256 # what i sthe maximum context length for predictions
14
+ max_iters = 5000 # how many iterations to train for
15
+ eval_interval = 500 # how often to evaluate the model
16
+ learning_rate = 3e-4 # how fast we update the weights, lowering the learning rate as the model gets bigger
17
+ device = 'cuda' if torch.cuda.is_available() else 'cpu' # check if GPU is available
18
+ eval_iters = 200 # how many batches to average for evaluation
19
+ n_embd = 384 # number of embedding dimensions
20
+ n_head = 6 # number of self-attention heads
21
+ n_layer = 6 # number of transformer blocks
22
+ dropout = 0.2 # dropout rate
23
+
24
+ # dataset ------------------------------------------------------------------------------------------
25
+ dataset_path = 'dataset/tiny-lafontaine.txt'
26
+ with open(dataset_path, 'r', encoding='utf-8') as f:
27
+ text = f.read()
28
+
29
+ # here are all the unique characters that occur in this text
30
+ chars = sorted(list(set(text)))
31
+ vocab_size = len(chars)
32
+
33
+ # create a mapping from characters to integers
34
+ stoi = {ch: i for i, ch in enumerate(chars)} # chars -> ints table
35
+ itos = {i: ch for i, ch in enumerate(chars)} # ints -> chars table
36
+ encode = lambda s: [stoi[c] for c in s] # encoder: takes a string, outputs a list of integers
37
+ decode = lambda l: ''.join([itos[i] for i in l]) # decoder: takes a list of integers, output a string
38
+
39
+ # train and test splits
40
+ data = torch.tensor(encode(text), dtype=torch.long)
41
+ n = int(0.9 * len(data)) # first 90% of the data will be the training set, rest will be the validation set
42
+ train_data = data[:n]
43
+ val_data = data[n:]
44
+
45
+
46
+ # data loading -------------------------------------------------------------------------------------
47
+ def get_batch(split):
48
+ # Generate a small batch of data of inputs x and targets y
49
+ data = train_data if split == 'train' else val_data # choose the split
50
+ ix = torch.randint(len(data) - block_size, (batch_size,)) # sample random starting indices for the sequences
51
+ x = torch.stack([data[i: i + block_size] for i in ix]) # create a batch of context windows
52
+ y = torch.stack([data[i + 1:i + block_size + 1] for i in ix]) # create a batch of targets, one step forward
53
+ x, y = x.to(device), y.to(device) # move the data to the device
54
+ return x, y
55
+
56
+
57
+ @torch.no_grad() # this is just to reduce memory consumption, block won't call backward, no back-propagation
58
+ def estimate_loss():
59
+ out = {} # store the losses for the train and val splits
60
+ model.eval() # switch to evaluation mode
61
+ for split in ['train', 'val']: # iterate over both splits
62
+ losses = torch.zeros(eval_iters) # store the loss for each batch
63
+ for k in range(eval_iters): # iterate over the number of batches
64
+ X, Y = get_batch(split) # get a batch of data
65
+ _, loss = model(X, Y) # compute the loss
66
+ losses[k] = loss.item() # store the loss
67
+ out[split] = losses.mean() # store the average loss for the split
68
+ model.train() # switch back to training mode
69
+ return out # return the losses
70
+
71
+
72
+ # self-attention head ------------------------------------------------------------------------------
73
+ class Head(nn.Module):
74
+
75
+ def __init__(self, head_size):
76
+ super().__init__()
77
+ self.key = nn.Linear(n_embd, head_size, bias=False) # key projection
78
+ self.query = nn.Linear(n_embd, head_size, bias=False) # query projection
79
+ self.value = nn.Linear(n_embd, head_size, bias=False) # value projection
80
+ self.register_buffer('tril', torch.tril(torch.ones(block_size, block_size))) # causal mask
81
+ self.dropout = nn.Dropout(dropout) # dropout layer
82
+
83
+ def forward(self, x):
84
+ B, T, C = x.shape
85
+ k = self.key(x) # (B, T, C)
86
+ q = self.query(x) # (B, T, C)
87
+ # compute attention scores ("affinities")
88
+ wei = q @ k.transpose(-2, -1) * C**-0.5 # (B, T, T) @ (B, C, T) -> (B, T, T)
89
+ wei = wei.masked_fill(self.tril[:T, :T] == 0, float('-inf')) # (B, T, T)
90
+ wei = F.softmax(wei, dim=-1) # (B, T, T)
91
+ wei = self.dropout(wei) # apply dropout
92
+ # perform the weighted aggregation of the values
93
+ v = self.value(x)
94
+ out = wei @ v # (B, T, T) @ (B, T, C) -> (B, T, C)
95
+ return out
96
+
97
+
98
+ # multi-attention head -----------------------------------------------------------------------------
99
+ class MultiHeadAttention(nn.Module):
100
+ """multiple heads of self-attention in parallel"""
101
+
102
+ def __init__(self, num_heads, head_size):
103
+ super().__init__()
104
+ self.heads = nn.ModuleList([Head(head_size) for _ in range(num_heads)]) # create n_heads heads
105
+ self.proj = nn.Linear(n_embd, n_embd) # linear projection to get back to the original dimension
106
+
107
+ def forward(self, x):
108
+ out = torch.cat([h(x) for h in self.heads], dim=-1) # concatenate the outputs of each head
109
+ out = self.proj(out) # linear projection to get back to the original dimension
110
+ return out
111
+
112
+
113
+ # feedforward block --------------------------------------------------------------------------------
114
+ class FeedForward(nn.Module):
115
+ """a simple linear layer followed by a non-linearity"""
116
+
117
+ def __init__(self, n_embd):
118
+ super().__init__() # call the constructor of the parent class
119
+ self.net = nn.Sequential(
120
+ nn.Linear(n_embd, 4 * n_embd), # linear layer
121
+ nn.ReLU(), # activation function
122
+ nn.Linear(4 * n_embd, n_embd), # projection layer to get back to the original dimension
123
+ nn.Dropout(dropout), # dropout layer
124
+ )
125
+
126
+ def forward(self, x):
127
+ return self.net(x) # apply the feedforward block
128
+
129
+
130
+ # transformer block --------------------------------------------------------------------------------
131
+ class Block(nn.Module):
132
+ """ Transformer block: communication followed by computation """
133
+
134
+ def __init__(self, n_embd, n_head):
135
+ # n_embd: embedding dimension, n_head: number of heads we'd like
136
+ super().__init__()
137
+ head_size = n_embd // n_head # size of the self-attention heads
138
+ self.sa = MultiHeadAttention(n_head, head_size) # self-attention layer
139
+ self.ffwd = FeedForward(n_embd) # feedforward block
140
+ self.ln1 = nn.LayerNorm(n_embd) # layer normalization
141
+ self.ln2 = nn.LayerNorm(n_embd) # layer normalization
142
+
143
+ def forward(self, x):
144
+ x = x + self.sa(self.ln1(x)) # apply the self-attention block. Layer normalization is applied before
145
+ x = x + self.ffwd(self.ln2(x)) # apply the feedforward block. Layer normalization is applied before
146
+ return x
147
+
148
+
149
+ # simple bigram model ------------------------------------------------------------------------------
150
+ class BigramLanguageModel(nn.Module):
151
+
152
+ def __init__(self):
153
+ super().__init__()
154
+ # each token directly reads off the logits from the next token from a lookup table
155
+ self.token_embedding_table = nn.Embedding(vocab_size, n_embd) # token embeddings
156
+ self.position_embedding_table = nn.Embedding(block_size, n_embd) # positional embeddings
157
+ self.blocks = nn.Sequential(*[Block(n_embd, n_head=n_head) for _ in range(n_layer)]) # stack of transformer blocks
158
+ self.ln_f = nn.LayerNorm(n_embd), # final layer normalization
159
+ self.lm_head = nn.Linear(n_embd, vocab_size) # output layer
160
+
161
+ def forward(self, idx, targets=None):
162
+ B, T = idx.shape
163
+
164
+ # idx and targets are both (B, T) tensors of integers
165
+ tok_emb = self.token_embedding_table(idx) # (B, T, C) = Batch, Time (block_size), Channels (vocab_size)
166
+ pos_emb = self.position_embedding_table(torch.arange(T, device=device)) # (T, C)
167
+ x = tok_emb + pos_emb # (B, T, C)
168
+ x = self.blocks(x) # apply the transformer blocks, multiple layers of self-attention and feedforward, (B, T, C)
169
+ logits = self.lm_head(x) # decoder head (B, T, vocab_size)
170
+
171
+ if targets is None: # if we don't have targets, we can't compute the loss
172
+ loss = None
173
+
174
+ else:
175
+ # reshape the logits to be (B*T, C) and the targets to be (B*T) so we can compute the loss
176
+ B, T, C = logits.shape # unpack batch, time, channels
177
+ logits = logits.view(B * T, C) # flatten the Time and Batch dimensions
178
+ targets = targets.view(B * T) # flatten the Time and Batch dimensions
179
+
180
+ # compute the loss using cross entropy = quality of the logicts in respect to the targets
181
+ loss = F.cross_entropy(logits, targets)
182
+
183
+ return logits, loss
184
+
185
+ def generate(self, idx, max_new_tokens):
186
+ # idx is a (B, T) array of indices in the current context
187
+ for _ in range(max_new_tokens):
188
+ # crop idx to the last block_size tokens
189
+ idx_cond = idx[:, -block_size:] # (B, T)
190
+ # get the predictions
191
+ logits, loss = self(idx_cond) # (B, T, C) internally calls the forward method in pytorch
192
+ # focus only on the last time step
193
+ logits = logits[:, -1, :] # becomes (B, C)
194
+ # apply softmax to get probabilities
195
+ probs = F.softmax(logits, dim=-1) # (B, C)
196
+ # sample from the distribution
197
+ idx_next = torch.multinomial(probs, num_samples=1) # (B, 1)
198
+ # append sampled index to the running sequence
199
+ idx = torch.cat((idx, idx_next), dim=1) # (B, T+1)
200
+
201
+ return idx
202
+
203
+
204
+ # train model --------------------------------------------------------------------------------------
205
+ def train_model():
206
+ # create the model and optimizer
207
+ model = BigramLanguageModel()
208
+ m = model.to(device) # move the model to the device (cuda)
209
+
210
+ # create a PyTorch optimizer
211
+ optimizer = torch.optim.AdamW(m.parameters(), lr=learning_rate) # AdamW is a good optimizer for transformers
212
+
213
+ # training loop ------------------------------------------------------------------------------------
214
+ for iter in range(max_iters):
215
+
216
+ # every once in a while evaluate the loss on the train and val sets
217
+ if iter % eval_interval == 0:
218
+ losses = estimate_loss()
219
+ print(f"step {iter}: train loss {losses['train']:.4f}, val loss {losses['val']:.4f}")
220
+
221
+ # sample a batch of data
222
+ xb, yb = get_batch('train')
223
+
224
+ # evaluate the loss
225
+ _, loss = m(xb, yb) # calling the model and passing in the input and the targets
226
+ optimizer.zero_grad(set_to_none=True) # clear previous gradients
227
+ loss.backward() # compute new gradients
228
+ optimizer.step() # update the weights
229
+
230
+ # generate from the model
231
+ context = torch.zeros((1, 1), dtype=torch.long, device=device) # initialize context to be a single token
232
+ print(decode(m.generate(context, max_new_tokens=500)[0].tolist())) # generate 100 new tokens
233
+
234
+ # save model
235
+ save_model(model)
236
+
237
+ return m
238
+
239
+
240
+ # save model ---------------------------------------------------------------------------------------
241
+ def save_model(model, save_path=None):
242
+ try:
243
+ if save_path is None:
244
+ filename = os.path.splitext(os.path.basename(__file__))[0]
245
+ timestamp = datetime.now().strftime('%y%m%d_%H%M')
246
+ save_path = f'{filename}_{timestamp}.pth'
247
+
248
+ torch.save(model.state_dict(), save_path)
249
+ print(f"Model saved to {save_path}.")
250
+ return save_path
251
+
252
+ except Exception as e:
253
+ print(f"Error saving the model: {e}")
254
+
255
+
256
+ # load model ---------------------------------------------------------------------------------------
257
+ def load_model(model_path):
258
+ try:
259
+ # Load the model
260
+ device = 'cuda' if torch.cuda.is_available() else 'cpu'
261
+ model = BigramLanguageModel().to(device)
262
+ model.load_state_dict(torch.load(model_path, map_location=device, weights_only=True))
263
+ print(f"Model loaded from {model_path}.")
264
+ return model
265
+
266
+ except Exception as e:
267
+ print(f"Error loading the model: {e}")
268
+
269
+
270
+ # run inference ------------------------------------------------------------------------------------
271
+ def run_inference(model, max_tokens=500):
272
+ # Set to evaluation mode
273
+ model.eval()
274
+ # Define a starting context and run inference
275
+ context = torch.zeros((1, 1), dtype=torch.long, device=device) # Initialize with a single token
276
+ generated_sequence = model.generate(context, max_tokens) # Generate text
277
+ generated_text = decode(generated_sequence[0].tolist()) # Decode the generated indices to text
278
+ return generated_text
279
+
280
+
281
+ # export model to onnx format ----------------------------------------------------------------------
282
+ def export_onnx_model(pt_model, onnx_path):
283
+ try:
284
+ # Dummy input tensor of the same shape as your training input
285
+ dummy_input = torch.zeros((1, 256), dtype=torch.long).to(device) # Example input shape
286
+
287
+ # Export the model to ONNX format
288
+ torch.onnx.export(
289
+ pt_model, # your trained model
290
+ dummy_input, # example input tensor
291
+ onnx_path, # output file path
292
+ input_names=["input"], # input layer names
293
+ output_names=["output"], # output layer names
294
+ dynamic_axes={"input": {0: "batch_size"}, "output": {0: "batch_size"}}, # dynamic axis support
295
+ opset_version=13 # compatibility with latest ONNX version
296
+ )
297
+
298
+ print(f"Model exported to {onnx_path}.")
299
+
300
+ except Exception as e:
301
+ print(f"Error exporting the onnx model: {e}")
302
+
303
+
304
+ if __name__ == '__main__':
305
+
306
+ # train model
307
+ model = train_model()
gradio_app.py ADDED
@@ -0,0 +1,38 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ import torch
3
+ from bigram_model import BigramLanguageModel, encode, decode
4
+
5
+ # Assuming 'BigramLanguageModel' and 'decode' are defined as in your code
6
+
7
+ class GradioInterface:
8
+ def __init__(self, model_path=None):
9
+ self.device = 'cuda' if torch.cuda.is_available() else 'cpu'
10
+ self.model = self.load_model(model_path)
11
+ self.model.eval()
12
+
13
+ def load_model(self, model_path):
14
+ model = BigramLanguageModel().to(self.device)
15
+ if model_path:
16
+ model.load_state_dict(torch.load(model_path, map_location=self.device))
17
+ return model
18
+
19
+ def generate_text(self, input_text, max_tokens=100):
20
+ context = torch.tensor([encode(input_text)], dtype=torch.long, device=self.device)
21
+ output = self.model.generate(context, max_new_tokens=max_tokens)
22
+ return decode(output[0].tolist())
23
+
24
+ # Load the model
25
+ model_path = "models/lafontaine_gpt_v8_241011_1307.pth"
26
+ model_interface = GradioInterface(model_path)
27
+
28
+ # Define Gradio interface
29
+ gr_interface = gr.Interface(
30
+ fn=model_interface.generate_text,
31
+ inputs=["text", gr.Slider(50, 500)],
32
+ outputs="text",
33
+ description="Bigram Language Model text generation. Enter some text, and the model will continue it.",
34
+ examples=[["Once upon a time"]]
35
+ )
36
+
37
+ # Launch the interface
38
+ gr_interface.launch()
lafontaine_gpt_v1.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c2649c070cc2cff979d023004f9c52d97b49e09c3bc1c5634b6131cf4418db1f
3
+ size 52731146