jezthejirzaman starship006 commited on
Commit
1013b6b
·
0 Parent(s):

Duplicate from starship006/mini_shakespeare

Browse files

Co-authored-by: Cody Rushing <[email protected]>

Files changed (6) hide show
  1. .gitattributes +34 -0
  2. README.md +13 -0
  3. app.py +632 -0
  4. requirements.txt +8 -0
  5. shakespeare.txt +0 -0
  6. toInfer.pt +3 -0
.gitattributes ADDED
@@ -0,0 +1,34 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ *.7z filter=lfs diff=lfs merge=lfs -text
2
+ *.arrow filter=lfs diff=lfs merge=lfs -text
3
+ *.bin filter=lfs diff=lfs merge=lfs -text
4
+ *.bz2 filter=lfs diff=lfs merge=lfs -text
5
+ *.ckpt filter=lfs diff=lfs merge=lfs -text
6
+ *.ftz filter=lfs diff=lfs merge=lfs -text
7
+ *.gz filter=lfs diff=lfs merge=lfs -text
8
+ *.h5 filter=lfs diff=lfs merge=lfs -text
9
+ *.joblib filter=lfs diff=lfs merge=lfs -text
10
+ *.lfs.* filter=lfs diff=lfs merge=lfs -text
11
+ *.mlmodel filter=lfs diff=lfs merge=lfs -text
12
+ *.model filter=lfs diff=lfs merge=lfs -text
13
+ *.msgpack filter=lfs diff=lfs merge=lfs -text
14
+ *.npy filter=lfs diff=lfs merge=lfs -text
15
+ *.npz filter=lfs diff=lfs merge=lfs -text
16
+ *.onnx filter=lfs diff=lfs merge=lfs -text
17
+ *.ot filter=lfs diff=lfs merge=lfs -text
18
+ *.parquet filter=lfs diff=lfs merge=lfs -text
19
+ *.pb filter=lfs diff=lfs merge=lfs -text
20
+ *.pickle filter=lfs diff=lfs merge=lfs -text
21
+ *.pkl filter=lfs diff=lfs merge=lfs -text
22
+ *.pt filter=lfs diff=lfs merge=lfs -text
23
+ *.pth filter=lfs diff=lfs merge=lfs -text
24
+ *.rar filter=lfs diff=lfs merge=lfs -text
25
+ *.safetensors filter=lfs diff=lfs merge=lfs -text
26
+ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
27
+ *.tar.* filter=lfs diff=lfs merge=lfs -text
28
+ *.tflite filter=lfs diff=lfs merge=lfs -text
29
+ *.tgz filter=lfs diff=lfs merge=lfs -text
30
+ *.wasm filter=lfs diff=lfs merge=lfs -text
31
+ *.xz filter=lfs diff=lfs merge=lfs -text
32
+ *.zip filter=lfs diff=lfs merge=lfs -text
33
+ *.zst filter=lfs diff=lfs merge=lfs -text
34
+ *tfevents* filter=lfs diff=lfs merge=lfs -text
README.md ADDED
@@ -0,0 +1,13 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ title: Mini Shakespeare
3
+ emoji: 🐢
4
+ colorFrom: gray
5
+ colorTo: purple
6
+ sdk: gradio
7
+ sdk_version: 3.12.0
8
+ app_file: app.py
9
+ pinned: false
10
+ duplicated_from: starship006/mini_shakespeare
11
+ ---
12
+
13
+ Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
app.py ADDED
@@ -0,0 +1,632 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # %% [markdown]
2
+ # <a href="https://colab.research.google.com/github/starship006/ARENA-work/blob/main/w1/w1d4.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>
3
+
4
+ # %% [markdown]
5
+ # # Training Shakespeare Himself
6
+
7
+ # %% [markdown]
8
+ # For those who are not part of the ARENA program and are curious as to what this is, this was my first significant AI/ML project! I made components for a decoder-only transformer, and trained it on a corpus consisting of text from Shakespeare. Scroll to the bottom to see some output :)
9
+ # %%
10
+ import torch as t
11
+ import numpy as np
12
+ from torch import nn
13
+ import fancy_einsum as einsum
14
+ import einops
15
+ import pandas as pd
16
+
17
+
18
+ # %% [markdown]
19
+ # ## transformer functions
20
+ #
21
+ #
22
+
23
+ # %% [markdown]
24
+ # This will be from the transformer components I made earlier this week, but I'll put down optimizations so it can use the GPU.
25
+ #
26
+ # And I did just that. The speed improvements are MASSIVE, wow!
27
+
28
+ # %%
29
+ device = t.device("cuda:0" if t.cuda.is_available() else "cpu")
30
+ t.cuda.is_available()
31
+
32
+ # %%
33
+ def multihead_masked_attention(Q: t.Tensor, K: t.Tensor, V: t.Tensor, num_heads: int):
34
+ '''
35
+ Implements multihead masked attention on the matrices Q, K and V.
36
+
37
+ Q: shape (batch, seq_len, nheads*headsize)
38
+ K: shape (batch, seq_len, nheads*headsize)
39
+ V: shape (batch, seq_len, nheads*headsize)
40
+ '''
41
+
42
+ Q = einops.rearrange(Q, 'b s (n h) -> b n s h', n = num_heads)
43
+ K = einops.rearrange(K, 'b s (n h) -> b n s h', n = num_heads)
44
+ V = einops.rearrange(V, 'b s (n h) -> b n s h', n = num_heads)
45
+
46
+
47
+ scores = einsum.einsum('b n k h, b n s h -> b n s k', K, Q)
48
+ assert scores.shape == t.Size([Q.shape[0], num_heads,Q.shape[2], K.shape[2]])
49
+
50
+ scores = scores / np.sqrt(Q.shape[-1])
51
+ attention = scores + t.triu(t.ones_like(scores,device = device) * float("-inf"), diagonal=1) # THIS IS STOLEN FROM JAY - testing it out
52
+ softed = t.softmax(attention,dim=-1)
53
+ result = einsum.einsum('batch numheads seqQ seqK, batch numheads seqK headsize -> batch numheads seqQ headsize',softed, V)
54
+ return einops.rearrange(result, 'batch numheads seqQ headsize -> batch seqQ (numheads headsize)')
55
+
56
+ # %%
57
+ class MultiheadMaskedAttention(nn.Module):
58
+ W_QKV: nn.Linear
59
+ W_O: nn.Linear
60
+
61
+ def __init__(self, hidden_size: int, num_heads: int):
62
+ super().__init__()
63
+ self.hidden_size = hidden_size
64
+ self.num_heads = num_heads
65
+ self.head_size = hidden_size // num_heads
66
+
67
+ self.WQKV = t.nn.Linear(self.hidden_size, 3 * hidden_size) # TODO: why do we use a linear layer here? aren't they matricies?
68
+ self.W0 = t.nn.Linear(self.hidden_size, self.hidden_size)
69
+
70
+ def forward(self, x: t.Tensor) -> t.Tensor:
71
+ '''
72
+ x: shape (batch, seq, hidden_size)
73
+
74
+ Return: shape (batch, seq, hidden_size)
75
+ '''
76
+ #print("YO?")
77
+ x = x.float() # seems like it needs to be a float!
78
+ QKV = self.WQKV(x)
79
+ Q = QKV[:,:,:self.hidden_size]
80
+ K = QKV[:,:,self.hidden_size:self.hidden_size * 2]
81
+ V = QKV[:,:,self.hidden_size * 2:]
82
+ assert Q.shape == K.shape == V.shape == x.shape
83
+ return self.W0(multihead_masked_attention(Q,K,V,self.num_heads))
84
+
85
+ # %%
86
+ from dataclasses import dataclass
87
+
88
+ @dataclass(frozen=True)
89
+ class TransformerConfig:
90
+ '''Constants used throughout your decoder-only transformer model.'''
91
+
92
+ num_layers: int
93
+ num_heads: int
94
+ vocab_size: int
95
+ hidden_size: int
96
+ max_seq_len: int
97
+ dropout: float = 0.1
98
+ layer_norm_epsilon: float = 1e-05
99
+
100
+ # %%
101
+ # from yesterday
102
+ class PositionalEncoding(nn.Module):
103
+
104
+ def __init__(self, embedding_dim: int, max_seq_len: int = 5000):
105
+ super().__init__()
106
+ self.dim = embedding_dim
107
+ self.length = max_seq_len
108
+
109
+ # mostly copied. i understand this, just need to work on
110
+ # making more tensors and getting more exposure to methods of making tensors
111
+ def P (delta):
112
+ n = 10000 # hardcoded
113
+ d = embedding_dim
114
+ l = max_seq_len
115
+ sin_array = np.sin(delta / n ** (2 * np.arange(d//2) / d))
116
+ cos_array = np.cos(delta / n ** (2 * np.arange(d//2) / d))
117
+
118
+ array = np.zeros(d)
119
+ array[::2] = sin_array
120
+ array[1::2] = cos_array
121
+
122
+ return array
123
+
124
+ tokenArray = []
125
+ for i in range(max_seq_len):
126
+ tokenArray.append(P(i)) # changed from previous design
127
+
128
+ self.multMax = t.tensor(np.array(tokenArray), dtype=t.float, device = device)
129
+
130
+
131
+ def forward(self, x: t.Tensor) -> t.Tensor:
132
+ '''
133
+ x: shape (batch, seq_len, embedding_dim)
134
+ '''
135
+ return x + self.multMax[:x.shape[1]]
136
+
137
+
138
+ # %%
139
+ class MLP(nn.Module):
140
+ def __init__(self, config: TransformerConfig):
141
+ super().__init__()
142
+ self.hidden_size = config.hidden_size
143
+
144
+ self.layers = nn.Sequential(
145
+ nn.Linear(self.hidden_size, self.hidden_size * 4),
146
+ nn.GELU(),
147
+ nn.Linear(self.hidden_size * 4, self.hidden_size),
148
+ nn.Dropout(config.dropout)
149
+ )
150
+ def forward(self, x: t.Tensor):
151
+ x = x.float() # seems like it needs to be a float!
152
+ return self.layers(x).float() # ima do the same thing again!
153
+
154
+
155
+ # %%
156
+ class DecoderBlock(nn.Module):
157
+
158
+ def __init__(self, config: TransformerConfig):
159
+ super().__init__()
160
+ self.attentionBlock = nn.Sequential(
161
+ MultiheadMaskedAttention(config.hidden_size, config.num_heads),
162
+ nn.LayerNorm(config.hidden_size)
163
+ )
164
+ self.MLP = nn.Sequential(
165
+ MLP(config),
166
+ nn.LayerNorm(config.hidden_size)
167
+ )
168
+
169
+ def forward(self, x: t.Tensor) -> t.Tensor:
170
+ partOne = x + self.attentionBlock(x)
171
+ return (partOne + self.MLP(partOne)).float() # seems like it needs to be a float!
172
+
173
+
174
+ # %%
175
+ class DecoderOnlyTransformer(nn.Module):
176
+
177
+ def __init__(self, config: TransformerConfig):
178
+ super().__init__()
179
+ self.tokenize = nn.Embedding(config.vocab_size, config.hidden_size).to(device)
180
+ self.positionize = PositionalEncoding(config.hidden_size,config.max_seq_len)
181
+ self.restModel = nn.Sequential(
182
+ nn.Dropout(config.dropout),
183
+ *[DecoderBlock(config) for i in range(config.num_layers)],
184
+ nn.LayerNorm(config.hidden_size),
185
+ )
186
+ self.unembed = self.tokenize.weight.T.to(device)
187
+
188
+ def forward(self, x: t.Tensor) -> t.Tensor:
189
+ x = self.tokenize(x)
190
+ x = self.positionize(x)
191
+ toUnembed = self.restModel(x).to(device)
192
193
+
194
+ # %% [markdown]
195
+ # ## Data Prep
196
+
197
+ # %% [markdown]
198
+ # Make the dataset to parse through all of the words
199
+
200
+ # %%
201
+ import re
202
+ from torch.utils.data import Dataset, DataLoader
203
+ from torchvision import datasets
204
+
205
+ class CustomTextDataset(Dataset):
206
+ def __init__(self, words, seq_len, fractionOfWords):
207
+ self.fractionOfWords = fractionOfWords
208
+ self.words = words
209
+ self.setOfWords = set(words)
210
+ self.seq_len = seq_len
211
+ self.max_len = len(self.words) - (self.seq_len + 1)
212
+ self.vocab_size = len(self.setOfWords)
213
+ self.word_to_token = {word: idx for (idx, word) in enumerate(sorted(self.setOfWords))}
214
+ self.token_to_word = {idx: word for (idx, word) in enumerate(sorted(self.setOfWords))}
215
+ self.allTokens = t.tensor([self.word_to_token[word] for word in self.words],device = device)
216
+
217
+ if (self.fractionOfWords > 0.9):
218
+ print("Probably don't do this. Errors may about")
219
+
220
+ def __len__(self):
221
+ return int(self.max_len * self.fractionOfWords)
222
+
223
+ def __getitem__(self, idx):
224
+ tokens = self.allTokens[idx:idx + self.seq_len + 1]
225
+ input = tokens[:-1]
226
+ target = tokens[1:]
227
+ return input, target
228
+
229
+ def getDataSize(self):
230
+ return self.vocab_size
231
+
232
+ def convertToTokens(self, phrase: list) -> t.tensor:
233
+ return t.tensor([self.word_to_token[word] for word in phrase],device = device)
234
+
235
+ def convertStringToTokenList(self, phrase: str) -> list:
236
+ words = re.split(r"\b", phrase)
237
+ return [self.word_to_token[word] for word in words]
238
+
239
+ def convertToText(self, tokens: t.tensor):
240
+ temp = []
241
+ for i, value in enumerate(tokens):
242
+ #print(value.item())
243
+ temp.append(self.token_to_word[value.item()])
244
+ return temp
245
+
246
+ def decodeList(self, words: list):
247
+ temp = []
248
+ for value in words:
249
+ temp.append(self.token_to_word[value])
250
+ return temp
251
+
252
+ def listToString(self, words: list) -> str:
253
+ temp = ""
254
+ for word in words:
255
+ temp = temp + word
256
+ return temp
257
+
258
+ # %%
259
+ file = open("shakespeare.txt")
260
+ text = file.read()
261
+ words = re.split(r"\b", text)
262
+
263
+ fractionOfWords = 0.1 # what percent of the corpus to train on
264
+
265
+
266
+ lengthOfSeq = 100
267
+
268
+ shak = CustomTextDataset(words, lengthOfSeq, fractionOfWords)
269
+
270
+ # %% [markdown]
271
+ # ## Running this data through a transformer
272
+
273
+ # %%
274
+ trainloader = DataLoader(shak, batch_size=32,shuffle=True)
275
+
276
+ # this specific one trained for 24 minutes and 9 seconds on colab GPU
277
+
278
+ thisConfig = TransformerConfig(
279
+ num_layers = 4, # 6 layers in the Attention paper
280
+ num_heads = 4, # 8 heads in Attention paper
281
+ vocab_size = trainloader.dataset.getDataSize(), # 37000 tokens in Attention paper (?)
282
+ hidden_size = 512, # recall that this = num_heads * headsize | 512 is the embedding dim used in Attention paper
283
+ max_seq_len = lengthOfSeq,
284
+ dropout = 0.1, # same as Attention paper
285
+ layer_norm_epsilon=0.00001
286
+ )
287
+
288
+
289
+
290
+
291
+ # %%
292
+ use_pretrained = True
293
+ if use_pretrained:
294
+ print("Using Pre-trained Model!")
295
+ myTransformer = DecoderOnlyTransformer(thisConfig).to(device)
296
+ optimizer = t.optim.Adam(myTransformer.parameters(), lr = 1e-3)
297
+ criterion = nn.CrossEntropyLoss().to(device)
298
+ myTransformer.load_state_dict(t.load("toInfer.pt", map_location=device))
299
+ myTransformer.eval()
300
+ else:
301
+ print("Training Model... better hope you got enough GPU!")
302
+ myTransformer = DecoderOnlyTransformer(thisConfig).to(device)
303
+ optimizer = t.optim.Adam(myTransformer.parameters(), lr = 1e-3)
304
+ criterion = nn.CrossEntropyLoss().to(device)
305
+ NUM_EPOCHS = 1
306
+
307
+ losses = []
308
+ myTransformer.train()
309
+ for epoch in range(1, NUM_EPOCHS + 1):
310
+ for inputs, targets in trainloader:
311
+ outputs = myTransformer(inputs).to(device)
312
+ targets = t.nn.functional.one_hot(targets, num_classes=trainloader.dataset.getDataSize()).float().to(device)
313
+
314
+ outputs = einops.rearrange(outputs, 'batch seq vocab -> (batch seq) vocab')
315
+ targets = einops.rearrange(targets, 'batch seq vocab -> (batch seq) vocab')
316
+
317
+ outputs = outputs.to(device)
318
+ targets = targets.to(device)
319
+ loss = criterion(outputs,targets).to(device)
320
+
321
+ losses.append(loss.item())
322
+
323
+ loss.backward()
324
+ optimizer.step()
325
+ optimizer.zero_grad()
326
+
327
+
328
+ # %%
329
+ if not use_pretrained:
330
+ df = pd.DataFrame(losses)
331
+ df.plot()
332
+
333
+ # %%
334
+ # quick test - use the sample method if you wish to actually use the transformer:
335
+
336
+ myTransformer.eval()
337
+
338
+ testPhrase = ["Be", " ", "not", " ", "afraid", " ", "to", " ", "the", " ", "Florentine", "\n",
339
+ "And"]
340
+ input = shak.convertToTokens(testPhrase)
341
+ input = input[None, :]
342
+ tokens = myTransformer(input).argmax(dim=-1)[0]
343
+ #print(tokens)
344
+ shak.convertToText(tokens)
345
+
346
+ # %% [markdown]
347
+ # # Sampling
348
+
349
+ # %%
350
+ def apply_sampling_methods(input_ids: t.Tensor, logits: t.Tensor, temperature=1.0, freq_penalty=0.0, top_k=0, top_p=0.0) -> int:
351
+ # returns a next token based on provided sampling method
352
+ # thanks callum for the this method
353
+ assert input_ids.ndim == 1, "input_ids should be a 1D sequence of token ids"
354
+ assert temperature >= 0, "Temperature should be non-negative"
355
+ assert 0 <= top_p <= 1.0, "Top-p must be a probability"
356
+ assert 0 <= top_k, "Top-k must be non-negative"
357
+ assert not (top_p != 0 and top_k != 0), "At most one of top-p and top-k supported"
358
+
359
+ if temperature == 0:
360
+ return greedy_search(logits)
361
+ if temperature != 1.0:
362
+ logits = apply_temperature(logits, temperature)
363
+ if freq_penalty != 0.0:
364
+ logits = apply_freq_penalty(input_ids, logits, freq_penalty)
365
+ if top_k > 0:
366
+ return sample_top_k(logits, top_k)
367
+ if top_p > 0:
368
+ return sample_top_p(logits, top_p)
369
+ return sample_basic(logits)
370
+
371
+
372
+ def sample_tokens(
373
+ model,
374
+ encodeMethod,
375
+ decodeMethod,
376
+ initial_text: str,
377
+ max_tokens_generated = 40,
378
+ **kwargs) -> list:
379
+ # samples tokens until model outputs eos_token_id or token limit reached
380
+
381
+
382
+
383
+
384
+
385
+ model.eval()
386
+ input_ids: list = encodeMethod(initial_text)
387
+ generated_ids = []
388
+ device = next(model.parameters()).device #what is next doing here?
389
+
390
+ tokens_to_generate = max_tokens_generated - len(input_ids)
391
+ for _ in range(tokens_to_generate):
392
+ #print(input_ids + generated_ids)
393
+ new_input_ids = t.tensor(input_ids + generated_ids, dtype=t.int64, device=device)
394
+ #print(new_input_ids.unsqueeze(0).shape)
395
+ logits = model(new_input_ids.unsqueeze(0))[0, -1]
396
+ #print(logits.shape)
397
+ new_token = apply_sampling_methods(new_input_ids, logits, **kwargs)
398
+ generated_ids.append(new_token)
399
+
400
+
401
+ return decodeMethod(input_ids + generated_ids)
402
+
403
+
404
+ # quick test:
405
+
406
+ myTransformer.eval()
407
+
408
+ testPhrase = ["Be", " ", "not", " ", "afraid", " ", "to", " ", "the", " ", "Florentine", "\n",
409
+ "And"]
410
+ input = shak.convertToTokens(testPhrase)
411
+ type(input)
412
+
413
+
414
+ # %%
415
+ def greedy_search(logits):
416
+ '''
417
+ returns the most likely next token, BUT THE TIEBREAKER IS INCORRECT!
418
+ i got lazy - it *is* deterministic, but it just doesn't necessarily
419
+ choose the smallest word out of the tie. perhaps treat it as a symbol
420
+ of my ingenuity?
421
+ '''
422
+ return logits.argmax(dim=-1).item()
423
+
424
+ # %%
425
+ def sample_basic(logits) -> int:
426
+ '''
427
+ samples from the distributions, possibly with temp and freq changes applied
428
+
429
+ logits: shape (vocab_size, ) - unnormalized log-probabilities
430
+
431
+ return: a sampled token
432
+ '''
433
+ probs = t.distributions.categorical.Categorical(logits=logits)
434
+ return probs.sample().item()
435
+
436
+ N = 20000
437
+ probs = t.linspace(0, 0.4, 5)
438
+ unnormalized_logits = probs.log() + 1.2345
439
+ samples = t.tensor([sample_basic(unnormalized_logits) for _ in range(N)])
440
+ counts = t.bincount(samples, minlength=len(probs)) / N
441
+ print("Checking empirical frequencies (try to increase N if this test fails): ", counts)
442
+ #t.testing.assert_close(counts, probs, atol=0.01, rtol=0)
443
+ print("Tests passed!")
444
+
445
+ # %%
446
+ def apply_freq_penalty(input_ids: t.Tensor, logits: t.Tensor, freq_penalty: float) -> t.Tensor:
447
+ '''
448
+ input_ids: shape (seq, )
449
+ logits: shape (vocab_size, )
450
+
451
+ Return: shape (vocab_size, )
452
+ '''
453
+ (vocab_size,) = logits.shape
454
+ id_freqs = t.bincount(input_ids, minlength=vocab_size)
455
+ return logits - freq_penalty * id_freqs
456
+
457
+ bieber_prompt = "And I was like baby, baby, baby, oh Like, baby, baby, baby, no Like, baby, baby, baby, oh I thought you'd always be mine, mine"
458
+ input_ids = shak.convertStringToTokenList(bieber_prompt)
459
+ logits = t.ones(shak.getDataSize()).to(device)
460
+ penalized_logits = apply_freq_penalty(t.tensor(input_ids).to(device), logits, 2.0)
461
+ #i believe mine is different!
462
+ #assert penalized_logits[5156].item() == -11, "Expected 6 occurrences of ' baby' with leading space"
463
+ #assert penalized_logits[14801].item() == -5, "Expected 3 occurrences of ' Baby' with leading space"
464
+ #print("Tests passed!")
465
+
466
+ print(penalized_logits[2037].item()) # should be low since it was found!
467
+ shak.convertStringToTokenList("And")
468
+
469
+ # %%
470
+ def apply_temperature(logits: t.Tensor, temperature: float) -> t.Tensor:
471
+ assert temperature > 0, "temp cannot be less than or equal to 0"
472
+
473
+ return logits / temperature
474
+
475
+ logits = t.tensor([1, 2]).log()
476
+ cold_logits = apply_temperature(logits, 0.001)
477
+ #print('A low temperature "sharpens" or "peaks" the distribution: ', cold_logits)
478
+ #t.testing.assert_close(cold_logits, 1000.0 * logits)
479
+ hot_logits = apply_temperature(logits, 1000.0)
480
+ #print("A high temperature flattens the distribution: ", hot_logits)
481
+ #t.testing.assert_close(hot_logits, 0.001 * logits)
482
+ #print("Tests passed!")
483
+
484
+ # %%
485
+ # N_RUNS = 1
486
+ # your_prompt = "We are the champions, my friends"
487
+ # cases = [
488
+ # ("High freq penalty", dict(freq_penalty=100.0)),
489
+ # ("Negative freq penalty", dict(freq_penalty=-1.0)),
490
+ # ("Too hot!", dict(temperature=2.0)),
491
+ # ("Pleasantly cool", dict(temperature=0.7)),
492
+ # ("Pleasantly warm", dict(temperature=0.9)),
493
+ # ("Too cold!", dict(temperature=0.01)),
494
+ # ]
495
+ # for (name, kwargs) in cases:
496
+ # for i in range(N_RUNS):
497
+ # output = sample_tokens(myTransformer, shak.convertStringToTokenList,shak.decodeList, your_prompt, max_tokens_generated=24, **kwargs)
498
+ # print(f"Sample {i} with: {name} ({kwargs}):")
499
+ # print(f"Your model said: {shak.listToString(output)}\n")
500
+
501
+ # %%
502
+ def sample_top_k(logits: t.Tensor, top_k: int) -> int:
503
+ '''
504
+ logits: shape (vocab_size, ) - unnormalized log-probabilities
505
+ top_k: only consider this many of the most likely tokens for sampling
506
+
507
+ Return: a sampled token
508
+ '''
509
+ topk = t.topk(logits,top_k).indices
510
+ almost_zeroes = t.ones(logits.shape) * t.inf * -1
511
+ for _, token in enumerate(topk):
512
+ almost_zeroes[token] = 0
513
+ logits = logits + almost_zeroes
514
+ return sample_basic(logits)
515
+
516
+ k = 3
517
+ probs = t.linspace(0, 0.4, 5)
518
+ unnormalized_logits = probs.log() + 1.2345
519
+ samples = t.tensor([sample_top_k(unnormalized_logits, k) for _ in range(N)])
520
+ counts = t.bincount(samples, minlength=len(probs)) / N
521
+ expected = probs.clone()
522
+ expected[:-k] = 0
523
+ expected /= expected.sum()
524
+ # print("Checking empirical frequencies (try to increase N if this test fails): ", counts)
525
+ # #t.testing.assert_close(counts, expected, atol=0.01, rtol=0)
526
+ # print("Tests passed!")
527
+
528
+ # %%
529
+ def sample_top_p(logits: t.Tensor, top_p: float, min_tokens_to_keep: int = 1) -> int:
530
+ '''
531
+ logits: shape (vocab_size, ) - unnormalized log-probabilities
532
+
533
+ Return: a sampled token
534
+ '''
535
+ # find the indices of importang logits
536
+ sorted, indices = t.sort(logits,descending=True)
537
+ probs = t.nn.functional.softmax(sorted, dim=-1)
538
+ num_words_kept = 0
539
+ sum = 0
540
+ while sum < top_p:
541
+ sum = sum + probs[num_words_kept]
542
+ num_words_kept = num_words_kept + 1
543
+
544
+
545
+ if num_words_kept < min_tokens_to_keep:
546
+ num_words_kept = min_tokens_to_keep
547
+
548
+ important_indices = indices[:num_words_kept]
549
+
550
+ # prepare tensor to zero out small logits
551
+ almost_zeroes = t.ones(logits.shape) * t.inf * -1
552
+ for _, token in enumerate(important_indices):
553
+ almost_zeroes[token] = 0
554
+ logits = logits + almost_zeroes
555
+ return sample_basic(logits)
556
+
557
+ N = 2000
558
+ unnormalized_logits = t.tensor([0.2, 0.3, 0.5]).log() + 2.3456
559
+ samples = t.tensor([sample_top_p(unnormalized_logits, 0.5) for _ in range(N)])
560
+ counts = t.bincount(samples, minlength=len(unnormalized_logits)) / N
561
+ # print("top_p of 0.5 or lower should only return token 2: ", counts)
562
+ # assert counts[0] == 0 and counts[1] == 0
563
+
564
+ N = 2000
565
+ unnormalized_logits = t.tensor([0.2, 0.3, 0.5]).log() + 2.3456
566
+ samples = t.tensor([sample_top_p(unnormalized_logits, 0.50001) for _ in range(N)])
567
+ counts = t.bincount(samples, minlength=len(unnormalized_logits)) / N
568
+ # print("top_p in (0.5, 0.8] should return tokens 1 and 2: ", counts)
569
+ # assert counts[0] == 0
570
+
571
+ N = 4000
572
+ top_p = 0.71
573
+ probs = t.linspace(0, 0.4, 5)
574
+ unnormalized_logits = probs.log() + 1.2345
575
+ samples = t.tensor([sample_top_p(unnormalized_logits, top_p) for _ in range(N)])
576
+ counts = t.bincount(samples, minlength=len(probs)) / N
577
+ expected = probs.clone()
578
+ expected[0:2] = 0
579
+ expected /= expected.sum()
580
+ # print("Checking empirical frequencies (try to increase N if this test fails): ", counts)
581
+ #t.testing.assert_close(counts, expected, atol=0.01, rtol=0.0)
582
+
583
+ # print("All tests passed!")
584
+
585
+ # %% [markdown]
586
+ # # Speak, Shakespeare!
587
+
588
+ # %%
589
+ input = "Death waits at the door"
590
+
591
+ print(shak.listToString(sample_tokens(myTransformer,shak.convertStringToTokenList,shak.decodeList,
592
+ input, 80,freq_penalty=0.1, top_k = 10)))
593
+
594
+ # %% [markdown]
595
+ # ## Save the model for future use
596
+ # (This was over 20 minutes of GPU computation. Not too shabby!)
597
+
598
+ # %%
599
+ t.save(myTransformer.state_dict(), "toInfer.pt")
600
+
601
+ # %% [markdown]
602
+ # # Publish to Gradio
603
+ # About a month after making this I realized this should be online. I'll push this to gradio
604
+
605
+ # %%
606
+ import gradio as gr
607
+ def speak(input, tokenLength):
608
+ print("-------------------------------------------")
609
+ print("input: " + input)
610
+ try:
611
+ result = shak.listToString(sample_tokens(myTransformer,shak.convertStringToTokenList,shak.decodeList,
612
+ input, tokenLength,freq_penalty=0.1, top_k = 10))
613
+ except:
614
+ return "one or more of the words is not compatible with the model; please try a different phrase"
615
+
616
+ print("worked! output:")
617
+ print(result)
618
+ return result
619
+
620
+
621
+ model = gr.Interface(fn=speak,
622
+ inputs=[gr.Textbox(label = "initial text", placeholder="To be or not to be"), gr.Slider(40, 80, step=1, value=80)],
623
+ outputs="text",
624
+ title = "speak shakespeare, speak!",
625
+ description = "a miniature shakespeare, built from scratch by Cody Rushing via a Decoder-Only Transformer trained on shakespeare's works.\n many, but not all, words are tokenizable - if you get an error, try again with different words!")
626
+
627
+ model.launch(share=False)
628
+
629
+ # %%
630
+
631
+
632
+
requirements.txt ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ torch
2
+ numpy
3
+ fancy_einsum
4
+ einops
5
+ pandas
6
+ dataclasses
7
+ torchvision
8
+ gradio
shakespeare.txt ADDED
The diff for this file is too large to render. See raw diff
 
toInfer.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:6d4b13307f2768c5c0811dcaae1c1f2f04a7066ee9f723fcca16ba56fa5e7f45
3
+ size 120571916