Spaces:
Runtime error
Runtime error
Commit
·
1013b6b
0
Parent(s):
Duplicate from starship006/mini_shakespeare
Browse filesCo-authored-by: Cody Rushing <[email protected]>
- .gitattributes +34 -0
- README.md +13 -0
- app.py +632 -0
- requirements.txt +8 -0
- shakespeare.txt +0 -0
- toInfer.pt +3 -0
.gitattributes
ADDED
@@ -0,0 +1,34 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
*.7z filter=lfs diff=lfs merge=lfs -text
|
2 |
+
*.arrow filter=lfs diff=lfs merge=lfs -text
|
3 |
+
*.bin filter=lfs diff=lfs merge=lfs -text
|
4 |
+
*.bz2 filter=lfs diff=lfs merge=lfs -text
|
5 |
+
*.ckpt filter=lfs diff=lfs merge=lfs -text
|
6 |
+
*.ftz filter=lfs diff=lfs merge=lfs -text
|
7 |
+
*.gz filter=lfs diff=lfs merge=lfs -text
|
8 |
+
*.h5 filter=lfs diff=lfs merge=lfs -text
|
9 |
+
*.joblib filter=lfs diff=lfs merge=lfs -text
|
10 |
+
*.lfs.* filter=lfs diff=lfs merge=lfs -text
|
11 |
+
*.mlmodel filter=lfs diff=lfs merge=lfs -text
|
12 |
+
*.model filter=lfs diff=lfs merge=lfs -text
|
13 |
+
*.msgpack filter=lfs diff=lfs merge=lfs -text
|
14 |
+
*.npy filter=lfs diff=lfs merge=lfs -text
|
15 |
+
*.npz filter=lfs diff=lfs merge=lfs -text
|
16 |
+
*.onnx filter=lfs diff=lfs merge=lfs -text
|
17 |
+
*.ot filter=lfs diff=lfs merge=lfs -text
|
18 |
+
*.parquet filter=lfs diff=lfs merge=lfs -text
|
19 |
+
*.pb filter=lfs diff=lfs merge=lfs -text
|
20 |
+
*.pickle filter=lfs diff=lfs merge=lfs -text
|
21 |
+
*.pkl filter=lfs diff=lfs merge=lfs -text
|
22 |
+
*.pt filter=lfs diff=lfs merge=lfs -text
|
23 |
+
*.pth filter=lfs diff=lfs merge=lfs -text
|
24 |
+
*.rar filter=lfs diff=lfs merge=lfs -text
|
25 |
+
*.safetensors filter=lfs diff=lfs merge=lfs -text
|
26 |
+
saved_model/**/* filter=lfs diff=lfs merge=lfs -text
|
27 |
+
*.tar.* filter=lfs diff=lfs merge=lfs -text
|
28 |
+
*.tflite filter=lfs diff=lfs merge=lfs -text
|
29 |
+
*.tgz filter=lfs diff=lfs merge=lfs -text
|
30 |
+
*.wasm filter=lfs diff=lfs merge=lfs -text
|
31 |
+
*.xz filter=lfs diff=lfs merge=lfs -text
|
32 |
+
*.zip filter=lfs diff=lfs merge=lfs -text
|
33 |
+
*.zst filter=lfs diff=lfs merge=lfs -text
|
34 |
+
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
README.md
ADDED
@@ -0,0 +1,13 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
---
|
2 |
+
title: Mini Shakespeare
|
3 |
+
emoji: 🐢
|
4 |
+
colorFrom: gray
|
5 |
+
colorTo: purple
|
6 |
+
sdk: gradio
|
7 |
+
sdk_version: 3.12.0
|
8 |
+
app_file: app.py
|
9 |
+
pinned: false
|
10 |
+
duplicated_from: starship006/mini_shakespeare
|
11 |
+
---
|
12 |
+
|
13 |
+
Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
|
app.py
ADDED
@@ -0,0 +1,632 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# %% [markdown]
|
2 |
+
# <a href="https://colab.research.google.com/github/starship006/ARENA-work/blob/main/w1/w1d4.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>
|
3 |
+
|
4 |
+
# %% [markdown]
|
5 |
+
# # Training Shakespeare Himself
|
6 |
+
|
7 |
+
# %% [markdown]
|
8 |
+
# For those who are not part of the ARENA program and are curious as to what this is, this was my first significant AI/ML project! I made components for a decoder-only transformer, and trained it on a corpus consisting of text from Shakespeare. Scroll to the bottom to see some output :)
|
9 |
+
# %%
|
10 |
+
import torch as t
|
11 |
+
import numpy as np
|
12 |
+
from torch import nn
|
13 |
+
import fancy_einsum as einsum
|
14 |
+
import einops
|
15 |
+
import pandas as pd
|
16 |
+
|
17 |
+
|
18 |
+
# %% [markdown]
|
19 |
+
# ## transformer functions
|
20 |
+
#
|
21 |
+
#
|
22 |
+
|
23 |
+
# %% [markdown]
|
24 |
+
# This will be from the transformer components I made earlier this week, but I'll put down optimizations so it can use the GPU.
|
25 |
+
#
|
26 |
+
# And I did just that. The speed improvements are MASSIVE, wow!
|
27 |
+
|
28 |
+
# %%
|
29 |
+
device = t.device("cuda:0" if t.cuda.is_available() else "cpu")
|
30 |
+
t.cuda.is_available()
|
31 |
+
|
32 |
+
# %%
|
33 |
+
def multihead_masked_attention(Q: t.Tensor, K: t.Tensor, V: t.Tensor, num_heads: int):
|
34 |
+
'''
|
35 |
+
Implements multihead masked attention on the matrices Q, K and V.
|
36 |
+
|
37 |
+
Q: shape (batch, seq_len, nheads*headsize)
|
38 |
+
K: shape (batch, seq_len, nheads*headsize)
|
39 |
+
V: shape (batch, seq_len, nheads*headsize)
|
40 |
+
'''
|
41 |
+
|
42 |
+
Q = einops.rearrange(Q, 'b s (n h) -> b n s h', n = num_heads)
|
43 |
+
K = einops.rearrange(K, 'b s (n h) -> b n s h', n = num_heads)
|
44 |
+
V = einops.rearrange(V, 'b s (n h) -> b n s h', n = num_heads)
|
45 |
+
|
46 |
+
|
47 |
+
scores = einsum.einsum('b n k h, b n s h -> b n s k', K, Q)
|
48 |
+
assert scores.shape == t.Size([Q.shape[0], num_heads,Q.shape[2], K.shape[2]])
|
49 |
+
|
50 |
+
scores = scores / np.sqrt(Q.shape[-1])
|
51 |
+
attention = scores + t.triu(t.ones_like(scores,device = device) * float("-inf"), diagonal=1) # THIS IS STOLEN FROM JAY - testing it out
|
52 |
+
softed = t.softmax(attention,dim=-1)
|
53 |
+
result = einsum.einsum('batch numheads seqQ seqK, batch numheads seqK headsize -> batch numheads seqQ headsize',softed, V)
|
54 |
+
return einops.rearrange(result, 'batch numheads seqQ headsize -> batch seqQ (numheads headsize)')
|
55 |
+
|
56 |
+
# %%
|
57 |
+
class MultiheadMaskedAttention(nn.Module):
|
58 |
+
W_QKV: nn.Linear
|
59 |
+
W_O: nn.Linear
|
60 |
+
|
61 |
+
def __init__(self, hidden_size: int, num_heads: int):
|
62 |
+
super().__init__()
|
63 |
+
self.hidden_size = hidden_size
|
64 |
+
self.num_heads = num_heads
|
65 |
+
self.head_size = hidden_size // num_heads
|
66 |
+
|
67 |
+
self.WQKV = t.nn.Linear(self.hidden_size, 3 * hidden_size) # TODO: why do we use a linear layer here? aren't they matricies?
|
68 |
+
self.W0 = t.nn.Linear(self.hidden_size, self.hidden_size)
|
69 |
+
|
70 |
+
def forward(self, x: t.Tensor) -> t.Tensor:
|
71 |
+
'''
|
72 |
+
x: shape (batch, seq, hidden_size)
|
73 |
+
|
74 |
+
Return: shape (batch, seq, hidden_size)
|
75 |
+
'''
|
76 |
+
#print("YO?")
|
77 |
+
x = x.float() # seems like it needs to be a float!
|
78 |
+
QKV = self.WQKV(x)
|
79 |
+
Q = QKV[:,:,:self.hidden_size]
|
80 |
+
K = QKV[:,:,self.hidden_size:self.hidden_size * 2]
|
81 |
+
V = QKV[:,:,self.hidden_size * 2:]
|
82 |
+
assert Q.shape == K.shape == V.shape == x.shape
|
83 |
+
return self.W0(multihead_masked_attention(Q,K,V,self.num_heads))
|
84 |
+
|
85 |
+
# %%
|
86 |
+
from dataclasses import dataclass
|
87 |
+
|
88 |
+
@dataclass(frozen=True)
|
89 |
+
class TransformerConfig:
|
90 |
+
'''Constants used throughout your decoder-only transformer model.'''
|
91 |
+
|
92 |
+
num_layers: int
|
93 |
+
num_heads: int
|
94 |
+
vocab_size: int
|
95 |
+
hidden_size: int
|
96 |
+
max_seq_len: int
|
97 |
+
dropout: float = 0.1
|
98 |
+
layer_norm_epsilon: float = 1e-05
|
99 |
+
|
100 |
+
# %%
|
101 |
+
# from yesterday
|
102 |
+
class PositionalEncoding(nn.Module):
|
103 |
+
|
104 |
+
def __init__(self, embedding_dim: int, max_seq_len: int = 5000):
|
105 |
+
super().__init__()
|
106 |
+
self.dim = embedding_dim
|
107 |
+
self.length = max_seq_len
|
108 |
+
|
109 |
+
# mostly copied. i understand this, just need to work on
|
110 |
+
# making more tensors and getting more exposure to methods of making tensors
|
111 |
+
def P (delta):
|
112 |
+
n = 10000 # hardcoded
|
113 |
+
d = embedding_dim
|
114 |
+
l = max_seq_len
|
115 |
+
sin_array = np.sin(delta / n ** (2 * np.arange(d//2) / d))
|
116 |
+
cos_array = np.cos(delta / n ** (2 * np.arange(d//2) / d))
|
117 |
+
|
118 |
+
array = np.zeros(d)
|
119 |
+
array[::2] = sin_array
|
120 |
+
array[1::2] = cos_array
|
121 |
+
|
122 |
+
return array
|
123 |
+
|
124 |
+
tokenArray = []
|
125 |
+
for i in range(max_seq_len):
|
126 |
+
tokenArray.append(P(i)) # changed from previous design
|
127 |
+
|
128 |
+
self.multMax = t.tensor(np.array(tokenArray), dtype=t.float, device = device)
|
129 |
+
|
130 |
+
|
131 |
+
def forward(self, x: t.Tensor) -> t.Tensor:
|
132 |
+
'''
|
133 |
+
x: shape (batch, seq_len, embedding_dim)
|
134 |
+
'''
|
135 |
+
return x + self.multMax[:x.shape[1]]
|
136 |
+
|
137 |
+
|
138 |
+
# %%
|
139 |
+
class MLP(nn.Module):
|
140 |
+
def __init__(self, config: TransformerConfig):
|
141 |
+
super().__init__()
|
142 |
+
self.hidden_size = config.hidden_size
|
143 |
+
|
144 |
+
self.layers = nn.Sequential(
|
145 |
+
nn.Linear(self.hidden_size, self.hidden_size * 4),
|
146 |
+
nn.GELU(),
|
147 |
+
nn.Linear(self.hidden_size * 4, self.hidden_size),
|
148 |
+
nn.Dropout(config.dropout)
|
149 |
+
)
|
150 |
+
def forward(self, x: t.Tensor):
|
151 |
+
x = x.float() # seems like it needs to be a float!
|
152 |
+
return self.layers(x).float() # ima do the same thing again!
|
153 |
+
|
154 |
+
|
155 |
+
# %%
|
156 |
+
class DecoderBlock(nn.Module):
|
157 |
+
|
158 |
+
def __init__(self, config: TransformerConfig):
|
159 |
+
super().__init__()
|
160 |
+
self.attentionBlock = nn.Sequential(
|
161 |
+
MultiheadMaskedAttention(config.hidden_size, config.num_heads),
|
162 |
+
nn.LayerNorm(config.hidden_size)
|
163 |
+
)
|
164 |
+
self.MLP = nn.Sequential(
|
165 |
+
MLP(config),
|
166 |
+
nn.LayerNorm(config.hidden_size)
|
167 |
+
)
|
168 |
+
|
169 |
+
def forward(self, x: t.Tensor) -> t.Tensor:
|
170 |
+
partOne = x + self.attentionBlock(x)
|
171 |
+
return (partOne + self.MLP(partOne)).float() # seems like it needs to be a float!
|
172 |
+
|
173 |
+
|
174 |
+
# %%
|
175 |
+
class DecoderOnlyTransformer(nn.Module):
|
176 |
+
|
177 |
+
def __init__(self, config: TransformerConfig):
|
178 |
+
super().__init__()
|
179 |
+
self.tokenize = nn.Embedding(config.vocab_size, config.hidden_size).to(device)
|
180 |
+
self.positionize = PositionalEncoding(config.hidden_size,config.max_seq_len)
|
181 |
+
self.restModel = nn.Sequential(
|
182 |
+
nn.Dropout(config.dropout),
|
183 |
+
*[DecoderBlock(config) for i in range(config.num_layers)],
|
184 |
+
nn.LayerNorm(config.hidden_size),
|
185 |
+
)
|
186 |
+
self.unembed = self.tokenize.weight.T.to(device)
|
187 |
+
|
188 |
+
def forward(self, x: t.Tensor) -> t.Tensor:
|
189 |
+
x = self.tokenize(x)
|
190 |
+
x = self.positionize(x)
|
191 |
+
toUnembed = self.restModel(x).to(device)
|
192 |
+
return [email protected]
|
193 |
+
|
194 |
+
# %% [markdown]
|
195 |
+
# ## Data Prep
|
196 |
+
|
197 |
+
# %% [markdown]
|
198 |
+
# Make the dataset to parse through all of the words
|
199 |
+
|
200 |
+
# %%
|
201 |
+
import re
|
202 |
+
from torch.utils.data import Dataset, DataLoader
|
203 |
+
from torchvision import datasets
|
204 |
+
|
205 |
+
class CustomTextDataset(Dataset):
|
206 |
+
def __init__(self, words, seq_len, fractionOfWords):
|
207 |
+
self.fractionOfWords = fractionOfWords
|
208 |
+
self.words = words
|
209 |
+
self.setOfWords = set(words)
|
210 |
+
self.seq_len = seq_len
|
211 |
+
self.max_len = len(self.words) - (self.seq_len + 1)
|
212 |
+
self.vocab_size = len(self.setOfWords)
|
213 |
+
self.word_to_token = {word: idx for (idx, word) in enumerate(sorted(self.setOfWords))}
|
214 |
+
self.token_to_word = {idx: word for (idx, word) in enumerate(sorted(self.setOfWords))}
|
215 |
+
self.allTokens = t.tensor([self.word_to_token[word] for word in self.words],device = device)
|
216 |
+
|
217 |
+
if (self.fractionOfWords > 0.9):
|
218 |
+
print("Probably don't do this. Errors may about")
|
219 |
+
|
220 |
+
def __len__(self):
|
221 |
+
return int(self.max_len * self.fractionOfWords)
|
222 |
+
|
223 |
+
def __getitem__(self, idx):
|
224 |
+
tokens = self.allTokens[idx:idx + self.seq_len + 1]
|
225 |
+
input = tokens[:-1]
|
226 |
+
target = tokens[1:]
|
227 |
+
return input, target
|
228 |
+
|
229 |
+
def getDataSize(self):
|
230 |
+
return self.vocab_size
|
231 |
+
|
232 |
+
def convertToTokens(self, phrase: list) -> t.tensor:
|
233 |
+
return t.tensor([self.word_to_token[word] for word in phrase],device = device)
|
234 |
+
|
235 |
+
def convertStringToTokenList(self, phrase: str) -> list:
|
236 |
+
words = re.split(r"\b", phrase)
|
237 |
+
return [self.word_to_token[word] for word in words]
|
238 |
+
|
239 |
+
def convertToText(self, tokens: t.tensor):
|
240 |
+
temp = []
|
241 |
+
for i, value in enumerate(tokens):
|
242 |
+
#print(value.item())
|
243 |
+
temp.append(self.token_to_word[value.item()])
|
244 |
+
return temp
|
245 |
+
|
246 |
+
def decodeList(self, words: list):
|
247 |
+
temp = []
|
248 |
+
for value in words:
|
249 |
+
temp.append(self.token_to_word[value])
|
250 |
+
return temp
|
251 |
+
|
252 |
+
def listToString(self, words: list) -> str:
|
253 |
+
temp = ""
|
254 |
+
for word in words:
|
255 |
+
temp = temp + word
|
256 |
+
return temp
|
257 |
+
|
258 |
+
# %%
|
259 |
+
file = open("shakespeare.txt")
|
260 |
+
text = file.read()
|
261 |
+
words = re.split(r"\b", text)
|
262 |
+
|
263 |
+
fractionOfWords = 0.1 # what percent of the corpus to train on
|
264 |
+
|
265 |
+
|
266 |
+
lengthOfSeq = 100
|
267 |
+
|
268 |
+
shak = CustomTextDataset(words, lengthOfSeq, fractionOfWords)
|
269 |
+
|
270 |
+
# %% [markdown]
|
271 |
+
# ## Running this data through a transformer
|
272 |
+
|
273 |
+
# %%
|
274 |
+
trainloader = DataLoader(shak, batch_size=32,shuffle=True)
|
275 |
+
|
276 |
+
# this specific one trained for 24 minutes and 9 seconds on colab GPU
|
277 |
+
|
278 |
+
thisConfig = TransformerConfig(
|
279 |
+
num_layers = 4, # 6 layers in the Attention paper
|
280 |
+
num_heads = 4, # 8 heads in Attention paper
|
281 |
+
vocab_size = trainloader.dataset.getDataSize(), # 37000 tokens in Attention paper (?)
|
282 |
+
hidden_size = 512, # recall that this = num_heads * headsize | 512 is the embedding dim used in Attention paper
|
283 |
+
max_seq_len = lengthOfSeq,
|
284 |
+
dropout = 0.1, # same as Attention paper
|
285 |
+
layer_norm_epsilon=0.00001
|
286 |
+
)
|
287 |
+
|
288 |
+
|
289 |
+
|
290 |
+
|
291 |
+
# %%
|
292 |
+
use_pretrained = True
|
293 |
+
if use_pretrained:
|
294 |
+
print("Using Pre-trained Model!")
|
295 |
+
myTransformer = DecoderOnlyTransformer(thisConfig).to(device)
|
296 |
+
optimizer = t.optim.Adam(myTransformer.parameters(), lr = 1e-3)
|
297 |
+
criterion = nn.CrossEntropyLoss().to(device)
|
298 |
+
myTransformer.load_state_dict(t.load("toInfer.pt", map_location=device))
|
299 |
+
myTransformer.eval()
|
300 |
+
else:
|
301 |
+
print("Training Model... better hope you got enough GPU!")
|
302 |
+
myTransformer = DecoderOnlyTransformer(thisConfig).to(device)
|
303 |
+
optimizer = t.optim.Adam(myTransformer.parameters(), lr = 1e-3)
|
304 |
+
criterion = nn.CrossEntropyLoss().to(device)
|
305 |
+
NUM_EPOCHS = 1
|
306 |
+
|
307 |
+
losses = []
|
308 |
+
myTransformer.train()
|
309 |
+
for epoch in range(1, NUM_EPOCHS + 1):
|
310 |
+
for inputs, targets in trainloader:
|
311 |
+
outputs = myTransformer(inputs).to(device)
|
312 |
+
targets = t.nn.functional.one_hot(targets, num_classes=trainloader.dataset.getDataSize()).float().to(device)
|
313 |
+
|
314 |
+
outputs = einops.rearrange(outputs, 'batch seq vocab -> (batch seq) vocab')
|
315 |
+
targets = einops.rearrange(targets, 'batch seq vocab -> (batch seq) vocab')
|
316 |
+
|
317 |
+
outputs = outputs.to(device)
|
318 |
+
targets = targets.to(device)
|
319 |
+
loss = criterion(outputs,targets).to(device)
|
320 |
+
|
321 |
+
losses.append(loss.item())
|
322 |
+
|
323 |
+
loss.backward()
|
324 |
+
optimizer.step()
|
325 |
+
optimizer.zero_grad()
|
326 |
+
|
327 |
+
|
328 |
+
# %%
|
329 |
+
if not use_pretrained:
|
330 |
+
df = pd.DataFrame(losses)
|
331 |
+
df.plot()
|
332 |
+
|
333 |
+
# %%
|
334 |
+
# quick test - use the sample method if you wish to actually use the transformer:
|
335 |
+
|
336 |
+
myTransformer.eval()
|
337 |
+
|
338 |
+
testPhrase = ["Be", " ", "not", " ", "afraid", " ", "to", " ", "the", " ", "Florentine", "\n",
|
339 |
+
"And"]
|
340 |
+
input = shak.convertToTokens(testPhrase)
|
341 |
+
input = input[None, :]
|
342 |
+
tokens = myTransformer(input).argmax(dim=-1)[0]
|
343 |
+
#print(tokens)
|
344 |
+
shak.convertToText(tokens)
|
345 |
+
|
346 |
+
# %% [markdown]
|
347 |
+
# # Sampling
|
348 |
+
|
349 |
+
# %%
|
350 |
+
def apply_sampling_methods(input_ids: t.Tensor, logits: t.Tensor, temperature=1.0, freq_penalty=0.0, top_k=0, top_p=0.0) -> int:
|
351 |
+
# returns a next token based on provided sampling method
|
352 |
+
# thanks callum for the this method
|
353 |
+
assert input_ids.ndim == 1, "input_ids should be a 1D sequence of token ids"
|
354 |
+
assert temperature >= 0, "Temperature should be non-negative"
|
355 |
+
assert 0 <= top_p <= 1.0, "Top-p must be a probability"
|
356 |
+
assert 0 <= top_k, "Top-k must be non-negative"
|
357 |
+
assert not (top_p != 0 and top_k != 0), "At most one of top-p and top-k supported"
|
358 |
+
|
359 |
+
if temperature == 0:
|
360 |
+
return greedy_search(logits)
|
361 |
+
if temperature != 1.0:
|
362 |
+
logits = apply_temperature(logits, temperature)
|
363 |
+
if freq_penalty != 0.0:
|
364 |
+
logits = apply_freq_penalty(input_ids, logits, freq_penalty)
|
365 |
+
if top_k > 0:
|
366 |
+
return sample_top_k(logits, top_k)
|
367 |
+
if top_p > 0:
|
368 |
+
return sample_top_p(logits, top_p)
|
369 |
+
return sample_basic(logits)
|
370 |
+
|
371 |
+
|
372 |
+
def sample_tokens(
|
373 |
+
model,
|
374 |
+
encodeMethod,
|
375 |
+
decodeMethod,
|
376 |
+
initial_text: str,
|
377 |
+
max_tokens_generated = 40,
|
378 |
+
**kwargs) -> list:
|
379 |
+
# samples tokens until model outputs eos_token_id or token limit reached
|
380 |
+
|
381 |
+
|
382 |
+
|
383 |
+
|
384 |
+
|
385 |
+
model.eval()
|
386 |
+
input_ids: list = encodeMethod(initial_text)
|
387 |
+
generated_ids = []
|
388 |
+
device = next(model.parameters()).device #what is next doing here?
|
389 |
+
|
390 |
+
tokens_to_generate = max_tokens_generated - len(input_ids)
|
391 |
+
for _ in range(tokens_to_generate):
|
392 |
+
#print(input_ids + generated_ids)
|
393 |
+
new_input_ids = t.tensor(input_ids + generated_ids, dtype=t.int64, device=device)
|
394 |
+
#print(new_input_ids.unsqueeze(0).shape)
|
395 |
+
logits = model(new_input_ids.unsqueeze(0))[0, -1]
|
396 |
+
#print(logits.shape)
|
397 |
+
new_token = apply_sampling_methods(new_input_ids, logits, **kwargs)
|
398 |
+
generated_ids.append(new_token)
|
399 |
+
|
400 |
+
|
401 |
+
return decodeMethod(input_ids + generated_ids)
|
402 |
+
|
403 |
+
|
404 |
+
# quick test:
|
405 |
+
|
406 |
+
myTransformer.eval()
|
407 |
+
|
408 |
+
testPhrase = ["Be", " ", "not", " ", "afraid", " ", "to", " ", "the", " ", "Florentine", "\n",
|
409 |
+
"And"]
|
410 |
+
input = shak.convertToTokens(testPhrase)
|
411 |
+
type(input)
|
412 |
+
|
413 |
+
|
414 |
+
# %%
|
415 |
+
def greedy_search(logits):
|
416 |
+
'''
|
417 |
+
returns the most likely next token, BUT THE TIEBREAKER IS INCORRECT!
|
418 |
+
i got lazy - it *is* deterministic, but it just doesn't necessarily
|
419 |
+
choose the smallest word out of the tie. perhaps treat it as a symbol
|
420 |
+
of my ingenuity?
|
421 |
+
'''
|
422 |
+
return logits.argmax(dim=-1).item()
|
423 |
+
|
424 |
+
# %%
|
425 |
+
def sample_basic(logits) -> int:
|
426 |
+
'''
|
427 |
+
samples from the distributions, possibly with temp and freq changes applied
|
428 |
+
|
429 |
+
logits: shape (vocab_size, ) - unnormalized log-probabilities
|
430 |
+
|
431 |
+
return: a sampled token
|
432 |
+
'''
|
433 |
+
probs = t.distributions.categorical.Categorical(logits=logits)
|
434 |
+
return probs.sample().item()
|
435 |
+
|
436 |
+
N = 20000
|
437 |
+
probs = t.linspace(0, 0.4, 5)
|
438 |
+
unnormalized_logits = probs.log() + 1.2345
|
439 |
+
samples = t.tensor([sample_basic(unnormalized_logits) for _ in range(N)])
|
440 |
+
counts = t.bincount(samples, minlength=len(probs)) / N
|
441 |
+
print("Checking empirical frequencies (try to increase N if this test fails): ", counts)
|
442 |
+
#t.testing.assert_close(counts, probs, atol=0.01, rtol=0)
|
443 |
+
print("Tests passed!")
|
444 |
+
|
445 |
+
# %%
|
446 |
+
def apply_freq_penalty(input_ids: t.Tensor, logits: t.Tensor, freq_penalty: float) -> t.Tensor:
|
447 |
+
'''
|
448 |
+
input_ids: shape (seq, )
|
449 |
+
logits: shape (vocab_size, )
|
450 |
+
|
451 |
+
Return: shape (vocab_size, )
|
452 |
+
'''
|
453 |
+
(vocab_size,) = logits.shape
|
454 |
+
id_freqs = t.bincount(input_ids, minlength=vocab_size)
|
455 |
+
return logits - freq_penalty * id_freqs
|
456 |
+
|
457 |
+
bieber_prompt = "And I was like baby, baby, baby, oh Like, baby, baby, baby, no Like, baby, baby, baby, oh I thought you'd always be mine, mine"
|
458 |
+
input_ids = shak.convertStringToTokenList(bieber_prompt)
|
459 |
+
logits = t.ones(shak.getDataSize()).to(device)
|
460 |
+
penalized_logits = apply_freq_penalty(t.tensor(input_ids).to(device), logits, 2.0)
|
461 |
+
#i believe mine is different!
|
462 |
+
#assert penalized_logits[5156].item() == -11, "Expected 6 occurrences of ' baby' with leading space"
|
463 |
+
#assert penalized_logits[14801].item() == -5, "Expected 3 occurrences of ' Baby' with leading space"
|
464 |
+
#print("Tests passed!")
|
465 |
+
|
466 |
+
print(penalized_logits[2037].item()) # should be low since it was found!
|
467 |
+
shak.convertStringToTokenList("And")
|
468 |
+
|
469 |
+
# %%
|
470 |
+
def apply_temperature(logits: t.Tensor, temperature: float) -> t.Tensor:
|
471 |
+
assert temperature > 0, "temp cannot be less than or equal to 0"
|
472 |
+
|
473 |
+
return logits / temperature
|
474 |
+
|
475 |
+
logits = t.tensor([1, 2]).log()
|
476 |
+
cold_logits = apply_temperature(logits, 0.001)
|
477 |
+
#print('A low temperature "sharpens" or "peaks" the distribution: ', cold_logits)
|
478 |
+
#t.testing.assert_close(cold_logits, 1000.0 * logits)
|
479 |
+
hot_logits = apply_temperature(logits, 1000.0)
|
480 |
+
#print("A high temperature flattens the distribution: ", hot_logits)
|
481 |
+
#t.testing.assert_close(hot_logits, 0.001 * logits)
|
482 |
+
#print("Tests passed!")
|
483 |
+
|
484 |
+
# %%
|
485 |
+
# N_RUNS = 1
|
486 |
+
# your_prompt = "We are the champions, my friends"
|
487 |
+
# cases = [
|
488 |
+
# ("High freq penalty", dict(freq_penalty=100.0)),
|
489 |
+
# ("Negative freq penalty", dict(freq_penalty=-1.0)),
|
490 |
+
# ("Too hot!", dict(temperature=2.0)),
|
491 |
+
# ("Pleasantly cool", dict(temperature=0.7)),
|
492 |
+
# ("Pleasantly warm", dict(temperature=0.9)),
|
493 |
+
# ("Too cold!", dict(temperature=0.01)),
|
494 |
+
# ]
|
495 |
+
# for (name, kwargs) in cases:
|
496 |
+
# for i in range(N_RUNS):
|
497 |
+
# output = sample_tokens(myTransformer, shak.convertStringToTokenList,shak.decodeList, your_prompt, max_tokens_generated=24, **kwargs)
|
498 |
+
# print(f"Sample {i} with: {name} ({kwargs}):")
|
499 |
+
# print(f"Your model said: {shak.listToString(output)}\n")
|
500 |
+
|
501 |
+
# %%
|
502 |
+
def sample_top_k(logits: t.Tensor, top_k: int) -> int:
|
503 |
+
'''
|
504 |
+
logits: shape (vocab_size, ) - unnormalized log-probabilities
|
505 |
+
top_k: only consider this many of the most likely tokens for sampling
|
506 |
+
|
507 |
+
Return: a sampled token
|
508 |
+
'''
|
509 |
+
topk = t.topk(logits,top_k).indices
|
510 |
+
almost_zeroes = t.ones(logits.shape) * t.inf * -1
|
511 |
+
for _, token in enumerate(topk):
|
512 |
+
almost_zeroes[token] = 0
|
513 |
+
logits = logits + almost_zeroes
|
514 |
+
return sample_basic(logits)
|
515 |
+
|
516 |
+
k = 3
|
517 |
+
probs = t.linspace(0, 0.4, 5)
|
518 |
+
unnormalized_logits = probs.log() + 1.2345
|
519 |
+
samples = t.tensor([sample_top_k(unnormalized_logits, k) for _ in range(N)])
|
520 |
+
counts = t.bincount(samples, minlength=len(probs)) / N
|
521 |
+
expected = probs.clone()
|
522 |
+
expected[:-k] = 0
|
523 |
+
expected /= expected.sum()
|
524 |
+
# print("Checking empirical frequencies (try to increase N if this test fails): ", counts)
|
525 |
+
# #t.testing.assert_close(counts, expected, atol=0.01, rtol=0)
|
526 |
+
# print("Tests passed!")
|
527 |
+
|
528 |
+
# %%
|
529 |
+
def sample_top_p(logits: t.Tensor, top_p: float, min_tokens_to_keep: int = 1) -> int:
|
530 |
+
'''
|
531 |
+
logits: shape (vocab_size, ) - unnormalized log-probabilities
|
532 |
+
|
533 |
+
Return: a sampled token
|
534 |
+
'''
|
535 |
+
# find the indices of importang logits
|
536 |
+
sorted, indices = t.sort(logits,descending=True)
|
537 |
+
probs = t.nn.functional.softmax(sorted, dim=-1)
|
538 |
+
num_words_kept = 0
|
539 |
+
sum = 0
|
540 |
+
while sum < top_p:
|
541 |
+
sum = sum + probs[num_words_kept]
|
542 |
+
num_words_kept = num_words_kept + 1
|
543 |
+
|
544 |
+
|
545 |
+
if num_words_kept < min_tokens_to_keep:
|
546 |
+
num_words_kept = min_tokens_to_keep
|
547 |
+
|
548 |
+
important_indices = indices[:num_words_kept]
|
549 |
+
|
550 |
+
# prepare tensor to zero out small logits
|
551 |
+
almost_zeroes = t.ones(logits.shape) * t.inf * -1
|
552 |
+
for _, token in enumerate(important_indices):
|
553 |
+
almost_zeroes[token] = 0
|
554 |
+
logits = logits + almost_zeroes
|
555 |
+
return sample_basic(logits)
|
556 |
+
|
557 |
+
N = 2000
|
558 |
+
unnormalized_logits = t.tensor([0.2, 0.3, 0.5]).log() + 2.3456
|
559 |
+
samples = t.tensor([sample_top_p(unnormalized_logits, 0.5) for _ in range(N)])
|
560 |
+
counts = t.bincount(samples, minlength=len(unnormalized_logits)) / N
|
561 |
+
# print("top_p of 0.5 or lower should only return token 2: ", counts)
|
562 |
+
# assert counts[0] == 0 and counts[1] == 0
|
563 |
+
|
564 |
+
N = 2000
|
565 |
+
unnormalized_logits = t.tensor([0.2, 0.3, 0.5]).log() + 2.3456
|
566 |
+
samples = t.tensor([sample_top_p(unnormalized_logits, 0.50001) for _ in range(N)])
|
567 |
+
counts = t.bincount(samples, minlength=len(unnormalized_logits)) / N
|
568 |
+
# print("top_p in (0.5, 0.8] should return tokens 1 and 2: ", counts)
|
569 |
+
# assert counts[0] == 0
|
570 |
+
|
571 |
+
N = 4000
|
572 |
+
top_p = 0.71
|
573 |
+
probs = t.linspace(0, 0.4, 5)
|
574 |
+
unnormalized_logits = probs.log() + 1.2345
|
575 |
+
samples = t.tensor([sample_top_p(unnormalized_logits, top_p) for _ in range(N)])
|
576 |
+
counts = t.bincount(samples, minlength=len(probs)) / N
|
577 |
+
expected = probs.clone()
|
578 |
+
expected[0:2] = 0
|
579 |
+
expected /= expected.sum()
|
580 |
+
# print("Checking empirical frequencies (try to increase N if this test fails): ", counts)
|
581 |
+
#t.testing.assert_close(counts, expected, atol=0.01, rtol=0.0)
|
582 |
+
|
583 |
+
# print("All tests passed!")
|
584 |
+
|
585 |
+
# %% [markdown]
|
586 |
+
# # Speak, Shakespeare!
|
587 |
+
|
588 |
+
# %%
|
589 |
+
input = "Death waits at the door"
|
590 |
+
|
591 |
+
print(shak.listToString(sample_tokens(myTransformer,shak.convertStringToTokenList,shak.decodeList,
|
592 |
+
input, 80,freq_penalty=0.1, top_k = 10)))
|
593 |
+
|
594 |
+
# %% [markdown]
|
595 |
+
# ## Save the model for future use
|
596 |
+
# (This was over 20 minutes of GPU computation. Not too shabby!)
|
597 |
+
|
598 |
+
# %%
|
599 |
+
t.save(myTransformer.state_dict(), "toInfer.pt")
|
600 |
+
|
601 |
+
# %% [markdown]
|
602 |
+
# # Publish to Gradio
|
603 |
+
# About a month after making this I realized this should be online. I'll push this to gradio
|
604 |
+
|
605 |
+
# %%
|
606 |
+
import gradio as gr
|
607 |
+
def speak(input, tokenLength):
|
608 |
+
print("-------------------------------------------")
|
609 |
+
print("input: " + input)
|
610 |
+
try:
|
611 |
+
result = shak.listToString(sample_tokens(myTransformer,shak.convertStringToTokenList,shak.decodeList,
|
612 |
+
input, tokenLength,freq_penalty=0.1, top_k = 10))
|
613 |
+
except:
|
614 |
+
return "one or more of the words is not compatible with the model; please try a different phrase"
|
615 |
+
|
616 |
+
print("worked! output:")
|
617 |
+
print(result)
|
618 |
+
return result
|
619 |
+
|
620 |
+
|
621 |
+
model = gr.Interface(fn=speak,
|
622 |
+
inputs=[gr.Textbox(label = "initial text", placeholder="To be or not to be"), gr.Slider(40, 80, step=1, value=80)],
|
623 |
+
outputs="text",
|
624 |
+
title = "speak shakespeare, speak!",
|
625 |
+
description = "a miniature shakespeare, built from scratch by Cody Rushing via a Decoder-Only Transformer trained on shakespeare's works.\n many, but not all, words are tokenizable - if you get an error, try again with different words!")
|
626 |
+
|
627 |
+
model.launch(share=False)
|
628 |
+
|
629 |
+
# %%
|
630 |
+
|
631 |
+
|
632 |
+
|
requirements.txt
ADDED
@@ -0,0 +1,8 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
torch
|
2 |
+
numpy
|
3 |
+
fancy_einsum
|
4 |
+
einops
|
5 |
+
pandas
|
6 |
+
dataclasses
|
7 |
+
torchvision
|
8 |
+
gradio
|
shakespeare.txt
ADDED
The diff for this file is too large to render.
See raw diff
|
|
toInfer.pt
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:6d4b13307f2768c5c0811dcaae1c1f2f04a7066ee9f723fcca16ba56fa5e7f45
|
3 |
+
size 120571916
|