Spaces:
Runtime error
Runtime error
# %% [markdown] | |
# <a href="https://colab.research.google.com/github/starship006/ARENA-work/blob/main/w1/w1d4.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a> | |
# %% [markdown] | |
# # Training Shakespeare Himself | |
# %% [markdown] | |
# For those who are not part of the ARENA program and are curious as to what this is, this was my first significant AI/ML project! I made components for a decoder-only transformer, and trained it on a corpus consisting of text from Shakespeare. Scroll to the bottom to see some output :) | |
# %% | |
import torch as t | |
import numpy as np | |
from torch import nn | |
import fancy_einsum as einsum | |
import einops | |
import pandas as pd | |
# %% [markdown] | |
# ## transformer functions | |
# | |
# | |
# %% [markdown] | |
# This will be from the transformer components I made earlier this week, but I'll put down optimizations so it can use the GPU. | |
# | |
# And I did just that. The speed improvements are MASSIVE, wow! | |
# %% | |
device = t.device("cuda:0" if t.cuda.is_available() else "cpu") | |
t.cuda.is_available() | |
# %% | |
def multihead_masked_attention(Q: t.Tensor, K: t.Tensor, V: t.Tensor, num_heads: int): | |
''' | |
Implements multihead masked attention on the matrices Q, K and V. | |
Q: shape (batch, seq_len, nheads*headsize) | |
K: shape (batch, seq_len, nheads*headsize) | |
V: shape (batch, seq_len, nheads*headsize) | |
''' | |
Q = einops.rearrange(Q, 'b s (n h) -> b n s h', n = num_heads) | |
K = einops.rearrange(K, 'b s (n h) -> b n s h', n = num_heads) | |
V = einops.rearrange(V, 'b s (n h) -> b n s h', n = num_heads) | |
scores = einsum.einsum('b n k h, b n s h -> b n s k', K, Q) | |
assert scores.shape == t.Size([Q.shape[0], num_heads,Q.shape[2], K.shape[2]]) | |
scores = scores / np.sqrt(Q.shape[-1]) | |
attention = scores + t.triu(t.ones_like(scores,device = device) * float("-inf"), diagonal=1) # THIS IS STOLEN FROM JAY - testing it out | |
softed = t.softmax(attention,dim=-1) | |
result = einsum.einsum('batch numheads seqQ seqK, batch numheads seqK headsize -> batch numheads seqQ headsize',softed, V) | |
return einops.rearrange(result, 'batch numheads seqQ headsize -> batch seqQ (numheads headsize)') | |
# %% | |
class MultiheadMaskedAttention(nn.Module): | |
W_QKV: nn.Linear | |
W_O: nn.Linear | |
def __init__(self, hidden_size: int, num_heads: int): | |
super().__init__() | |
self.hidden_size = hidden_size | |
self.num_heads = num_heads | |
self.head_size = hidden_size // num_heads | |
self.WQKV = t.nn.Linear(self.hidden_size, 3 * hidden_size) # TODO: why do we use a linear layer here? aren't they matricies? | |
self.W0 = t.nn.Linear(self.hidden_size, self.hidden_size) | |
def forward(self, x: t.Tensor) -> t.Tensor: | |
''' | |
x: shape (batch, seq, hidden_size) | |
Return: shape (batch, seq, hidden_size) | |
''' | |
#print("YO?") | |
x = x.float() # seems like it needs to be a float! | |
QKV = self.WQKV(x) | |
Q = QKV[:,:,:self.hidden_size] | |
K = QKV[:,:,self.hidden_size:self.hidden_size * 2] | |
V = QKV[:,:,self.hidden_size * 2:] | |
assert Q.shape == K.shape == V.shape == x.shape | |
return self.W0(multihead_masked_attention(Q,K,V,self.num_heads)) | |
# %% | |
from dataclasses import dataclass | |
class TransformerConfig: | |
'''Constants used throughout your decoder-only transformer model.''' | |
num_layers: int | |
num_heads: int | |
vocab_size: int | |
hidden_size: int | |
max_seq_len: int | |
dropout: float = 0.1 | |
layer_norm_epsilon: float = 1e-05 | |
# %% | |
# from yesterday | |
class PositionalEncoding(nn.Module): | |
def __init__(self, embedding_dim: int, max_seq_len: int = 5000): | |
super().__init__() | |
self.dim = embedding_dim | |
self.length = max_seq_len | |
# mostly copied. i understand this, just need to work on | |
# making more tensors and getting more exposure to methods of making tensors | |
def P (delta): | |
n = 10000 # hardcoded | |
d = embedding_dim | |
l = max_seq_len | |
sin_array = np.sin(delta / n ** (2 * np.arange(d//2) / d)) | |
cos_array = np.cos(delta / n ** (2 * np.arange(d//2) / d)) | |
array = np.zeros(d) | |
array[::2] = sin_array | |
array[1::2] = cos_array | |
return array | |
tokenArray = [] | |
for i in range(max_seq_len): | |
tokenArray.append(P(i)) # changed from previous design | |
self.multMax = t.tensor(np.array(tokenArray), dtype=t.float, device = device) | |
def forward(self, x: t.Tensor) -> t.Tensor: | |
''' | |
x: shape (batch, seq_len, embedding_dim) | |
''' | |
return x + self.multMax[:x.shape[1]] | |
# %% | |
class MLP(nn.Module): | |
def __init__(self, config: TransformerConfig): | |
super().__init__() | |
self.hidden_size = config.hidden_size | |
self.layers = nn.Sequential( | |
nn.Linear(self.hidden_size, self.hidden_size * 4), | |
nn.GELU(), | |
nn.Linear(self.hidden_size * 4, self.hidden_size), | |
nn.Dropout(config.dropout) | |
) | |
def forward(self, x: t.Tensor): | |
x = x.float() # seems like it needs to be a float! | |
return self.layers(x).float() # ima do the same thing again! | |
# %% | |
class DecoderBlock(nn.Module): | |
def __init__(self, config: TransformerConfig): | |
super().__init__() | |
self.attentionBlock = nn.Sequential( | |
MultiheadMaskedAttention(config.hidden_size, config.num_heads), | |
nn.LayerNorm(config.hidden_size) | |
) | |
self.MLP = nn.Sequential( | |
MLP(config), | |
nn.LayerNorm(config.hidden_size) | |
) | |
def forward(self, x: t.Tensor) -> t.Tensor: | |
partOne = x + self.attentionBlock(x) | |
return (partOne + self.MLP(partOne)).float() # seems like it needs to be a float! | |
# %% | |
class DecoderOnlyTransformer(nn.Module): | |
def __init__(self, config: TransformerConfig): | |
super().__init__() | |
self.tokenize = nn.Embedding(config.vocab_size, config.hidden_size).to(device) | |
self.positionize = PositionalEncoding(config.hidden_size,config.max_seq_len) | |
self.restModel = nn.Sequential( | |
nn.Dropout(config.dropout), | |
*[DecoderBlock(config) for i in range(config.num_layers)], | |
nn.LayerNorm(config.hidden_size), | |
) | |
self.unembed = self.tokenize.weight.T.to(device) | |
def forward(self, x: t.Tensor) -> t.Tensor: | |
x = self.tokenize(x) | |
x = self.positionize(x) | |
toUnembed = self.restModel(x).to(device) | |
return [email protected] | |
# %% [markdown] | |
# ## Data Prep | |
# %% [markdown] | |
# Make the dataset to parse through all of the words | |
# %% | |
import re | |
from torch.utils.data import Dataset, DataLoader | |
from torchvision import datasets | |
class CustomTextDataset(Dataset): | |
def __init__(self, words, seq_len, fractionOfWords): | |
self.fractionOfWords = fractionOfWords | |
self.words = words | |
self.setOfWords = set(words) | |
self.seq_len = seq_len | |
self.max_len = len(self.words) - (self.seq_len + 1) | |
self.vocab_size = len(self.setOfWords) | |
self.word_to_token = {word: idx for (idx, word) in enumerate(sorted(self.setOfWords))} | |
self.token_to_word = {idx: word for (idx, word) in enumerate(sorted(self.setOfWords))} | |
self.allTokens = t.tensor([self.word_to_token[word] for word in self.words],device = device) | |
if (self.fractionOfWords > 0.9): | |
print("Probably don't do this. Errors may about") | |
def __len__(self): | |
return int(self.max_len * self.fractionOfWords) | |
def __getitem__(self, idx): | |
tokens = self.allTokens[idx:idx + self.seq_len + 1] | |
input = tokens[:-1] | |
target = tokens[1:] | |
return input, target | |
def getDataSize(self): | |
return self.vocab_size | |
def convertToTokens(self, phrase: list) -> t.tensor: | |
return t.tensor([self.word_to_token[word] for word in phrase],device = device) | |
def convertStringToTokenList(self, phrase: str) -> list: | |
words = re.split(r"\b", phrase) | |
return [self.word_to_token[word] for word in words] | |
def convertToText(self, tokens: t.tensor): | |
temp = [] | |
for i, value in enumerate(tokens): | |
#print(value.item()) | |
temp.append(self.token_to_word[value.item()]) | |
return temp | |
def decodeList(self, words: list): | |
temp = [] | |
for value in words: | |
temp.append(self.token_to_word[value]) | |
return temp | |
def listToString(self, words: list) -> str: | |
temp = "" | |
for word in words: | |
temp = temp + word | |
return temp | |
# %% | |
file = open("shakespeare.txt") | |
text = file.read() | |
words = re.split(r"\b", text) | |
fractionOfWords = 0.1 # what percent of the corpus to train on | |
lengthOfSeq = 100 | |
shak = CustomTextDataset(words, lengthOfSeq, fractionOfWords) | |
# %% [markdown] | |
# ## Running this data through a transformer | |
# %% | |
trainloader = DataLoader(shak, batch_size=32,shuffle=True) | |
# this specific one trained for 24 minutes and 9 seconds on colab GPU | |
thisConfig = TransformerConfig( | |
num_layers = 4, # 6 layers in the Attention paper | |
num_heads = 4, # 8 heads in Attention paper | |
vocab_size = trainloader.dataset.getDataSize(), # 37000 tokens in Attention paper (?) | |
hidden_size = 512, # recall that this = num_heads * headsize | 512 is the embedding dim used in Attention paper | |
max_seq_len = lengthOfSeq, | |
dropout = 0.1, # same as Attention paper | |
layer_norm_epsilon=0.00001 | |
) | |
# %% | |
use_pretrained = True | |
if use_pretrained: | |
print("Using Pre-trained Model!") | |
myTransformer = DecoderOnlyTransformer(thisConfig).to(device) | |
optimizer = t.optim.Adam(myTransformer.parameters(), lr = 1e-3) | |
criterion = nn.CrossEntropyLoss().to(device) | |
myTransformer.load_state_dict(t.load("toInfer.pt", map_location=device)) | |
myTransformer.eval() | |
else: | |
print("Training Model... better hope you got enough GPU!") | |
myTransformer = DecoderOnlyTransformer(thisConfig).to(device) | |
optimizer = t.optim.Adam(myTransformer.parameters(), lr = 1e-3) | |
criterion = nn.CrossEntropyLoss().to(device) | |
NUM_EPOCHS = 1 | |
losses = [] | |
myTransformer.train() | |
for epoch in range(1, NUM_EPOCHS + 1): | |
for inputs, targets in trainloader: | |
outputs = myTransformer(inputs).to(device) | |
targets = t.nn.functional.one_hot(targets, num_classes=trainloader.dataset.getDataSize()).float().to(device) | |
outputs = einops.rearrange(outputs, 'batch seq vocab -> (batch seq) vocab') | |
targets = einops.rearrange(targets, 'batch seq vocab -> (batch seq) vocab') | |
outputs = outputs.to(device) | |
targets = targets.to(device) | |
loss = criterion(outputs,targets).to(device) | |
losses.append(loss.item()) | |
loss.backward() | |
optimizer.step() | |
optimizer.zero_grad() | |
# %% | |
if not use_pretrained: | |
df = pd.DataFrame(losses) | |
df.plot() | |
# %% | |
# quick test - use the sample method if you wish to actually use the transformer: | |
myTransformer.eval() | |
testPhrase = ["Be", " ", "not", " ", "afraid", " ", "to", " ", "the", " ", "Florentine", "\n", | |
"And"] | |
input = shak.convertToTokens(testPhrase) | |
input = input[None, :] | |
tokens = myTransformer(input).argmax(dim=-1)[0] | |
#print(tokens) | |
shak.convertToText(tokens) | |
# %% [markdown] | |
# # Sampling | |
# %% | |
def apply_sampling_methods(input_ids: t.Tensor, logits: t.Tensor, temperature=1.0, freq_penalty=0.0, top_k=0, top_p=0.0) -> int: | |
# returns a next token based on provided sampling method | |
# thanks callum for the this method | |
assert input_ids.ndim == 1, "input_ids should be a 1D sequence of token ids" | |
assert temperature >= 0, "Temperature should be non-negative" | |
assert 0 <= top_p <= 1.0, "Top-p must be a probability" | |
assert 0 <= top_k, "Top-k must be non-negative" | |
assert not (top_p != 0 and top_k != 0), "At most one of top-p and top-k supported" | |
if temperature == 0: | |
return greedy_search(logits) | |
if temperature != 1.0: | |
logits = apply_temperature(logits, temperature) | |
if freq_penalty != 0.0: | |
logits = apply_freq_penalty(input_ids, logits, freq_penalty) | |
if top_k > 0: | |
return sample_top_k(logits, top_k) | |
if top_p > 0: | |
return sample_top_p(logits, top_p) | |
return sample_basic(logits) | |
def sample_tokens( | |
model, | |
encodeMethod, | |
decodeMethod, | |
initial_text: str, | |
max_tokens_generated = 40, | |
**kwargs) -> list: | |
# samples tokens until model outputs eos_token_id or token limit reached | |
model.eval() | |
input_ids: list = encodeMethod(initial_text) | |
generated_ids = [] | |
device = next(model.parameters()).device #what is next doing here? | |
tokens_to_generate = max_tokens_generated - len(input_ids) | |
for _ in range(tokens_to_generate): | |
#print(input_ids + generated_ids) | |
new_input_ids = t.tensor(input_ids + generated_ids, dtype=t.int64, device=device) | |
#print(new_input_ids.unsqueeze(0).shape) | |
logits = model(new_input_ids.unsqueeze(0))[0, -1] | |
#print(logits.shape) | |
new_token = apply_sampling_methods(new_input_ids, logits, **kwargs) | |
generated_ids.append(new_token) | |
return decodeMethod(input_ids + generated_ids) | |
# quick test: | |
myTransformer.eval() | |
testPhrase = ["Be", " ", "not", " ", "afraid", " ", "to", " ", "the", " ", "Florentine", "\n", | |
"And"] | |
input = shak.convertToTokens(testPhrase) | |
type(input) | |
# %% | |
def greedy_search(logits): | |
''' | |
returns the most likely next token, BUT THE TIEBREAKER IS INCORRECT! | |
i got lazy - it *is* deterministic, but it just doesn't necessarily | |
choose the smallest word out of the tie. perhaps treat it as a symbol | |
of my ingenuity? | |
''' | |
return logits.argmax(dim=-1).item() | |
# %% | |
def sample_basic(logits) -> int: | |
''' | |
samples from the distributions, possibly with temp and freq changes applied | |
logits: shape (vocab_size, ) - unnormalized log-probabilities | |
return: a sampled token | |
''' | |
probs = t.distributions.categorical.Categorical(logits=logits) | |
return probs.sample().item() | |
N = 20000 | |
probs = t.linspace(0, 0.4, 5) | |
unnormalized_logits = probs.log() + 1.2345 | |
samples = t.tensor([sample_basic(unnormalized_logits) for _ in range(N)]) | |
counts = t.bincount(samples, minlength=len(probs)) / N | |
print("Checking empirical frequencies (try to increase N if this test fails): ", counts) | |
#t.testing.assert_close(counts, probs, atol=0.01, rtol=0) | |
print("Tests passed!") | |
# %% | |
def apply_freq_penalty(input_ids: t.Tensor, logits: t.Tensor, freq_penalty: float) -> t.Tensor: | |
''' | |
input_ids: shape (seq, ) | |
logits: shape (vocab_size, ) | |
Return: shape (vocab_size, ) | |
''' | |
(vocab_size,) = logits.shape | |
id_freqs = t.bincount(input_ids, minlength=vocab_size) | |
return logits - freq_penalty * id_freqs | |
bieber_prompt = "And I was like baby, baby, baby, oh Like, baby, baby, baby, no Like, baby, baby, baby, oh I thought you'd always be mine, mine" | |
input_ids = shak.convertStringToTokenList(bieber_prompt) | |
logits = t.ones(shak.getDataSize()).to(device) | |
penalized_logits = apply_freq_penalty(t.tensor(input_ids).to(device), logits, 2.0) | |
#i believe mine is different! | |
#assert penalized_logits[5156].item() == -11, "Expected 6 occurrences of ' baby' with leading space" | |
#assert penalized_logits[14801].item() == -5, "Expected 3 occurrences of ' Baby' with leading space" | |
#print("Tests passed!") | |
print(penalized_logits[2037].item()) # should be low since it was found! | |
shak.convertStringToTokenList("And") | |
# %% | |
def apply_temperature(logits: t.Tensor, temperature: float) -> t.Tensor: | |
assert temperature > 0, "temp cannot be less than or equal to 0" | |
return logits / temperature | |
logits = t.tensor([1, 2]).log() | |
cold_logits = apply_temperature(logits, 0.001) | |
#print('A low temperature "sharpens" or "peaks" the distribution: ', cold_logits) | |
#t.testing.assert_close(cold_logits, 1000.0 * logits) | |
hot_logits = apply_temperature(logits, 1000.0) | |
#print("A high temperature flattens the distribution: ", hot_logits) | |
#t.testing.assert_close(hot_logits, 0.001 * logits) | |
#print("Tests passed!") | |
# %% | |
# N_RUNS = 1 | |
# your_prompt = "We are the champions, my friends" | |
# cases = [ | |
# ("High freq penalty", dict(freq_penalty=100.0)), | |
# ("Negative freq penalty", dict(freq_penalty=-1.0)), | |
# ("Too hot!", dict(temperature=2.0)), | |
# ("Pleasantly cool", dict(temperature=0.7)), | |
# ("Pleasantly warm", dict(temperature=0.9)), | |
# ("Too cold!", dict(temperature=0.01)), | |
# ] | |
# for (name, kwargs) in cases: | |
# for i in range(N_RUNS): | |
# output = sample_tokens(myTransformer, shak.convertStringToTokenList,shak.decodeList, your_prompt, max_tokens_generated=24, **kwargs) | |
# print(f"Sample {i} with: {name} ({kwargs}):") | |
# print(f"Your model said: {shak.listToString(output)}\n") | |
# %% | |
def sample_top_k(logits: t.Tensor, top_k: int) -> int: | |
''' | |
logits: shape (vocab_size, ) - unnormalized log-probabilities | |
top_k: only consider this many of the most likely tokens for sampling | |
Return: a sampled token | |
''' | |
topk = t.topk(logits,top_k).indices | |
almost_zeroes = t.ones(logits.shape) * t.inf * -1 | |
for _, token in enumerate(topk): | |
almost_zeroes[token] = 0 | |
logits = logits + almost_zeroes | |
return sample_basic(logits) | |
k = 3 | |
probs = t.linspace(0, 0.4, 5) | |
unnormalized_logits = probs.log() + 1.2345 | |
samples = t.tensor([sample_top_k(unnormalized_logits, k) for _ in range(N)]) | |
counts = t.bincount(samples, minlength=len(probs)) / N | |
expected = probs.clone() | |
expected[:-k] = 0 | |
expected /= expected.sum() | |
# print("Checking empirical frequencies (try to increase N if this test fails): ", counts) | |
# #t.testing.assert_close(counts, expected, atol=0.01, rtol=0) | |
# print("Tests passed!") | |
# %% | |
def sample_top_p(logits: t.Tensor, top_p: float, min_tokens_to_keep: int = 1) -> int: | |
''' | |
logits: shape (vocab_size, ) - unnormalized log-probabilities | |
Return: a sampled token | |
''' | |
# find the indices of importang logits | |
sorted, indices = t.sort(logits,descending=True) | |
probs = t.nn.functional.softmax(sorted, dim=-1) | |
num_words_kept = 0 | |
sum = 0 | |
while sum < top_p: | |
sum = sum + probs[num_words_kept] | |
num_words_kept = num_words_kept + 1 | |
if num_words_kept < min_tokens_to_keep: | |
num_words_kept = min_tokens_to_keep | |
important_indices = indices[:num_words_kept] | |
# prepare tensor to zero out small logits | |
almost_zeroes = t.ones(logits.shape) * t.inf * -1 | |
for _, token in enumerate(important_indices): | |
almost_zeroes[token] = 0 | |
logits = logits + almost_zeroes | |
return sample_basic(logits) | |
N = 2000 | |
unnormalized_logits = t.tensor([0.2, 0.3, 0.5]).log() + 2.3456 | |
samples = t.tensor([sample_top_p(unnormalized_logits, 0.5) for _ in range(N)]) | |
counts = t.bincount(samples, minlength=len(unnormalized_logits)) / N | |
# print("top_p of 0.5 or lower should only return token 2: ", counts) | |
# assert counts[0] == 0 and counts[1] == 0 | |
N = 2000 | |
unnormalized_logits = t.tensor([0.2, 0.3, 0.5]).log() + 2.3456 | |
samples = t.tensor([sample_top_p(unnormalized_logits, 0.50001) for _ in range(N)]) | |
counts = t.bincount(samples, minlength=len(unnormalized_logits)) / N | |
# print("top_p in (0.5, 0.8] should return tokens 1 and 2: ", counts) | |
# assert counts[0] == 0 | |
N = 4000 | |
top_p = 0.71 | |
probs = t.linspace(0, 0.4, 5) | |
unnormalized_logits = probs.log() + 1.2345 | |
samples = t.tensor([sample_top_p(unnormalized_logits, top_p) for _ in range(N)]) | |
counts = t.bincount(samples, minlength=len(probs)) / N | |
expected = probs.clone() | |
expected[0:2] = 0 | |
expected /= expected.sum() | |
# print("Checking empirical frequencies (try to increase N if this test fails): ", counts) | |
#t.testing.assert_close(counts, expected, atol=0.01, rtol=0.0) | |
# print("All tests passed!") | |
# %% [markdown] | |
# # Speak, Shakespeare! | |
# %% | |
input = "Death waits at the door" | |
print(shak.listToString(sample_tokens(myTransformer,shak.convertStringToTokenList,shak.decodeList, | |
input, 80,freq_penalty=0.1, top_k = 10))) | |
# %% [markdown] | |
# ## Save the model for future use | |
# (This was over 20 minutes of GPU computation. Not too shabby!) | |
# %% | |
t.save(myTransformer.state_dict(), "toInfer.pt") | |
# %% [markdown] | |
# # Publish to Gradio | |
# About a month after making this I realized this should be online. I'll push this to gradio | |
# %% | |
import gradio as gr | |
def speak(input, tokenLength): | |
print("-------------------------------------------") | |
print("input: " + input) | |
try: | |
result = shak.listToString(sample_tokens(myTransformer,shak.convertStringToTokenList,shak.decodeList, | |
input, tokenLength,freq_penalty=0.1, top_k = 10)) | |
except: | |
return "one or more of the words is not compatible with the model; please try a different phrase" | |
print("worked! output:") | |
print(result) | |
return result | |
model = gr.Interface(fn=speak, | |
css="custom_theme.css", | |
inputs=[gr.Textbox(label = "initial text", placeholder="To be or not to be"), gr.Slider(20, 40, step=1, value=40)], | |
outputs="text", | |
title = "speak shakespeare, speak!", | |
description = "a miniature shakespeare, built from scratch. Decoder-Only Transformer trained on shakespeare's works.\n many, but not all, words are tokenizable - if you get an error, try again with different words!") | |
model.launch(share=False) | |
# %% | |