# %% [markdown] #

# %% [markdown] # # Training Shakespeare Himself # %% [markdown] # For those who are not part of the ARENA program and are curious as to what this is, this was my first significant AI/ML project! I made components for a decoder-only transformer, and trained it on a corpus consisting of text from Shakespeare. Scroll to the bottom to see some output :) # %% import torch as t import numpy as np from torch import nn import fancy_einsum as einsum import einops import pandas as pd # %% [markdown] # ## transformer functions # # # %% [markdown] # This will be from the transformer components I made earlier this week, but I'll put down optimizations so it can use the GPU. # # And I did just that. The speed improvements are MASSIVE, wow! # %% device = t.device("cuda:0" if t.cuda.is_available() else "cpu") t.cuda.is_available() # %% def multihead_masked_attention(Q: t.Tensor, K: t.Tensor, V: t.Tensor, num_heads: int): ''' Implements multihead masked attention on the matrices Q, K and V. Q: shape (batch, seq_len, nheads*headsize) K: shape (batch, seq_len, nheads*headsize) V: shape (batch, seq_len, nheads*headsize) ''' Q = einops.rearrange(Q, 'b s (n h) -> b n s h', n = num_heads) K = einops.rearrange(K, 'b s (n h) -> b n s h', n = num_heads) V = einops.rearrange(V, 'b s (n h) -> b n s h', n = num_heads) scores = einsum.einsum('b n k h, b n s h -> b n s k', K, Q) assert scores.shape == t.Size([Q.shape[0], num_heads,Q.shape[2], K.shape[2]]) scores = scores / np.sqrt(Q.shape[-1]) attention = scores + t.triu(t.ones_like(scores,device = device) * float("-inf"), diagonal=1) # THIS IS STOLEN FROM JAY - testing it out softed = t.softmax(attention,dim=-1) result = einsum.einsum('batch numheads seqQ seqK, batch numheads seqK headsize -> batch numheads seqQ headsize',softed, V) return einops.rearrange(result, 'batch numheads seqQ headsize -> batch seqQ (numheads headsize)') # %% class MultiheadMaskedAttention(nn.Module): W_QKV: nn.Linear W_O: nn.Linear def __init__(self, hidden_size: int, num_heads: int): super().__init__() self.hidden_size = hidden_size self.num_heads = num_heads self.head_size = hidden_size // num_heads self.WQKV = t.nn.Linear(self.hidden_size, 3 * hidden_size) # TODO: why do we use a linear layer here? aren't they matricies? self.W0 = t.nn.Linear(self.hidden_size, self.hidden_size) def forward(self, x: t.Tensor) -> t.Tensor: ''' x: shape (batch, seq, hidden_size) Return: shape (batch, seq, hidden_size) ''' #print("YO?") x = x.float() # seems like it needs to be a float! QKV = self.WQKV(x) Q = QKV[:,:,:self.hidden_size] K = QKV[:,:,self.hidden_size:self.hidden_size * 2] V = QKV[:,:,self.hidden_size * 2:] assert Q.shape == K.shape == V.shape == x.shape return self.W0(multihead_masked_attention(Q,K,V,self.num_heads)) # %% from dataclasses import dataclass @dataclass(frozen=True) class TransformerConfig: '''Constants used throughout your decoder-only transformer model.''' num_layers: int num_heads: int vocab_size: int hidden_size: int max_seq_len: int dropout: float = 0.1 layer_norm_epsilon: float = 1e-05 # %% # from yesterday class PositionalEncoding(nn.Module): def __init__(self, embedding_dim: int, max_seq_len: int = 5000): super().__init__() self.dim = embedding_dim self.length = max_seq_len # mostly copied. i understand this, just need to work on # making more tensors and getting more exposure to methods of making tensors def P (delta): n = 10000 # hardcoded d = embedding_dim l = max_seq_len sin_array = np.sin(delta / n ** (2 * np.arange(d//2) / d)) cos_array = np.cos(delta / n ** (2 * np.arange(d//2) / d)) array = np.zeros(d) array[::2] = sin_array array[1::2] = cos_array return array tokenArray = [] for i in range(max_seq_len): tokenArray.append(P(i)) # changed from previous design self.multMax = t.tensor(np.array(tokenArray), dtype=t.float, device = device) def forward(self, x: t.Tensor) -> t.Tensor: ''' x: shape (batch, seq_len, embedding_dim) ''' return x + self.multMax[:x.shape[1]] # %% class MLP(nn.Module): def __init__(self, config: TransformerConfig): super().__init__() self.hidden_size = config.hidden_size self.layers = nn.Sequential( nn.Linear(self.hidden_size, self.hidden_size * 4), nn.GELU(), nn.Linear(self.hidden_size * 4, self.hidden_size), nn.Dropout(config.dropout) ) def forward(self, x: t.Tensor): x = x.float() # seems like it needs to be a float! return self.layers(x).float() # ima do the same thing again! # %% class DecoderBlock(nn.Module): def __init__(self, config: TransformerConfig): super().__init__() self.attentionBlock = nn.Sequential( MultiheadMaskedAttention(config.hidden_size, config.num_heads), nn.LayerNorm(config.hidden_size) ) self.MLP = nn.Sequential( MLP(config), nn.LayerNorm(config.hidden_size) ) def forward(self, x: t.Tensor) -> t.Tensor: partOne = x + self.attentionBlock(x) return (partOne + self.MLP(partOne)).float() # seems like it needs to be a float! # %% class DecoderOnlyTransformer(nn.Module): def __init__(self, config: TransformerConfig): super().__init__() self.tokenize = nn.Embedding(config.vocab_size, config.hidden_size).to(device) self.positionize = PositionalEncoding(config.hidden_size,config.max_seq_len) self.restModel = nn.Sequential( nn.Dropout(config.dropout), *[DecoderBlock(config) for i in range(config.num_layers)], nn.LayerNorm(config.hidden_size), ) self.unembed = self.tokenize.weight.T.to(device) def forward(self, x: t.Tensor) -> t.Tensor: x = self.tokenize(x) x = self.positionize(x) toUnembed = self.restModel(x).to(device) return toUnembed@self.unembed # %% [markdown] # ## Data Prep # %% [markdown] # Make the dataset to parse through all of the words # %% import re from torch.utils.data import Dataset, DataLoader from torchvision import datasets class CustomTextDataset(Dataset): def __init__(self, words, seq_len, fractionOfWords): self.fractionOfWords = fractionOfWords self.words = words self.setOfWords = set(words) self.seq_len = seq_len self.max_len = len(self.words) - (self.seq_len + 1) self.vocab_size = len(self.setOfWords) self.word_to_token = {word: idx for (idx, word) in enumerate(sorted(self.setOfWords))} self.token_to_word = {idx: word for (idx, word) in enumerate(sorted(self.setOfWords))} self.allTokens = t.tensor([self.word_to_token[word] for word in self.words],device = device) if (self.fractionOfWords > 0.9): print("Probably don't do this. Errors may about") def __len__(self): return int(self.max_len * self.fractionOfWords) def __getitem__(self, idx): tokens = self.allTokens[idx:idx + self.seq_len + 1] input = tokens[:-1] target = tokens[1:] return input, target def getDataSize(self): return self.vocab_size def convertToTokens(self, phrase: list) -> t.tensor: return t.tensor([self.word_to_token[word] for word in phrase],device = device) def convertStringToTokenList(self, phrase: str) -> list: words = re.split(r"\b", phrase) return [self.word_to_token[word] for word in words] def convertToText(self, tokens: t.tensor): temp = [] for i, value in enumerate(tokens): #print(value.item()) temp.append(self.token_to_word[value.item()]) return temp def decodeList(self, words: list): temp = [] for value in words: temp.append(self.token_to_word[value]) return temp def listToString(self, words: list) -> str: temp = "" for word in words: temp = temp + word return temp # %% file = open("shakespeare.txt") text = file.read() words = re.split(r"\b", text) fractionOfWords = 0.1 # what percent of the corpus to train on lengthOfSeq = 100 shak = CustomTextDataset(words, lengthOfSeq, fractionOfWords) # %% [markdown] # ## Running this data through a transformer # %% trainloader = DataLoader(shak, batch_size=32,shuffle=True) # this specific one trained for 24 minutes and 9 seconds on colab GPU thisConfig = TransformerConfig( num_layers = 4, # 6 layers in the Attention paper num_heads = 4, # 8 heads in Attention paper vocab_size = trainloader.dataset.getDataSize(), # 37000 tokens in Attention paper (?) hidden_size = 512, # recall that this = num_heads * headsize | 512 is the embedding dim used in Attention paper max_seq_len = lengthOfSeq, dropout = 0.1, # same as Attention paper layer_norm_epsilon=0.00001 ) # %% use_pretrained = True if use_pretrained: print("Using Pre-trained Model!") myTransformer = DecoderOnlyTransformer(thisConfig).to(device) optimizer = t.optim.Adam(myTransformer.parameters(), lr = 1e-3) criterion = nn.CrossEntropyLoss().to(device) myTransformer.load_state_dict(t.load("toInfer.pt", map_location=device)) myTransformer.eval() else: print("Training Model... better hope you got enough GPU!") myTransformer = DecoderOnlyTransformer(thisConfig).to(device) optimizer = t.optim.Adam(myTransformer.parameters(), lr = 1e-3) criterion = nn.CrossEntropyLoss().to(device) NUM_EPOCHS = 1 losses = [] myTransformer.train() for epoch in range(1, NUM_EPOCHS + 1): for inputs, targets in trainloader: outputs = myTransformer(inputs).to(device) targets = t.nn.functional.one_hot(targets, num_classes=trainloader.dataset.getDataSize()).float().to(device) outputs = einops.rearrange(outputs, 'batch seq vocab -> (batch seq) vocab') targets = einops.rearrange(targets, 'batch seq vocab -> (batch seq) vocab') outputs = outputs.to(device) targets = targets.to(device) loss = criterion(outputs,targets).to(device) losses.append(loss.item()) loss.backward() optimizer.step() optimizer.zero_grad() # %% if not use_pretrained: df = pd.DataFrame(losses) df.plot() # %% # quick test - use the sample method if you wish to actually use the transformer: myTransformer.eval() testPhrase = ["Be", " ", "not", " ", "afraid", " ", "to", " ", "the", " ", "Florentine", "\n", "And"] input = shak.convertToTokens(testPhrase) input = input[None, :] tokens = myTransformer(input).argmax(dim=-1)[0] #print(tokens) shak.convertToText(tokens) # %% [markdown] # # Sampling # %% def apply_sampling_methods(input_ids: t.Tensor, logits: t.Tensor, temperature=1.0, freq_penalty=0.0, top_k=0, top_p=0.0) -> int: # returns a next token based on provided sampling method # thanks callum for the this method assert input_ids.ndim == 1, "input_ids should be a 1D sequence of token ids" assert temperature >= 0, "Temperature should be non-negative" assert 0 <= top_p <= 1.0, "Top-p must be a probability" assert 0 <= top_k, "Top-k must be non-negative" assert not (top_p != 0 and top_k != 0), "At most one of top-p and top-k supported" if temperature == 0: return greedy_search(logits) if temperature != 1.0: logits = apply_temperature(logits, temperature) if freq_penalty != 0.0: logits = apply_freq_penalty(input_ids, logits, freq_penalty) if top_k > 0: return sample_top_k(logits, top_k) if top_p > 0: return sample_top_p(logits, top_p) return sample_basic(logits) def sample_tokens( model, encodeMethod, decodeMethod, initial_text: str, max_tokens_generated = 40, **kwargs) -> list: # samples tokens until model outputs eos_token_id or token limit reached model.eval() input_ids: list = encodeMethod(initial_text) generated_ids = [] device = next(model.parameters()).device #what is next doing here? tokens_to_generate = max_tokens_generated - len(input_ids) for _ in range(tokens_to_generate): #print(input_ids + generated_ids) new_input_ids = t.tensor(input_ids + generated_ids, dtype=t.int64, device=device) #print(new_input_ids.unsqueeze(0).shape) logits = model(new_input_ids.unsqueeze(0))[0, -1] #print(logits.shape) new_token = apply_sampling_methods(new_input_ids, logits, **kwargs) generated_ids.append(new_token) return decodeMethod(input_ids + generated_ids) # quick test: myTransformer.eval() testPhrase = ["Be", " ", "not", " ", "afraid", " ", "to", " ", "the", " ", "Florentine", "\n", "And"] input = shak.convertToTokens(testPhrase) type(input) # %% def greedy_search(logits): ''' returns the most likely next token, BUT THE TIEBREAKER IS INCORRECT! i got lazy - it *is* deterministic, but it just doesn't necessarily choose the smallest word out of the tie. perhaps treat it as a symbol of my ingenuity? ''' return logits.argmax(dim=-1).item() # %% def sample_basic(logits) -> int: ''' samples from the distributions, possibly with temp and freq changes applied logits: shape (vocab_size, ) - unnormalized log-probabilities return: a sampled token ''' probs = t.distributions.categorical.Categorical(logits=logits) return probs.sample().item() N = 20000 probs = t.linspace(0, 0.4, 5) unnormalized_logits = probs.log() + 1.2345 samples = t.tensor([sample_basic(unnormalized_logits) for _ in range(N)]) counts = t.bincount(samples, minlength=len(probs)) / N print("Checking empirical frequencies (try to increase N if this test fails): ", counts) #t.testing.assert_close(counts, probs, atol=0.01, rtol=0) print("Tests passed!") # %% def apply_freq_penalty(input_ids: t.Tensor, logits: t.Tensor, freq_penalty: float) -> t.Tensor: ''' input_ids: shape (seq, ) logits: shape (vocab_size, ) Return: shape (vocab_size, ) ''' (vocab_size,) = logits.shape id_freqs = t.bincount(input_ids, minlength=vocab_size) return logits - freq_penalty * id_freqs bieber_prompt = "And I was like baby, baby, baby, oh Like, baby, baby, baby, no Like, baby, baby, baby, oh I thought you'd always be mine, mine" input_ids = shak.convertStringToTokenList(bieber_prompt) logits = t.ones(shak.getDataSize()).to(device) penalized_logits = apply_freq_penalty(t.tensor(input_ids).to(device), logits, 2.0) #i believe mine is different! #assert penalized_logits[5156].item() == -11, "Expected 6 occurrences of ' baby' with leading space" #assert penalized_logits[14801].item() == -5, "Expected 3 occurrences of ' Baby' with leading space" #print("Tests passed!") print(penalized_logits[2037].item()) # should be low since it was found! shak.convertStringToTokenList("And") # %% def apply_temperature(logits: t.Tensor, temperature: float) -> t.Tensor: assert temperature > 0, "temp cannot be less than or equal to 0" return logits / temperature logits = t.tensor([1, 2]).log() cold_logits = apply_temperature(logits, 0.001) #print('A low temperature "sharpens" or "peaks" the distribution: ', cold_logits) #t.testing.assert_close(cold_logits, 1000.0 * logits) hot_logits = apply_temperature(logits, 1000.0) #print("A high temperature flattens the distribution: ", hot_logits) #t.testing.assert_close(hot_logits, 0.001 * logits) #print("Tests passed!") # %% # N_RUNS = 1 # your_prompt = "We are the champions, my friends" # cases = [ # ("High freq penalty", dict(freq_penalty=100.0)), # ("Negative freq penalty", dict(freq_penalty=-1.0)), # ("Too hot!", dict(temperature=2.0)), # ("Pleasantly cool", dict(temperature=0.7)), # ("Pleasantly warm", dict(temperature=0.9)), # ("Too cold!", dict(temperature=0.01)), # ] # for (name, kwargs) in cases: # for i in range(N_RUNS): # output = sample_tokens(myTransformer, shak.convertStringToTokenList,shak.decodeList, your_prompt, max_tokens_generated=24, **kwargs) # print(f"Sample {i} with: {name} ({kwargs}):") # print(f"Your model said: {shak.listToString(output)}\n") # %% def sample_top_k(logits: t.Tensor, top_k: int) -> int: ''' logits: shape (vocab_size, ) - unnormalized log-probabilities top_k: only consider this many of the most likely tokens for sampling Return: a sampled token ''' topk = t.topk(logits,top_k).indices almost_zeroes = t.ones(logits.shape) * t.inf * -1 for _, token in enumerate(topk): almost_zeroes[token] = 0 logits = logits + almost_zeroes return sample_basic(logits) k = 3 probs = t.linspace(0, 0.4, 5) unnormalized_logits = probs.log() + 1.2345 samples = t.tensor([sample_top_k(unnormalized_logits, k) for _ in range(N)]) counts = t.bincount(samples, minlength=len(probs)) / N expected = probs.clone() expected[:-k] = 0 expected /= expected.sum() # print("Checking empirical frequencies (try to increase N if this test fails): ", counts) # #t.testing.assert_close(counts, expected, atol=0.01, rtol=0) # print("Tests passed!") # %% def sample_top_p(logits: t.Tensor, top_p: float, min_tokens_to_keep: int = 1) -> int: ''' logits: shape (vocab_size, ) - unnormalized log-probabilities Return: a sampled token ''' # find the indices of importang logits sorted, indices = t.sort(logits,descending=True) probs = t.nn.functional.softmax(sorted, dim=-1) num_words_kept = 0 sum = 0 while sum < top_p: sum = sum + probs[num_words_kept] num_words_kept = num_words_kept + 1 if num_words_kept < min_tokens_to_keep: num_words_kept = min_tokens_to_keep important_indices = indices[:num_words_kept] # prepare tensor to zero out small logits almost_zeroes = t.ones(logits.shape) * t.inf * -1 for _, token in enumerate(important_indices): almost_zeroes[token] = 0 logits = logits + almost_zeroes return sample_basic(logits) N = 2000 unnormalized_logits = t.tensor([0.2, 0.3, 0.5]).log() + 2.3456 samples = t.tensor([sample_top_p(unnormalized_logits, 0.5) for _ in range(N)]) counts = t.bincount(samples, minlength=len(unnormalized_logits)) / N # print("top_p of 0.5 or lower should only return token 2: ", counts) # assert counts[0] == 0 and counts[1] == 0 N = 2000 unnormalized_logits = t.tensor([0.2, 0.3, 0.5]).log() + 2.3456 samples = t.tensor([sample_top_p(unnormalized_logits, 0.50001) for _ in range(N)]) counts = t.bincount(samples, minlength=len(unnormalized_logits)) / N # print("top_p in (0.5, 0.8] should return tokens 1 and 2: ", counts) # assert counts[0] == 0 N = 4000 top_p = 0.71 probs = t.linspace(0, 0.4, 5) unnormalized_logits = probs.log() + 1.2345 samples = t.tensor([sample_top_p(unnormalized_logits, top_p) for _ in range(N)]) counts = t.bincount(samples, minlength=len(probs)) / N expected = probs.clone() expected[0:2] = 0 expected /= expected.sum() # print("Checking empirical frequencies (try to increase N if this test fails): ", counts) #t.testing.assert_close(counts, expected, atol=0.01, rtol=0.0) # print("All tests passed!") # %% [markdown] # # Speak, Shakespeare! # %% input = "Death waits at the door" print(shak.listToString(sample_tokens(myTransformer,shak.convertStringToTokenList,shak.decodeList, input, 80,freq_penalty=0.1, top_k = 10))) # %% [markdown] # ## Save the model for future use # (This was over 20 minutes of GPU computation. Not too shabby!) # %% t.save(myTransformer.state_dict(), "toInfer.pt") # %% [markdown] # # Publish to Gradio # About a month after making this I realized this should be online. I'll push this to gradio # %% import gradio as gr def speak(input, tokenLength): print("-------------------------------------------") print("input: " + input) try: result = shak.listToString(sample_tokens(myTransformer,shak.convertStringToTokenList,shak.decodeList, input, tokenLength,freq_penalty=0.1, top_k = 10)) except: return "one or more of the words is not compatible with the model; please try a different phrase" print("worked! output:") print(result) return result model = gr.Interface(fn=speak, css="custom_theme.css", inputs=[gr.Textbox(label = "initial text", placeholder="To be or not to be"), gr.Slider(20, 40, step=1, value=40)], outputs="text", title = "speak shakespeare, speak!", description = "a miniature shakespeare, built from scratch. Decoder-Only Transformer trained on shakespeare's works.\n many, but not all, words are tokenizable - if you get an error, try again with different words!") model.launch(share=False) # %%