mini_shakespeare

Runtime error

App Files Files Community

jezthejirzaman

starship006 commited on May 1, 2023

Commit

1013b6b

0 Parent(s):

Duplicate from starship006/mini_shakespeare

Browse files

Co-authored-by: Cody Rushing <[email protected]>

Files changed (6) hide show

.gitattributes +34 -0
README.md +13 -0
app.py +632 -0
requirements.txt +8 -0
shakespeare.txt +0 -0
toInfer.pt +3 -0

.gitattributes ADDED Viewed

	@@ -0,0 +1,34 @@

+*.7z filter=lfs diff=lfs merge=lfs -text
+*.arrow filter=lfs diff=lfs merge=lfs -text
+*.bin filter=lfs diff=lfs merge=lfs -text
+*.bz2 filter=lfs diff=lfs merge=lfs -text
+*.ckpt filter=lfs diff=lfs merge=lfs -text
+*.ftz filter=lfs diff=lfs merge=lfs -text
+*.gz filter=lfs diff=lfs merge=lfs -text
+*.h5 filter=lfs diff=lfs merge=lfs -text
+*.joblib filter=lfs diff=lfs merge=lfs -text
+*.lfs.* filter=lfs diff=lfs merge=lfs -text
+*.mlmodel filter=lfs diff=lfs merge=lfs -text
+*.model filter=lfs diff=lfs merge=lfs -text
+*.msgpack filter=lfs diff=lfs merge=lfs -text
+*.npy filter=lfs diff=lfs merge=lfs -text
+*.npz filter=lfs diff=lfs merge=lfs -text
+*.onnx filter=lfs diff=lfs merge=lfs -text
+*.ot filter=lfs diff=lfs merge=lfs -text
+*.parquet filter=lfs diff=lfs merge=lfs -text
+*.pb filter=lfs diff=lfs merge=lfs -text
+*.pickle filter=lfs diff=lfs merge=lfs -text
+*.pkl filter=lfs diff=lfs merge=lfs -text
+*.pt filter=lfs diff=lfs merge=lfs -text
+*.pth filter=lfs diff=lfs merge=lfs -text
+*.rar filter=lfs diff=lfs merge=lfs -text
+*.safetensors filter=lfs diff=lfs merge=lfs -text
+saved_model/**/* filter=lfs diff=lfs merge=lfs -text
+*.tar.* filter=lfs diff=lfs merge=lfs -text
+*.tflite filter=lfs diff=lfs merge=lfs -text
+*.tgz filter=lfs diff=lfs merge=lfs -text
+*.wasm filter=lfs diff=lfs merge=lfs -text
+*.xz filter=lfs diff=lfs merge=lfs -text
+*.zip filter=lfs diff=lfs merge=lfs -text
+*.zst filter=lfs diff=lfs merge=lfs -text
+*tfevents* filter=lfs diff=lfs merge=lfs -text

README.md ADDED Viewed

	@@ -0,0 +1,13 @@

+---
+title: Mini Shakespeare
+emoji: 🐢
+colorFrom: gray
+colorTo: purple
+sdk: gradio
+sdk_version: 3.12.0
+app_file: app.py
+pinned: false
+duplicated_from: starship006/mini_shakespeare
+---
+Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

app.py ADDED Viewed

	@@ -0,0 +1,632 @@

+# %% [markdown]
+# <a href="https://colab.research.google.com/github/starship006/ARENA-work/blob/main/w1/w1d4.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>
+# %% [markdown]
+# # Training Shakespeare Himself
+# %% [markdown]
+# For those who are not part of the ARENA program and are curious as to what this is, this was my first significant AI/ML project! I made components for a decoder-only transformer, and trained it on a corpus consisting of text from Shakespeare. Scroll to the bottom to see some output :)
+# %%
+import torch as t
+import numpy as np
+from torch import nn
+import fancy_einsum as einsum
+import einops
+import pandas as pd
+# %% [markdown]
+# ## transformer functions
+#
+#
+# %% [markdown]
+# This will be from the transformer components I made earlier this week, but I'll put down optimizations so it can use the GPU.
+#
+# And I did just that. The speed improvements are MASSIVE, wow!
+# %%
+device = t.device("cuda:0" if t.cuda.is_available() else "cpu")
+t.cuda.is_available()
+# %%
+def multihead_masked_attention(Q: t.Tensor, K: t.Tensor, V: t.Tensor, num_heads: int):
+    '''
+    Implements multihead masked attention on the matrices Q, K and V.
+    Q: shape (batch, seq_len, nheads*headsize)
+    K: shape (batch, seq_len, nheads*headsize)
+    V: shape (batch, seq_len, nheads*headsize)
+    '''
+    Q = einops.rearrange(Q, 'b s (n h) -> b n s h', n = num_heads)
+    K = einops.rearrange(K, 'b s (n h) -> b n s h', n = num_heads)
+    V = einops.rearrange(V, 'b s (n h) -> b n s h', n = num_heads)
+    scores = einsum.einsum('b n k h, b n s h -> b n s k', K, Q)
+    assert scores.shape == t.Size([Q.shape[0], num_heads,Q.shape[2], K.shape[2]])
+    scores = scores / np.sqrt(Q.shape[-1])
+    attention = scores + t.triu(t.ones_like(scores,device = device) * float("-inf"), diagonal=1) # THIS IS STOLEN FROM JAY - testing it out
+    softed = t.softmax(attention,dim=-1)
+    result =  einsum.einsum('batch numheads seqQ seqK, batch numheads seqK headsize -> batch numheads seqQ headsize',softed, V)
+    return einops.rearrange(result, 'batch numheads seqQ headsize -> batch seqQ (numheads headsize)')
+# %%
+class MultiheadMaskedAttention(nn.Module):
+    W_QKV: nn.Linear
+    W_O: nn.Linear
+    def __init__(self, hidden_size: int, num_heads: int):
+        super().__init__()
+        self.hidden_size = hidden_size
+        self.num_heads = num_heads
+        self.head_size = hidden_size // num_heads
+        self.WQKV = t.nn.Linear(self.hidden_size, 3 * hidden_size) # TODO: why do we use a linear layer here? aren't they matricies?
+        self.W0 = t.nn.Linear(self.hidden_size, self.hidden_size)
+    def forward(self, x: t.Tensor) -> t.Tensor:
+        '''
+        x: shape (batch, seq, hidden_size)
+        Return: shape (batch, seq, hidden_size)
+        '''
+        #print("YO?")
+        x = x.float() # seems like it needs to be a float!
+        QKV = self.WQKV(x)
+        Q = QKV[:,:,:self.hidden_size]
+        K = QKV[:,:,self.hidden_size:self.hidden_size * 2]
+        V = QKV[:,:,self.hidden_size * 2:]
+        assert Q.shape == K.shape == V.shape == x.shape
+        return self.W0(multihead_masked_attention(Q,K,V,self.num_heads))
+# %%
+from dataclasses import dataclass
+@dataclass(frozen=True)
+class TransformerConfig:
+    '''Constants used throughout your decoder-only transformer model.'''
+    num_layers: int
+    num_heads: int
+    vocab_size: int
+    hidden_size: int
+    max_seq_len: int
+    dropout: float = 0.1
+    layer_norm_epsilon: float = 1e-05
+# %%
+# from yesterday
+class PositionalEncoding(nn.Module):
+    def __init__(self, embedding_dim: int, max_seq_len: int = 5000):
+        super().__init__()
+        self.dim = embedding_dim
+        self.length = max_seq_len
+        # mostly copied. i understand this, just need to work on
+        # making more tensors and getting more exposure to methods of making tensors
+        def P (delta):
+            n = 10000 # hardcoded
+            d = embedding_dim
+            l = max_seq_len
+            sin_array = np.sin(delta / n ** (2 * np.arange(d//2) / d))
+            cos_array = np.cos(delta / n ** (2 * np.arange(d//2) / d))
+            array = np.zeros(d)
+            array[::2] = sin_array
+            array[1::2] = cos_array
+            return array
+        tokenArray = []
+        for i in range(max_seq_len):
+            tokenArray.append(P(i)) # changed from previous design
+        self.multMax = t.tensor(np.array(tokenArray), dtype=t.float, device = device)
+    def forward(self, x: t.Tensor) -> t.Tensor:
+        '''
+        x: shape (batch, seq_len, embedding_dim)
+        '''
+        return x + self.multMax[:x.shape[1]]
+# %%
+class MLP(nn.Module):
+    def __init__(self, config: TransformerConfig):
+        super().__init__()
+        self.hidden_size = config.hidden_size
+        self.layers = nn.Sequential(
+            nn.Linear(self.hidden_size, self.hidden_size * 4),
+            nn.GELU(),
+            nn.Linear(self.hidden_size * 4, self.hidden_size),
+            nn.Dropout(config.dropout)
+        )
+    def forward(self, x: t.Tensor):
+        x = x.float() # seems like it needs to be a float!
+        return self.layers(x).float() # ima do the same thing again!
+# %%
+class DecoderBlock(nn.Module):
+    def __init__(self, config: TransformerConfig):
+        super().__init__()
+        self.attentionBlock = nn.Sequential(
+            MultiheadMaskedAttention(config.hidden_size,  config.num_heads),
+            nn.LayerNorm(config.hidden_size)
+        )
+        self.MLP = nn.Sequential(
+            MLP(config),
+            nn.LayerNorm(config.hidden_size)
+        )
+    def forward(self, x: t.Tensor) -> t.Tensor:
+        partOne = x + self.attentionBlock(x)
+        return (partOne + self.MLP(partOne)).float() # seems like it needs to be a float!
+# %%
+class DecoderOnlyTransformer(nn.Module):
+    def __init__(self, config: TransformerConfig):
+        super().__init__()
+        self.tokenize = nn.Embedding(config.vocab_size, config.hidden_size).to(device)
+        self.positionize = PositionalEncoding(config.hidden_size,config.max_seq_len)
+        self.restModel = nn.Sequential(
+            nn.Dropout(config.dropout),
+            *[DecoderBlock(config) for i in range(config.num_layers)],
+            nn.LayerNorm(config.hidden_size),
+        )
+        self.unembed = self.tokenize.weight.T.to(device)
+    def forward(self, x: t.Tensor) -> t.Tensor:
+        x = self.tokenize(x)
+        x = self.positionize(x)
+        toUnembed = self.restModel(x).to(device)
+        return [email protected]
+# %% [markdown]
+# ## Data Prep
+# %% [markdown]
+# Make the dataset to parse through all of the words
+# %%
+import re
+from torch.utils.data import Dataset, DataLoader
+from torchvision import datasets
+class CustomTextDataset(Dataset):
+    def __init__(self, words, seq_len, fractionOfWords):
+        self.fractionOfWords = fractionOfWords
+        self.words = words
+        self.setOfWords = set(words)
+        self.seq_len = seq_len
+        self.max_len = len(self.words) - (self.seq_len + 1)
+        self.vocab_size = len(self.setOfWords)
+        self.word_to_token = {word: idx for (idx, word) in enumerate(sorted(self.setOfWords))}
+        self.token_to_word = {idx: word for (idx, word) in enumerate(sorted(self.setOfWords))}
+        self.allTokens = t.tensor([self.word_to_token[word] for word in self.words],device = device)
+        if (self.fractionOfWords > 0.9):
+            print("Probably don't do this. Errors may about")
+    def __len__(self):
+        return int(self.max_len * self.fractionOfWords)
+    def __getitem__(self, idx):
+        tokens = self.allTokens[idx:idx + self.seq_len + 1]
+        input = tokens[:-1]
+        target = tokens[1:]
+        return input, target
+    def getDataSize(self):
+        return self.vocab_size
+    def convertToTokens(self, phrase: list) -> t.tensor:
+        return t.tensor([self.word_to_token[word] for word in phrase],device = device)
+    def convertStringToTokenList(self, phrase: str) -> list:
+        words = re.split(r"\b", phrase)
+        return [self.word_to_token[word] for word in words]
+    def convertToText(self, tokens: t.tensor):
+        temp = []
+        for i, value in enumerate(tokens):
+            #print(value.item())
+            temp.append(self.token_to_word[value.item()])
+        return temp
+    def decodeList(self, words: list):
+        temp = []
+        for value in words:
+            temp.append(self.token_to_word[value])
+        return temp
+    def listToString(self, words: list) -> str:
+        temp = ""
+        for word in words:
+            temp = temp + word
+        return temp
+# %%
+file = open("shakespeare.txt")
+text = file.read()
+words = re.split(r"\b", text)
+fractionOfWords = 0.1 # what percent of the corpus to train on
+lengthOfSeq = 100
+shak = CustomTextDataset(words, lengthOfSeq, fractionOfWords)
+# %% [markdown]
+# ## Running this data through a transformer
+# %%
+trainloader = DataLoader(shak, batch_size=32,shuffle=True)
+# this specific one trained for 24 minutes and 9 seconds on colab GPU
+thisConfig = TransformerConfig(
+    num_layers = 4, # 6 layers in the Attention paper
+    num_heads = 4, # 8 heads in Attention paper
+    vocab_size = trainloader.dataset.getDataSize(), # 37000 tokens in Attention paper (?)
+    hidden_size = 512, # recall that this = num_heads * headsize | 512 is the embedding dim used in Attention paper
+    max_seq_len = lengthOfSeq,
+    dropout = 0.1, # same as Attention paper
+    layer_norm_epsilon=0.00001
+)
+# %%
+use_pretrained = True
+if use_pretrained:
+    print("Using Pre-trained Model!")
+    myTransformer = DecoderOnlyTransformer(thisConfig).to(device)
+    optimizer = t.optim.Adam(myTransformer.parameters(), lr = 1e-3)
+    criterion = nn.CrossEntropyLoss().to(device)
+    myTransformer.load_state_dict(t.load("toInfer.pt", map_location=device))
+    myTransformer.eval()
+else:
+    print("Training Model... better hope you got enough GPU!")
+    myTransformer = DecoderOnlyTransformer(thisConfig).to(device)
+    optimizer = t.optim.Adam(myTransformer.parameters(), lr = 1e-3)
+    criterion = nn.CrossEntropyLoss().to(device)
+    NUM_EPOCHS = 1
+    losses = []
+    myTransformer.train()
+    for epoch in range(1, NUM_EPOCHS + 1):
+        for inputs, targets in trainloader:
+            outputs = myTransformer(inputs).to(device)
+            targets = t.nn.functional.one_hot(targets, num_classes=trainloader.dataset.getDataSize()).float().to(device)
+            outputs = einops.rearrange(outputs, 'batch seq vocab -> (batch seq) vocab')
+            targets = einops.rearrange(targets, 'batch seq vocab -> (batch seq) vocab')
+            outputs = outputs.to(device)
+            targets = targets.to(device)
+            loss = criterion(outputs,targets).to(device)
+            losses.append(loss.item())
+            loss.backward()
+            optimizer.step()
+            optimizer.zero_grad()
+# %%
+if not use_pretrained:
+    df = pd.DataFrame(losses)
+    df.plot()
+# %%
+# quick test - use the sample method if you wish to actually use the transformer:
+myTransformer.eval()
+testPhrase = ["Be", " ", "not", " ", "afraid", " ", "to", " ", "the", " ", "Florentine", "\n",
+              "And"]
+input = shak.convertToTokens(testPhrase)
+input = input[None, :]
+tokens = myTransformer(input).argmax(dim=-1)[0]
+#print(tokens)
+shak.convertToText(tokens)
+# %% [markdown]
+# # Sampling
+# %%
+def apply_sampling_methods(input_ids: t.Tensor, logits: t.Tensor, temperature=1.0, freq_penalty=0.0, top_k=0, top_p=0.0) -> int:
+  # returns a next token based on provided sampling method
+  # thanks callum for the this method
+  assert input_ids.ndim == 1, "input_ids should be a 1D sequence of token ids"
+  assert temperature >= 0, "Temperature should be non-negative"
+  assert 0 <= top_p <= 1.0, "Top-p must be a probability"
+  assert 0 <= top_k, "Top-k must be non-negative"
+  assert not (top_p != 0 and top_k != 0), "At most one of top-p and top-k supported"
+  if temperature == 0:
+    return greedy_search(logits)
+  if temperature != 1.0:
+    logits = apply_temperature(logits, temperature)
+  if freq_penalty != 0.0:
+    logits = apply_freq_penalty(input_ids, logits, freq_penalty)
+  if top_k > 0:
+    return sample_top_k(logits, top_k)
+  if top_p > 0:
+    return sample_top_p(logits, top_p)
+  return sample_basic(logits)
+def sample_tokens(
+    model,
+    encodeMethod,
+    decodeMethod,
+    initial_text: str,
+    max_tokens_generated = 40,
+    **kwargs) -> list:
+    # samples tokens until model outputs eos_token_id or token limit reached
+    model.eval()
+    input_ids: list = encodeMethod(initial_text)
+    generated_ids = []
+    device = next(model.parameters()).device #what is next doing here?
+    tokens_to_generate = max_tokens_generated - len(input_ids)
+    for _ in range(tokens_to_generate):
+        #print(input_ids + generated_ids)
+        new_input_ids = t.tensor(input_ids + generated_ids, dtype=t.int64, device=device)
+        #print(new_input_ids.unsqueeze(0).shape)
+        logits = model(new_input_ids.unsqueeze(0))[0, -1]
+        #print(logits.shape)
+        new_token = apply_sampling_methods(new_input_ids, logits, **kwargs)
+        generated_ids.append(new_token)
+    return decodeMethod(input_ids + generated_ids)
+# quick test:
+myTransformer.eval()
+testPhrase = ["Be", " ", "not", " ", "afraid", " ", "to", " ", "the", " ", "Florentine", "\n",
+              "And"]
+input = shak.convertToTokens(testPhrase)
+type(input)
+# %%
+def greedy_search(logits):
+    '''
+    returns the most likely next token, BUT THE TIEBREAKER IS INCORRECT!
+    i got lazy - it *is* deterministic, but it just doesn't necessarily
+    choose the smallest word out of the tie. perhaps treat it as a symbol
+    of my ingenuity?
+    '''
+    return logits.argmax(dim=-1).item()
+# %%
+def sample_basic(logits) -> int:
+    '''
+    samples from the distributions, possibly with temp and freq changes applied
+    logits: shape (vocab_size, ) - unnormalized log-probabilities
+    return: a sampled token
+    '''
+    probs = t.distributions.categorical.Categorical(logits=logits)
+    return probs.sample().item()
+N = 20000
+probs = t.linspace(0, 0.4, 5)
+unnormalized_logits = probs.log() + 1.2345
+samples = t.tensor([sample_basic(unnormalized_logits) for _ in range(N)])
+counts = t.bincount(samples, minlength=len(probs)) / N
+print("Checking empirical frequencies (try to increase N if this test fails): ", counts)
+#t.testing.assert_close(counts, probs, atol=0.01, rtol=0)
+print("Tests passed!")
+# %%
+def apply_freq_penalty(input_ids: t.Tensor, logits: t.Tensor, freq_penalty: float) -> t.Tensor:
+    '''
+    input_ids: shape (seq, )
+    logits: shape (vocab_size, )
+    Return: shape (vocab_size, )
+    '''
+    (vocab_size,) = logits.shape
+    id_freqs = t.bincount(input_ids, minlength=vocab_size)
+    return logits - freq_penalty * id_freqs
+bieber_prompt = "And I was like baby, baby, baby, oh Like, baby, baby, baby, no Like, baby, baby, baby, oh I thought you'd always be mine, mine"
+input_ids = shak.convertStringToTokenList(bieber_prompt)
+logits = t.ones(shak.getDataSize()).to(device)
+penalized_logits = apply_freq_penalty(t.tensor(input_ids).to(device), logits, 2.0)
+#i believe mine is different!
+#assert penalized_logits[5156].item() == -11, "Expected 6 occurrences of ' baby' with leading space"
+#assert penalized_logits[14801].item() == -5, "Expected 3 occurrences of ' Baby' with leading space"
+#print("Tests passed!")
+print(penalized_logits[2037].item()) # should be low since it was found!
+shak.convertStringToTokenList("And")
+# %%
+def apply_temperature(logits: t.Tensor, temperature: float) -> t.Tensor:
+    assert temperature > 0, "temp cannot be less than or equal to 0"
+    return logits / temperature
+logits = t.tensor([1, 2]).log()
+cold_logits = apply_temperature(logits, 0.001)
+#print('A low temperature "sharpens" or "peaks" the distribution: ', cold_logits)
+#t.testing.assert_close(cold_logits, 1000.0 * logits)
+hot_logits = apply_temperature(logits, 1000.0)
+#print("A high temperature flattens the distribution: ", hot_logits)
+#t.testing.assert_close(hot_logits, 0.001 * logits)
+#print("Tests passed!")
+# %%
+# N_RUNS = 1
+# your_prompt = "We are the champions, my friends"
+# cases = [
+#     ("High freq penalty", dict(freq_penalty=100.0)),
+#     ("Negative freq penalty", dict(freq_penalty=-1.0)),
+#     ("Too hot!", dict(temperature=2.0)),
+#     ("Pleasantly cool", dict(temperature=0.7)),
+#     ("Pleasantly warm", dict(temperature=0.9)),
+#     ("Too cold!", dict(temperature=0.01)),
+# ]
+# for (name, kwargs) in cases:
+#     for i in range(N_RUNS):
+#         output = sample_tokens(myTransformer, shak.convertStringToTokenList,shak.decodeList, your_prompt, max_tokens_generated=24, **kwargs)
+#         print(f"Sample {i} with: {name} ({kwargs}):")
+#         print(f"Your model said: {shak.listToString(output)}\n")
+# %%
+def sample_top_k(logits: t.Tensor, top_k: int) -> int:
+    '''
+    logits: shape (vocab_size, ) - unnormalized log-probabilities
+    top_k: only consider this many of the most likely tokens for sampling
+    Return: a sampled token
+    '''
+    topk = t.topk(logits,top_k).indices
+    almost_zeroes = t.ones(logits.shape) * t.inf * -1
+    for _, token in enumerate(topk):
+        almost_zeroes[token] = 0
+    logits = logits + almost_zeroes
+    return sample_basic(logits)
+k = 3
+probs = t.linspace(0, 0.4, 5)
+unnormalized_logits = probs.log() + 1.2345
+samples = t.tensor([sample_top_k(unnormalized_logits, k) for _ in range(N)])
+counts = t.bincount(samples, minlength=len(probs)) / N
+expected = probs.clone()
+expected[:-k] = 0
+expected /= expected.sum()
+# print("Checking empirical frequencies (try to increase N if this test fails): ", counts)
+# #t.testing.assert_close(counts, expected, atol=0.01, rtol=0)
+# print("Tests passed!")
+# %%
+def sample_top_p(logits: t.Tensor, top_p: float, min_tokens_to_keep: int = 1) -> int:
+    '''
+    logits: shape (vocab_size, ) - unnormalized log-probabilities
+    Return: a sampled token
+    '''
+    # find the indices of importang logits
+    sorted, indices = t.sort(logits,descending=True)
+    probs = t.nn.functional.softmax(sorted, dim=-1)
+    num_words_kept = 0
+    sum = 0
+    while sum < top_p:
+        sum = sum + probs[num_words_kept]
+        num_words_kept = num_words_kept + 1
+    if num_words_kept < min_tokens_to_keep:
+        num_words_kept = min_tokens_to_keep
+    important_indices = indices[:num_words_kept]
+    # prepare tensor to zero out small logits
+    almost_zeroes = t.ones(logits.shape) * t.inf * -1
+    for _, token in enumerate(important_indices):
+        almost_zeroes[token] = 0
+    logits = logits + almost_zeroes
+    return sample_basic(logits)
+N = 2000
+unnormalized_logits = t.tensor([0.2, 0.3, 0.5]).log() + 2.3456
+samples = t.tensor([sample_top_p(unnormalized_logits, 0.5) for _ in range(N)])
+counts = t.bincount(samples, minlength=len(unnormalized_logits)) / N
+# print("top_p of 0.5 or lower should only return token 2: ", counts)
+# assert counts[0] == 0 and counts[1] == 0
+N = 2000
+unnormalized_logits = t.tensor([0.2, 0.3, 0.5]).log() + 2.3456
+samples = t.tensor([sample_top_p(unnormalized_logits, 0.50001) for _ in range(N)])
+counts = t.bincount(samples, minlength=len(unnormalized_logits)) / N
+# print("top_p in (0.5, 0.8] should return tokens 1 and 2: ", counts)
+# assert counts[0] == 0
+N = 4000
+top_p = 0.71
+probs = t.linspace(0, 0.4, 5)
+unnormalized_logits = probs.log() + 1.2345
+samples = t.tensor([sample_top_p(unnormalized_logits, top_p) for _ in range(N)])
+counts = t.bincount(samples, minlength=len(probs)) / N
+expected = probs.clone()
+expected[0:2] = 0
+expected /= expected.sum()
+# print("Checking empirical frequencies (try to increase N if this test fails): ", counts)
+#t.testing.assert_close(counts, expected, atol=0.01, rtol=0.0)
+# print("All tests passed!")
+# %% [markdown]
+# # Speak, Shakespeare!
+# %%
+input = "Death waits at the door"
+print(shak.listToString(sample_tokens(myTransformer,shak.convertStringToTokenList,shak.decodeList,
+                                     input, 80,freq_penalty=0.1, top_k = 10)))
+# %% [markdown]
+# ## Save the model for future use
+# (This was over 20 minutes of GPU computation. Not too shabby!)
+# %%
+t.save(myTransformer.state_dict(), "toInfer.pt")
+# %% [markdown]
+# # Publish to Gradio
+# About a month after making this I realized this should be online. I'll push this to gradio
+# %%
+import gradio as gr
+def speak(input, tokenLength):
+    print("-------------------------------------------")
+    print("input: " + input)
+    try:
+        result = shak.listToString(sample_tokens(myTransformer,shak.convertStringToTokenList,shak.decodeList,
+                                    input, tokenLength,freq_penalty=0.1, top_k = 10))
+    except:
+        return "one or more of the words is not compatible with the model; please try a different phrase"
+    print("worked! output:")
+    print(result)
+    return result
+model = gr.Interface(fn=speak,
+                    inputs=[gr.Textbox(label = "initial text", placeholder="To be or not to be"), gr.Slider(40, 80, step=1, value=80)],
+                    outputs="text",
+                    title = "speak shakespeare, speak!",
+                    description = "a miniature shakespeare, built from scratch by Cody Rushing via a Decoder-Only Transformer trained on shakespeare's works.\n many, but not all, words are tokenizable - if you get an error, try again with different words!")
+model.launch(share=False)
+# %%

requirements.txt ADDED Viewed

	@@ -0,0 +1,8 @@

+torch
+numpy
+fancy_einsum
+einops
+pandas
+dataclasses
+torchvision
+gradio

shakespeare.txt ADDED Viewed

The diff for this file is too large to render. See raw diff

toInfer.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:6d4b13307f2768c5c0811dcaae1c1f2f04a7066ee9f723fcca16ba56fa5e7f45
+size 120571916