Spaces:

raja5259
/

EraV2S19_Raj

Sleeping

App Files Files Community

raja5259 commited on Jun 14, 2024

Commit

1030703

verified ·

1 Parent(s): a5bb12c

Upload 3 files

Browse files

Files changed (3) hide show

input.txt +0 -0
s19_cpu_gradio_2.py +181 -0
wt_25k.pth +3 -0

input.txt ADDED Viewed

The diff for this file is too large to render. See raw diff

s19_cpu_gradio_2.py ADDED Viewed

	@@ -0,0 +1,181 @@

+# -*- coding: utf-8 -*-
+"""2_S19_cpu.ipynb
+Automatically generated by Colab.
+Original file is located at
+    https://colab.research.google.com/drive/1laKI3fOJgsqMey3iQ5u3r8mhekJg-inM
+## Building a GPT
+Companion notebook to the [Zero To Hero](https://karpathy.ai/zero-to-hero.html) video on GPT.
+"""
+import torch
+import torch.nn as nn
+from torch.nn import functional as F
+def GPT(txt, max_new_tokens=100):
+  # hyperparameters
+  batch_size = 16 # how many independent sequences will we process in parallel?
+  block_size = 32 # what is the maximum context length for predictions?
+  max_iters = 5000
+  eval_interval = 100
+  learning_rate = 1e-3
+  device = 'cuda' if torch.cuda.is_available() else 'cpu'
+  eval_iters = 200
+  n_embd = 64
+  n_head = 4
+  n_layer = 4
+  dropout = 0.0
+  # ------------
+  torch.manual_seed(1337)
+  with open('input.txt', 'r', encoding='utf-8') as f:
+      text = f.read()
+  # here are all the unique characters that occur in this text
+  chars = sorted(list(set(text)))
+  vocab_size = len(chars)
+  # create a mapping from characters to integers
+  stoi = { ch:i for i,ch in enumerate(chars) }
+  itos = { i:ch for i,ch in enumerate(chars) }
+  encode = lambda s: [stoi[c] for c in s] # encoder: take a string, output a list of integers
+  decode = lambda l: ''.join([itos[i] for i in l]) # decoder: take a list of integers, output a string
+  class Head(nn.Module):
+      """ one head of self-attention """
+      def __init__(self, head_size):
+          super().__init__()
+          self.key = nn.Linear(n_embd, head_size, bias=False)
+          self.query = nn.Linear(n_embd, head_size, bias=False)
+          self.value = nn.Linear(n_embd, head_size, bias=False)
+          self.register_buffer('tril', torch.tril(torch.ones(block_size, block_size)))
+          self.dropout = nn.Dropout(dropout)
+      def forward(self, x):
+          B,T,C = x.shape
+          k = self.key(x)   # (B,T,C)
+          q = self.query(x) # (B,T,C)
+          # compute attention scores ("affinities")
+          wei = q @ k.transpose(-2,-1) * C**-0.5 # (B, T, C) @ (B, C, T) -> (B, T, T)
+          wei = wei.masked_fill(self.tril[:T, :T] == 0, float('-inf')) # (B, T, T)
+          wei = F.softmax(wei, dim=-1) # (B, T, T)
+          wei = self.dropout(wei)
+          # perform the weighted aggregation of the values
+          v = self.value(x) # (B,T,C)
+          out = wei @ v # (B, T, T) @ (B, T, C) -> (B, T, C)
+          return out
+  class MultiHeadAttention(nn.Module):
+      """ multiple heads of self-attention in parallel """
+      def __init__(self, num_heads, head_size):
+          super().__init__()
+          self.heads = nn.ModuleList([Head(head_size) for _ in range(num_heads)])
+          self.proj = nn.Linear(n_embd, n_embd)
+          self.dropout = nn.Dropout(dropout)
+      def forward(self, x):
+          out = torch.cat([h(x) for h in self.heads], dim=-1)
+          out = self.dropout(self.proj(out))
+          return out
+  class FeedFoward(nn.Module):
+      """ a simple linear layer followed by a non-linearity """
+      def __init__(self, n_embd):
+          super().__init__()
+          self.net = nn.Sequential(
+              nn.Linear(n_embd, 4 * n_embd),
+              nn.ReLU(),
+              nn.Linear(4 * n_embd, n_embd),
+              nn.Dropout(dropout),
+          )
+      def forward(self, x):
+          return self.net(x)
+  class Block(nn.Module):
+      """ Transformer block: communication followed by computation """
+      def __init__(self, n_embd, n_head):
+          # n_embd: embedding dimension, n_head: the number of heads we'd like
+          super().__init__()
+          head_size = n_embd // n_head
+          self.sa = MultiHeadAttention(n_head, head_size)
+          self.ffwd = FeedFoward(n_embd)
+          self.ln1 = nn.LayerNorm(n_embd)
+          self.ln2 = nn.LayerNorm(n_embd)
+      def forward(self, x):
+          x = x + self.sa(self.ln1(x))
+          x = x + self.ffwd(self.ln2(x))
+          return x
+  # super simple bigram model
+  class BigramLanguageModel(nn.Module):
+      def __init__(self):
+          super().__init__()
+          # each token directly reads off the logits for the next token from a lookup table
+          self.token_embedding_table = nn.Embedding(vocab_size, n_embd)
+          self.position_embedding_table = nn.Embedding(block_size, n_embd)
+          self.blocks = nn.Sequential(*[Block(n_embd, n_head=n_head) for _ in range(n_layer)])
+          self.ln_f = nn.LayerNorm(n_embd) # final layer norm
+          self.lm_head = nn.Linear(n_embd, vocab_size)
+      def forward(self, idx, targets=None):
+          B, T = idx.shape
+          # idx and targets are both (B,T) tensor of integers
+          tok_emb = self.token_embedding_table(idx) # (B,T,C)
+          pos_emb = self.position_embedding_table(torch.arange(T, device=device)) # (T,C)
+          x = tok_emb + pos_emb # (B,T,C)
+          x = self.blocks(x) # (B,T,C)
+          x = self.ln_f(x) # (B,T,C)
+          logits = self.lm_head(x) # (B,T,vocab_size)
+          if targets is None:
+              loss = None
+          else:
+              B, T, C = logits.shape
+              logits = logits.view(B*T, C)
+              targets = targets.view(B*T)
+              loss = F.cross_entropy(logits, targets)
+          return logits, loss
+      def generate(self, idx, max_new_tokens):
+          # idx is (B, T) array of indices in the current context
+          for _ in range(max_new_tokens):
+              # crop idx to the last block_size tokens
+              idx_cond = idx[:, -block_size:]
+              # get the predictions
+              logits, loss = self(idx_cond)
+              # focus only on the last time step
+              logits = logits[:, -1, :] # becomes (B, C)
+              # apply softmax to get probabilities
+              probs = F.softmax(logits, dim=-1) # (B, C)
+              # sample from the distribution
+              idx_next = torch.multinomial(probs, num_samples=1) # (B, 1)
+              # append sampled index to the running sequence
+              idx = torch.cat((idx, idx_next), dim=1) # (B, T+1)
+          return idx
+  model = BigramLanguageModel()
+  m = model.to(device)
+  # print the number of parameters in the model
+  print(sum(p.numel() for p in m.parameters())/1e6, 'M parameters')
+  # create a PyTorch optimizer
+  optimizer = torch.optim.AdamW(model.parameters(), lr=learning_rate)
+  m.load_state_dict(torch.load('wt_25k.pth', map_location=torch.device(device)))
+  context = torch.tensor(encode(txt), dtype=torch.long)
+  context = context.reshape(1, -1)
+  ret_val = decode(m.generate(context, max_new_tokens=max_new_tokens)[0].tolist())
+  return ret_val

wt_25k.pth ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:f31a4015218f7c3c099dec1ba88aa33b926d915ecbbe82d016172515f565825e
+size 938098