nullHawk commited on
Commit
9fe7c42
·
verified ·
1 Parent(s): c3e30f9
.gitignore ADDED
@@ -0,0 +1,2 @@
 
 
 
1
+ __pycache__/
2
+ output/
app.py ADDED
@@ -0,0 +1,45 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import torch
2
+ import gradio as gr
3
+ from model import CharacterLevelTokenizer, PotterGPT, Config
4
+
5
+ class GradioApp():
6
+ def __init__(self):
7
+ # Set up configuration and data
8
+ self.model_path = 'potterGPT/potterGPT.pth'
9
+ with open('data/harry_potter_data', 'r', encoding='utf-8') as f:
10
+ data = f.read()
11
+
12
+ self.tokenizer = CharacterLevelTokenizer(data)
13
+ self.lm = PotterGPT(Config)
14
+ state_dict = torch.load(self.model_path, map_location='cpu')
15
+ self.lm.load_state_dict(state_dict)
16
+
17
+ def launch(self):
18
+ # Define Gradio interface without a clear button
19
+ with gr.Blocks() as demo:
20
+ gr.Markdown("# potterGPT v0")
21
+ gr.Markdown("Click the button to generate a text prompt using the potterGPT model.")
22
+
23
+ generate_button = gr.Button("Generate")
24
+ output_text = gr.Textbox(label="Generated Text")
25
+
26
+ generate_button.click(self.generate_text, inputs=None, outputs=output_text)
27
+
28
+ demo.launch()
29
+
30
+ def generate_text(self, input=None):
31
+ """Generate text using the trained model."""
32
+ generated_texts = []
33
+ for length in [1000]:
34
+ generated = self.lm.generate(
35
+ torch.zeros((1,1),dtype=torch.long,device='cpu') + 61, # initial context 0, 61 is \n
36
+ total=length
37
+ )
38
+ generated = self.tokenizer.decode(generated[0].cpu().numpy())
39
+ text = f'generated ({length} tokens)\n{"="*50}\n{generated}\n{"="*50}\n\n'
40
+ generated_texts.append(text)
41
+ return generated_texts[0]
42
+
43
+ if __name__ == '__main__':
44
+ app = GradioApp()
45
+ app.launch()
build_tokenizer.py ADDED
@@ -0,0 +1,22 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from pathlib import Path
2
+ from tokenizers import Tokenizer
3
+ from tokenizers.models import BPE
4
+ import tokenizers.pre_tokenizers as pre_tokenizers
5
+ import tokenizers.processors as processors
6
+ import tokenizers.decoders as decoders
7
+ from tokenizers.trainers import BpeTrainer
8
+
9
+ if __name__ == '__main__':
10
+
11
+ tokenizer_path = Path('tokenizer/')
12
+ tokenizer_path.mkdir(exist_ok=True)
13
+
14
+ tokenizer = Tokenizer(BPE())
15
+
16
+ tokenizer.pre_tokenizer = pre_tokenizers.ByteLevel(add_prefix_space=False)
17
+ tokenizer.post_processor = processors.ByteLevel(trim_offsets=False)
18
+ tokenizer.decoder = decoders.ByteLevel()
19
+ trainer = BpeTrainer(special_tokens=['<|endoftext|>'], min_frequency=2)
20
+
21
+ tokenizer.train(['data/harry_potter_data'],trainer)
22
+ tokenizer.save(str(tokenizer_path / 'potter.json'))
data/harry_potter_data ADDED
The diff for this file is too large to render. See raw diff
 
data/part1.txt ADDED
The diff for this file is too large to render. See raw diff
 
data/part2.txt ADDED
The diff for this file is too large to render. See raw diff
 
data/part3.txt ADDED
The diff for this file is too large to render. See raw diff
 
data/part4.txt ADDED
The diff for this file is too large to render. See raw diff
 
inference.py ADDED
@@ -0,0 +1,35 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import torch
2
+ import os
3
+ from model import PotterGPT, Config, CharacterLevelTokenizer
4
+ from tokenizers import Tokenizer
5
+ from dataclasses import dataclass
6
+
7
+ model_path = 'potterGPT/potterGPT.pth'
8
+ with open('data/harry_potter_data', 'r', encoding='utf-8') as f:
9
+ data = f.read()
10
+
11
+
12
+ tokenizer = CharacterLevelTokenizer(data)
13
+
14
+
15
+
16
+ lm = PotterGPT(Config)
17
+ state_dict = torch.load(model_path, map_location='cpu')
18
+ lm.load_state_dict(state_dict)
19
+
20
+ generated_texts = []
21
+ for length in [1000]:
22
+ generated = lm.generate(
23
+ torch.zeros((1,1),dtype=torch.long,device='cpu') + 61, # initial context 0, 61 is \n
24
+ total=length
25
+ )
26
+ generated = tokenizer.decode(generated[0].cpu().numpy())
27
+ text=f'generated ({length} tokens)\n{"="*50}\n{generated}\n{"="*50}\n\n'
28
+ generated_texts.append(text)
29
+
30
+ print(generated_texts[0])
31
+
32
+ os.makedirs('output', exist_ok=True)
33
+ with open('output/generated.txt', 'w+') as f:
34
+ for text in generated_texts:
35
+ f.write(text)
model/__init__.py ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ from .config import Config
2
+ from .transformer_block import TransformerBlock
3
+ from .feed_forward import FeedForward
4
+ from .multi_head_attention import MultiHeadAttention
5
+ from .single_attention_head import AttentionHead
6
+ from .model import PotterGPT
7
+ from .tokenizer import CharacterLevelTokenizer
model/config.py ADDED
@@ -0,0 +1,27 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import torch
2
+ from .tokenizer import CharacterLevelTokenizer
3
+ from dataclasses import dataclass
4
+
5
+ with open('data/harry_potter_data', 'r', encoding='utf-8') as f:
6
+ data = f.read()
7
+
8
+ @dataclass
9
+ class Config:
10
+ tokenizer = CharacterLevelTokenizer(data)
11
+ block_size = 256 # context-length
12
+ batch_size = 64 # mini-batch size
13
+ vocab_size = tokenizer.VOCAB_SIZE
14
+ n_embed = 256
15
+ n_heads = 8
16
+ head_size =n_embed //n_heads # computes to 384/6=64 or 128/4=32 or 256/8
17
+
18
+ n_layers = 3
19
+
20
+ train_iters = 10_000
21
+ val_iters = 1000
22
+ lr = 3e-4
23
+
24
+ attn_dropout = 0.1
25
+ block_dropout = 0.1
26
+
27
+ device = 'cuda' if torch.cuda.is_available() else 'cpu'
model/feed_forward.py ADDED
@@ -0,0 +1,14 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import torch
2
+ import torch.nn as nn
3
+
4
+ class FeedForward(nn.Module):
5
+ def __init__(self, Config):
6
+ super().__init__()
7
+ self.net = nn.Sequential(
8
+ nn.Linear(Config.n_embed,Config.n_embed * 4),
9
+ nn.ReLU(),
10
+ nn.Linear(Config.n_embed * 4, Config.n_embed), # projection
11
+ nn.Dropout(Config.block_dropout)
12
+ )
13
+ def forward(self,x):
14
+ return self.net(x)
model/model.py ADDED
@@ -0,0 +1,47 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import torch
2
+ import torch.nn as nn
3
+ import torch.nn.functional as F
4
+ from .transformer_block import TransformerBlock
5
+ from .config import Config
6
+
7
+ class PotterGPT(nn.Module):
8
+ def __init__(self,Config):
9
+ super().__init__()
10
+
11
+ self.n_embed = Config.n_embed
12
+ self.block_size = Config.block_size
13
+
14
+ self.token_embedding_table = nn.Embedding(Config.vocab_size,self.n_embed)
15
+ self.pos_embedding_table = nn.Embedding(self.block_size, self.n_embed)
16
+
17
+ self.blocks = nn.Sequential(
18
+ *[TransformerBlock(Config)]*Config.n_layers,
19
+ nn.LayerNorm(self.n_embed)
20
+ )
21
+
22
+ self.lm_head = nn.Linear(self.n_embed,Config.vocab_size)
23
+
24
+ def forward(self,idx):
25
+
26
+ B,T = idx.shape
27
+
28
+ token_embs = self.token_embedding_table(idx)
29
+ pos_embs = self.pos_embedding_table(torch.arange(T,device=Config.device))
30
+
31
+
32
+ x = token_embs + pos_embs
33
+ x = self.blocks(x)
34
+ logits = self.lm_head(x)
35
+
36
+ return logits
37
+
38
+
39
+ def generate(self,idx,total):
40
+ for _ in range(total):
41
+ idx_cond = idx[:, -self.block_size:]
42
+ logits= self(idx_cond)
43
+ logits = logits[:, -1, :]
44
+ probs = F.softmax(logits, dim=-1)
45
+ idx_next = torch.multinomial(probs, num_samples=1)
46
+ idx = torch.cat((idx, idx_next), dim=1)
47
+ return idx
model/multi_head_attention.py ADDED
@@ -0,0 +1,21 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import torch
2
+ import torch.nn as nn
3
+ from .single_attention_head import AttentionHead
4
+
5
+ class MultiHeadAttention(nn.Module):
6
+ def __init__(self, Config):
7
+ super().__init__()
8
+ self.n_heads = Config.n_heads
9
+ self.head_size = Config.head_size
10
+
11
+ self.heads = nn.ModuleList([AttentionHead(Config) for _ in range(self.n_heads)])
12
+
13
+ self.projection = nn.Linear(Config.n_embed, Config.n_embed)
14
+
15
+ self.dropout = nn.Dropout(Config.attn_dropout)
16
+
17
+ def forward(self,x):
18
+ x = torch.cat([h(x) for h in self.heads],dim=-1)
19
+ x = self.projection(x)
20
+ x = self.dropout(x)
21
+ return x
model/single_attention_head.py ADDED
@@ -0,0 +1,40 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import torch
2
+ import torch.nn as nn
3
+ import torch.nn.functional as F
4
+
5
+ class AttentionHead(nn.Module):
6
+ def __init__(self, Config):
7
+ super().__init__()
8
+
9
+ self.block_size = Config.block_size
10
+ self.n_embed = Config.n_embed
11
+ self.head_size = Config.head_size
12
+
13
+ self.key = nn.Linear(self.n_embed, self.head_size, bias=False)
14
+ self.query = nn.Linear(self.n_embed, self.head_size, bias=False)
15
+
16
+ self.value = nn.Linear(self.n_embed, self.head_size, bias=False)
17
+
18
+ self.register_buffer(
19
+ 'tril',
20
+ torch.tril(torch.ones(self.block_size,self.block_size))
21
+ )
22
+
23
+ self.dropout = nn.Dropout(Config.attn_dropout)
24
+
25
+ def forward(self, x):
26
+
27
+ B,T,C = x.shape
28
+
29
+ k = self.key(x)
30
+ q = self.query(x)
31
+
32
+ wei = [email protected](-2,-1) * (C ** 0.5)
33
+ wei = wei.masked_fill(self.tril[:T,:T]==0,float('-inf'))
34
+ wei = F.softmax(wei, dim=-1)
35
+ wei = self.dropout(wei)
36
+
37
+ v = self.value(x)
38
+ out = wei @ v
39
+
40
+ return out
model/tokenizer.py ADDED
@@ -0,0 +1,16 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import torch
2
+
3
+ class CharacterLevelTokenizer:
4
+ def __init__(self,data):
5
+ self.data = data
6
+ self.vocab = sorted(list(set(self.data)))
7
+ self.VOCAB_SIZE = len(self.vocab)
8
+
9
+ self.i_s = {i:s for i,s in enumerate(self.vocab)}
10
+ self.s_i = {s:i for i,s in self.i_s.items()}
11
+
12
+ def encode(self,s):
13
+ return torch.tensor([self.s_i[c] for c in s],dtype=torch.long)
14
+
15
+ def decode(self,s):
16
+ return ''.join([self.i_s[i.item()] for i in s])
model/transformer_block.py ADDED
@@ -0,0 +1,18 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import torch
2
+ import torch.nn as nn
3
+ from .multi_head_attention import MultiHeadAttention
4
+ from .feed_forward import FeedForward
5
+
6
+ class TransformerBlock(nn.Module):
7
+ def __init__(self, Config):
8
+ super().__init__()
9
+ self.attn = MultiHeadAttention(Config)
10
+ self.ff = FeedForward(Config)
11
+ self.ln1 = nn.LayerNorm(Config.n_embed)
12
+ self.ln2 = nn.LayerNorm(Config.n_embed)
13
+
14
+ def forward(self,x):
15
+ x = x + self.attn(self.ln1(x))
16
+ x = x + self.ff(self.ln2(x))
17
+
18
+ return x
potterGPT/potterGPT.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a6fe29f05742d58ac7ce20a36eea366e7022e23ef849dc5b130d185f0d36301a
3
+ size 5733550
tokenizer/potter.json ADDED
The diff for this file is too large to render. See raw diff
 
train.py ADDED
@@ -0,0 +1,129 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from model import CharacterLevelTokenizer, Config, PotterGPT
2
+ import torch
3
+ import torch.nn as nn
4
+ import torch.nn.functional as F
5
+ from tqdm import tqdm
6
+ from pathlib import Path
7
+ from tokenizers import Tokenizer
8
+ import matplotlib.pyplot as plt
9
+
10
+ torch.manual_seed(1357)
11
+ with open('data/harry_potter_data', 'r', encoding='utf-8') as f:
12
+ data = f.read()
13
+
14
+ class Dataset:
15
+ def __init__(self,Config, is_test=False) -> None:
16
+ self.tokenizer = CharacterLevelTokenizer(data)
17
+ self.is_test = is_test
18
+ self.full_data = self.tokenizer.encode(self.tokenizer.data)
19
+ if self.is_test:
20
+ self.data = self.full_data[int(0.9*len(self.full_data)):]
21
+ else:
22
+ self.data = self.full_data[:int(0.9*len(self.full_data))]
23
+ self.block_size = Config.block_size
24
+ self.batch_size = Config.batch_size
25
+
26
+ def __len__(self) -> int:
27
+ return len(self.data)
28
+
29
+ def get_block_size(self) -> int:
30
+ return self.block_size
31
+
32
+ def get_vocab_size(self) -> int:
33
+ return self.tokenizer.VOCAB_SIZE
34
+
35
+ def get(self):
36
+ ix = torch.randint(len(self.data) - self.block_size, (self.batch_size,))
37
+ x = torch.stack([self.data[i:i+self.block_size] for i in ix])
38
+ y = torch.stack([self.data[i+1:i+self.block_size+1] for i in ix])
39
+ return x,y
40
+
41
+ # tokenizer = tokenizer = Tokenizer.from_file('tokenizer/potter.json')
42
+ tokenizer = CharacterLevelTokenizer(data)
43
+
44
+ #Training
45
+
46
+ train_ds = Dataset(Config)
47
+ val_ds = Dataset(Config, is_test=True)
48
+
49
+ lm = PotterGPT(Config)
50
+ lm = lm.to(device=Config.device)
51
+
52
+ optim = torch.optim.Adam(lm.parameters(), lr=Config.lr)
53
+
54
+ def loss_fn(logits, targets):
55
+ B, T, C = logits.shape
56
+ logits = logits.view(B*T, C)
57
+ targets = targets.view(B*T)
58
+ loss = F.cross_entropy(logits, targets)
59
+ return loss
60
+
61
+ def train_N_iters():
62
+ lm.train()
63
+ train_step_losses = []
64
+ for batch in tqdm(range(Config.train_iters)):
65
+ optim.zero_grad()
66
+ inputs, targets = train_ds.get()
67
+ inputs, targets = inputs.to(device=Config.device), targets.to(device=Config.device)
68
+ logits = lm(inputs)
69
+ loss = loss_fn(logits,targets)
70
+ loss.backward()
71
+ optim.step()
72
+ train_step_losses.append(loss.item())
73
+
74
+ if batch%(Config.train_iters//10)==0 or batch==Config.train_iters-1:
75
+ print(f"batch {batch} train step loss: {loss.item()}")
76
+
77
+ del inputs, targets, loss, logits
78
+
79
+ return train_step_losses
80
+
81
+ @torch.no_grad()
82
+ def valid_N_iters():
83
+ lm.eval()
84
+ val_step_losses = []
85
+ for batch in tqdm(range(Config.val_iters)):
86
+ inputs, targets = val_ds.get()
87
+ inputs, targets = inputs.to(device=Config.device), targets.to(device=Config.device)
88
+ logits = lm(inputs)
89
+ loss = loss_fn(logits,targets)
90
+ val_step_losses.append(loss.item())
91
+
92
+ if batch%(Config.val_iters//10)==0 or batch==Config.val_iters-1:
93
+ print(f"batch {batch} valid step loss: {loss.item()}")
94
+
95
+ del inputs, targets, loss, logits
96
+
97
+ return val_step_losses
98
+
99
+ def save_lm():
100
+ state_dict = lm.state_dict()
101
+ save_path = Path('./').resolve() / 'potterGPT'
102
+ save_path.mkdir(exist_ok=True)
103
+ model_path = save_path / f'potterGPT.pth'
104
+ torch.save(state_dict, model_path)
105
+
106
+ def train_lm():
107
+ train_losses = train_N_iters()
108
+ valid_losses = valid_N_iters()
109
+ save_lm()
110
+ return train_losses, valid_losses
111
+
112
+ tl, vl = train_lm()
113
+
114
+ plt.plot(tl,label='train loss',color='orange')
115
+ plt.plot(vl,label='valid loss',color='blue')
116
+ plt.title('Potter GPT Losses')
117
+ plt.legend()
118
+ plt.show()
119
+
120
+ generated_texts = []
121
+ for length in [100,300,500,700,1000]:
122
+ generated = lm.generate(
123
+ torch.zeros((1,1),dtype=torch.long,device=Config.device), # initial context 0
124
+ total=length
125
+ )
126
+ generated = tokenizer.decode(generated[0])
127
+ text=f'generated ({length} tokens)\n{"="*50}\n{generated}\n{"="*50}\n\n'
128
+ generated_texts.append(text)
129
+ print(text)