Spaces:
Sleeping
Sleeping
add: v0
Browse files- .gitignore +2 -0
- app.py +45 -0
- build_tokenizer.py +22 -0
- data/harry_potter_data +0 -0
- data/part1.txt +0 -0
- data/part2.txt +0 -0
- data/part3.txt +0 -0
- data/part4.txt +0 -0
- inference.py +35 -0
- model/__init__.py +7 -0
- model/config.py +27 -0
- model/feed_forward.py +14 -0
- model/model.py +47 -0
- model/multi_head_attention.py +21 -0
- model/single_attention_head.py +40 -0
- model/tokenizer.py +16 -0
- model/transformer_block.py +18 -0
- potterGPT/potterGPT.pth +3 -0
- tokenizer/potter.json +0 -0
- train.py +129 -0
.gitignore
ADDED
@@ -0,0 +1,2 @@
|
|
|
|
|
|
|
1 |
+
__pycache__/
|
2 |
+
output/
|
app.py
ADDED
@@ -0,0 +1,45 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import torch
|
2 |
+
import gradio as gr
|
3 |
+
from model import CharacterLevelTokenizer, PotterGPT, Config
|
4 |
+
|
5 |
+
class GradioApp():
|
6 |
+
def __init__(self):
|
7 |
+
# Set up configuration and data
|
8 |
+
self.model_path = 'potterGPT/potterGPT.pth'
|
9 |
+
with open('data/harry_potter_data', 'r', encoding='utf-8') as f:
|
10 |
+
data = f.read()
|
11 |
+
|
12 |
+
self.tokenizer = CharacterLevelTokenizer(data)
|
13 |
+
self.lm = PotterGPT(Config)
|
14 |
+
state_dict = torch.load(self.model_path, map_location='cpu')
|
15 |
+
self.lm.load_state_dict(state_dict)
|
16 |
+
|
17 |
+
def launch(self):
|
18 |
+
# Define Gradio interface without a clear button
|
19 |
+
with gr.Blocks() as demo:
|
20 |
+
gr.Markdown("# potterGPT v0")
|
21 |
+
gr.Markdown("Click the button to generate a text prompt using the potterGPT model.")
|
22 |
+
|
23 |
+
generate_button = gr.Button("Generate")
|
24 |
+
output_text = gr.Textbox(label="Generated Text")
|
25 |
+
|
26 |
+
generate_button.click(self.generate_text, inputs=None, outputs=output_text)
|
27 |
+
|
28 |
+
demo.launch()
|
29 |
+
|
30 |
+
def generate_text(self, input=None):
|
31 |
+
"""Generate text using the trained model."""
|
32 |
+
generated_texts = []
|
33 |
+
for length in [1000]:
|
34 |
+
generated = self.lm.generate(
|
35 |
+
torch.zeros((1,1),dtype=torch.long,device='cpu') + 61, # initial context 0, 61 is \n
|
36 |
+
total=length
|
37 |
+
)
|
38 |
+
generated = self.tokenizer.decode(generated[0].cpu().numpy())
|
39 |
+
text = f'generated ({length} tokens)\n{"="*50}\n{generated}\n{"="*50}\n\n'
|
40 |
+
generated_texts.append(text)
|
41 |
+
return generated_texts[0]
|
42 |
+
|
43 |
+
if __name__ == '__main__':
|
44 |
+
app = GradioApp()
|
45 |
+
app.launch()
|
build_tokenizer.py
ADDED
@@ -0,0 +1,22 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from pathlib import Path
|
2 |
+
from tokenizers import Tokenizer
|
3 |
+
from tokenizers.models import BPE
|
4 |
+
import tokenizers.pre_tokenizers as pre_tokenizers
|
5 |
+
import tokenizers.processors as processors
|
6 |
+
import tokenizers.decoders as decoders
|
7 |
+
from tokenizers.trainers import BpeTrainer
|
8 |
+
|
9 |
+
if __name__ == '__main__':
|
10 |
+
|
11 |
+
tokenizer_path = Path('tokenizer/')
|
12 |
+
tokenizer_path.mkdir(exist_ok=True)
|
13 |
+
|
14 |
+
tokenizer = Tokenizer(BPE())
|
15 |
+
|
16 |
+
tokenizer.pre_tokenizer = pre_tokenizers.ByteLevel(add_prefix_space=False)
|
17 |
+
tokenizer.post_processor = processors.ByteLevel(trim_offsets=False)
|
18 |
+
tokenizer.decoder = decoders.ByteLevel()
|
19 |
+
trainer = BpeTrainer(special_tokens=['<|endoftext|>'], min_frequency=2)
|
20 |
+
|
21 |
+
tokenizer.train(['data/harry_potter_data'],trainer)
|
22 |
+
tokenizer.save(str(tokenizer_path / 'potter.json'))
|
data/harry_potter_data
ADDED
The diff for this file is too large to render.
See raw diff
|
|
data/part1.txt
ADDED
The diff for this file is too large to render.
See raw diff
|
|
data/part2.txt
ADDED
The diff for this file is too large to render.
See raw diff
|
|
data/part3.txt
ADDED
The diff for this file is too large to render.
See raw diff
|
|
data/part4.txt
ADDED
The diff for this file is too large to render.
See raw diff
|
|
inference.py
ADDED
@@ -0,0 +1,35 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import torch
|
2 |
+
import os
|
3 |
+
from model import PotterGPT, Config, CharacterLevelTokenizer
|
4 |
+
from tokenizers import Tokenizer
|
5 |
+
from dataclasses import dataclass
|
6 |
+
|
7 |
+
model_path = 'potterGPT/potterGPT.pth'
|
8 |
+
with open('data/harry_potter_data', 'r', encoding='utf-8') as f:
|
9 |
+
data = f.read()
|
10 |
+
|
11 |
+
|
12 |
+
tokenizer = CharacterLevelTokenizer(data)
|
13 |
+
|
14 |
+
|
15 |
+
|
16 |
+
lm = PotterGPT(Config)
|
17 |
+
state_dict = torch.load(model_path, map_location='cpu')
|
18 |
+
lm.load_state_dict(state_dict)
|
19 |
+
|
20 |
+
generated_texts = []
|
21 |
+
for length in [1000]:
|
22 |
+
generated = lm.generate(
|
23 |
+
torch.zeros((1,1),dtype=torch.long,device='cpu') + 61, # initial context 0, 61 is \n
|
24 |
+
total=length
|
25 |
+
)
|
26 |
+
generated = tokenizer.decode(generated[0].cpu().numpy())
|
27 |
+
text=f'generated ({length} tokens)\n{"="*50}\n{generated}\n{"="*50}\n\n'
|
28 |
+
generated_texts.append(text)
|
29 |
+
|
30 |
+
print(generated_texts[0])
|
31 |
+
|
32 |
+
os.makedirs('output', exist_ok=True)
|
33 |
+
with open('output/generated.txt', 'w+') as f:
|
34 |
+
for text in generated_texts:
|
35 |
+
f.write(text)
|
model/__init__.py
ADDED
@@ -0,0 +1,7 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from .config import Config
|
2 |
+
from .transformer_block import TransformerBlock
|
3 |
+
from .feed_forward import FeedForward
|
4 |
+
from .multi_head_attention import MultiHeadAttention
|
5 |
+
from .single_attention_head import AttentionHead
|
6 |
+
from .model import PotterGPT
|
7 |
+
from .tokenizer import CharacterLevelTokenizer
|
model/config.py
ADDED
@@ -0,0 +1,27 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import torch
|
2 |
+
from .tokenizer import CharacterLevelTokenizer
|
3 |
+
from dataclasses import dataclass
|
4 |
+
|
5 |
+
with open('data/harry_potter_data', 'r', encoding='utf-8') as f:
|
6 |
+
data = f.read()
|
7 |
+
|
8 |
+
@dataclass
|
9 |
+
class Config:
|
10 |
+
tokenizer = CharacterLevelTokenizer(data)
|
11 |
+
block_size = 256 # context-length
|
12 |
+
batch_size = 64 # mini-batch size
|
13 |
+
vocab_size = tokenizer.VOCAB_SIZE
|
14 |
+
n_embed = 256
|
15 |
+
n_heads = 8
|
16 |
+
head_size =n_embed //n_heads # computes to 384/6=64 or 128/4=32 or 256/8
|
17 |
+
|
18 |
+
n_layers = 3
|
19 |
+
|
20 |
+
train_iters = 10_000
|
21 |
+
val_iters = 1000
|
22 |
+
lr = 3e-4
|
23 |
+
|
24 |
+
attn_dropout = 0.1
|
25 |
+
block_dropout = 0.1
|
26 |
+
|
27 |
+
device = 'cuda' if torch.cuda.is_available() else 'cpu'
|
model/feed_forward.py
ADDED
@@ -0,0 +1,14 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import torch
|
2 |
+
import torch.nn as nn
|
3 |
+
|
4 |
+
class FeedForward(nn.Module):
|
5 |
+
def __init__(self, Config):
|
6 |
+
super().__init__()
|
7 |
+
self.net = nn.Sequential(
|
8 |
+
nn.Linear(Config.n_embed,Config.n_embed * 4),
|
9 |
+
nn.ReLU(),
|
10 |
+
nn.Linear(Config.n_embed * 4, Config.n_embed), # projection
|
11 |
+
nn.Dropout(Config.block_dropout)
|
12 |
+
)
|
13 |
+
def forward(self,x):
|
14 |
+
return self.net(x)
|
model/model.py
ADDED
@@ -0,0 +1,47 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import torch
|
2 |
+
import torch.nn as nn
|
3 |
+
import torch.nn.functional as F
|
4 |
+
from .transformer_block import TransformerBlock
|
5 |
+
from .config import Config
|
6 |
+
|
7 |
+
class PotterGPT(nn.Module):
|
8 |
+
def __init__(self,Config):
|
9 |
+
super().__init__()
|
10 |
+
|
11 |
+
self.n_embed = Config.n_embed
|
12 |
+
self.block_size = Config.block_size
|
13 |
+
|
14 |
+
self.token_embedding_table = nn.Embedding(Config.vocab_size,self.n_embed)
|
15 |
+
self.pos_embedding_table = nn.Embedding(self.block_size, self.n_embed)
|
16 |
+
|
17 |
+
self.blocks = nn.Sequential(
|
18 |
+
*[TransformerBlock(Config)]*Config.n_layers,
|
19 |
+
nn.LayerNorm(self.n_embed)
|
20 |
+
)
|
21 |
+
|
22 |
+
self.lm_head = nn.Linear(self.n_embed,Config.vocab_size)
|
23 |
+
|
24 |
+
def forward(self,idx):
|
25 |
+
|
26 |
+
B,T = idx.shape
|
27 |
+
|
28 |
+
token_embs = self.token_embedding_table(idx)
|
29 |
+
pos_embs = self.pos_embedding_table(torch.arange(T,device=Config.device))
|
30 |
+
|
31 |
+
|
32 |
+
x = token_embs + pos_embs
|
33 |
+
x = self.blocks(x)
|
34 |
+
logits = self.lm_head(x)
|
35 |
+
|
36 |
+
return logits
|
37 |
+
|
38 |
+
|
39 |
+
def generate(self,idx,total):
|
40 |
+
for _ in range(total):
|
41 |
+
idx_cond = idx[:, -self.block_size:]
|
42 |
+
logits= self(idx_cond)
|
43 |
+
logits = logits[:, -1, :]
|
44 |
+
probs = F.softmax(logits, dim=-1)
|
45 |
+
idx_next = torch.multinomial(probs, num_samples=1)
|
46 |
+
idx = torch.cat((idx, idx_next), dim=1)
|
47 |
+
return idx
|
model/multi_head_attention.py
ADDED
@@ -0,0 +1,21 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import torch
|
2 |
+
import torch.nn as nn
|
3 |
+
from .single_attention_head import AttentionHead
|
4 |
+
|
5 |
+
class MultiHeadAttention(nn.Module):
|
6 |
+
def __init__(self, Config):
|
7 |
+
super().__init__()
|
8 |
+
self.n_heads = Config.n_heads
|
9 |
+
self.head_size = Config.head_size
|
10 |
+
|
11 |
+
self.heads = nn.ModuleList([AttentionHead(Config) for _ in range(self.n_heads)])
|
12 |
+
|
13 |
+
self.projection = nn.Linear(Config.n_embed, Config.n_embed)
|
14 |
+
|
15 |
+
self.dropout = nn.Dropout(Config.attn_dropout)
|
16 |
+
|
17 |
+
def forward(self,x):
|
18 |
+
x = torch.cat([h(x) for h in self.heads],dim=-1)
|
19 |
+
x = self.projection(x)
|
20 |
+
x = self.dropout(x)
|
21 |
+
return x
|
model/single_attention_head.py
ADDED
@@ -0,0 +1,40 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import torch
|
2 |
+
import torch.nn as nn
|
3 |
+
import torch.nn.functional as F
|
4 |
+
|
5 |
+
class AttentionHead(nn.Module):
|
6 |
+
def __init__(self, Config):
|
7 |
+
super().__init__()
|
8 |
+
|
9 |
+
self.block_size = Config.block_size
|
10 |
+
self.n_embed = Config.n_embed
|
11 |
+
self.head_size = Config.head_size
|
12 |
+
|
13 |
+
self.key = nn.Linear(self.n_embed, self.head_size, bias=False)
|
14 |
+
self.query = nn.Linear(self.n_embed, self.head_size, bias=False)
|
15 |
+
|
16 |
+
self.value = nn.Linear(self.n_embed, self.head_size, bias=False)
|
17 |
+
|
18 |
+
self.register_buffer(
|
19 |
+
'tril',
|
20 |
+
torch.tril(torch.ones(self.block_size,self.block_size))
|
21 |
+
)
|
22 |
+
|
23 |
+
self.dropout = nn.Dropout(Config.attn_dropout)
|
24 |
+
|
25 |
+
def forward(self, x):
|
26 |
+
|
27 |
+
B,T,C = x.shape
|
28 |
+
|
29 |
+
k = self.key(x)
|
30 |
+
q = self.query(x)
|
31 |
+
|
32 |
+
wei = [email protected](-2,-1) * (C ** 0.5)
|
33 |
+
wei = wei.masked_fill(self.tril[:T,:T]==0,float('-inf'))
|
34 |
+
wei = F.softmax(wei, dim=-1)
|
35 |
+
wei = self.dropout(wei)
|
36 |
+
|
37 |
+
v = self.value(x)
|
38 |
+
out = wei @ v
|
39 |
+
|
40 |
+
return out
|
model/tokenizer.py
ADDED
@@ -0,0 +1,16 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import torch
|
2 |
+
|
3 |
+
class CharacterLevelTokenizer:
|
4 |
+
def __init__(self,data):
|
5 |
+
self.data = data
|
6 |
+
self.vocab = sorted(list(set(self.data)))
|
7 |
+
self.VOCAB_SIZE = len(self.vocab)
|
8 |
+
|
9 |
+
self.i_s = {i:s for i,s in enumerate(self.vocab)}
|
10 |
+
self.s_i = {s:i for i,s in self.i_s.items()}
|
11 |
+
|
12 |
+
def encode(self,s):
|
13 |
+
return torch.tensor([self.s_i[c] for c in s],dtype=torch.long)
|
14 |
+
|
15 |
+
def decode(self,s):
|
16 |
+
return ''.join([self.i_s[i.item()] for i in s])
|
model/transformer_block.py
ADDED
@@ -0,0 +1,18 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import torch
|
2 |
+
import torch.nn as nn
|
3 |
+
from .multi_head_attention import MultiHeadAttention
|
4 |
+
from .feed_forward import FeedForward
|
5 |
+
|
6 |
+
class TransformerBlock(nn.Module):
|
7 |
+
def __init__(self, Config):
|
8 |
+
super().__init__()
|
9 |
+
self.attn = MultiHeadAttention(Config)
|
10 |
+
self.ff = FeedForward(Config)
|
11 |
+
self.ln1 = nn.LayerNorm(Config.n_embed)
|
12 |
+
self.ln2 = nn.LayerNorm(Config.n_embed)
|
13 |
+
|
14 |
+
def forward(self,x):
|
15 |
+
x = x + self.attn(self.ln1(x))
|
16 |
+
x = x + self.ff(self.ln2(x))
|
17 |
+
|
18 |
+
return x
|
potterGPT/potterGPT.pth
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:a6fe29f05742d58ac7ce20a36eea366e7022e23ef849dc5b130d185f0d36301a
|
3 |
+
size 5733550
|
tokenizer/potter.json
ADDED
The diff for this file is too large to render.
See raw diff
|
|
train.py
ADDED
@@ -0,0 +1,129 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from model import CharacterLevelTokenizer, Config, PotterGPT
|
2 |
+
import torch
|
3 |
+
import torch.nn as nn
|
4 |
+
import torch.nn.functional as F
|
5 |
+
from tqdm import tqdm
|
6 |
+
from pathlib import Path
|
7 |
+
from tokenizers import Tokenizer
|
8 |
+
import matplotlib.pyplot as plt
|
9 |
+
|
10 |
+
torch.manual_seed(1357)
|
11 |
+
with open('data/harry_potter_data', 'r', encoding='utf-8') as f:
|
12 |
+
data = f.read()
|
13 |
+
|
14 |
+
class Dataset:
|
15 |
+
def __init__(self,Config, is_test=False) -> None:
|
16 |
+
self.tokenizer = CharacterLevelTokenizer(data)
|
17 |
+
self.is_test = is_test
|
18 |
+
self.full_data = self.tokenizer.encode(self.tokenizer.data)
|
19 |
+
if self.is_test:
|
20 |
+
self.data = self.full_data[int(0.9*len(self.full_data)):]
|
21 |
+
else:
|
22 |
+
self.data = self.full_data[:int(0.9*len(self.full_data))]
|
23 |
+
self.block_size = Config.block_size
|
24 |
+
self.batch_size = Config.batch_size
|
25 |
+
|
26 |
+
def __len__(self) -> int:
|
27 |
+
return len(self.data)
|
28 |
+
|
29 |
+
def get_block_size(self) -> int:
|
30 |
+
return self.block_size
|
31 |
+
|
32 |
+
def get_vocab_size(self) -> int:
|
33 |
+
return self.tokenizer.VOCAB_SIZE
|
34 |
+
|
35 |
+
def get(self):
|
36 |
+
ix = torch.randint(len(self.data) - self.block_size, (self.batch_size,))
|
37 |
+
x = torch.stack([self.data[i:i+self.block_size] for i in ix])
|
38 |
+
y = torch.stack([self.data[i+1:i+self.block_size+1] for i in ix])
|
39 |
+
return x,y
|
40 |
+
|
41 |
+
# tokenizer = tokenizer = Tokenizer.from_file('tokenizer/potter.json')
|
42 |
+
tokenizer = CharacterLevelTokenizer(data)
|
43 |
+
|
44 |
+
#Training
|
45 |
+
|
46 |
+
train_ds = Dataset(Config)
|
47 |
+
val_ds = Dataset(Config, is_test=True)
|
48 |
+
|
49 |
+
lm = PotterGPT(Config)
|
50 |
+
lm = lm.to(device=Config.device)
|
51 |
+
|
52 |
+
optim = torch.optim.Adam(lm.parameters(), lr=Config.lr)
|
53 |
+
|
54 |
+
def loss_fn(logits, targets):
|
55 |
+
B, T, C = logits.shape
|
56 |
+
logits = logits.view(B*T, C)
|
57 |
+
targets = targets.view(B*T)
|
58 |
+
loss = F.cross_entropy(logits, targets)
|
59 |
+
return loss
|
60 |
+
|
61 |
+
def train_N_iters():
|
62 |
+
lm.train()
|
63 |
+
train_step_losses = []
|
64 |
+
for batch in tqdm(range(Config.train_iters)):
|
65 |
+
optim.zero_grad()
|
66 |
+
inputs, targets = train_ds.get()
|
67 |
+
inputs, targets = inputs.to(device=Config.device), targets.to(device=Config.device)
|
68 |
+
logits = lm(inputs)
|
69 |
+
loss = loss_fn(logits,targets)
|
70 |
+
loss.backward()
|
71 |
+
optim.step()
|
72 |
+
train_step_losses.append(loss.item())
|
73 |
+
|
74 |
+
if batch%(Config.train_iters//10)==0 or batch==Config.train_iters-1:
|
75 |
+
print(f"batch {batch} train step loss: {loss.item()}")
|
76 |
+
|
77 |
+
del inputs, targets, loss, logits
|
78 |
+
|
79 |
+
return train_step_losses
|
80 |
+
|
81 |
+
@torch.no_grad()
|
82 |
+
def valid_N_iters():
|
83 |
+
lm.eval()
|
84 |
+
val_step_losses = []
|
85 |
+
for batch in tqdm(range(Config.val_iters)):
|
86 |
+
inputs, targets = val_ds.get()
|
87 |
+
inputs, targets = inputs.to(device=Config.device), targets.to(device=Config.device)
|
88 |
+
logits = lm(inputs)
|
89 |
+
loss = loss_fn(logits,targets)
|
90 |
+
val_step_losses.append(loss.item())
|
91 |
+
|
92 |
+
if batch%(Config.val_iters//10)==0 or batch==Config.val_iters-1:
|
93 |
+
print(f"batch {batch} valid step loss: {loss.item()}")
|
94 |
+
|
95 |
+
del inputs, targets, loss, logits
|
96 |
+
|
97 |
+
return val_step_losses
|
98 |
+
|
99 |
+
def save_lm():
|
100 |
+
state_dict = lm.state_dict()
|
101 |
+
save_path = Path('./').resolve() / 'potterGPT'
|
102 |
+
save_path.mkdir(exist_ok=True)
|
103 |
+
model_path = save_path / f'potterGPT.pth'
|
104 |
+
torch.save(state_dict, model_path)
|
105 |
+
|
106 |
+
def train_lm():
|
107 |
+
train_losses = train_N_iters()
|
108 |
+
valid_losses = valid_N_iters()
|
109 |
+
save_lm()
|
110 |
+
return train_losses, valid_losses
|
111 |
+
|
112 |
+
tl, vl = train_lm()
|
113 |
+
|
114 |
+
plt.plot(tl,label='train loss',color='orange')
|
115 |
+
plt.plot(vl,label='valid loss',color='blue')
|
116 |
+
plt.title('Potter GPT Losses')
|
117 |
+
plt.legend()
|
118 |
+
plt.show()
|
119 |
+
|
120 |
+
generated_texts = []
|
121 |
+
for length in [100,300,500,700,1000]:
|
122 |
+
generated = lm.generate(
|
123 |
+
torch.zeros((1,1),dtype=torch.long,device=Config.device), # initial context 0
|
124 |
+
total=length
|
125 |
+
)
|
126 |
+
generated = tokenizer.decode(generated[0])
|
127 |
+
text=f'generated ({length} tokens)\n{"="*50}\n{generated}\n{"="*50}\n\n'
|
128 |
+
generated_texts.append(text)
|
129 |
+
print(text)
|