Commit
·
b89d38f
1
Parent(s):
facc855
Upload BabyLangModel with model card and config
Browse files- README.md +104 -1
- config.json +12 -0
- model.py +133 -0
- pytorch_model.bin +3 -0
README.md
CHANGED
@@ -1,3 +1,106 @@
|
|
|
|
|
|
|
|
|
|
1 |
---
|
2 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
3 |
---
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# 🍼 BabyLangModel
|
2 |
+
|
3 |
+
A tiny GPT-style language model trained from scratch on the [TinyStories](https://huggingface.co/datasets/roneneldan/TinyStories) dataset. Built using PyTorch and a custom architecture inspired by [nanoGPT](https://github.com/karpathy/nanoGPT). This model was trained for 200k iterations on a consumer GPU (RTX 4060) using custom code from scratch.
|
4 |
+
|
5 |
---
|
6 |
+
|
7 |
+
## 🧠 Model Details
|
8 |
+
|
9 |
+
- **Architecture**: GPT (custom implementation)
|
10 |
+
- **Parameters**: ~10–15M
|
11 |
+
- **Layers**: 6
|
12 |
+
- **Heads**: 6
|
13 |
+
- **Embedding Size**: 384
|
14 |
+
- **Block Size**: 128
|
15 |
+
- **Tokenizer**: GPT-2 (`tiktoken`)
|
16 |
+
- **Training Steps**: 200,000
|
17 |
+
- **Training Loss**: ~1.80
|
18 |
+
|
19 |
---
|
20 |
+
|
21 |
+
## 📚 Training Data
|
22 |
+
|
23 |
+
We trained on the open-source **[TinyStories](https://huggingface.co/datasets/roneneldan/TinyStories)** dataset by Microsoft Research. It's a dataset of short, simple English stories written for young children (ages 2–4).
|
24 |
+
|
25 |
+
- Clean, simple narratives
|
26 |
+
- Ideal for small model generalization
|
27 |
+
- 100% open and publicly available
|
28 |
+
|
29 |
+
---
|
30 |
+
|
31 |
+
## 🧰 Usage (with `transformers`)
|
32 |
+
|
33 |
+
This model uses a **custom architecture**, so you need to use `trust_remote_code=True`:
|
34 |
+
|
35 |
+
```python
|
36 |
+
from transformers import AutoModel
|
37 |
+
|
38 |
+
model = AutoModel.from_pretrained("Exquisique/BabyLangModel", trust_remote_code=True)
|
39 |
+
```
|
40 |
+
|
41 |
+
---
|
42 |
+
|
43 |
+
## ✨ Sample Generation
|
44 |
+
|
45 |
+
```text
|
46 |
+
Prompt: Once upon a time there was a tiny robot who
|
47 |
+
|
48 |
+
Output: ...lived in a far away home. One day, a little girl named Lily decided to go on a special trip in the forest. She walked and walked until she got there but suddenly she started to go. Her mom called her and said, "Don't worry, Lily. We will get you my special ride."
|
49 |
+
```
|
50 |
+
|
51 |
+
> 🗣️ Still improving, but quite readable and story-like after 200k iterations!
|
52 |
+
|
53 |
+
---
|
54 |
+
|
55 |
+
## 💻 Train It Yourself
|
56 |
+
|
57 |
+
You can find the full training code on [GitHub](https://github.com/Exquisique/Babymodel) or use this structure:
|
58 |
+
|
59 |
+
```bash
|
60 |
+
python -m src.tokenizer # Tokenize TinyStories
|
61 |
+
python -m src.train # Train model from scratch
|
62 |
+
python -m src.generate # Generate text
|
63 |
+
```
|
64 |
+
|
65 |
+
You’ll also find:
|
66 |
+
- Checkpointing & resume support
|
67 |
+
- Configurable hyperparams
|
68 |
+
- Gradient accumulation & mixed precision
|
69 |
+
|
70 |
+
---
|
71 |
+
|
72 |
+
## 🔧 Config Used
|
73 |
+
|
74 |
+
```json
|
75 |
+
{
|
76 |
+
"vocab_size": 50257,
|
77 |
+
"block_size": 128,
|
78 |
+
"n_layer": 6,
|
79 |
+
"n_head": 6,
|
80 |
+
"n_embd": 384,
|
81 |
+
"model_type": "gpt"
|
82 |
+
}
|
83 |
+
```
|
84 |
+
|
85 |
+
---
|
86 |
+
|
87 |
+
## 📦 Inference Notes
|
88 |
+
|
89 |
+
To load the model, use:
|
90 |
+
|
91 |
+
```python
|
92 |
+
from transformers import AutoModel
|
93 |
+
model = AutoModel.from_pretrained("Exquisique/BabyLangModel", trust_remote_code=True)
|
94 |
+
```
|
95 |
+
|
96 |
+
You can also upload a tokenizer later for full text input support (e.g. with `tiktoken`).
|
97 |
+
|
98 |
+
---
|
99 |
+
|
100 |
+
## 🧑💻 Author
|
101 |
+
**Exquisique** — GenAI explorer, poetic dreamer, and neural model whisperer.
|
102 |
+
|
103 |
+
---
|
104 |
+
|
105 |
+
## 📜 License
|
106 |
+
MIT — open source, fine-tune and remix freely. ✨
|
config.json
ADDED
@@ -0,0 +1,12 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"model_type": "gpt",
|
3 |
+
"architectures": ["GPT"],
|
4 |
+
"vocab_size": 50257,
|
5 |
+
"block_size": 128,
|
6 |
+
"n_layer": 6,
|
7 |
+
"n_head": 6,
|
8 |
+
"n_embd": 384,
|
9 |
+
"auto_map": {
|
10 |
+
"AutoModel": "model.GPT"
|
11 |
+
}
|
12 |
+
}
|
model.py
ADDED
@@ -0,0 +1,133 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import torch
|
2 |
+
import torch.nn as nn
|
3 |
+
import torch.nn.functional as F
|
4 |
+
import math
|
5 |
+
|
6 |
+
class LayerNorm(nn.Module):
|
7 |
+
def __init__(self, ndim, bias):
|
8 |
+
super().__init__()
|
9 |
+
self.weight = nn.Parameter(torch.ones(ndim))
|
10 |
+
self.bias = nn.Parameter(torch.zeros(ndim)) if bias else None
|
11 |
+
|
12 |
+
def forward(self, x):
|
13 |
+
return F.layer_norm(x, self.weight.shape, self.weight, self.bias, 1e-5)
|
14 |
+
|
15 |
+
class CausalSelfAttention(nn.Module):
|
16 |
+
def __init__(self, config):
|
17 |
+
super().__init__()
|
18 |
+
assert config.n_embd % config.n_head == 0
|
19 |
+
self.c_attn = nn.Linear(config.n_embd, 3 * config.n_embd, bias=config.bias)
|
20 |
+
self.c_proj = nn.Linear(config.n_embd, config.n_embd, bias=config.bias)
|
21 |
+
self.attn_dropout = nn.Dropout(config.dropout)
|
22 |
+
self.resid_dropout = nn.Dropout(config.dropout)
|
23 |
+
self.n_head = config.n_head
|
24 |
+
self.n_embd = config.n_embd
|
25 |
+
self.flash = hasattr(F, 'scaled_dot_product_attention')
|
26 |
+
if not self.flash:
|
27 |
+
self.register_buffer("bias", torch.tril(torch.ones(config.block_size, config.block_size)).view(1, 1, config.block_size, config.block_size))
|
28 |
+
|
29 |
+
def forward(self, x):
|
30 |
+
B, T, C = x.size()
|
31 |
+
q, k, v = self.c_attn(x).split(self.n_embd, dim=2)
|
32 |
+
k = k.view(B, T, self.n_head, C // self.n_head).transpose(1, 2)
|
33 |
+
q = q.view(B, T, self.n_head, C // self.n_head).transpose(1, 2)
|
34 |
+
v = v.view(B, T, self.n_head, C // self.n_head).transpose(1, 2)
|
35 |
+
|
36 |
+
if self.flash:
|
37 |
+
y = F.scaled_dot_product_attention(q, k, v, attn_mask=None, dropout_p=self.attn_dropout.p if self.training else 0.0, is_causal=True)
|
38 |
+
else:
|
39 |
+
att = (q @ k.transpose(-2, -1)) * (1.0 / math.sqrt(k.size(-1)))
|
40 |
+
att = att.masked_fill(self.bias[:, :, :T, :T] == 0, float('-inf'))
|
41 |
+
att = F.softmax(att, dim=-1)
|
42 |
+
att = self.attn_dropout(att)
|
43 |
+
y = att @ v
|
44 |
+
|
45 |
+
y = y.transpose(1, 2).contiguous().view(B, T, C)
|
46 |
+
y = self.resid_dropout(self.c_proj(y))
|
47 |
+
return y
|
48 |
+
|
49 |
+
class MLP(nn.Module):
|
50 |
+
def __init__(self, config):
|
51 |
+
super().__init__()
|
52 |
+
self.c_fc = nn.Linear(config.n_embd, 4 * config.n_embd, bias=config.bias)
|
53 |
+
self.gelu = nn.GELU()
|
54 |
+
self.c_proj = nn.Linear(4 * config.n_embd, config.n_embd, bias=config.bias)
|
55 |
+
self.dropout = nn.Dropout(config.dropout)
|
56 |
+
|
57 |
+
def forward(self, x):
|
58 |
+
return self.dropout(self.c_proj(self.gelu(self.c_fc(x))))
|
59 |
+
|
60 |
+
class Block(nn.Module):
|
61 |
+
def __init__(self, config):
|
62 |
+
super().__init__()
|
63 |
+
self.ln1 = LayerNorm(config.n_embd, config.bias)
|
64 |
+
self.attn = CausalSelfAttention(config)
|
65 |
+
self.ln2 = LayerNorm(config.n_embd, config.bias)
|
66 |
+
self.mlp = MLP(config)
|
67 |
+
|
68 |
+
def forward(self, x):
|
69 |
+
x = x + self.attn(self.ln1(x))
|
70 |
+
x = x + self.mlp(self.ln2(x))
|
71 |
+
return x
|
72 |
+
|
73 |
+
class GPT(nn.Module):
|
74 |
+
def __init__(self, config):
|
75 |
+
super().__init__()
|
76 |
+
self.config = config
|
77 |
+
self.transformer = nn.ModuleDict(dict(
|
78 |
+
wte=nn.Embedding(config.vocab_size, config.n_embd),
|
79 |
+
wpe=nn.Embedding(config.block_size, config.n_embd),
|
80 |
+
drop=nn.Dropout(config.dropout),
|
81 |
+
h=nn.ModuleList([Block(config) for _ in range(config.n_layer)]),
|
82 |
+
ln_f=LayerNorm(config.n_embd, config.bias),
|
83 |
+
))
|
84 |
+
self.lm_head = nn.Linear(config.n_embd, config.vocab_size, bias=False)
|
85 |
+
self.transformer.wte.weight = self.lm_head.weight
|
86 |
+
|
87 |
+
self.apply(self._init_weights)
|
88 |
+
for pn, p in self.named_parameters():
|
89 |
+
if pn.endswith('c_proj.weight'):
|
90 |
+
nn.init.normal_(p, mean=0.0, std=0.02 / math.sqrt(2 * config.n_layer))
|
91 |
+
|
92 |
+
def _init_weights(self, module):
|
93 |
+
if isinstance(module, nn.Linear):
|
94 |
+
nn.init.normal_(module.weight, mean=0.0, std=0.02)
|
95 |
+
if module.bias is not None:
|
96 |
+
nn.init.zeros_(module.bias)
|
97 |
+
elif isinstance(module, nn.Embedding):
|
98 |
+
nn.init.normal_(module.weight, mean=0.0, std=0.02)
|
99 |
+
|
100 |
+
def forward(self, idx, targets=None):
|
101 |
+
device = idx.device
|
102 |
+
b, t = idx.size()
|
103 |
+
assert t <= self.config.block_size
|
104 |
+
pos = torch.arange(0, t, dtype=torch.long, device=device)
|
105 |
+
|
106 |
+
tok_emb = self.transformer.wte(idx)
|
107 |
+
pos_emb = self.transformer.wpe(pos)
|
108 |
+
x = self.transformer.drop(tok_emb + pos_emb)
|
109 |
+
for block in self.transformer.h:
|
110 |
+
x = block(x)
|
111 |
+
x = self.transformer.ln_f(x)
|
112 |
+
|
113 |
+
if targets is not None:
|
114 |
+
logits = self.lm_head(x)
|
115 |
+
loss = F.cross_entropy(logits.view(-1, logits.size(-1)), targets.view(-1), ignore_index=-1)
|
116 |
+
return logits, loss
|
117 |
+
else:
|
118 |
+
logits = self.lm_head(x[:, [-1], :])
|
119 |
+
return logits, None
|
120 |
+
|
121 |
+
@torch.no_grad()
|
122 |
+
def generate(self, idx, max_new_tokens, temperature=1.0, top_k=None):
|
123 |
+
for _ in range(max_new_tokens):
|
124 |
+
idx_cond = idx if idx.size(1) <= self.config.block_size else idx[:, -self.config.block_size:]
|
125 |
+
logits, _ = self(idx_cond)
|
126 |
+
logits = logits[:, -1, :] / temperature
|
127 |
+
if top_k is not None:
|
128 |
+
v, _ = torch.topk(logits, min(top_k, logits.size(-1)))
|
129 |
+
logits[logits < v[:, [-1]]] = -float('Inf')
|
130 |
+
probs = F.softmax(logits, dim=-1)
|
131 |
+
idx_next = torch.multinomial(probs, num_samples=1)
|
132 |
+
idx = torch.cat((idx, idx_next), dim=1)
|
133 |
+
return idx
|
pytorch_model.bin
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:04062f6b9a95597ed396310517437052b03f9de17bf352e405851bea2553bd70
|
3 |
+
size 360042363
|