pt-sk commited on
Commit
00c2cf7
·
verified ·
1 Parent(s): 4108043
1/GPT2-small_finetune/.gitignore DELETED
@@ -1,162 +0,0 @@
1
- # Byte-compiled / optimized / DLL files
2
- __pycache__/
3
- *.py[cod]
4
- *$py.class
5
-
6
- # C extensions
7
- *.so
8
-
9
- # Distribution / packaging
10
- .Python
11
- build/
12
- develop-eggs/
13
- dist/
14
- downloads/
15
- eggs/
16
- .eggs/
17
- lib/
18
- lib64/
19
- parts/
20
- sdist/
21
- var/
22
- wheels/
23
- share/python-wheels/
24
- *.egg-info/
25
- .installed.cfg
26
- *.egg
27
- MANIFEST
28
-
29
- # PyInstaller
30
- # Usually these files are written by a python script from a template
31
- # before PyInstaller builds the exe, so as to inject date/other infos into it.
32
- *.manifest
33
- *.spec
34
-
35
- # Installer logs
36
- pip-log.txt
37
- pip-delete-this-directory.txt
38
-
39
- # Unit test / coverage reports
40
- htmlcov/
41
- .tox/
42
- .nox/
43
- .coverage
44
- .coverage.*
45
- .cache
46
- nosetests.xml
47
- coverage.xml
48
- *.cover
49
- *.py,cover
50
- .hypothesis/
51
- .pytest_cache/
52
- cover/
53
-
54
- # Translations
55
- *.mo
56
- *.pot
57
-
58
- # Django stuff:
59
- *.log
60
- local_settings.py
61
- db.sqlite3
62
- db.sqlite3-journal
63
-
64
- # Flask stuff:
65
- instance/
66
- .webassets-cache
67
-
68
- # Scrapy stuff:
69
- .scrapy
70
-
71
- # Sphinx documentation
72
- docs/_build/
73
-
74
- # PyBuilder
75
- .pybuilder/
76
- target/
77
-
78
- # Jupyter Notebook
79
- .ipynb_checkpoints
80
-
81
- # IPython
82
- profile_default/
83
- ipython_config.py
84
-
85
- # pyenv
86
- # For a library or package, you might want to ignore these files since the code is
87
- # intended to run in multiple environments; otherwise, check them in:
88
- # .python-version
89
-
90
- # pipenv
91
- # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
92
- # However, in case of collaboration, if having platform-specific dependencies or dependencies
93
- # having no cross-platform support, pipenv may install dependencies that don't work, or not
94
- # install all needed dependencies.
95
- #Pipfile.lock
96
-
97
- # poetry
98
- # Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
99
- # This is especially recommended for binary packages to ensure reproducibility, and is more
100
- # commonly ignored for libraries.
101
- # https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
102
- #poetry.lock
103
-
104
- # pdm
105
- # Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
106
- #pdm.lock
107
- # pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
108
- # in version control.
109
- # https://pdm.fming.dev/latest/usage/project/#working-with-version-control
110
- .pdm.toml
111
- .pdm-python
112
- .pdm-build/
113
-
114
- # PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
115
- __pypackages__/
116
-
117
- # Celery stuff
118
- celerybeat-schedule
119
- celerybeat.pid
120
-
121
- # SageMath parsed files
122
- *.sage.py
123
-
124
- # Environments
125
- .env
126
- .venv
127
- env/
128
- venv/
129
- ENV/
130
- env.bak/
131
- venv.bak/
132
-
133
- # Spyder project settings
134
- .spyderproject
135
- .spyproject
136
-
137
- # Rope project settings
138
- .ropeproject
139
-
140
- # mkdocs documentation
141
- /site
142
-
143
- # mypy
144
- .mypy_cache/
145
- .dmypy.json
146
- dmypy.json
147
-
148
- # Pyre type checker
149
- .pyre/
150
-
151
- # pytype static type analyzer
152
- .pytype/
153
-
154
- # Cython debug symbols
155
- cython_debug/
156
-
157
- # PyCharm
158
- # JetBrains specific template is maintained in a separate JetBrains.gitignore that can
159
- # be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
160
- # and can be added to the global gitignore or merged into this file. For a more nuclear
161
- # option (not recommended) you can uncomment the following to ignore the entire idea folder.
162
- #.idea/
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1/LICENSE DELETED
@@ -1,21 +0,0 @@
1
- MIT License
2
-
3
- Copyright (c) 2024 Sathish Kumar R
4
-
5
- Permission is hereby granted, free of charge, to any person obtaining a copy
6
- of this software and associated documentation files (the "Software"), to deal
7
- in the Software without restriction, including without limitation the rights
8
- to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
- copies of the Software, and to permit persons to whom the Software is
10
- furnished to do so, subject to the following conditions:
11
-
12
- The above copyright notice and this permission notice shall be included in all
13
- copies or substantial portions of the Software.
14
-
15
- THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
- IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
- FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
- AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
- LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
- OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
- SOFTWARE.
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1/README.md DELETED
@@ -1,2 +0,0 @@
1
- # GPT2-small_finetune
2
- FineTuning GPT2-small
 
 
 
1/__pycache__/dataset.cpython-310.pyc DELETED
Binary file (2.34 kB)
 
1/__pycache__/model.cpython-310.pyc DELETED
Binary file (6.5 kB)
 
1/checkpoint.pth DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:537fb7db96118d1cc2cd241a035b9e5ca1f991f119d3b5bad80c853e1e43737b
3
- size 746821134
 
 
 
 
1/config/gpt2-small.gin DELETED
@@ -1,16 +0,0 @@
1
- GPTConfig.block_size = 1024
2
- GPTConfig.vocab_size = 50257
3
- GPTConfig.n_layer = 12
4
- GPTConfig.n_head = 12
5
- GPTConfig.n_embd = 768
6
- GPTConfig.batch_size = 8
7
- GPTConfig.learning_rate = 6e-5
8
- GPTConfig.seed = 42
9
- GPTConfig.epochs = 2
10
- GPTConfig.weight_decay = 0.001
11
- GPTConfig.eps = 1e-8
12
- GPTConfig.betas = (0.9, 0.95)
13
- GPTConfig.training_backend = "nccl"
14
- GPTConfig.device = "cuda"
15
- GPTConfig.model_name = "gpt2"
16
- GPTConfig.clip_grad_norm_val = 1.0
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1/conversation_tokens.npy DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:d6e45a35a853f6c564953289864aa042b155f6c7017bc2ec7f4b25511b521286
3
- size 10116720
 
 
 
 
1/dataset.py DELETED
@@ -1,56 +0,0 @@
1
- from __future__ import annotations
2
- import torch
3
- from torch.utils.data import Dataset
4
- from model import GPTConfig
5
- from typing import Tuple, List
6
-
7
-
8
-
9
-
10
- class TokenDataset(Dataset):
11
- def __init__(self, model_args: GPTConfig, input_ids: List) -> None:
12
- """
13
- Initializes the TokenDataset.
14
-
15
- Args:
16
- model_args: An instance of ModelArgs containing model configuration
17
- parameters, including the maximum sequence length.
18
- input_ids: A tensor containing tokenized input data.
19
-
20
- Attributes:
21
- input_ids: Stores the tokenized input data.
22
- block_size: The block size for dividing the input data, determined by
23
- the maximum sequence length in model_args.
24
- """
25
- self.input_ids = input_ids
26
- self.block_size = model_args.block_size
27
-
28
- def __len__(self) -> int:
29
- """
30
- Returns the number of blocks in the dataset.
31
-
32
- Since the input_ids are divided into blocks of size block_size, the number of
33
- blocks is calculated as the length of the input_ids minus one, divided by the
34
- block size.
35
- """
36
- return (len(self.input_ids) - 1) // self.block_size
37
-
38
- def __getitem__(self, idx) -> Tuple[torch.Tensor, torch.Tensor]:
39
- """
40
- Returns a tuple of two tensors, x and y, where x is the input tensor slice
41
- and y is the output tensor slice. The slices are of size block_size, and are
42
- taken from the input_ids tensor at the given index.
43
-
44
- Args:
45
- idx: The index of the block to retrieve.
46
-
47
- Returns:
48
- A tuple of two tensors, x and y.
49
- """
50
- start_idx = idx * self.block_size
51
- end_idx = start_idx + self.block_size
52
-
53
- x = self.input_ids[start_idx:end_idx]
54
- y = self.input_ids[start_idx+1:end_idx+1]
55
-
56
- return torch.LongTensor(x), torch.LongTensor(y)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1/gradient_norms.npy DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:a6a0923184ec6bc884e6b29aa4c20d3034aa94614799d3a85340c51bd3edee1f
3
- size 2592
 
 
 
 
1/model.py DELETED
@@ -1,218 +0,0 @@
1
- from __future__ import annotations
2
- from dataclasses import dataclass
3
- import torch
4
- import torch.nn as nn
5
- from torch.nn import functional as F
6
- import gin
7
- from typing import Tuple
8
-
9
-
10
- @gin.configurable
11
- @dataclass
12
- class GPTConfig:
13
- block_size: int
14
- vocab_size: int
15
- n_layer: int
16
- n_head: int
17
- n_embd: int
18
- batch_size: int
19
- learning_rate: float
20
- weight_decay: float
21
- eps: float
22
- betas: Tuple[float, float]
23
- seed: int
24
- epochs: int
25
- training_backend: str
26
- device: str
27
- model_name: str
28
- clip_grad_norm_val: float
29
- dtype: torch.dtype = torch.bfloat16
30
-
31
-
32
- class CausalSelfAttention(nn.Module):
33
-
34
- def __init__(self, config):
35
- super().__init__()
36
- assert config.n_embd % config.n_head == 0
37
- # key, query, value projections for all heads, but in a batch
38
- self.c_attn = nn.Linear(config.n_embd, 3 * config.n_embd)
39
- # output projection
40
- self.c_proj = nn.Linear(config.n_embd, config.n_embd)
41
- self.c_proj.NANOGPT_SCALE_INIT = 1
42
- # regularization
43
- self.n_head = config.n_head
44
- self.n_embd = config.n_embd
45
-
46
- def forward(self, x):
47
- B, T, C = x.size() # batch size, sequence length, embedding dimensionality (n_embd)
48
- # calculate query, key, values for all heads in batch and move head forward to be the batch dim
49
- # nh is "number of heads", hs is "head size", and C (number of channels) = nh * hs
50
- # e.g. in GPT-2 (124M), n_head=12, hs=64, so nh*hs=C=768 channels in the Transformer
51
- qkv = self.c_attn(x)
52
- q, k, v = qkv.split(self.n_embd, dim=2)
53
- k = k.view(B, T, self.n_head, C // self.n_head).transpose(1, 2) # (B, nh, T, hs)
54
- q = q.view(B, T, self.n_head, C // self.n_head).transpose(1, 2) # (B, nh, T, hs)
55
- v = v.view(B, T, self.n_head, C // self.n_head).transpose(1, 2) # (B, nh, T, hs)
56
- y = F.scaled_dot_product_attention(q, k, v, is_causal=True) # flash attention
57
- y = y.transpose(1, 2).contiguous().view(B, T, C) # re-assemble all head outputs side by side
58
- # output projection
59
- y = self.c_proj(y)
60
- return y
61
-
62
- class MLP(nn.Module):
63
-
64
- def __init__(self, config):
65
- super().__init__()
66
- self.c_fc = nn.Linear(config.n_embd, 4 * config.n_embd)
67
- self.gelu = nn.GELU(approximate='tanh')
68
- self.c_proj = nn.Linear(4 * config.n_embd, config.n_embd)
69
- self.c_proj.NANOGPT_SCALE_INIT = 1
70
-
71
- def forward(self, x):
72
- x = self.c_fc(x)
73
- x = self.gelu(x)
74
- x = self.c_proj(x)
75
- return x
76
-
77
- class Block(nn.Module):
78
-
79
- def __init__(self, config):
80
- super().__init__()
81
- self.ln_1 = nn.LayerNorm(config.n_embd)
82
- self.attn = CausalSelfAttention(config)
83
- self.ln_2 = nn.LayerNorm(config.n_embd)
84
- self.mlp = MLP(config)
85
-
86
- def forward(self, x):
87
- x = x + self.attn(self.ln_1(x))
88
- x = x + self.mlp(self.ln_2(x))
89
- return x
90
-
91
- class GPT(nn.Module):
92
-
93
- def __init__(self, config):
94
- super().__init__()
95
- self.config = config
96
-
97
- self.transformer = nn.ModuleDict(dict(
98
- wte = nn.Embedding(config.vocab_size, config.n_embd),
99
- wpe = nn.Embedding(config.block_size, config.n_embd),
100
- h = nn.ModuleList([Block(config) for _ in range(config.n_layer)]),
101
- ln_f = nn.LayerNorm(config.n_embd),
102
- ))
103
- self.lm_head = nn.Linear(config.n_embd, config.vocab_size, bias=False)
104
-
105
- # weight sharing scheme
106
- self.transformer.wte.weight = self.lm_head.weight
107
-
108
- # init params
109
- self.apply(self._init_weights)
110
-
111
- def _init_weights(self, module):
112
- if isinstance(module, nn.Linear):
113
- std = 0.02
114
- if hasattr(module, 'NANOGPT_SCALE_INIT'):
115
- std *= (2 * self.config.n_layer) ** -0.5
116
- torch.nn.init.normal_(module.weight, mean=0.0, std=std)
117
- if module.bias is not None:
118
- torch.nn.init.zeros_(module.bias)
119
- elif isinstance(module, nn.Embedding):
120
- torch.nn.init.normal_(module.weight, mean=0.0, std=0.02)
121
-
122
- def forward(self, idx, targets=None):
123
- # idx is of shape (B, T)
124
- B, T = idx.size()
125
- assert T <= self.config.block_size, f"Cannot forward sequence of length {T}, block size is only {self.config.block_size}"
126
- # forward the token and posisition embeddings
127
- pos = torch.arange(0, T, dtype=torch.long, device=idx.device) # shape (T)
128
- pos_emb = self.transformer.wpe(pos) # position embeddings of shape (T, n_embd)
129
- tok_emb = self.transformer.wte(idx) # token embeddings of shape (B, T, n_embd)
130
- x = tok_emb + pos_emb
131
- # forward the blocks of the transformer
132
- for block in self.transformer.h:
133
- x = block(x)
134
- # forward the final layernorm and the classifier
135
- x = self.transformer.ln_f(x)
136
- logits = self.lm_head(x) # (B, T, vocab_size)
137
- loss = None
138
- if targets is not None:
139
- loss = F.cross_entropy(logits.view(-1, logits.size(-1)), targets.view(-1))
140
- return logits, loss
141
-
142
- @classmethod
143
- def from_pretrained(cls, model_type):
144
- """Loads pretrained GPT-2 model weights from huggingface"""
145
- assert model_type in {'gpt2', 'gpt2-medium', 'gpt2-large', 'gpt2-xl'}
146
- from transformers import GPT2LMHeadModel
147
- print("loading weights from pretrained gpt: %s" % model_type)
148
-
149
- # n_layer, n_head and n_embd are determined from model_type
150
- config_args = {
151
- 'gpt2': dict(n_layer=12, n_head=12, n_embd=768), # 124M params
152
- 'gpt2-medium': dict(n_layer=24, n_head=16, n_embd=1024), # 350M params
153
- 'gpt2-large': dict(n_layer=36, n_head=20, n_embd=1280), # 774M params
154
- 'gpt2-xl': dict(n_layer=48, n_head=25, n_embd=1600), # 1558M params
155
- }[model_type]
156
- config_args['vocab_size'] = 50257 # always 50257 for GPT model checkpoints
157
- config_args['block_size'] = 1024 # always 1024 for GPT model checkpoints
158
- # create a from-scratch initialized minGPT model
159
- config = GPTConfig(**config_args)
160
- model = GPT(config)
161
- sd = model.state_dict()
162
- sd_keys = sd.keys()
163
- sd_keys = [k for k in sd_keys if not k.endswith('.attn.bias')] # discard this mask / buffer, not a param
164
-
165
- # init a huggingface/transformers model
166
- model_hf = GPT2LMHeadModel.from_pretrained(model_type)
167
- sd_hf = model_hf.state_dict()
168
-
169
- # copy while ensuring all of the parameters are aligned and match in names and shapes
170
- sd_keys_hf = sd_hf.keys()
171
- sd_keys_hf = [k for k in sd_keys_hf if not k.endswith('.attn.masked_bias')] # ignore these, just a buffer
172
- sd_keys_hf = [k for k in sd_keys_hf if not k.endswith('.attn.bias')] # same, just the mask (buffer)
173
- transposed = ['attn.c_attn.weight', 'attn.c_proj.weight', 'mlp.c_fc.weight', 'mlp.c_proj.weight']
174
- # basically the openai checkpoints use a "Conv1D" module, but we only want to use a vanilla Linear
175
- # this means that we have to transpose these weights when we import them
176
- assert len(sd_keys_hf) == len(sd_keys), f"mismatched keys: {len(sd_keys_hf)} != {len(sd_keys)}"
177
- for k in sd_keys_hf:
178
- if any(k.endswith(w) for w in transposed):
179
- # special treatment for the Conv1D weights we need to transpose
180
- assert sd_hf[k].shape[::-1] == sd[k].shape
181
- with torch.no_grad():
182
- sd[k].copy_(sd_hf[k].t())
183
- else:
184
- # vanilla copy over the other parameters
185
- assert sd_hf[k].shape == sd[k].shape
186
- with torch.no_grad():
187
- sd[k].copy_(sd_hf[k])
188
-
189
- return model
190
-
191
-
192
- # import lightning as L
193
-
194
- # class GPT2Wrapper(L.LightningModule):
195
- # def __init__(self, config: GPTConfig, model: GPT):
196
- # super().__init__()
197
- # self.config = config
198
- # self.model = model
199
- # self.optimizer = self.configure_optimizers()
200
- # self.train_loss = []
201
-
202
- # def forward(self, idx, targets=None):
203
- # return self.model(idx, targets)
204
-
205
- # def training_step(self, batch, batch_idx):
206
- # self.model.train()
207
- # optimizer = self.optimizers()
208
- # optimizer.zero_grad()
209
-
210
- # batch, label = batch
211
- # _, loss = self.model(batch, label)
212
- # self.log("Train_Loss", loss, prog_bar=True)
213
- # self.train_loss.append(loss.item())
214
-
215
- # return loss
216
-
217
- # def configure_optimizers(self):
218
- # return torch.optim.AdamW(self.model.parameters(), lr=self.config.learning_rate, betas=self.config.betas, eps=self.config.eps, weight_decay=self.config.weight_decay)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1/requirements.txt DELETED
@@ -1,4 +0,0 @@
1
- gin-config
2
- huggingface_hub
3
- # deepspeed
4
- # lightning
 
 
 
 
 
1/testing.ipynb DELETED
@@ -1,74 +0,0 @@
1
- {
2
- "cells": [
3
- {
4
- "cell_type": "code",
5
- "execution_count": 2,
6
- "metadata": {},
7
- "outputs": [
8
- {
9
- "name": "stderr",
10
- "output_type": "stream",
11
- "text": [
12
- "c:\\Users\\sathi\\miniconda3\\envs\\dl\\Lib\\site-packages\\tqdm\\auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n",
13
- " from .autonotebook import tqdm as notebook_tqdm\n"
14
- ]
15
- }
16
- ],
17
- "source": [
18
- "from model import *"
19
- ]
20
- },
21
- {
22
- "cell_type": "code",
23
- "execution_count": 3,
24
- "metadata": {},
25
- "outputs": [
26
- {
27
- "data": {
28
- "text/plain": [
29
- "GPTConfig(block_size=1024, vocab_size=50257, n_layer=12, n_head=12, n_embd=768, batch_size=16, learning_rate=0.0003, weight_decay=0.001, eps=1e-08, betas=(0.9, 0.95), seed=42, epochs=1)"
30
- ]
31
- },
32
- "execution_count": 3,
33
- "metadata": {},
34
- "output_type": "execute_result"
35
- }
36
- ],
37
- "source": [
38
- "gin.parse_config_file(\"config/gpt2-small.gin\")\n",
39
- "config = GPTConfig()\n",
40
- "config"
41
- ]
42
- },
43
- {
44
- "cell_type": "code",
45
- "execution_count": null,
46
- "metadata": {},
47
- "outputs": [],
48
- "source": [
49
- "model = GPT.from_pretrained()"
50
- ]
51
- }
52
- ],
53
- "metadata": {
54
- "kernelspec": {
55
- "display_name": "dl",
56
- "language": "python",
57
- "name": "python3"
58
- },
59
- "language_info": {
60
- "codemirror_mode": {
61
- "name": "ipython",
62
- "version": 3
63
- },
64
- "file_extension": ".py",
65
- "mimetype": "text/x-python",
66
- "name": "python",
67
- "nbconvert_exporter": "python",
68
- "pygments_lexer": "ipython3",
69
- "version": "3.11.9"
70
- }
71
- },
72
- "nbformat": 4,
73
- "nbformat_minor": 2
74
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1/time_spent.npy DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:8d4823bd1dd822c435a723dd8f7bc7fc060dcdf76ce0fc7e023a79a4c076b742
3
- size 2592
 
 
 
 
1/train.py DELETED
@@ -1,127 +0,0 @@
1
- from __future__ import annotations
2
- import os
3
- import gin
4
- import numpy as np
5
- import torch
6
- import torch.optim as optim
7
- import torch.distributed as dist
8
- import torch.multiprocessing as mp
9
- from torch.utils.data import DataLoader, DistributedSampler
10
- from torch.nn.parallel import DistributedDataParallel as DDP
11
- from huggingface_hub import hf_hub_download
12
- from model import GPTConfig, GPT
13
- from dataset import TokenDataset
14
- from tqdm import tqdm
15
- import time
16
-
17
-
18
- hf_hub_download(repo_id="pt-sk/chatgpt-dataset", filename="conversation_tokens.npy", repo_type="dataset", local_dir="/kaggle/working")
19
- tokens = np.load("/kaggle/working/conversation_tokens.npy")
20
- print(f"Number of tokens: {len(tokens)}")
21
-
22
-
23
-
24
- gin.parse_config_file("config/gpt2-small.gin")
25
- config = GPTConfig()
26
-
27
-
28
- np.random.seed(config.seed)
29
- torch.manual_seed(config.seed)
30
- torch.cuda.manual_seed(config.seed)
31
- torch.cuda.manual_seed_all(config.seed)
32
-
33
-
34
-
35
-
36
- def trainer(rank, world_size):
37
- # Initialize the Process Group
38
- dist.init_process_group(backend=config.training_backend, rank=rank, world_size=world_size)
39
-
40
- # Set the Device for the Current Process
41
- torch.cuda.set_device(rank)
42
- device = torch.device(config.device, rank)
43
-
44
- # Define Model and Optimizer
45
- model = GPT.from_pretrained(config.model_name)
46
- model.to(config.dtype).to(device)
47
- model = DDP(model, device_ids=[rank]) # Wrap model in DDP
48
-
49
- # Define Optimizer
50
- optimizer = optim.AdamW(model.parameters(), lr=config.learning_rate, betas=config.betas, eps=config.eps, weight_decay=config.weight_decay)
51
-
52
- # Create DataLoader
53
- dataset = TokenDataset(config, tokens)
54
- # Use DistributedSampler to partition data among distributed processes
55
- sampler = DistributedSampler(dataset, num_replicas=world_size, rank=rank)
56
- # Use DataLoader to manage batches
57
- dataloader = DataLoader(dataset, batch_size=config.batch_size, sampler=sampler, drop_last=True)
58
-
59
-
60
- # Training Loop
61
- model.train()
62
- training_loss = []
63
- gradient_norms = []
64
- time_spent = []
65
- for epoch in range(config.epochs) : # Loop over the dataset multiple times
66
- sampler.set_epoch(epoch) # Shuffle data per epoch for
67
-
68
- for batch, (inputs, labels) in enumerate(dataloader):
69
- start_time = time.time()
70
- # Move data to device
71
- inputs, labels = inputs.to(device), labels.to(device)
72
-
73
- # Forward pass
74
- _, loss = model(inputs, labels)
75
-
76
- # Zero gradients before backward pass
77
- optimizer.zero_grad()
78
-
79
- # Backward pass
80
- loss.backward()
81
-
82
- # Gradient clipping
83
- grad_norm = torch.nn.utils.clip_grad_norm_(model.parameters(), config.clip_grad_norm_val)
84
-
85
- # Update weights and biases
86
- optimizer.step()
87
-
88
- # Log training loss and gradient norms
89
- training_loss.append(loss.item())
90
- gradient_norms.append(grad_norm.item())
91
- time_spent.append(time.time() - start_time)
92
- if rank == 0:
93
- print(f"Epoch: {epoch}, Batch: {batch}, Loss: {loss.item()}, Gradient Norm: {grad_norm.item()}, Time Spent: {round(time_spent[-1], 2)} seconds")
94
-
95
-
96
- # Log training loss and gradient norms
97
- if rank == 0:
98
- np.save("training_loss.npy", np.array(training_loss))
99
- np.save("gradient_norms.npy", np.array(gradient_norms))
100
- np.save("time_spent.npy", np.array(time_spent))
101
-
102
- # Save the model and optimizer states for checkpointing
103
- torch.save(
104
- {
105
- "model_state_dict": model.state_dict(),
106
- "optimizer_state_dict": optimizer.state_dict(),
107
- },
108
- "checkpoint.pth",
109
- )
110
-
111
- # Cleanup
112
- dist.destroy_process_group()
113
-
114
-
115
- def run_ddp_training():
116
- world_size = torch.cuda.device_count() # Number of available GPUs
117
- mp.spawn(trainer, args=(world_size,), nprocs=world_size, join=True)
118
-
119
-
120
- if __name__ == "__main__":
121
-
122
- os.environ['MASTER_ADDR'] = 'localhost'
123
- os.environ['MASTER_PORT'] = '12355'
124
- os.environ['WORLD_SIZE'] = str(torch.cuda.device_count()) # Total number of GPUs on this node
125
- os.environ['RANK'] = '0' # Rank 0 for a single-node setup
126
-
127
- run_ddp_training()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1/training_loss.npy DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:7ab3c5e3013452e42bb5095281767cf6780f1bf1fe1bd6fc0707190d5c6478dc
3
- size 2592