thomwolf HF staff commited on
Commit
878bd55
·
1 Parent(s): f2c15d5
Files changed (5) hide show
  1. dist/index.html +138 -76
  2. dist/main.bundle.js +0 -0
  3. dist/main.bundle.js.map +0 -0
  4. src/index.html +138 -76
  5. src/memory.js +158 -81
dist/index.html CHANGED
@@ -9,6 +9,26 @@
9
  <title>FineWeb: decanting the web for the finest text data at scale</title>
10
  <link rel="stylesheet" href="style.css">
11
  <style>
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
12
  #graph svg {
13
  font-family: sans-serif;
14
  }
@@ -68,61 +88,101 @@
68
 
69
  <aside>We are extremely thankful to the whole <a href="https://distill.pub/">distill.pub</a> team for creating the template on which we based this blog post.</aside>
70
 
71
- <div>
72
- <label for="a">Attention Heads (a):</label>
73
- <input type="range" id="a" name="a" min="1" max="128" value="8">
74
- <input type="number" id="a_input" value="8" min="1" max="128">
75
- <br>
76
- <label for="b">Micro Batch Size (b):</label>
77
- <input type="range" id="b" name="b" min="1" max="53248" value="32">
78
- <input type="number" id="b_input" value="32" min="1" max="53248">
79
- <br>
80
- <label for="h">Hidden Dimension Size (h):</label>
81
- <input type="range" id="h" name="h" min="1" max="16384" value="512">
82
- <input type="number" id="h_input" value="512" min="128" max="16384">
83
- <br>
84
- <label for="h_ff">Feedforward Dimension Size (h_ff):</label>
85
- <input type="range" id="h_ff" name="h_ff" min="1" max="65536" value="2048">
86
- <input type="number" id="h_ff_input" value="2048" min="512" max="65536">
87
- <br>
88
- <label for="L">Number of Layers (L):</label>
89
- <input type="range" id="L" name="L" min="1" max="126" value="12">
90
- <input type="number" id="L_input" value="12" min="1" max="126">
91
- <br>
92
- <label for="s">Sequence Length (s):</label>
93
- <input type="range" id="s" name="s" min="1" max="128000" value="128">
94
- <input type="number" id="s_input" value="128" min="64" max="128000">
95
- <br>
96
- <label for="v">Vocabulary Size (v):</label>
97
- <input type="range" id="v" name="v" min="1000" max="100000" value="30522">
98
- <input type="number" id="v_input" value="30522" min="1000" max="100000">
99
- <br>
100
- <label for="mixed">Mixed Precision:</label>
101
- <input type="checkbox" id="mixed" name="mixed" checked>
102
- <br>
103
- <label for="recomputation">Recomputation:</label>
104
- <select id="recomputation" name="recomputation">
105
- <option value="none">None</option>
106
- <option value="selective">Selective</option>
107
- <option value="full">Full</option>
108
- </select>
109
- <br>
110
- <label for="ff_activation">FF Activation:</label>
111
- <select id="ff_activation" name="ff_activation">
112
- <option value="relu">ReLU</option>
113
- <option value="gelu">GELU</option>
114
- <option value="swiglu">SwiGLU</option>
115
- </select>
116
- <br>
117
- <label for="presets">Presets:</label>
118
- <select id="presets" name="presets">
119
- <option value="Tiny">Tiny</option>
120
- <option value="8B">8B</option>
121
- <option value="70B">70B</option>
122
- <option value="405B">405B</option>
123
- </select>
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
124
  </div>
125
- <div id="graph" style="position: relative; width: 960px; height: 500px;"></div>
126
 
127
  <p><strong>TLDR:</strong> This blog covers a discussion on processing and evaluating data quality at scale, the 🍷 FineWeb
128
  recipe (listing and explaining all of our design choices), and the process followed to create its 📚 FineWeb-Edu subset.</p>
@@ -357,13 +417,13 @@ m_{act} = 2 * L* seq * bs * h * (34 + \frac{5*n_{heads}*seq}{h})
357
 
358
  <p>Most frameworks these days use FlashAttention (TODO: see later) which makes the attention computation less memory intensive through kernel fusion, thus most trainings use the <code>full</code> settings.</p>
359
 
360
- <p>We can save some GPU memory with activation recomputation but this only delays by a bit the next bottleneck: as hinted earlier for LLM training there is usually a sweet spot for the GBST and we need to work out the training configuration backward from there. However, you cant choose MBS to be an arbitrary large number on your GPU; at some point you will run out of GPU memory again since you need to store at least some of the activations in memory.</p>
361
 
362
  <p>There is a useful trick to compensate for that: <strong>gradient accumulation</strong> (<em>GradAcc</em>). With gradient accumulation we will split our batch in micro-batch, do forward and backward passes repeatedly on each micro-batch, compute the gradients, and, as the name suggests, sum the gradients step by step before doing a final optimizer step.</p>
363
 
364
- <p>We call the <code>micro batch size</code> (MBS) the batch size for each forward pass on a single node (the number of samples flowing through the model in one forward pass). Well refer to the overall batch size between each optimizer step as the <code>global batch size</code> (GBS). If we do one optimizer step each 8 forward/backward pass, the <code>global batch size</code> will be 8 times the <code>micro batch size</code>.</p>
365
 
366
- <p>What we now call <code>global batch size</code> thus corresponds to what weve called up to now just <code>batch size</code> for simplicity (we now make the terms more precise to avoid ambiguity).</p>
367
 
368
  <p>With gradient accumulation the global batch size can be computed as follows:</p>
369
 
@@ -377,7 +437,7 @@ BS = GBS=MBS * GradAcc
377
 
378
  <p>This is actually a bummer since the forward/backward passes for each micro-batch could actually totally be run in parallel. They are independent from each other and the only changing parameter are the input samples.</p>
379
 
380
- <p>Here comes data parallelism to solve exactly this problem! Lets take a look, you say? Okay sure!</p>
381
 
382
  <h3>Data Parallelism</h3>
383
 
@@ -387,7 +447,7 @@ BS = GBS=MBS * GradAcc
387
  GBS=MBS * GradAcc * DP
388
  </d-math>
389
 
390
- <p>This means that we can reduce the number of gradient accumulation steps in favor of data parallel processes which speeds up training. In practice, people will tend to max out the number of data parallel nodes (the DP above) as much as possible as its inherently parallel versus the sequential Gradient Accumulation. Gradient accumulation is then added only to achieve a target batch size if DP alone is not sufficient. One exception to that is pipeline parallelism which well discuss later.</p>
391
 
392
  <img src="assets/images/IMG_A95961668B3F-1.jpeg" alt="Data Parallelism">
393
 
@@ -403,17 +463,17 @@ GBS=MBS * GradAcc * DP
403
 
404
  <p>If the gradient accumulation ratio is lower than one, i.e. you have too many GPUs (!), you can either choose to not use all your GPUs or test if a lower MBS will speed up training. In these cases, you may want to prioritize throughput over the individual GPU utilization, you can then choose DP first and use a smaller MBS than possible in order to speed up training.</p>
405
 
406
- <p>Time to take a concrete example: We want to train a model with a GBS of 4M tokens and a sequence length of 4k. This means our batch size will be 1024 samples (we pick powers of two). We observe that a single of our GPU can fit MBS=2 in memory and we have 128 GPUs available for training. This means with 4 gradient accumulation steps well achieve our goal of 1024 samples or 4M tokens per training step. Now what if we suddenly have 1024 GPUs available? We can achieve the same GBS and thus identical training by setting both MBS and gradient accumulation to 1 speeding up training significantly.</p>
407
 
408
  <p>[EXPERIMENTS WHERE WE INCREASE DP AND SHOW THROUGHPUT FOR SEVERAL MODELS]</p>
409
 
410
- <p>Weve explored data parallelism, a simple strategy to scale training across more GPUs and gives consistent speed improvements. The keen reader might have noticed however that it rests on the assumption that we can fit at least one input sample forward pass (<em>MBS=1</em>) into our GPU memory. This is not always the case! In particular for larger models which often dont fit into a single GPU anymore even with activation recomputations activated.</p>
411
 
412
- <p>In such case, we need to shard the model across devices! Well now study two complementary sharding methods, tensor and pipeline parallelism which are doing that. Lets start by the simplest, tensor parallelism!</p>
413
 
414
  <h3>Tensor Parallelism</h3>
415
 
416
- <p>So youve exhausted all the previous textbook tricks to try to fit your model on a single GPU but it still doesnt fit? Lets try to distribute this model across several GPUs. Unlike DP we will not simply duplicate the model but various parts of the model instance will be living on various GPUs.</p>
417
 
418
  <p>If we take a look at a typical matrix multiplication (the core of a neural network), we can get an idea about how we could split the model:</p>
419
 
@@ -470,6 +530,8 @@ GeLU(XW1 + XW2) \neq GeLU(XW1) + GeLU(XW2)
470
  <p>If you rather like code, note that we can prove this with the following snippet as well:</p>
471
 
472
  <d-code block language="python">
 
 
473
  def example_gelu():
474
  from torch.nn.functional import gelu
475
 
@@ -495,9 +557,9 @@ def example_gelu():
495
  torch.testing.assert_close(y_row_1, y_row_2, rtol=1e-5, atol=1e-5)
496
  </d-code>
497
 
498
- <p>To avoid a synchronization step directly after the first MLP, well thus start with Column Parallel and be able to directly perform parallel GELU.</p>
499
 
500
- <p>Now, what about the second MLP? Should it be column or row parallel? Lets draft both options:</p>
501
  <ul>
502
  <li>Column Parallel followed by Column Parallel</li>
503
  <img src="assets/images/image%2013.png" alt="Column Parallel Schema 1">
@@ -505,9 +567,9 @@ def example_gelu():
505
  <img src="assets/images/image%2014.png" alt="Column Parallel Schema 2">
506
  </ul>
507
 
508
- <p>We see that the Column Parallel followed by Row Parallel schema only involves two communications instead of four. Its thus the most efficient schema in terms of communications.</p>
509
 
510
- <p>Lets take a quick look at the backward pass:</p>
511
  <img src="assets/images/image%2015.png" alt="Backward Pass 1">
512
  <img src="assets/images/image%2016.png" alt="Backward Pass 2">
513
 
@@ -586,7 +648,7 @@ if __name__ == "__main__":
586
  example_column_row_linear()
587
  </d-code>
588
 
589
- <p>Now that weve found the most efficient schema for the Feedforward part of the transformer, lets take a look at the multi-head attention block (MHA).</p>
590
 
591
  <p>We can generally follow a similar approach where the Q, K, V will be split in a Column Parallel fashion and the output projection will be split along the Row dimension.</p>
592
 
@@ -604,7 +666,7 @@ if __name__ == "__main__":
604
 
605
  <p>Could we push this approach further?</p>
606
 
607
- <p>Sequence parallelism applies this same idea to other parts of our model. Weve applied tensor parallelism to two main parts in our models where combination of MLP allowed to naturally split the weights along major axis.</p>
608
 
609
  <p>The rest of the model mostly comprises layer norms, dropout and various summation of residuals, these contribute little to the computation but come with rather large forward activations to store.</p>
610
 
@@ -640,14 +702,14 @@ if __name__ == "__main__":
640
 
641
  <p>In the transformer model, tokens have no inherent information about their positional information. For these reasons, we need to use a positional encoding function.</p>
642
 
643
- <p>Assuming that in the multi-head attention layer, <em>q_m</em> is the position-aware query vector corresponding to a token at position <em>m</em>, <em>k_n</em> the position-aware key vector corresponding to the token at position <em>n</em> and <em>f</em> is our position embedding function, we would like our position vector to be a function of the input vectors and absolute positions like this:</p>
644
 
645
  <d-math>
646
  q_m = f(q,m)
647
  k_n = f(k,n)
648
  </d-math>
649
 
650
- <p>We may also want the positional encoding to model relative positional information between two input tokens. Relative positions help the model to operate across longer context spans and even context lengths not seen during training. The attention operation is generally a dot product operation between position-aware vectors <em>q</em> and <em>k</em>, so for a positional encoding that contains relative positional information, well want to have:</p>
651
 
652
  <d-math>
653
  <q_m, k_n> = g(q, k, m-n)
@@ -655,7 +717,7 @@ k_n = f(k,n)
655
 
656
  <p>In other words, we want the result of <em>⟨ 𝑞_𝑚 , 𝑘_𝑛 ⟩</em> to depend on the values of <em>q</em> and <em>k</em> themselves, as well as their relative position <em>m − n</em>, but not <em>m</em> and <em>n</em>. This way, the model can focus on the relative difference between two tokens rather than their absolute positions.</p>
657
 
658
- <p>Lets show that the RoPE positional embedding formulation satisfies the above formula.</p>
659
 
660
  <p><strong>Rotation matrix</strong></p>
661
 
@@ -695,9 +757,9 @@ R(θ) =
695
 
696
  <p><strong>Implementation</strong></p>
697
 
698
- <p>In our case, our internal vectors (the activations in our model) have much more than two elements. Lets pair elements to get 2D vectors and apply the 2D rotation operation on these pairs.</p>
699
 
700
- <p>There are combinatorially many ways we can pair elements but generally two options are the most popular for implementing RoPE: we call them the <em>interleaved</em> and <em>non-interleaved</em> versions. (Its still rather unfortunate to have two popular options)</p>
701
 
702
  <ol>
703
  <li>In the interleaved version, we pair consecutive elements <em>(x<sub>0</sub>, x<sub>1</sub>),(x<sub>2</sub>,x<sub>3</sub>),…</em> before applying the rotation matrix:</li>
@@ -860,13 +922,13 @@ x_{d-1}\cos mθ_{d/2-1} + x_{d-1}\sin mθ_{d/2-1} \\
860
  <h2>References</h2>
861
 
862
  <ul>
863
- <li>Harms posts:
864
  <ul>
865
  <li><a href="https://www.harmdevries.com/post/context-length/">https://www.harmdevries.com/post/context-length/</a></li>
866
  <li><a href="https://www.harmdevries.com/post/model-size-vs-compute-overhead/">https://www.harmdevries.com/post/model-size-vs-compute-overhead/</a></li>
867
  </ul>
868
  </li>
869
- <li>Stas guides:
870
  <ul>
871
  <li><a href="https://github.com/stas00/ml-engineering">https://github.com/stas00/ml-engineering</a></li>
872
  <li><a href="https://github.com/bigscience-workshop/bigscience/blob/master/train/tr11-176B-ml/chronicles.md">https://github.com/bigscience-workshop/bigscience/blob/master/train/tr11-176B-ml/chronicles.md</a></li>
 
9
  <title>FineWeb: decanting the web for the finest text data at scale</title>
10
  <link rel="stylesheet" href="style.css">
11
  <style>
12
+ #controls {
13
+ display: grid;
14
+ grid-template-columns: auto 1fr auto;
15
+ gap: 5px;
16
+ align-items: center;
17
+ max-width: 600px;
18
+ margin-bottom: 20px;
19
+ }
20
+ #controls label {
21
+ text-align: right;
22
+ }
23
+ #controls input[type="range"] {
24
+ width: 100%;
25
+ }
26
+ #controls input[type="number"] {
27
+ width: 60px;
28
+ }
29
+ #controls .row {
30
+ display: contents;
31
+ }
32
  #graph svg {
33
  font-family: sans-serif;
34
  }
 
88
 
89
  <aside>We are extremely thankful to the whole <a href="https://distill.pub/">distill.pub</a> team for creating the template on which we based this blog post.</aside>
90
 
91
+ <div id="graph" style="position: relative; width: 700px; height: 500px;"></div>
92
+ <div id="controls">
93
+ <div class="row">
94
+ <label for="a">Attention Heads (a):</label>
95
+ <input type="range" id="a" name="a" min="1" max="128" value="8">
96
+ <input type="number" id="a_input" value="8" min="1" max="128">
97
+ </div>
98
+ <div class="row">
99
+ <label for="b">Micro Batch Size (b):</label>
100
+ <input type="range" id="b" name="b" min="1" max="53248" value="32">
101
+ <input type="number" id="b_input" value="32" min="1" max="53248">
102
+ </div>
103
+ <div class="row">
104
+ <label for="h">Hidden Dimension Size (h):</label>
105
+ <input type="range" id="h" name="h" min="1" max="16384" value="512">
106
+ <input type="number" id="h_input" value="512" min="128" max="16384">
107
+ </div>
108
+ <div class="row">
109
+ <label for="h_ff">Feedforward Dimension Size (h_ff):</label>
110
+ <input type="range" id="h_ff" name="h_ff" min="1" max="65536" value="2048">
111
+ <input type="number" id="h_ff_input" value="2048" min="512" max="65536">
112
+ </div>
113
+ <div class="row">
114
+ <label for="L">Number of Layers (L):</label>
115
+ <input type="range" id="L" name="L" min="1" max="126" value="12">
116
+ <input type="number" id="L_input" value="12" min="1" max="126">
117
+ </div>
118
+ <div class="row">
119
+ <label for="s">Sequence Length (s):</label>
120
+ <input type="range" id="s" name="s" min="1" max="128000" value="128">
121
+ <input type="number" id="s_input" value="128" min="64" max="128000">
122
+ </div>
123
+ <div class="row">
124
+ <label for="v">Vocabulary Size (v):</label>
125
+ <input type="range" id="v" name="v" min="1000" max="100000" value="30522">
126
+ <input type="number" id="v_input" value="30522" min="1000" max="100000">
127
+ </div>
128
+ <div class="row">
129
+ <label for="k">Optimizer Parameters (k):</label>
130
+ <input type="range" id="k" name="k" min="1" max="16" value="8">
131
+ <input type="number" id="k_input" value="8" min="1" max="16">
132
+ </div>
133
+ <div class="row">
134
+ <label for="tp">Tensor Model Parallelism (t):</label>
135
+ <input type="range" id="tp" name="tp" min="1" max="16" value="8">
136
+ <input type="number" id="tp_input" value="8" min="1" max="16">
137
+ </div>
138
+ <div class="row">
139
+ <label for="dp">Data Model Parallelism (d):</label>
140
+ <input type="range" id="dp" name="dp" min="1" max="256" value="1">
141
+ <input type="number" id="dp_input" value="1" min="1" max="256">
142
+ </div>
143
+ <div class="row">
144
+ <label for="mixed">Mixed Precision:</label>
145
+ <input type="checkbox" id="mixed" name="mixed" checked>
146
+ <span></span> <!-- Empty span to maintain grid alignment -->
147
+ </div>
148
+ <div class="row">
149
+ <label for="recomputation">Recomputation:</label>
150
+ <select id="recomputation" name="recomputation">
151
+ <option value="none">None</option>
152
+ <option value="selective">Selective</option>
153
+ <option value="full">Full</option>
154
+ </select>
155
+ <span></span> <!-- Empty span to maintain grid alignment -->
156
+ </div>
157
+ <div class="row">
158
+ <label for="zero">Zero:</label>
159
+ <select id="zero" name="zero">
160
+ <option value="Optimizer">Optimizer</option>
161
+ <option value="Gradients">Gradients</option>
162
+ <option value="Parameters">Parameters</option>
163
+ </select>
164
+ <span></span> <!-- Empty span to maintain grid alignment -->
165
+ </div>
166
+ <div class="row">
167
+ <label for="ff_activation">FF Activation:</label>
168
+ <select id="ff_activation" name="ff_activation">
169
+ <option value="relu">ReLU</option>
170
+ <option value="gelu">GELU</option>
171
+ <option value="swiglu">SwiGLU</option>
172
+ </select>
173
+ <span></span> <!-- Empty span to maintain grid alignment -->
174
+ </div>
175
+ <div class="row">
176
+ <label for="presets">Presets:</label>
177
+ <select id="presets" name="presets">
178
+ <option value="Llama 3 Tiny">Llama 3 Tiny</option>
179
+ <option value="Llama 3 8B">Llama 3 8B</option>
180
+ <option value="Llama 3 70B">Llama 3 70B</option>
181
+ <option value="Llama 3 405B">Llama 3 405B</option>
182
+ </select>
183
+ <span></span> <!-- Empty span to maintain grid alignment -->
184
+ </div>
185
  </div>
 
186
 
187
  <p><strong>TLDR:</strong> This blog covers a discussion on processing and evaluating data quality at scale, the 🍷 FineWeb
188
  recipe (listing and explaining all of our design choices), and the process followed to create its 📚 FineWeb-Edu subset.</p>
 
417
 
418
  <p>Most frameworks these days use FlashAttention (TODO: see later) which makes the attention computation less memory intensive through kernel fusion, thus most trainings use the <code>full</code> settings.</p>
419
 
420
+ <p>We can save some GPU memory with activation recomputation but this only delays by a bit the next bottleneck: as hinted earlier for LLM training there is usually a sweet spot for the GBST and we need to work out the training configuration backward from there. However, you can't choose MBS to be an arbitrary large number on your GPU; at some point you will run out of GPU memory again since you need to store at least some of the activations in memory.</p>
421
 
422
  <p>There is a useful trick to compensate for that: <strong>gradient accumulation</strong> (<em>GradAcc</em>). With gradient accumulation we will split our batch in micro-batch, do forward and backward passes repeatedly on each micro-batch, compute the gradients, and, as the name suggests, sum the gradients step by step before doing a final optimizer step.</p>
423
 
424
+ <p>We call the <code>micro batch size</code> (MBS) the batch size for each forward pass on a single node (the number of samples flowing through the model in one forward pass). We'll refer to the overall batch size between each optimizer step as the <code>global batch size</code> (GBS). If we do one optimizer step each 8 forward/backward pass, the <code>global batch size</code> will be 8 times the <code>micro batch size</code>.</p>
425
 
426
+ <p>What we now call <code>global batch size</code> thus corresponds to what we've called up to now just <code>batch size</code> for simplicity (we now make the terms more precise to avoid ambiguity).</p>
427
 
428
  <p>With gradient accumulation the global batch size can be computed as follows:</p>
429
 
 
437
 
438
  <p>This is actually a bummer since the forward/backward passes for each micro-batch could actually totally be run in parallel. They are independent from each other and the only changing parameter are the input samples.</p>
439
 
440
+ <p>Here comes data parallelism to solve exactly this problem! Let's take a look, you say? Okay sure!</p>
441
 
442
  <h3>Data Parallelism</h3>
443
 
 
447
  GBS=MBS * GradAcc * DP
448
  </d-math>
449
 
450
+ <p>This means that we can reduce the number of gradient accumulation steps in favor of data parallel processes which speeds up training. In practice, people will tend to max out the number of data parallel nodes (the DP above) as much as possible as it's inherently parallel versus the sequential Gradient Accumulation. Gradient accumulation is then added only to achieve a target batch size if DP alone is not sufficient. One exception to that is pipeline parallelism which we'll discuss later.</p>
451
 
452
  <img src="assets/images/IMG_A95961668B3F-1.jpeg" alt="Data Parallelism">
453
 
 
463
 
464
  <p>If the gradient accumulation ratio is lower than one, i.e. you have too many GPUs (!), you can either choose to not use all your GPUs or test if a lower MBS will speed up training. In these cases, you may want to prioritize throughput over the individual GPU utilization, you can then choose DP first and use a smaller MBS than possible in order to speed up training.</p>
465
 
466
+ <p>Time to take a concrete example: We want to train a model with a GBS of 4M tokens and a sequence length of 4k. This means our batch size will be 1024 samples (we pick powers of two). We observe that a single of our GPU can fit MBS=2 in memory and we have 128 GPUs available for training. This means with 4 gradient accumulation steps we'll achieve our goal of 1024 samples or 4M tokens per training step. Now what if we suddenly have 1024 GPUs available? We can achieve the same GBS and thus identical training by setting both MBS and gradient accumulation to 1 speeding up training significantly.</p>
467
 
468
  <p>[EXPERIMENTS WHERE WE INCREASE DP AND SHOW THROUGHPUT FOR SEVERAL MODELS]</p>
469
 
470
+ <p>We've explored data parallelism, a simple strategy to scale training across more GPUs and gives consistent speed improvements. The keen reader might have noticed however that it rests on the assumption that we can fit at least one input sample forward pass (<em>MBS=1</em>) into our GPU memory. This is not always the case! In particular for larger models which often don't fit into a single GPU anymore even with activation recomputations activated.</p>
471
 
472
+ <p>In such case, we need to shard the model across devices! We'll now study two complementary sharding methods, tensor and pipeline parallelism which are doing that. Let's start by the simplest, tensor parallelism!</p>
473
 
474
  <h3>Tensor Parallelism</h3>
475
 
476
+ <p>So you've exhausted all the previous textbook tricks to try to fit your model on a single GPU but it still doesn't fit? Let's try to distribute this model across several GPUs. Unlike DP we will not simply duplicate the model but various parts of the model instance will be living on various GPUs.</p>
477
 
478
  <p>If we take a look at a typical matrix multiplication (the core of a neural network), we can get an idea about how we could split the model:</p>
479
 
 
530
  <p>If you rather like code, note that we can prove this with the following snippet as well:</p>
531
 
532
  <d-code block language="python">
533
+ ```
534
+ </region_of_file_to_rewritten_file>
535
  def example_gelu():
536
  from torch.nn.functional import gelu
537
 
 
557
  torch.testing.assert_close(y_row_1, y_row_2, rtol=1e-5, atol=1e-5)
558
  </d-code>
559
 
560
+ <p>To avoid a synchronization step directly after the first MLP, we'll thus start with Column Parallel and be able to directly perform parallel GELU.</p>
561
 
562
+ <p>Now, what about the second MLP? Should it be column or row parallel? Let's draft both options:</p>
563
  <ul>
564
  <li>Column Parallel followed by Column Parallel</li>
565
  <img src="assets/images/image%2013.png" alt="Column Parallel Schema 1">
 
567
  <img src="assets/images/image%2014.png" alt="Column Parallel Schema 2">
568
  </ul>
569
 
570
+ <p>We see that the "Column Parallel followed by Row Parallel" schema only involves two communications instead of four. It's thus the most efficient schema in terms of communications.</p>
571
 
572
+ <p>Let's take a quick look at the backward pass:</p>
573
  <img src="assets/images/image%2015.png" alt="Backward Pass 1">
574
  <img src="assets/images/image%2016.png" alt="Backward Pass 2">
575
 
 
648
  example_column_row_linear()
649
  </d-code>
650
 
651
+ <p>Now that we've found the most efficient schema for the Feedforward part of the transformer, let's take a look at the multi-head attention block (MHA).</p>
652
 
653
  <p>We can generally follow a similar approach where the Q, K, V will be split in a Column Parallel fashion and the output projection will be split along the Row dimension.</p>
654
 
 
666
 
667
  <p>Could we push this approach further?</p>
668
 
669
+ <p>Sequence parallelism applies this same idea to other parts of our model. We've applied tensor parallelism to two main parts in our models where combination of MLP allowed to naturally split the weights along major axis.</p>
670
 
671
  <p>The rest of the model mostly comprises layer norms, dropout and various summation of residuals, these contribute little to the computation but come with rather large forward activations to store.</p>
672
 
 
702
 
703
  <p>In the transformer model, tokens have no inherent information about their positional information. For these reasons, we need to use a positional encoding function.</p>
704
 
705
+ <p>Assuming that in the multi-head attention layer, <em>q_m</em> is the "position-aware" query vector corresponding to a token at position <em>m</em>, <em>k_n</em> the "position-aware" key vector corresponding to the token at position <em>n</em> and <em>f</em> is our position embedding function, we would like our position vector to be a function of the input vectors and absolute positions like this:</p>
706
 
707
  <d-math>
708
  q_m = f(q,m)
709
  k_n = f(k,n)
710
  </d-math>
711
 
712
+ <p>We may also want the positional encoding to model relative positional information between two input tokens. Relative positions help the model to operate across longer context spans and even context lengths not seen during training. The attention operation is generally a dot product operation between "position-aware" vectors <em>q</em> and <em>k</em>, so for a positional encoding that contains relative positional information, we'll want to have:</p>
713
 
714
  <d-math>
715
  <q_m, k_n> = g(q, k, m-n)
 
717
 
718
  <p>In other words, we want the result of <em>⟨ 𝑞_𝑚 , 𝑘_𝑛 ⟩</em> to depend on the values of <em>q</em> and <em>k</em> themselves, as well as their relative position <em>m − n</em>, but not <em>m</em> and <em>n</em>. This way, the model can focus on the relative difference between two tokens rather than their absolute positions.</p>
719
 
720
+ <p>Let's show that the RoPE positional embedding formulation satisfies the above formula.</p>
721
 
722
  <p><strong>Rotation matrix</strong></p>
723
 
 
757
 
758
  <p><strong>Implementation</strong></p>
759
 
760
+ <p>In our case, our internal vectors (the activations in our model) have much more than two elements. Let's pair elements to get 2D vectors and apply the 2D rotation operation on these pairs.</p>
761
 
762
+ <p>There are combinatorially many ways we can pair elements but generally two options are the most popular for implementing RoPE: we call them the <em>interleaved</em> and <em>non-interleaved</em> versions. (It's still rather unfortunate to have two popular options)</p>
763
 
764
  <ol>
765
  <li>In the interleaved version, we pair consecutive elements <em>(x<sub>0</sub>, x<sub>1</sub>),(x<sub>2</sub>,x<sub>3</sub>),…</em> before applying the rotation matrix:</li>
 
922
  <h2>References</h2>
923
 
924
  <ul>
925
+ <li>Harm's posts:
926
  <ul>
927
  <li><a href="https://www.harmdevries.com/post/context-length/">https://www.harmdevries.com/post/context-length/</a></li>
928
  <li><a href="https://www.harmdevries.com/post/model-size-vs-compute-overhead/">https://www.harmdevries.com/post/model-size-vs-compute-overhead/</a></li>
929
  </ul>
930
  </li>
931
+ <li>Stas' guides:
932
  <ul>
933
  <li><a href="https://github.com/stas00/ml-engineering">https://github.com/stas00/ml-engineering</a></li>
934
  <li><a href="https://github.com/bigscience-workshop/bigscience/blob/master/train/tr11-176B-ml/chronicles.md">https://github.com/bigscience-workshop/bigscience/blob/master/train/tr11-176B-ml/chronicles.md</a></li>
dist/main.bundle.js CHANGED
The diff for this file is too large to render. See raw diff
 
dist/main.bundle.js.map CHANGED
The diff for this file is too large to render. See raw diff
 
src/index.html CHANGED
@@ -9,6 +9,26 @@
9
  <title>FineWeb: decanting the web for the finest text data at scale</title>
10
  <link rel="stylesheet" href="style.css">
11
  <style>
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
12
  #graph svg {
13
  font-family: sans-serif;
14
  }
@@ -68,61 +88,101 @@
68
 
69
  <aside>We are extremely thankful to the whole <a href="https://distill.pub/">distill.pub</a> team for creating the template on which we based this blog post.</aside>
70
 
71
- <div>
72
- <label for="a">Attention Heads (a):</label>
73
- <input type="range" id="a" name="a" min="1" max="128" value="8">
74
- <input type="number" id="a_input" value="8" min="1" max="128">
75
- <br>
76
- <label for="b">Micro Batch Size (b):</label>
77
- <input type="range" id="b" name="b" min="1" max="53248" value="32">
78
- <input type="number" id="b_input" value="32" min="1" max="53248">
79
- <br>
80
- <label for="h">Hidden Dimension Size (h):</label>
81
- <input type="range" id="h" name="h" min="1" max="16384" value="512">
82
- <input type="number" id="h_input" value="512" min="128" max="16384">
83
- <br>
84
- <label for="h_ff">Feedforward Dimension Size (h_ff):</label>
85
- <input type="range" id="h_ff" name="h_ff" min="1" max="65536" value="2048">
86
- <input type="number" id="h_ff_input" value="2048" min="512" max="65536">
87
- <br>
88
- <label for="L">Number of Layers (L):</label>
89
- <input type="range" id="L" name="L" min="1" max="126" value="12">
90
- <input type="number" id="L_input" value="12" min="1" max="126">
91
- <br>
92
- <label for="s">Sequence Length (s):</label>
93
- <input type="range" id="s" name="s" min="1" max="128000" value="128">
94
- <input type="number" id="s_input" value="128" min="64" max="128000">
95
- <br>
96
- <label for="v">Vocabulary Size (v):</label>
97
- <input type="range" id="v" name="v" min="1000" max="100000" value="30522">
98
- <input type="number" id="v_input" value="30522" min="1000" max="100000">
99
- <br>
100
- <label for="mixed">Mixed Precision:</label>
101
- <input type="checkbox" id="mixed" name="mixed" checked>
102
- <br>
103
- <label for="recomputation">Recomputation:</label>
104
- <select id="recomputation" name="recomputation">
105
- <option value="none">None</option>
106
- <option value="selective">Selective</option>
107
- <option value="full">Full</option>
108
- </select>
109
- <br>
110
- <label for="ff_activation">FF Activation:</label>
111
- <select id="ff_activation" name="ff_activation">
112
- <option value="relu">ReLU</option>
113
- <option value="gelu">GELU</option>
114
- <option value="swiglu">SwiGLU</option>
115
- </select>
116
- <br>
117
- <label for="presets">Presets:</label>
118
- <select id="presets" name="presets">
119
- <option value="Tiny">Tiny</option>
120
- <option value="8B">8B</option>
121
- <option value="70B">70B</option>
122
- <option value="405B">405B</option>
123
- </select>
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
124
  </div>
125
- <div id="graph" style="position: relative; width: 960px; height: 500px;"></div>
126
 
127
  <p><strong>TLDR:</strong> This blog covers a discussion on processing and evaluating data quality at scale, the 🍷 FineWeb
128
  recipe (listing and explaining all of our design choices), and the process followed to create its 📚 FineWeb-Edu subset.</p>
@@ -357,13 +417,13 @@ m_{act} = 2 * L* seq * bs * h * (34 + \frac{5*n_{heads}*seq}{h})
357
 
358
  <p>Most frameworks these days use FlashAttention (TODO: see later) which makes the attention computation less memory intensive through kernel fusion, thus most trainings use the <code>full</code> settings.</p>
359
 
360
- <p>We can save some GPU memory with activation recomputation but this only delays by a bit the next bottleneck: as hinted earlier for LLM training there is usually a sweet spot for the GBST and we need to work out the training configuration backward from there. However, you cant choose MBS to be an arbitrary large number on your GPU; at some point you will run out of GPU memory again since you need to store at least some of the activations in memory.</p>
361
 
362
  <p>There is a useful trick to compensate for that: <strong>gradient accumulation</strong> (<em>GradAcc</em>). With gradient accumulation we will split our batch in micro-batch, do forward and backward passes repeatedly on each micro-batch, compute the gradients, and, as the name suggests, sum the gradients step by step before doing a final optimizer step.</p>
363
 
364
- <p>We call the <code>micro batch size</code> (MBS) the batch size for each forward pass on a single node (the number of samples flowing through the model in one forward pass). Well refer to the overall batch size between each optimizer step as the <code>global batch size</code> (GBS). If we do one optimizer step each 8 forward/backward pass, the <code>global batch size</code> will be 8 times the <code>micro batch size</code>.</p>
365
 
366
- <p>What we now call <code>global batch size</code> thus corresponds to what weve called up to now just <code>batch size</code> for simplicity (we now make the terms more precise to avoid ambiguity).</p>
367
 
368
  <p>With gradient accumulation the global batch size can be computed as follows:</p>
369
 
@@ -377,7 +437,7 @@ BS = GBS=MBS * GradAcc
377
 
378
  <p>This is actually a bummer since the forward/backward passes for each micro-batch could actually totally be run in parallel. They are independent from each other and the only changing parameter are the input samples.</p>
379
 
380
- <p>Here comes data parallelism to solve exactly this problem! Lets take a look, you say? Okay sure!</p>
381
 
382
  <h3>Data Parallelism</h3>
383
 
@@ -387,7 +447,7 @@ BS = GBS=MBS * GradAcc
387
  GBS=MBS * GradAcc * DP
388
  </d-math>
389
 
390
- <p>This means that we can reduce the number of gradient accumulation steps in favor of data parallel processes which speeds up training. In practice, people will tend to max out the number of data parallel nodes (the DP above) as much as possible as its inherently parallel versus the sequential Gradient Accumulation. Gradient accumulation is then added only to achieve a target batch size if DP alone is not sufficient. One exception to that is pipeline parallelism which well discuss later.</p>
391
 
392
  <img src="assets/images/IMG_A95961668B3F-1.jpeg" alt="Data Parallelism">
393
 
@@ -403,17 +463,17 @@ GBS=MBS * GradAcc * DP
403
 
404
  <p>If the gradient accumulation ratio is lower than one, i.e. you have too many GPUs (!), you can either choose to not use all your GPUs or test if a lower MBS will speed up training. In these cases, you may want to prioritize throughput over the individual GPU utilization, you can then choose DP first and use a smaller MBS than possible in order to speed up training.</p>
405
 
406
- <p>Time to take a concrete example: We want to train a model with a GBS of 4M tokens and a sequence length of 4k. This means our batch size will be 1024 samples (we pick powers of two). We observe that a single of our GPU can fit MBS=2 in memory and we have 128 GPUs available for training. This means with 4 gradient accumulation steps well achieve our goal of 1024 samples or 4M tokens per training step. Now what if we suddenly have 1024 GPUs available? We can achieve the same GBS and thus identical training by setting both MBS and gradient accumulation to 1 speeding up training significantly.</p>
407
 
408
  <p>[EXPERIMENTS WHERE WE INCREASE DP AND SHOW THROUGHPUT FOR SEVERAL MODELS]</p>
409
 
410
- <p>Weve explored data parallelism, a simple strategy to scale training across more GPUs and gives consistent speed improvements. The keen reader might have noticed however that it rests on the assumption that we can fit at least one input sample forward pass (<em>MBS=1</em>) into our GPU memory. This is not always the case! In particular for larger models which often dont fit into a single GPU anymore even with activation recomputations activated.</p>
411
 
412
- <p>In such case, we need to shard the model across devices! Well now study two complementary sharding methods, tensor and pipeline parallelism which are doing that. Lets start by the simplest, tensor parallelism!</p>
413
 
414
  <h3>Tensor Parallelism</h3>
415
 
416
- <p>So youve exhausted all the previous textbook tricks to try to fit your model on a single GPU but it still doesnt fit? Lets try to distribute this model across several GPUs. Unlike DP we will not simply duplicate the model but various parts of the model instance will be living on various GPUs.</p>
417
 
418
  <p>If we take a look at a typical matrix multiplication (the core of a neural network), we can get an idea about how we could split the model:</p>
419
 
@@ -470,6 +530,8 @@ GeLU(XW1 + XW2) \neq GeLU(XW1) + GeLU(XW2)
470
  <p>If you rather like code, note that we can prove this with the following snippet as well:</p>
471
 
472
  <d-code block language="python">
 
 
473
  def example_gelu():
474
  from torch.nn.functional import gelu
475
 
@@ -495,9 +557,9 @@ def example_gelu():
495
  torch.testing.assert_close(y_row_1, y_row_2, rtol=1e-5, atol=1e-5)
496
  </d-code>
497
 
498
- <p>To avoid a synchronization step directly after the first MLP, well thus start with Column Parallel and be able to directly perform parallel GELU.</p>
499
 
500
- <p>Now, what about the second MLP? Should it be column or row parallel? Lets draft both options:</p>
501
  <ul>
502
  <li>Column Parallel followed by Column Parallel</li>
503
  <img src="assets/images/image%2013.png" alt="Column Parallel Schema 1">
@@ -505,9 +567,9 @@ def example_gelu():
505
  <img src="assets/images/image%2014.png" alt="Column Parallel Schema 2">
506
  </ul>
507
 
508
- <p>We see that the Column Parallel followed by Row Parallel schema only involves two communications instead of four. Its thus the most efficient schema in terms of communications.</p>
509
 
510
- <p>Lets take a quick look at the backward pass:</p>
511
  <img src="assets/images/image%2015.png" alt="Backward Pass 1">
512
  <img src="assets/images/image%2016.png" alt="Backward Pass 2">
513
 
@@ -586,7 +648,7 @@ if __name__ == "__main__":
586
  example_column_row_linear()
587
  </d-code>
588
 
589
- <p>Now that weve found the most efficient schema for the Feedforward part of the transformer, lets take a look at the multi-head attention block (MHA).</p>
590
 
591
  <p>We can generally follow a similar approach where the Q, K, V will be split in a Column Parallel fashion and the output projection will be split along the Row dimension.</p>
592
 
@@ -604,7 +666,7 @@ if __name__ == "__main__":
604
 
605
  <p>Could we push this approach further?</p>
606
 
607
- <p>Sequence parallelism applies this same idea to other parts of our model. Weve applied tensor parallelism to two main parts in our models where combination of MLP allowed to naturally split the weights along major axis.</p>
608
 
609
  <p>The rest of the model mostly comprises layer norms, dropout and various summation of residuals, these contribute little to the computation but come with rather large forward activations to store.</p>
610
 
@@ -640,14 +702,14 @@ if __name__ == "__main__":
640
 
641
  <p>In the transformer model, tokens have no inherent information about their positional information. For these reasons, we need to use a positional encoding function.</p>
642
 
643
- <p>Assuming that in the multi-head attention layer, <em>q_m</em> is the position-aware query vector corresponding to a token at position <em>m</em>, <em>k_n</em> the position-aware key vector corresponding to the token at position <em>n</em> and <em>f</em> is our position embedding function, we would like our position vector to be a function of the input vectors and absolute positions like this:</p>
644
 
645
  <d-math>
646
  q_m = f(q,m)
647
  k_n = f(k,n)
648
  </d-math>
649
 
650
- <p>We may also want the positional encoding to model relative positional information between two input tokens. Relative positions help the model to operate across longer context spans and even context lengths not seen during training. The attention operation is generally a dot product operation between position-aware vectors <em>q</em> and <em>k</em>, so for a positional encoding that contains relative positional information, well want to have:</p>
651
 
652
  <d-math>
653
  <q_m, k_n> = g(q, k, m-n)
@@ -655,7 +717,7 @@ k_n = f(k,n)
655
 
656
  <p>In other words, we want the result of <em>⟨ 𝑞_𝑚 , 𝑘_𝑛 ⟩</em> to depend on the values of <em>q</em> and <em>k</em> themselves, as well as their relative position <em>m − n</em>, but not <em>m</em> and <em>n</em>. This way, the model can focus on the relative difference between two tokens rather than their absolute positions.</p>
657
 
658
- <p>Lets show that the RoPE positional embedding formulation satisfies the above formula.</p>
659
 
660
  <p><strong>Rotation matrix</strong></p>
661
 
@@ -695,9 +757,9 @@ R(θ) =
695
 
696
  <p><strong>Implementation</strong></p>
697
 
698
- <p>In our case, our internal vectors (the activations in our model) have much more than two elements. Lets pair elements to get 2D vectors and apply the 2D rotation operation on these pairs.</p>
699
 
700
- <p>There are combinatorially many ways we can pair elements but generally two options are the most popular for implementing RoPE: we call them the <em>interleaved</em> and <em>non-interleaved</em> versions. (Its still rather unfortunate to have two popular options)</p>
701
 
702
  <ol>
703
  <li>In the interleaved version, we pair consecutive elements <em>(x<sub>0</sub>, x<sub>1</sub>),(x<sub>2</sub>,x<sub>3</sub>),…</em> before applying the rotation matrix:</li>
@@ -860,13 +922,13 @@ x_{d-1}\cos mθ_{d/2-1} + x_{d-1}\sin mθ_{d/2-1} \\
860
  <h2>References</h2>
861
 
862
  <ul>
863
- <li>Harms posts:
864
  <ul>
865
  <li><a href="https://www.harmdevries.com/post/context-length/">https://www.harmdevries.com/post/context-length/</a></li>
866
  <li><a href="https://www.harmdevries.com/post/model-size-vs-compute-overhead/">https://www.harmdevries.com/post/model-size-vs-compute-overhead/</a></li>
867
  </ul>
868
  </li>
869
- <li>Stas guides:
870
  <ul>
871
  <li><a href="https://github.com/stas00/ml-engineering">https://github.com/stas00/ml-engineering</a></li>
872
  <li><a href="https://github.com/bigscience-workshop/bigscience/blob/master/train/tr11-176B-ml/chronicles.md">https://github.com/bigscience-workshop/bigscience/blob/master/train/tr11-176B-ml/chronicles.md</a></li>
 
9
  <title>FineWeb: decanting the web for the finest text data at scale</title>
10
  <link rel="stylesheet" href="style.css">
11
  <style>
12
+ #controls {
13
+ display: grid;
14
+ grid-template-columns: auto 1fr auto;
15
+ gap: 5px;
16
+ align-items: center;
17
+ max-width: 600px;
18
+ margin-bottom: 20px;
19
+ }
20
+ #controls label {
21
+ text-align: right;
22
+ }
23
+ #controls input[type="range"] {
24
+ width: 100%;
25
+ }
26
+ #controls input[type="number"] {
27
+ width: 60px;
28
+ }
29
+ #controls .row {
30
+ display: contents;
31
+ }
32
  #graph svg {
33
  font-family: sans-serif;
34
  }
 
88
 
89
  <aside>We are extremely thankful to the whole <a href="https://distill.pub/">distill.pub</a> team for creating the template on which we based this blog post.</aside>
90
 
91
+ <div id="graph" style="position: relative; width: 700px; height: 500px;"></div>
92
+ <div id="controls">
93
+ <div class="row">
94
+ <label for="a">Attention Heads (a):</label>
95
+ <input type="range" id="a" name="a" min="1" max="128" value="8">
96
+ <input type="number" id="a_input" value="8" min="1" max="128">
97
+ </div>
98
+ <div class="row">
99
+ <label for="b">Micro Batch Size (b):</label>
100
+ <input type="range" id="b" name="b" min="1" max="53248" value="32">
101
+ <input type="number" id="b_input" value="32" min="1" max="53248">
102
+ </div>
103
+ <div class="row">
104
+ <label for="h">Hidden Dimension Size (h):</label>
105
+ <input type="range" id="h" name="h" min="1" max="16384" value="512">
106
+ <input type="number" id="h_input" value="512" min="128" max="16384">
107
+ </div>
108
+ <div class="row">
109
+ <label for="h_ff">Feedforward Dimension Size (h_ff):</label>
110
+ <input type="range" id="h_ff" name="h_ff" min="1" max="65536" value="2048">
111
+ <input type="number" id="h_ff_input" value="2048" min="512" max="65536">
112
+ </div>
113
+ <div class="row">
114
+ <label for="L">Number of Layers (L):</label>
115
+ <input type="range" id="L" name="L" min="1" max="126" value="12">
116
+ <input type="number" id="L_input" value="12" min="1" max="126">
117
+ </div>
118
+ <div class="row">
119
+ <label for="s">Sequence Length (s):</label>
120
+ <input type="range" id="s" name="s" min="1" max="128000" value="128">
121
+ <input type="number" id="s_input" value="128" min="64" max="128000">
122
+ </div>
123
+ <div class="row">
124
+ <label for="v">Vocabulary Size (v):</label>
125
+ <input type="range" id="v" name="v" min="1000" max="100000" value="30522">
126
+ <input type="number" id="v_input" value="30522" min="1000" max="100000">
127
+ </div>
128
+ <div class="row">
129
+ <label for="k">Optimizer Parameters (k):</label>
130
+ <input type="range" id="k" name="k" min="1" max="16" value="8">
131
+ <input type="number" id="k_input" value="8" min="1" max="16">
132
+ </div>
133
+ <div class="row">
134
+ <label for="tp">Tensor Model Parallelism (t):</label>
135
+ <input type="range" id="tp" name="tp" min="1" max="16" value="8">
136
+ <input type="number" id="tp_input" value="8" min="1" max="16">
137
+ </div>
138
+ <div class="row">
139
+ <label for="dp">Data Model Parallelism (d):</label>
140
+ <input type="range" id="dp" name="dp" min="1" max="256" value="1">
141
+ <input type="number" id="dp_input" value="1" min="1" max="256">
142
+ </div>
143
+ <div class="row">
144
+ <label for="mixed">Mixed Precision:</label>
145
+ <input type="checkbox" id="mixed" name="mixed" checked>
146
+ <span></span> <!-- Empty span to maintain grid alignment -->
147
+ </div>
148
+ <div class="row">
149
+ <label for="recomputation">Recomputation:</label>
150
+ <select id="recomputation" name="recomputation">
151
+ <option value="none">None</option>
152
+ <option value="selective">Selective</option>
153
+ <option value="full">Full</option>
154
+ </select>
155
+ <span></span> <!-- Empty span to maintain grid alignment -->
156
+ </div>
157
+ <div class="row">
158
+ <label for="zero">Zero:</label>
159
+ <select id="zero" name="zero">
160
+ <option value="Optimizer">Optimizer</option>
161
+ <option value="Gradients">Gradients</option>
162
+ <option value="Parameters">Parameters</option>
163
+ </select>
164
+ <span></span> <!-- Empty span to maintain grid alignment -->
165
+ </div>
166
+ <div class="row">
167
+ <label for="ff_activation">FF Activation:</label>
168
+ <select id="ff_activation" name="ff_activation">
169
+ <option value="relu">ReLU</option>
170
+ <option value="gelu">GELU</option>
171
+ <option value="swiglu">SwiGLU</option>
172
+ </select>
173
+ <span></span> <!-- Empty span to maintain grid alignment -->
174
+ </div>
175
+ <div class="row">
176
+ <label for="presets">Presets:</label>
177
+ <select id="presets" name="presets">
178
+ <option value="Llama 3 Tiny">Llama 3 Tiny</option>
179
+ <option value="Llama 3 8B">Llama 3 8B</option>
180
+ <option value="Llama 3 70B">Llama 3 70B</option>
181
+ <option value="Llama 3 405B">Llama 3 405B</option>
182
+ </select>
183
+ <span></span> <!-- Empty span to maintain grid alignment -->
184
+ </div>
185
  </div>
 
186
 
187
  <p><strong>TLDR:</strong> This blog covers a discussion on processing and evaluating data quality at scale, the 🍷 FineWeb
188
  recipe (listing and explaining all of our design choices), and the process followed to create its 📚 FineWeb-Edu subset.</p>
 
417
 
418
  <p>Most frameworks these days use FlashAttention (TODO: see later) which makes the attention computation less memory intensive through kernel fusion, thus most trainings use the <code>full</code> settings.</p>
419
 
420
+ <p>We can save some GPU memory with activation recomputation but this only delays by a bit the next bottleneck: as hinted earlier for LLM training there is usually a sweet spot for the GBST and we need to work out the training configuration backward from there. However, you can't choose MBS to be an arbitrary large number on your GPU; at some point you will run out of GPU memory again since you need to store at least some of the activations in memory.</p>
421
 
422
  <p>There is a useful trick to compensate for that: <strong>gradient accumulation</strong> (<em>GradAcc</em>). With gradient accumulation we will split our batch in micro-batch, do forward and backward passes repeatedly on each micro-batch, compute the gradients, and, as the name suggests, sum the gradients step by step before doing a final optimizer step.</p>
423
 
424
+ <p>We call the <code>micro batch size</code> (MBS) the batch size for each forward pass on a single node (the number of samples flowing through the model in one forward pass). We'll refer to the overall batch size between each optimizer step as the <code>global batch size</code> (GBS). If we do one optimizer step each 8 forward/backward pass, the <code>global batch size</code> will be 8 times the <code>micro batch size</code>.</p>
425
 
426
+ <p>What we now call <code>global batch size</code> thus corresponds to what we've called up to now just <code>batch size</code> for simplicity (we now make the terms more precise to avoid ambiguity).</p>
427
 
428
  <p>With gradient accumulation the global batch size can be computed as follows:</p>
429
 
 
437
 
438
  <p>This is actually a bummer since the forward/backward passes for each micro-batch could actually totally be run in parallel. They are independent from each other and the only changing parameter are the input samples.</p>
439
 
440
+ <p>Here comes data parallelism to solve exactly this problem! Let's take a look, you say? Okay sure!</p>
441
 
442
  <h3>Data Parallelism</h3>
443
 
 
447
  GBS=MBS * GradAcc * DP
448
  </d-math>
449
 
450
+ <p>This means that we can reduce the number of gradient accumulation steps in favor of data parallel processes which speeds up training. In practice, people will tend to max out the number of data parallel nodes (the DP above) as much as possible as it's inherently parallel versus the sequential Gradient Accumulation. Gradient accumulation is then added only to achieve a target batch size if DP alone is not sufficient. One exception to that is pipeline parallelism which we'll discuss later.</p>
451
 
452
  <img src="assets/images/IMG_A95961668B3F-1.jpeg" alt="Data Parallelism">
453
 
 
463
 
464
  <p>If the gradient accumulation ratio is lower than one, i.e. you have too many GPUs (!), you can either choose to not use all your GPUs or test if a lower MBS will speed up training. In these cases, you may want to prioritize throughput over the individual GPU utilization, you can then choose DP first and use a smaller MBS than possible in order to speed up training.</p>
465
 
466
+ <p>Time to take a concrete example: We want to train a model with a GBS of 4M tokens and a sequence length of 4k. This means our batch size will be 1024 samples (we pick powers of two). We observe that a single of our GPU can fit MBS=2 in memory and we have 128 GPUs available for training. This means with 4 gradient accumulation steps we'll achieve our goal of 1024 samples or 4M tokens per training step. Now what if we suddenly have 1024 GPUs available? We can achieve the same GBS and thus identical training by setting both MBS and gradient accumulation to 1 speeding up training significantly.</p>
467
 
468
  <p>[EXPERIMENTS WHERE WE INCREASE DP AND SHOW THROUGHPUT FOR SEVERAL MODELS]</p>
469
 
470
+ <p>We've explored data parallelism, a simple strategy to scale training across more GPUs and gives consistent speed improvements. The keen reader might have noticed however that it rests on the assumption that we can fit at least one input sample forward pass (<em>MBS=1</em>) into our GPU memory. This is not always the case! In particular for larger models which often don't fit into a single GPU anymore even with activation recomputations activated.</p>
471
 
472
+ <p>In such case, we need to shard the model across devices! We'll now study two complementary sharding methods, tensor and pipeline parallelism which are doing that. Let's start by the simplest, tensor parallelism!</p>
473
 
474
  <h3>Tensor Parallelism</h3>
475
 
476
+ <p>So you've exhausted all the previous textbook tricks to try to fit your model on a single GPU but it still doesn't fit? Let's try to distribute this model across several GPUs. Unlike DP we will not simply duplicate the model but various parts of the model instance will be living on various GPUs.</p>
477
 
478
  <p>If we take a look at a typical matrix multiplication (the core of a neural network), we can get an idea about how we could split the model:</p>
479
 
 
530
  <p>If you rather like code, note that we can prove this with the following snippet as well:</p>
531
 
532
  <d-code block language="python">
533
+ ```
534
+ </region_of_file_to_rewritten_file>
535
  def example_gelu():
536
  from torch.nn.functional import gelu
537
 
 
557
  torch.testing.assert_close(y_row_1, y_row_2, rtol=1e-5, atol=1e-5)
558
  </d-code>
559
 
560
+ <p>To avoid a synchronization step directly after the first MLP, we'll thus start with Column Parallel and be able to directly perform parallel GELU.</p>
561
 
562
+ <p>Now, what about the second MLP? Should it be column or row parallel? Let's draft both options:</p>
563
  <ul>
564
  <li>Column Parallel followed by Column Parallel</li>
565
  <img src="assets/images/image%2013.png" alt="Column Parallel Schema 1">
 
567
  <img src="assets/images/image%2014.png" alt="Column Parallel Schema 2">
568
  </ul>
569
 
570
+ <p>We see that the "Column Parallel followed by Row Parallel" schema only involves two communications instead of four. It's thus the most efficient schema in terms of communications.</p>
571
 
572
+ <p>Let's take a quick look at the backward pass:</p>
573
  <img src="assets/images/image%2015.png" alt="Backward Pass 1">
574
  <img src="assets/images/image%2016.png" alt="Backward Pass 2">
575
 
 
648
  example_column_row_linear()
649
  </d-code>
650
 
651
+ <p>Now that we've found the most efficient schema for the Feedforward part of the transformer, let's take a look at the multi-head attention block (MHA).</p>
652
 
653
  <p>We can generally follow a similar approach where the Q, K, V will be split in a Column Parallel fashion and the output projection will be split along the Row dimension.</p>
654
 
 
666
 
667
  <p>Could we push this approach further?</p>
668
 
669
+ <p>Sequence parallelism applies this same idea to other parts of our model. We've applied tensor parallelism to two main parts in our models where combination of MLP allowed to naturally split the weights along major axis.</p>
670
 
671
  <p>The rest of the model mostly comprises layer norms, dropout and various summation of residuals, these contribute little to the computation but come with rather large forward activations to store.</p>
672
 
 
702
 
703
  <p>In the transformer model, tokens have no inherent information about their positional information. For these reasons, we need to use a positional encoding function.</p>
704
 
705
+ <p>Assuming that in the multi-head attention layer, <em>q_m</em> is the "position-aware" query vector corresponding to a token at position <em>m</em>, <em>k_n</em> the "position-aware" key vector corresponding to the token at position <em>n</em> and <em>f</em> is our position embedding function, we would like our position vector to be a function of the input vectors and absolute positions like this:</p>
706
 
707
  <d-math>
708
  q_m = f(q,m)
709
  k_n = f(k,n)
710
  </d-math>
711
 
712
+ <p>We may also want the positional encoding to model relative positional information between two input tokens. Relative positions help the model to operate across longer context spans and even context lengths not seen during training. The attention operation is generally a dot product operation between "position-aware" vectors <em>q</em> and <em>k</em>, so for a positional encoding that contains relative positional information, we'll want to have:</p>
713
 
714
  <d-math>
715
  <q_m, k_n> = g(q, k, m-n)
 
717
 
718
  <p>In other words, we want the result of <em>⟨ 𝑞_𝑚 , 𝑘_𝑛 ⟩</em> to depend on the values of <em>q</em> and <em>k</em> themselves, as well as their relative position <em>m − n</em>, but not <em>m</em> and <em>n</em>. This way, the model can focus on the relative difference between two tokens rather than their absolute positions.</p>
719
 
720
+ <p>Let's show that the RoPE positional embedding formulation satisfies the above formula.</p>
721
 
722
  <p><strong>Rotation matrix</strong></p>
723
 
 
757
 
758
  <p><strong>Implementation</strong></p>
759
 
760
+ <p>In our case, our internal vectors (the activations in our model) have much more than two elements. Let's pair elements to get 2D vectors and apply the 2D rotation operation on these pairs.</p>
761
 
762
+ <p>There are combinatorially many ways we can pair elements but generally two options are the most popular for implementing RoPE: we call them the <em>interleaved</em> and <em>non-interleaved</em> versions. (It's still rather unfortunate to have two popular options)</p>
763
 
764
  <ol>
765
  <li>In the interleaved version, we pair consecutive elements <em>(x<sub>0</sub>, x<sub>1</sub>),(x<sub>2</sub>,x<sub>3</sub>),…</em> before applying the rotation matrix:</li>
 
922
  <h2>References</h2>
923
 
924
  <ul>
925
+ <li>Harm's posts:
926
  <ul>
927
  <li><a href="https://www.harmdevries.com/post/context-length/">https://www.harmdevries.com/post/context-length/</a></li>
928
  <li><a href="https://www.harmdevries.com/post/model-size-vs-compute-overhead/">https://www.harmdevries.com/post/model-size-vs-compute-overhead/</a></li>
929
  </ul>
930
  </li>
931
+ <li>Stas' guides:
932
  <ul>
933
  <li><a href="https://github.com/stas00/ml-engineering">https://github.com/stas00/ml-engineering</a></li>
934
  <li><a href="https://github.com/bigscience-workshop/bigscience/blob/master/train/tr11-176B-ml/chronicles.md">https://github.com/bigscience-workshop/bigscience/blob/master/train/tr11-176B-ml/chronicles.md</a></li>
src/memory.js CHANGED
@@ -8,27 +8,35 @@ export function activationMemory(
8
  L, // number of layers
9
  s, // sequence length
10
  v, // vocab size
 
11
  mixed = true,
12
  recomputation = "none",
13
  ff_activation = "relu"
14
  ) {
15
- console.log('activationMemory called with:', { a, b, h, h_ff, L, s, v, mixed, recomputation, ff_activation });
16
  // https://arxiv.org/pdf/2205.05198
17
  const bytesPerValue = mixed ? 2 : 4;
18
 
19
- const oneLayerAttention = s * b * h * (bytesPerValue * 5 + 1) + ((2 * bytesPerValue + 1) * a * s * s * b); // eq (2)
 
 
 
 
 
 
 
20
 
21
  let oneLayerFeedforward;
22
  if (ff_activation === "relu") {
23
- oneLayerFeedforward = (s * b * h * bytesPerValue + (s * b * h_ff * bytesPerValue) // inputs of 1st/2nd linear layers
24
  + s * b * h); // dropout
25
  } else if (ff_activation === "gelu") {
26
- oneLayerFeedforward = (s * b * h * bytesPerValue + (s * b * h_ff * bytesPerValue) // inputs of 1st/2nd linear layers
27
- + s * b * h_ff * bytesPerValue // inputs of activation function (not really necessary for Relu)
28
  + s * b * h); // dropout
29
  } else if (ff_activation === "swiglu") {
30
- oneLayerFeedforward = (s * b * h * bytesPerValue + (s * b * h_ff * bytesPerValue) // inputs of input/output linear layers
31
- + s * b * h_ff * bytesPerValue * 3 // inputs of activation function
32
  + s * b * h); // dropout (note that dropout is lower-precision - boolean)
33
  }
34
 
@@ -41,41 +49,56 @@ export function activationMemory(
41
 
42
 
43
  let oneLayer;
44
- if (recomputation === "none") {
45
- oneLayer = oneLayerAttention + oneLayerFeedforward + 2 * layerNorm; // eq (2)
46
- } else if (recomputation === "selective") {
47
- oneLayer = s * b * h * 34; // eq (6)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
48
  } else if (recomputation === "full") {
49
- oneLayer = s * b * h * 2;
50
- } else {
 
 
 
 
 
 
 
 
 
51
  throw new Error("Invalid recomputation value");
52
  }
53
 
54
- const data = {
55
- name: "activationMemory",
56
- children: [
57
- ...Array.from({ length: L }, (_, index) => ({
58
- name: `Layer ${index + 1}`,
59
- children: [
60
- { name: 'Attention', value: oneLayerAttention },
61
- { name: 'Feedforward', value: oneLayerFeedforward },
62
- { name: 'LayerNorm', value: 2 * layerNorm },
63
- ]
64
- })),
65
- { name: 'Dropout', value: inputDropout },
66
- { name: 'LayerNorm', value: outputLayerNorm },
67
- { name: 'Projection', value: outputLayerProjection },
68
- { name: 'Cross Entropy', value: outputCrossEntropy }
69
- ]
70
- };
71
 
72
- const total = L * oneLayer + inputDropout + outputLayerNorm + outputLayerProjection + outputCrossEntropy;
73
 
74
  return data;
75
  }
76
 
77
- export function paramGradsOpt(h, L, s, v, k = 8, mixed = true) {
78
- console.log('paramGradsOpt called with:', { h, L, s, v, k, mixed });
 
 
 
 
 
 
 
 
79
  const emb = h * (v + s);
80
  const oneLayer = 12 * h ** 2 + 13 * h;
81
  const other = 2 * h;
@@ -87,9 +110,16 @@ export function paramGradsOpt(h, L, s, v, k = 8, mixed = true) {
87
  }
88
  const bytesPerParameter = mixed ? 2 : 4;
89
 
90
- const result = [bytesPerParameter * n, bytesPerParameter * n, k * n];
91
- console.log('paramGradsOpt result:', result);
92
- return result;
 
 
 
 
 
 
 
93
  }
94
 
95
  export function updateGraph() {
@@ -101,15 +131,18 @@ export function updateGraph() {
101
  const L = +document.getElementById('L').value;
102
  const s = +document.getElementById('s').value;
103
  const v = +document.getElementById('v').value;
 
 
 
 
104
  const mixed = document.getElementById('mixed').checked;
105
  const recomputation = document.getElementById('recomputation').value;
106
  const ff_activation = document.getElementById('ff_activation').value;
107
 
108
- console.log('Slider values:', { a, b, h, h_ff, L, s, v, mixed, recomputation, ff_activation });
109
 
110
- const fixedSize100GB = 100 * 1024 * 1024 * 1024; // 100GB in bytes
111
- const activationMemoryData = activationMemory(a, b, h, h_ff, L, s, v, mixed, recomputation, ff_activation);
112
- const paramGradsOptValue = paramGradsOpt(h, L, s, v)[0];
113
 
114
  const data = {
115
  name: "root",
@@ -119,7 +152,7 @@ export function updateGraph() {
119
  value: 0,
120
  children: [
121
  activationMemoryData,
122
- { name: 'paramGradsOpt', value: paramGradsOptValue }
123
  ]
124
  }
125
  ]
@@ -147,9 +180,11 @@ export function updateGraph() {
147
  .sum(d => d.value);
148
  // .sort((a, b) => b.value - a.value);
149
 
150
- if (root.children[0].value < fixedSize100GB) {
151
- root.children[0].value = fixedSize100GB;
152
- }
 
 
153
 
154
  console.log('Treemap root:', root);
155
 
@@ -157,16 +192,20 @@ export function updateGraph() {
157
 
158
  const color = d => {
159
  switch(d.data.name) {
160
- case 'paramGradsOpt': return '#4e79a7'; // Blue
 
 
161
  case 'activationMemory': return '#f28e2c'; // Orange
162
  case 'fixed100GB': return '#59a14f'; // Green
163
  case 'Attention': return '#e15759'; // Red
164
- case 'Feedforward': return '#f28e2c'; // Orange
165
- case 'LayerNorm': return '#9b59b6'; // Purple
166
- case 'Dropout': return '#e15759'; // Red
167
- case 'Projection': return '#f28e2c'; // Orange
168
- case 'Cross Entropy': return '#e15759'; // Red
169
- default: return '#59a14f'; // Red (for unexpected cases)
 
 
170
  }
171
  };
172
 
@@ -178,7 +217,7 @@ export function updateGraph() {
178
  cell.append("rect")
179
  .attr("width", d => d.x1 - d.x0)
180
  .attr("height", d => d.y1 - d.y0)
181
- .attr("fill", d => d.depth === 1 ? "none" : color(d))
182
  .attr("stroke", d => d.depth === 1 ? color(d) : "none")
183
  .attr("stroke-width", 2);
184
 
@@ -196,8 +235,7 @@ export function updateGraph() {
196
  const name = d.data.name;
197
  const value = formatBytes(d.value);
198
 
199
- if (d.depth === 1) {
200
- // Parent node (fixed100GB)
201
  node.attr("transform", `translate(${padding},${fontSize + padding})`)
202
  .attr("font-weight", "bold")
203
  .text(`${name}: ${value}`);
@@ -235,8 +273,8 @@ export function updateGraph() {
235
  .attr("x", 0)
236
  .attr("width", 19)
237
  .attr("height", 19)
238
- .attr("fill", d => d.data.name === 'fixed100GB' ? 'none' : color(d))
239
- .attr("stroke", d => d.data.name === 'fixed100GB' ? color(d) : 'none')
240
  .attr("stroke-width", 2);
241
 
242
  legend.append("text")
@@ -256,10 +294,10 @@ function formatBytes(bytes) {
256
  }
257
 
258
  const presets = {
259
- "Tiny": { a: 16, b: 3, h: 1024, h_ff: 4096, L: 1, s: 7, v: 30522, mixed: true, recomputation: "none", ff_activation: "gelu" },
260
- "8B": { a: 32, b: 32, h: 4096, h_ff: 16384, L: 32, s: 256, v: 30522, mixed: true, recomputation: "none", ff_activation: "swiglu" },
261
- "70B": { a: 64, b: 32, h: 8192, h_ff: 32768, L: 80, s: 256, v: 30522, mixed: true, recomputation: "none", ff_activation: "swiglu" },
262
- "405B": { a: 128, b: 32, h: 16384, h_ff: 65536, L: 126, s: 256, v: 30522, mixed: true, recomputation: "none", ff_activation: "swiglu" }
263
  };
264
 
265
  function setPresetValues(preset) {
@@ -306,41 +344,80 @@ function syncSliderAndInput(sliderId, inputId) {
306
  }
307
 
308
  export const init_memory_plot = function () {
309
- console.log('DOM fully loaded and parsed');
310
 
311
- const sliderIds = ['a', 'b', 'h', 'h_ff', 'L', 's', 'v']; // Added 'v'
312
  sliderIds.forEach(id => {
313
- syncSliderAndInput(id, `${id}_input`);
 
 
 
 
 
 
314
  });
315
 
316
  const recomputationSelect = document.getElementById('recomputation');
317
- recomputationSelect.addEventListener('change', updateGraph);
 
 
 
 
318
 
319
  const ffActivationSelect = document.getElementById('ff_activation');
320
- ffActivationSelect.addEventListener('change', updateGraph);
 
 
 
 
321
 
322
  const mixedCheckbox = document.getElementById('mixed');
323
- mixedCheckbox.addEventListener('change', updateGraph);
 
 
 
 
324
 
325
  const presetSelect = document.getElementById('presets');
326
- presetSelect.addEventListener('change', (event) => {
327
- setPresetValues(event.target.value);
328
- });
 
 
 
 
329
 
330
- // Set max values for sliders based on the highest values in the presets
331
- document.getElementById('a').max = 128;
332
- document.getElementById('b').max = 53248;
333
- document.getElementById('h').max = 16384;
334
- document.getElementById('h_ff').max = 65536;
335
- document.getElementById('L').max = 126;
336
- document.getElementById('s').max = 128000;
337
- document.getElementById('v').max = 100000; // Set a reasonable max for vocabulary size
 
 
 
 
 
 
 
 
 
 
 
 
338
 
339
  console.log('Adding svg');
340
- const svg = d3.select("#graph")
341
- .append("svg")
342
- .attr("width", 960)
343
- .attr("height", 500);
 
 
 
 
 
344
 
345
  updateGraph();
346
  };
 
8
  L, // number of layers
9
  s, // sequence length
10
  v, // vocab size
11
+ tp = 1, // tensor model parallelism
12
  mixed = true,
13
  recomputation = "none",
14
  ff_activation = "relu"
15
  ) {
16
+ console.log('activationMemory called with:', { a, b, h, h_ff, L, s, v, tp, mixed, recomputation, ff_activation });
17
  // https://arxiv.org/pdf/2205.05198
18
  const bytesPerValue = mixed ? 2 : 4;
19
 
20
+ let oneLayerAttention;
21
+ if (recomputation === "none" || recomputation === "full") {
22
+ oneLayerAttention = s * b * h * (bytesPerValue * 4 / tp + bytesPerValue + 1) + ((2 * bytesPerValue + 1) * a * s * s * b); // eq (2)
23
+ } else if (recomputation === "selective") {
24
+ oneLayerAttention = s * b * h * (bytesPerValue * 4 / tp + bytesPerValue + 1); // table 2
25
+ } else {
26
+ throw new Error("Invalid recomputation value");
27
+ }
28
 
29
  let oneLayerFeedforward;
30
  if (ff_activation === "relu") {
31
+ oneLayerFeedforward = (s * b * h * bytesPerValue + (s * b * h_ff * bytesPerValue / tp) // inputs of 1st/2nd linear layers
32
  + s * b * h); // dropout
33
  } else if (ff_activation === "gelu") {
34
+ oneLayerFeedforward = (s * b * h * bytesPerValue + (s * b * h_ff * bytesPerValue / tp) // inputs of 1st/2nd linear layers
35
+ + s * b * h_ff * bytesPerValue / tp // inputs of activation function (not really necessary for Relu)
36
  + s * b * h); // dropout
37
  } else if (ff_activation === "swiglu") {
38
+ oneLayerFeedforward = (s * b * h * bytesPerValue + (s * b * h_ff * bytesPerValue / tp) // inputs of input/output linear layers
39
+ + s * b * h_ff * bytesPerValue * 3 / tp // inputs of activation function
40
  + s * b * h); // dropout (note that dropout is lower-precision - boolean)
41
  }
42
 
 
49
 
50
 
51
  let oneLayer;
52
+ let data
53
+ if (recomputation === "none" || recomputation === "selective") {
54
+
55
+ data = {
56
+ name: "activationMemory",
57
+ children: [
58
+ ...Array.from({ length: L }, (_, index) => ({
59
+ name: `Layer ${index + 1}`,
60
+ children: [
61
+ { name: 'Attention', value: oneLayerAttention },
62
+ { name: 'Feedforward', value: oneLayerFeedforward },
63
+ { name: 'LayerNorm', value: 2 * layerNorm },
64
+ ]
65
+ })),
66
+ { name: 'Dropout', value: inputDropout },
67
+ { name: 'LayerNorm', value: outputLayerNorm },
68
+ { name: 'Projection', value: outputLayerProjection },
69
+ { name: 'Cross Entropy', value: outputCrossEntropy }
70
+ ]
71
+ };
72
  } else if (recomputation === "full") {
73
+ data = {
74
+ name: "activationMemory",
75
+ children: [
76
+ { name: 'LayerInput', value: s * b * h * bytesPerValue * L},
77
+ { name: 'Dropout', value: inputDropout },
78
+ { name: 'LayerNorm', value: outputLayerNorm },
79
+ { name: 'Projection', value: outputLayerProjection },
80
+ { name: 'Cross Entropy', value: outputCrossEntropy }
81
+ ]
82
+ };
83
+ } else {
84
  throw new Error("Invalid recomputation value");
85
  }
86
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
87
 
 
88
 
89
  return data;
90
  }
91
 
92
+ export function paramGradsOpt(h, L, s, v, k = 8, dp = 1, zero = "Optimizer", mixed = true) {
93
+ // h, # hidden dimension size
94
+ // L, # number of layers
95
+ // s, # sequence length
96
+ // v, # vocab size
97
+ // k=8, # parameters for optimizer (Adam: 8 = 4 bytes moments + 4 bytes variance)
98
+ // dp=1, # data parallelism
99
+ // zero = "Optimizer", # zero data parallelism
100
+ // mixed=True # mixed precision training
101
+ console.log('paramGradsOpt called with:', { h, L, s, v, k, dp, zero, mixed });
102
  const emb = h * (v + s);
103
  const oneLayer = 12 * h ** 2 + 13 * h;
104
  const other = 2 * h;
 
110
  }
111
  const bytesPerParameter = mixed ? 2 : 4;
112
 
113
+ const data = {
114
+ name: "ParametersGradientOps",
115
+ children: [
116
+ { name: 'Parameters', value: zero === "Parameters" ? bytesPerParameter * n / dp : bytesPerParameter * n },
117
+ { name: 'Gradients', value: zero === "Gradients" ? bytesPerParameter * n / dp : bytesPerParameter * n },
118
+ { name: 'OptimizerAverages', value: zero === "Optimizer" ? k * n / dp : k * n }
119
+ ]
120
+ };
121
+ console.log('paramGradsOpt result:', data);
122
+ return data;
123
  }
124
 
125
  export function updateGraph() {
 
131
  const L = +document.getElementById('L').value;
132
  const s = +document.getElementById('s').value;
133
  const v = +document.getElementById('v').value;
134
+ const k = +document.getElementById('k').value;
135
+ const tp = +document.getElementById('tp').value; // New: t parameter
136
+ const zero = document.getElementById('zero').value;
137
+ const dp = document.getElementById('dp').value;
138
  const mixed = document.getElementById('mixed').checked;
139
  const recomputation = document.getElementById('recomputation').value;
140
  const ff_activation = document.getElementById('ff_activation').value;
141
 
142
+ console.log('Slider values:', { a, b, h, h_ff, L, s, v, k, tp, zero, dp, mixed, recomputation, ff_activation });
143
 
144
+ const activationMemoryData = activationMemory(a, b, h, h_ff, L, s, v, tp, mixed, recomputation, ff_activation);
145
+ const paramGradsOptValue = paramGradsOpt(h, L, s, v, k, dp, zero, mixed);
 
146
 
147
  const data = {
148
  name: "root",
 
152
  value: 0,
153
  children: [
154
  activationMemoryData,
155
+ paramGradsOptValue
156
  ]
157
  }
158
  ]
 
180
  .sum(d => d.value);
181
  // .sort((a, b) => b.value - a.value);
182
 
183
+ // const fixedSize100GB = 100 * 1024 * 1024 * 1024; // 100GB in bytes
184
+ // if (root.children[0].value < fixedSize100GB) {
185
+ // root.value = fixedSize100GB;
186
+ // root.children[0].value = fixedSize100GB;
187
+ // }
188
 
189
  console.log('Treemap root:', root);
190
 
 
192
 
193
  const color = d => {
194
  switch(d.data.name) {
195
+ case 'Parameters': return '#4e79a7'; // Blue
196
+ case 'Gradients': return '#f28e2c'; // Orange
197
+ case 'OptimizerAverages': return '#e15759'; // Green
198
  case 'activationMemory': return '#f28e2c'; // Orange
199
  case 'fixed100GB': return '#59a14f'; // Green
200
  case 'Attention': return '#e15759'; // Red
201
+ case 'Feedforward': return '#1f77b4'; // Light Blue
202
+ case 'LayerNorm': return '#ff7f0e'; // Dark Orange
203
+ case 'Dropout': return '#2ca02c'; // Dark Green
204
+ case 'Projection': return '#d62728'; // Dark Red
205
+ case 'Cross Entropy': return '#9467bd'; // Violet
206
+ case 'Total': return '#59a14f'; // Green
207
+ case 'root': return '#d3d3d3'; // Light Grey
208
+ default: return '#a0c4ff'; // Lighter Blue (for unexpected cases)
209
  }
210
  };
211
 
 
217
  cell.append("rect")
218
  .attr("width", d => d.x1 - d.x0)
219
  .attr("height", d => d.y1 - d.y0)
220
+ .attr("fill", d => color(d))
221
  .attr("stroke", d => d.depth === 1 ? color(d) : "none")
222
  .attr("stroke-width", 2);
223
 
 
235
  const name = d.data.name;
236
  const value = formatBytes(d.value);
237
 
238
+ if (d.depth === 1 || d.depth === 2) {
 
239
  node.attr("transform", `translate(${padding},${fontSize + padding})`)
240
  .attr("font-weight", "bold")
241
  .text(`${name}: ${value}`);
 
273
  .attr("x", 0)
274
  .attr("width", 19)
275
  .attr("height", 19)
276
+ .attr("fill", d => color(d))
277
+ .attr("stroke", 'grey')
278
  .attr("stroke-width", 2);
279
 
280
  legend.append("text")
 
294
  }
295
 
296
  const presets = {
297
+ "Llama 3 Tiny": { a: 16, b: 3, h: 1024, h_ff: 4096, L: 1, s: 7, v: 30522, k: 8, tp: 1, zero: "Optimizer", dp: 1, mixed: true, recomputation: "none", ff_activation: "gelu" },
298
+ "Llama 3 8B": { a: 32, b: 32, h: 4096, h_ff: 16384, L: 32, s: 256, v: 30522, k: 8, tp: 1, zero: "Optimizer", dp: 1, mixed: true, recomputation: "none", ff_activation: "swiglu" },
299
+ "Llama 3 70B": { a: 64, b: 32, h: 8192, h_ff: 32768, L: 80, s: 256, v: 30522, k: 8, tp: 1, zero: "Optimizer", dp: 1, mixed: true, recomputation: "none", ff_activation: "swiglu" },
300
+ "Llama 3 405B": { a: 128, b: 32, h: 16384, h_ff: 65536, L: 126, s: 256, v: 30522, k: 8, t: 1, mixed: true, recomputation: "none", ff_activation: "swiglu" }
301
  };
302
 
303
  function setPresetValues(preset) {
 
344
  }
345
 
346
  export const init_memory_plot = function () {
347
+ console.log('Initializing memory plot');
348
 
349
+ const sliderIds = ['a', 'b', 'h', 'h_ff', 'L', 's', 'v', 'k', 'tp', 'dp'];
350
  sliderIds.forEach(id => {
351
+ const slider = document.getElementById(id);
352
+ const input = document.getElementById(`${id}_input`);
353
+ if (slider && input) {
354
+ syncSliderAndInput(id, `${id}_input`);
355
+ } else {
356
+ console.warn(`Elements for ${id} not found`);
357
+ }
358
  });
359
 
360
  const recomputationSelect = document.getElementById('recomputation');
361
+ if (recomputationSelect) {
362
+ recomputationSelect.addEventListener('change', updateGraph);
363
+ } else {
364
+ console.warn('Recomputation select not found');
365
+ }
366
 
367
  const ffActivationSelect = document.getElementById('ff_activation');
368
+ if (ffActivationSelect) {
369
+ ffActivationSelect.addEventListener('change', updateGraph);
370
+ } else {
371
+ console.warn('FF Activation select not found');
372
+ }
373
 
374
  const mixedCheckbox = document.getElementById('mixed');
375
+ if (mixedCheckbox) {
376
+ mixedCheckbox.addEventListener('change', updateGraph);
377
+ } else {
378
+ console.warn('Mixed checkbox not found');
379
+ }
380
 
381
  const presetSelect = document.getElementById('presets');
382
+ if (presetSelect) {
383
+ presetSelect.addEventListener('change', (event) => {
384
+ setPresetValues(event.target.value);
385
+ });
386
+ } else {
387
+ console.warn('Preset select not found');
388
+ }
389
 
390
+ // Set max values for sliders
391
+ sliderIds.forEach(id => {
392
+ const slider = document.getElementById(id);
393
+ if (slider) {
394
+ switch(id) {
395
+ case 'a': slider.max = '128'; break;
396
+ case 'b': slider.max = '53248'; break;
397
+ case 'h': slider.max = '16384'; break;
398
+ case 'h_ff': slider.max = '65536'; break;
399
+ case 'L': slider.max = '126'; break;
400
+ case 's': slider.max = '128000'; break;
401
+ case 'v': slider.max = '100000'; break;
402
+ case 'k': slider.max = '16'; break;
403
+ case 'tp': slider.max = '16'; break;
404
+ case 'dp': slider.max = '256'; break;
405
+ }
406
+ } else {
407
+ console.warn(`Slider ${id} not found`);
408
+ }
409
+ });
410
 
411
  console.log('Adding svg');
412
+ const graphContainer = document.getElementById('graph');
413
+ if (graphContainer) {
414
+ const svg = d3.select("#graph")
415
+ .append("svg")
416
+ .attr("width", 960)
417
+ .attr("height", 500);
418
+ } else {
419
+ console.warn('Graph container not found');
420
+ }
421
 
422
  updateGraph();
423
  };