thomwolf HF staff commited on
Commit
ccc87e8
·
verified ·
1 Parent(s): 6fa4a17
Files changed (4) hide show
  1. dist/index.html +18 -11
  2. dist/style.css +1 -1
  3. src/index.html +18 -11
  4. src/style.css +1 -1
dist/index.html CHANGED
@@ -368,7 +368,7 @@
368
  \end{aligned}
369
  </d-math>
370
 
371
- <p>Now let’s have look how things change if we train with mixed precision<d-cite bibtex-key="micikevicius2018mixedprecisiontraining"></d-cite>. The default nowadays is for mixed precision training is BF16, requires 2 bytes per parameter and gradient as well as an additional copy of the model weights and gradients in FP32, thus 12 bytes per parameter in total. In addition to the parameters and gradient, we need to store the optimizer states: for the Adam optimizer, this requires the momentum and the variance usually stored in FP32 for numerical stability, each using 4 bytes. </p>
372
 
373
  <aside>See some more details below when we cover the ZeRO methods.</aside>
374
 
@@ -2722,22 +2722,29 @@
2722
 
2723
  window.addEventListener('scroll', (_event) => {
2724
  if (typeof (headings) != 'undefined' && headings != null && typeof (toc_links) != 'undefined' && toc_links != null) {
2725
- // Then iterate forwards, on the first match highlight it and break
2726
  find_active: {
2727
  for (let i = headings.length - 1; i >= 0; i--) {
2728
- if (headings[i].getBoundingClientRect().top - 50 <= 0) {
2729
- if (!toc_links[i].classList.contains("active")) {
2730
- toc_links.forEach((link, _index) => {
2731
- link.classList.remove("active");
2732
- });
2733
- toc_links[i].classList.add('active');
 
 
 
 
 
 
 
 
 
 
2734
  }
2735
  break find_active;
2736
  }
2737
  }
2738
- toc_links.forEach((link, _index) => {
2739
- link.classList.remove("active");
2740
- });
2741
  }
2742
  }
2743
  });
 
368
  \end{aligned}
369
  </d-math>
370
 
371
+ <p>Now let’s have look how things change if we use a lower precision. For stability reason (see <a target="_self" href="#mixed_precision_training">the mixed-precision training section below</a>) we often don't use full low precision training but a mix of higher and lower precision called "mixed precision"<d-cite bibtex-key="micikevicius2018mixedprecisiontraining"></d-cite>. The default nowadays for mixed precision training is to generally use BF16 for most of the computations –requiring 2 bytes per parameter and gradient as well as an additional copy of the model weights and gradients in FP32, thus 12 bytes per parameter in total. In addition to the parameters and gradient, we need to store the optimizer states: for the Adam optimizer, this requires the momentum and the variance usually stored in FP32 for numerical stability, each using 4 bytes. </p>
372
 
373
  <aside>See some more details below when we cover the ZeRO methods.</aside>
374
 
 
2722
 
2723
  window.addEventListener('scroll', (_event) => {
2724
  if (typeof (headings) != 'undefined' && headings != null && typeof (toc_links) != 'undefined' && toc_links != null) {
 
2725
  find_active: {
2726
  for (let i = headings.length - 1; i >= 0; i--) {
2727
+ const heading = headings[i];
2728
+ // Skip headings that shouldn't be in TOC
2729
+ if (heading.parentElement.tagName == 'D-TITLE' || heading.getAttribute('no-toc')) {
2730
+ continue;
2731
+ }
2732
+
2733
+ if (heading.getBoundingClientRect().top - 50 <= 0) {
2734
+ // Find matching TOC link by href
2735
+ const headingId = heading.getAttribute('id');
2736
+ const activeLink = Array.from(toc_links).find(link =>
2737
+ link.getAttribute('href') === '#' + headingId
2738
+ );
2739
+
2740
+ if (activeLink && !activeLink.classList.contains("active")) {
2741
+ toc_links.forEach(link => link.classList.remove("active"));
2742
+ activeLink.classList.add('active');
2743
  }
2744
  break find_active;
2745
  }
2746
  }
2747
+ toc_links.forEach(link => link.classList.remove("active"));
 
 
2748
  }
2749
  }
2750
  });
dist/style.css CHANGED
@@ -188,7 +188,7 @@ toggle-icon.collapsed {
188
  .toc-content {
189
  margin-top: 15px;
190
  overflow: hidden;
191
- max-height: 1000px;
192
  transition: max-height 0.3s ease-out;
193
  }
194
 
 
188
  .toc-content {
189
  margin-top: 15px;
190
  overflow: hidden;
191
+ /* max-height: 1000px; */
192
  transition: max-height 0.3s ease-out;
193
  }
194
 
src/index.html CHANGED
@@ -368,7 +368,7 @@
368
  \end{aligned}
369
  </d-math>
370
 
371
- <p>Now let’s have look how things change if we train with mixed precision<d-cite bibtex-key="micikevicius2018mixedprecisiontraining"></d-cite>. The default nowadays is for mixed precision training is BF16, requires 2 bytes per parameter and gradient as well as an additional copy of the model weights and gradients in FP32, thus 12 bytes per parameter in total. In addition to the parameters and gradient, we need to store the optimizer states: for the Adam optimizer, this requires the momentum and the variance usually stored in FP32 for numerical stability, each using 4 bytes. </p>
372
 
373
  <aside>See some more details below when we cover the ZeRO methods.</aside>
374
 
@@ -2772,22 +2772,29 @@
2772
 
2773
  window.addEventListener('scroll', (_event) => {
2774
  if (typeof (headings) != 'undefined' && headings != null && typeof (toc_links) != 'undefined' && toc_links != null) {
2775
- // Then iterate forwards, on the first match highlight it and break
2776
  find_active: {
2777
  for (let i = headings.length - 1; i >= 0; i--) {
2778
- if (headings[i].getBoundingClientRect().top - 50 <= 0) {
2779
- if (!toc_links[i].classList.contains("active")) {
2780
- toc_links.forEach((link, _index) => {
2781
- link.classList.remove("active");
2782
- });
2783
- toc_links[i].classList.add('active');
 
 
 
 
 
 
 
 
 
 
2784
  }
2785
  break find_active;
2786
  }
2787
  }
2788
- toc_links.forEach((link, _index) => {
2789
- link.classList.remove("active");
2790
- });
2791
  }
2792
  }
2793
  });
 
368
  \end{aligned}
369
  </d-math>
370
 
371
+ <p>Now let’s have look how things change if we use a lower precision. For stability reason (see <a target="_self" href="#mixed_precision_training">the mixed-precision training section below</a>) we often don't use full low precision training but a mix of higher and lower precision called "mixed precision"<d-cite bibtex-key="micikevicius2018mixedprecisiontraining"></d-cite>. The default nowadays for mixed precision training is to generally use BF16 for most of the computations –requiring 2 bytes per parameter and gradient as well as an additional copy of the model weights and gradients in FP32, thus 12 bytes per parameter in total. In addition to the parameters and gradient, we need to store the optimizer states: for the Adam optimizer, this requires the momentum and the variance usually stored in FP32 for numerical stability, each using 4 bytes. </p>
372
 
373
  <aside>See some more details below when we cover the ZeRO methods.</aside>
374
 
 
2772
 
2773
  window.addEventListener('scroll', (_event) => {
2774
  if (typeof (headings) != 'undefined' && headings != null && typeof (toc_links) != 'undefined' && toc_links != null) {
 
2775
  find_active: {
2776
  for (let i = headings.length - 1; i >= 0; i--) {
2777
+ const heading = headings[i];
2778
+ // Skip headings that shouldn't be in TOC
2779
+ if (heading.parentElement.tagName == 'D-TITLE' || heading.getAttribute('no-toc')) {
2780
+ continue;
2781
+ }
2782
+
2783
+ if (heading.getBoundingClientRect().top - 50 <= 0) {
2784
+ // Find matching TOC link by href
2785
+ const headingId = heading.getAttribute('id');
2786
+ const activeLink = Array.from(toc_links).find(link =>
2787
+ link.getAttribute('href') === '#' + headingId
2788
+ );
2789
+
2790
+ if (activeLink && !activeLink.classList.contains("active")) {
2791
+ toc_links.forEach(link => link.classList.remove("active"));
2792
+ activeLink.classList.add('active');
2793
  }
2794
  break find_active;
2795
  }
2796
  }
2797
+ toc_links.forEach(link => link.classList.remove("active"));
 
 
2798
  }
2799
  }
2800
  });
src/style.css CHANGED
@@ -188,7 +188,7 @@ toggle-icon.collapsed {
188
  .toc-content {
189
  margin-top: 15px;
190
  overflow: hidden;
191
- max-height: 1000px;
192
  transition: max-height 0.3s ease-out;
193
  }
194
 
 
188
  .toc-content {
189
  margin-top: 15px;
190
  overflow: hidden;
191
+ /* max-height: 1000px; */
192
  transition: max-height 0.3s ease-out;
193
  }
194