leandro commited on
Commit
59b2635
1 Parent(s): c39ea01

initial draft

Browse files
Files changed (2) hide show
  1. README.md +3 -3
  2. app.py +136 -0
README.md CHANGED
@@ -1,7 +1,7 @@
1
  ---
2
- title: Train Llm
3
- emoji: 🏃
4
- colorFrom: pink
5
  colorTo: yellow
6
  sdk: gradio
7
  sdk_version: 4.26.0
 
1
  ---
2
+ title: Harm Space
3
+ emoji:
4
+ colorFrom: gray
5
  colorTo: yellow
6
  sdk: gradio
7
  sdk_version: 4.26.0
app.py ADDED
@@ -0,0 +1,136 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ import matplotlib
3
+ matplotlib.use('Agg')
4
+ import matplotlib.pyplot as plt
5
+ import numpy as np
6
+ from matplotlib.ticker import MultipleLocator
7
+
8
+ HARM_INTRO = """
9
+ The Chinchilla scaling laws focus on optimally scaling training compute but often we also care about inference cost.
10
+ This tool follows [Harm de Vries' blog post](https://www.harmdevries.com/post/model-size-vs-compute-overhead/) and visualizes the tradeoff between training comput and inference cost (i.e. model size).
11
+ """
12
+
13
+ ### GPU specs:
14
+ A100_flops = 312e12
15
+ H100_flops = 990e12
16
+
17
+ ### CHINCHILLA PARAMS:
18
+ E = 1.62
19
+ A = 406.4
20
+ B = 410.7
21
+ alpha = 0.336
22
+ beta = 0.283
23
+
24
+ Bn = 10**9
25
+
26
+ G = ((alpha*A)/(beta*B))**(1/(alpha+beta))
27
+
28
+ ### FUNCTIONS
29
+ def to_flops(N, D):
30
+ return 6 * N * D
31
+
32
+ def n_opt(C):
33
+ return G * ((C/6) ** (beta / (alpha+beta)))
34
+
35
+ def d_opt(C):
36
+ return (1/G) * ((C/6) ** (alpha / (alpha+beta)))
37
+
38
+ def compute_kd(kn):
39
+ frac = (A/B)*(G**(-alpha-beta))
40
+ kd = (1-((kn**-alpha -1)*frac))**(1/(-beta))
41
+ return kd
42
+
43
+ def compute_overhead(kn, kd):
44
+ return kn*kd - 1
45
+
46
+ ### PRECOMPUTE CURVE:
47
+ kn_min = 0.18
48
+ kn_max = 2
49
+
50
+ kns = np.linspace(kn_min, kn_max, 100)
51
+ overheads = []
52
+ for kn in kns:
53
+ kd = compute_kd(kn)
54
+ overheads.append(compute_overhead(kn, kd)*100)
55
+
56
+ def plot_curve(kn, kd):
57
+ fig, ax = plt.subplots(dpi=200, figsize=(5, 3))
58
+ plt.plot(kns, overheads, color="black", zorder=1)
59
+ plt.scatter([kn], [compute_overhead(kn, kd)*100], s=100, marker="o", c="red", label="You are here!", zorder=2)
60
+ plt.scatter([1.0], [0.0], marker="o", s=100, c="blue", label="Chinchilla optimal", zorder=2)
61
+ plt.xlabel("Fraction of Chinchilla optimal model size")
62
+ plt.ylabel("Compute overhead (%)")
63
+ plt.legend(loc="best")
64
+ plt.grid(True, which="both")
65
+ plt.grid(True, which="minor", alpha=0.5)
66
+ ax.yaxis.set_minor_locator(MultipleLocator(10))
67
+ plt.tight_layout()
68
+
69
+ return fig
70
+
71
+
72
+ def compute(N, D, gpu_type, gpu_util, n_gpus, gpu_price):
73
+
74
+ C = to_flops(N * Bn, D * Bn)
75
+ N_opt = n_opt(C)
76
+ D_opt = d_opt(C)
77
+
78
+ kn = Bn*N/N_opt
79
+ kd = compute_kd(kn)
80
+
81
+ fig = plot_curve(kn, kd)
82
+
83
+
84
+ gpu_util = 0.5
85
+ if gpu_type=="H100":
86
+ gpu_flops = H100_flops * gpu_util
87
+ else:
88
+ gpu_flops = A100_flops * gpu_util
89
+ gpu_hours = (C / (gpu_flops * 3600))
90
+
91
+
92
+ text = f"""\
93
+ ## Training summary
94
+
95
+ |Training compute| Training cost | Training time | Total GPU hours |
96
+ |:----|:-------|:-------|:-------|
97
+ |{C:.2E} TFLOPs | ${(gpu_hours * gpu_price)/1e6:.2f}M | {gpu_hours/(24*n_gpus):.2f} days | {gpu_hours/1_000_000:.2f}M |
98
+
99
+ ## Chinchilla and Training/Inference Trade-off
100
+ Optimal model/dataset size for training compute and how it translates to training overhead and inference savings according to Harm's law
101
+ |Chinchilla optimal model | Chinchilla optimal dataset | Training overhead | Inference savings|
102
+ |:----|:-------|:----|:-------|
103
+ | {N_opt/Bn:.2f}B parameters | {D_opt/Bn:.2f}B tokens | {100*compute_overhead(kn, kd):.2f}%| {100 - kn*100:.2f}% |
104
+ """
105
+
106
+ return text, fig
107
+
108
+ with gr.Blocks() as demo:
109
+ gr.Markdown("# LLM training calculator")
110
+
111
+ gr.Markdown("## Training configuration")
112
+ with gr.Row():
113
+
114
+ N = gr.Number(value=7, label="Model size (in B parameters):")
115
+ D = gr.Number(value=2000, label="Dataset size (in B tokens):")
116
+
117
+ gr.Markdown("## Cluster configuration")
118
+ with gr.Row():
119
+ n_gpus = gr.Number(value=1000, label="Number of GPUs")
120
+ gpu_type = gr.Dropdown(choices=["A100", "H100"], value="H100", label="GPU type")
121
+ gpu_util = gr.Number(value=50, label="% GPU utilization")
122
+ gpu_price = gr.Number(value=3.00, label="$/GPU/Hour")
123
+ button = gr.Button("Compute!")
124
+
125
+ with gr.Row():
126
+ with gr.Column():
127
+ gr.Markdown("## Harm's law")
128
+ plot = gr.Plot(value=plt)
129
+ gr.Markdown(HARM_INTRO)
130
+
131
+ with gr.Column():
132
+ md = gr.Markdown("")
133
+
134
+ button.click(fn=compute, inputs=[N, D, gpu_type, gpu_util, n_gpus, gpu_price], outputs=[md, plot])
135
+ demo.load(fn=compute, inputs=[N, D, gpu_type, gpu_util, n_gpus, gpu_price], outputs=[md, plot])
136
+ demo.launch()