Spaces:
Runtime error
Runtime error
Adapt my vram calc code to this UI and create a prototype
Browse files
app.py
CHANGED
@@ -31,18 +31,14 @@ def report_results():
|
|
31 |
|
32 |
USER_TOKEN = None
|
33 |
post = f"""# Model Memory Requirements\n
|
34 |
-
|
35 |
You will need about {minimum[1]} VRAM to load this model for inference, and {minimum[3]} VRAM to train it using Adam.
|
36 |
|
37 |
These calculations were measured from the [Model Memory Utility Space](https://hf.co/spaces/hf-accelerate/model-memory-utility) on the Hub.
|
38 |
|
39 |
The minimum recommended vRAM needed for this model assumes using [Accelerate or `device_map="auto"`](https://huggingface.co/docs/accelerate/usage_guides/big_modeling) and is denoted by the size of the "largest layer".
|
40 |
When performing inference, expect to add up to an additional 20% to this, as found by [EleutherAI](https://blog.eleuther.ai/transformer-math/). More tests will be performed in the future to get a more accurate benchmark for each model.
|
41 |
-
|
42 |
When training with `Adam`, you can expect roughly 4x the reported results to be used. (1x for the model, 1x for the gradients, and 2x for the optimizer).
|
43 |
-
|
44 |
## Results:
|
45 |
-
|
46 |
{results}
|
47 |
"""
|
48 |
discussion = api.create_discussion(MODEL_NAME, "[AUTOMATED] Model Memory Requirements", description=post)
|
@@ -55,7 +51,123 @@ def convert_url_to_name(url:str):
|
|
55 |
raise ValueError(f"URL {url} is not a valid model URL to the Hugging Face Hub")
|
56 |
return results[0]
|
57 |
|
58 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
59 |
"Calculates the memory usage for a model"
|
60 |
if library == "auto":
|
61 |
library = None
|
@@ -82,27 +194,54 @@ def calculate_memory(model_name:str, library:str, options:list, access_token:str
|
|
82 |
data = []
|
83 |
|
84 |
title = f"Memory Usage for '{model_name}'"
|
85 |
-
|
86 |
-
|
87 |
-
|
88 |
-
|
89 |
-
|
90 |
-
|
91 |
-
|
92 |
-
|
93 |
-
|
94 |
-
|
95 |
-
dtype_total_size /= 8
|
96 |
-
dtype_largest_layer /= 8
|
97 |
-
dtype_training_size = convert_bytes(dtype_total_size * 4)
|
98 |
-
dtype_total_size = convert_bytes(dtype_total_size)
|
99 |
-
dtype_largest_layer = convert_bytes(dtype_largest_layer)
|
100 |
-
data.append({
|
101 |
"dtype": dtype,
|
102 |
-
|
103 |
-
|
104 |
-
|
105 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
106 |
global HAS_DISCUSSION, MODEL_NAME, LIBRARY
|
107 |
HAS_DISCUSSION = check_for_discussion(model_name)
|
108 |
MODEL_NAME = model_name
|
@@ -114,7 +253,7 @@ def calculate_memory(model_name:str, library:str, options:list, access_token:str
|
|
114 |
results = [
|
115 |
f'## {title}',
|
116 |
gr.update(visible=True, value=pd.DataFrame(data)),
|
117 |
-
gr.update(visible=not HAS_DISCUSSION)
|
118 |
]
|
119 |
return results
|
120 |
|
@@ -122,48 +261,71 @@ with gr.Blocks() as demo:
|
|
122 |
with gr.Column():
|
123 |
gr.Markdown(
|
124 |
"""<img src="https://huggingface.co/spaces/hf-accelerate/model-memory-usage/resolve/main/measure_model_size.png" style="float: left;" width="250" height="250"><h1>🤗 Model Memory Calculator</h1>
|
|
|
|
|
|
|
|
|
|
|
125 |
|
126 |
-
|
127 |
-
on a model hosted on the 🤗 Hugging Face Hub. The minimum recommended vRAM needed for a model
|
128 |
-
is denoted as the size of the "largest layer", and training of a model is roughly 4x its size (for Adam).
|
129 |
-
|
130 |
-
These calculations are accurate within a few percent at most, such as `bert-base-cased` being 413.68 MB and the calculator estimating 413.18 MB.
|
131 |
-
|
132 |
-
When performing inference, expect to add up to an additional 20% to this as found by [EleutherAI](https://blog.eleuther.ai/transformer-math/).
|
133 |
-
More tests will be performed in the future to get a more accurate benchmark for each model.
|
134 |
|
135 |
-
|
|
|
136 |
|
137 |
-
|
138 |
-
select which framework it originates from ("auto" will try and detect it from the model metadata), and
|
139 |
-
what precisions you want to use."""
|
140 |
)
|
141 |
out_text = gr.Markdown()
|
142 |
-
out = gr.DataFrame(
|
143 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
144 |
interactive=False,
|
145 |
visible=False,
|
146 |
)
|
147 |
with gr.Row():
|
148 |
inp = gr.Textbox(label="Model Name or URL", value="bert-base-cased")
|
149 |
with gr.Row():
|
150 |
-
library = gr.Radio(["
|
151 |
-
|
152 |
["float32", "float16/bfloat16", "int8", "int4"],
|
153 |
-
value="float32",
|
154 |
label="Model Precision",
|
155 |
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
156 |
access_token = gr.Textbox(label="API Token", placeholder="Optional (for gated models)")
|
157 |
with gr.Row():
|
158 |
btn = gr.Button("Calculate Memory Usage")
|
159 |
-
post_to_hub = gr.Button(value = "Report results in this model repo's discussions!\n(Will open in a new tab)", visible=False)
|
160 |
USER_TOKEN = access_token
|
161 |
|
162 |
btn.click(
|
163 |
-
calculate_memory, inputs=[inp, library,
|
164 |
)
|
165 |
|
166 |
-
post_to_hub.click(report_results).then(lambda: gr.Button.update(visible=False), outputs=post_to_hub)
|
167 |
|
168 |
|
169 |
-
demo.launch()
|
|
|
31 |
|
32 |
USER_TOKEN = None
|
33 |
post = f"""# Model Memory Requirements\n
|
|
|
34 |
You will need about {minimum[1]} VRAM to load this model for inference, and {minimum[3]} VRAM to train it using Adam.
|
35 |
|
36 |
These calculations were measured from the [Model Memory Utility Space](https://hf.co/spaces/hf-accelerate/model-memory-utility) on the Hub.
|
37 |
|
38 |
The minimum recommended vRAM needed for this model assumes using [Accelerate or `device_map="auto"`](https://huggingface.co/docs/accelerate/usage_guides/big_modeling) and is denoted by the size of the "largest layer".
|
39 |
When performing inference, expect to add up to an additional 20% to this, as found by [EleutherAI](https://blog.eleuther.ai/transformer-math/). More tests will be performed in the future to get a more accurate benchmark for each model.
|
|
|
40 |
When training with `Adam`, you can expect roughly 4x the reported results to be used. (1x for the model, 1x for the gradients, and 2x for the optimizer).
|
|
|
41 |
## Results:
|
|
|
42 |
{results}
|
43 |
"""
|
44 |
discussion = api.create_discussion(MODEL_NAME, "[AUTOMATED] Model Memory Requirements", description=post)
|
|
|
51 |
raise ValueError(f"URL {url} is not a valid model URL to the Hugging Face Hub")
|
52 |
return results[0]
|
53 |
|
54 |
+
# Based on the following doc:
|
55 |
+
#
|
56 |
+
# - https://huggingface.co/docs/transformers/v4.31.0/perf_train_gpu_one#anatomy-of-models-memory
|
57 |
+
# - https://blog.eleuther.ai/transformer-math/
|
58 |
+
# - https://kipp.ly/transformer-inference-arithmetic/
|
59 |
+
# - https://github.com/ray-project/llm-numbers
|
60 |
+
#
|
61 |
+
def calc_vram_f32(model, optimizer, sequence_len, micro_batch_size, device_count, gradient_checkpointing):
|
62 |
+
# is_16bit = cfg.bf16 or cfg.bfloat16 or cfg.load_in_8bit or cfg.fp16 or cfg.float16
|
63 |
+
|
64 |
+
# if torch.cuda.device_count() > 1 or cfg.fsdp or os.environ.get("ACCELERATE_USE_DEEPSPEED") == "true" or cfg.adapter:
|
65 |
+
# return { 'supported': False }
|
66 |
+
|
67 |
+
# Model Weights
|
68 |
+
#
|
69 |
+
# Hf doc counts:
|
70 |
+
#
|
71 |
+
# - 4 bytes * number of parameters for fp32 training
|
72 |
+
# - 6 bytes * number of parameters for mixed precision training (maintains a model in fp32 and one in fp16 in memory)
|
73 |
+
#
|
74 |
+
# But we follow https://blog.eleuther.ai/transformer-math/#model-weights to count 2 bytes here for mixed precision training,
|
75 |
+
# leave the rest to optimizor state.
|
76 |
+
#
|
77 |
+
# Here we calculate only for fp32, will adjust for each dtype outside.
|
78 |
+
#
|
79 |
+
# for param in model.parameters():
|
80 |
+
# print(f'{type(param)} {param.shape} {param.element_size()}')
|
81 |
+
#
|
82 |
+
# print(f'total parameters = {sum([param.nelement() for param in model.parameters()])}')
|
83 |
+
|
84 |
+
param_element_size = 4
|
85 |
+
vram_model = sum([param.nelement() * param_element_size for param in model.parameters()])
|
86 |
+
|
87 |
+
# Buffers
|
88 |
+
#
|
89 |
+
# Buffers are tensors that do not require gradients and not registered as parameters.
|
90 |
+
# e.g. mean and std in batch norm layers.
|
91 |
+
# - https://github.com/huggingface/transformers/blob/d4bd33cc9f11ca48635e54983d75249c78d72e2a/src/transformers/modeling_utils.py#L1897
|
92 |
+
# - https://discuss.pytorch.org/t/gpu-memory-that-model-uses/56822/2
|
93 |
+
#
|
94 |
+
# for buf in model.buffers():
|
95 |
+
# print(f'buf.element_size() = {buf.element_size()}')
|
96 |
+
vram_buffer = sum([buf.nelement() * buf.element_size() for buf in model.buffers()])
|
97 |
+
|
98 |
+
# Optimizer States:
|
99 |
+
# - 8 bytes * number of parameters for normal AdamW (maintains 2 states)
|
100 |
+
# - 2 bytes * number of parameters for 8-bit AdamW optimizers like bitsandbytes
|
101 |
+
# - 4 bytes * number of parameters for optimizers like SGD with momentum (maintains only 1 state)
|
102 |
+
#
|
103 |
+
# For now we use AdamW/SGD as the baseline for the estimation, even for other more memory-efficient optimizers
|
104 |
+
# ADAMW_HF = "adamw_hf"
|
105 |
+
# ADAMW_TORCH = "adamw_torch"
|
106 |
+
# ADAMW_TORCH_FUSED = "adamw_torch_fused"
|
107 |
+
# ADAMW_TORCH_XLA = "adamw_torch_xla"
|
108 |
+
# ADAMW_APEX_FUSED = "adamw_apex_fused"
|
109 |
+
# ADAFACTOR = "adafactor"
|
110 |
+
# ADAMW_ANYPRECISION = "adamw_anyprecision"
|
111 |
+
# SGD = "sgd"
|
112 |
+
# ADAGRAD = "adagrad"
|
113 |
+
# ADAMW_BNB = "adamw_bnb_8bit"
|
114 |
+
# ADAMW_8BIT = "adamw_8bit" # just an alias for adamw_bnb_8bit
|
115 |
+
# LION_8BIT = "lion_8bit"
|
116 |
+
# LION = "lion_32bit"
|
117 |
+
# PAGED_ADAMW = "paged_adamw_32bit"
|
118 |
+
# PAGED_ADAMW_8BIT = "paged_adamw_8bit"
|
119 |
+
# PAGED_LION = "paged_lion_32bit"
|
120 |
+
# PAGED_LION_8BIT = "paged_lion_8bit"
|
121 |
+
# optimizer = cfg.optimizer
|
122 |
+
optimizer_state_size_per_param = 4 if 'sgd' in optimizer else (2 if '8bit' in optimizer else 8)
|
123 |
+
vram_optimizer = sum([param.nelement() * optimizer_state_size_per_param for param in model.parameters()])
|
124 |
+
|
125 |
+
# Gradients
|
126 |
+
#
|
127 |
+
# 4 bytes * number of parameters for either fp32 or mixed precision training (gradients are always kept in fp32)
|
128 |
+
# but we will follow transformer-math to treat it conditionally outside
|
129 |
+
# for now we ignores whether is mixed precision training
|
130 |
+
#
|
131 |
+
gradient_element_size = 4 # 2 if is_16bit else 4
|
132 |
+
vram_gradient = sum([param.nelement() * gradient_element_size for param in model.parameters()])
|
133 |
+
|
134 |
+
# Forward Activations
|
135 |
+
# size depends on many factors, the key ones being sequence length, hidden size and batch size.
|
136 |
+
s = sequence_len # cfg.sequence_len
|
137 |
+
b = micro_batch_size # cfg.micro_batch_size
|
138 |
+
h = model.config.hidden_size
|
139 |
+
L = model.config.num_hidden_layers
|
140 |
+
t = device_count # max(1, torch.cuda.device_count()) # len(DataParallel(model).device_ids) #torch.cuda.device_count()
|
141 |
+
a = model.config.num_attention_heads
|
142 |
+
print(f's={s} b={b} h={h} L={L} t={t} a={a}')
|
143 |
+
|
144 |
+
sbHL = s * b * h * L
|
145 |
+
print(f'sbHL = {sbHL / 1e9} GB')
|
146 |
+
|
147 |
+
print(f'10 + {24 / t} + {5 * a * s / (h * t)}')
|
148 |
+
|
149 |
+
vram_activation = sbHL * (10 + 24 / t) if gradient_checkpointing else sbHL * (10 + 24 / t + 5 * a * s / (h * t))
|
150 |
+
|
151 |
+
return {
|
152 |
+
# 'supported': True,
|
153 |
+
'param_element_size': param_element_size,
|
154 |
+
'total': vram_model + vram_buffer + vram_optimizer + vram_activation,
|
155 |
+
'model': vram_model,
|
156 |
+
'buffer': vram_buffer,
|
157 |
+
'optimizer': vram_optimizer,
|
158 |
+
'activation': vram_activation,
|
159 |
+
}
|
160 |
+
def bytes_by_dtype(bytes, dtype):
|
161 |
+
if dtype in ("fp16", "bf16", "float16/bfloat16"):
|
162 |
+
return bytes / 2
|
163 |
+
elif dtype == "int8":
|
164 |
+
return bytes / 4
|
165 |
+
elif dtype == "int4":
|
166 |
+
return bytes / 8
|
167 |
+
else:
|
168 |
+
return bytes
|
169 |
+
|
170 |
+
def calculate_memory(model_name:str, library:str, dtypes:list, optimizer:str, access_token:str, raw=False):
|
171 |
"Calculates the memory usage for a model"
|
172 |
if library == "auto":
|
173 |
library = None
|
|
|
194 |
data = []
|
195 |
|
196 |
title = f"Memory Usage for '{model_name}'"
|
197 |
+
|
198 |
+
vram_f32 = calc_vram_f32(model, optimizer=optimizer, sequence_len=2048, micro_batch_size=1, device_count=1, gradient_checkpointing=True)
|
199 |
+
|
200 |
+
for dtype in dtypes:
|
201 |
+
param_element_size = bytes_by_dtype(vram_f32['param_element_size'], dtype)
|
202 |
+
vram_model = bytes_by_dtype(vram_f32['model'], dtype)
|
203 |
+
vram_buffer = vram_f32['buffer']
|
204 |
+
vram_optimizer = vram_f32['optimizer']
|
205 |
+
vram_activation = vram_f32['activation']
|
206 |
+
row = {
|
|
|
|
|
|
|
|
|
|
|
|
|
207 |
"dtype": dtype,
|
208 |
+
'inference_total': convert_bytes(vram_model + vram_activation),
|
209 |
+
'training_total': convert_bytes(vram_model + vram_buffer + vram_optimizer + vram_activation),
|
210 |
+
'model': convert_bytes(vram_model),
|
211 |
+
'buffer': convert_bytes(vram_buffer),
|
212 |
+
'optimizer': convert_bytes(vram_optimizer),
|
213 |
+
'activation': convert_bytes(vram_activation),
|
214 |
+
}
|
215 |
+
|
216 |
+
data.append(row)
|
217 |
+
# dtype_total_size = total_size
|
218 |
+
# dtype_largest_layer = largest_layer[0]
|
219 |
+
# if dtype in ("fp16", "bf16", "float16/bfloat16"):
|
220 |
+
# dtype_total_size /= 2
|
221 |
+
# dtype_largest_layer /= 2
|
222 |
+
# elif dtype == "int8":
|
223 |
+
# dtype_total_size /= 4
|
224 |
+
# dtype_largest_layer /= 4
|
225 |
+
# elif dtype == "int4":
|
226 |
+
# dtype_total_size /= 8
|
227 |
+
# dtype_largest_layer /= 8
|
228 |
+
# dtype_training_size = convert_bytes(dtype_total_size * 4)
|
229 |
+
# dtype_total_size = convert_bytes(dtype_total_size)
|
230 |
+
# dtype_largest_layer = convert_bytes(dtype_largest_layer)
|
231 |
+
# data.append({
|
232 |
+
# "dtype": dtype,
|
233 |
+
# "Largest Layer or Residual Group": dtype_largest_layer,
|
234 |
+
# "Total Size": dtype_total_size,
|
235 |
+
# "Training using Adam": dtype_training_size,
|
236 |
+
# "Test": 12345
|
237 |
+
# })
|
238 |
+
# data.append({
|
239 |
+
# "dtype": dtype,
|
240 |
+
# "Largest Layer or Residual Group": dtype_largest_layer,
|
241 |
+
# "Total Size": dtype_total_size,
|
242 |
+
# "Training using Adam": dtype_training_size,
|
243 |
+
# "Test": 12345
|
244 |
+
# })
|
245 |
global HAS_DISCUSSION, MODEL_NAME, LIBRARY
|
246 |
HAS_DISCUSSION = check_for_discussion(model_name)
|
247 |
MODEL_NAME = model_name
|
|
|
253 |
results = [
|
254 |
f'## {title}',
|
255 |
gr.update(visible=True, value=pd.DataFrame(data)),
|
256 |
+
# gr.update(visible=not HAS_DISCUSSION)
|
257 |
]
|
258 |
return results
|
259 |
|
|
|
261 |
with gr.Column():
|
262 |
gr.Markdown(
|
263 |
"""<img src="https://huggingface.co/spaces/hf-accelerate/model-memory-usage/resolve/main/measure_model_size.png" style="float: left;" width="250" height="250"><h1>🤗 Model Memory Calculator</h1>
|
264 |
+
This tool is modified from https://huggingface.co/spaces/hf-accelerate/model-memory-usage with the following changes:
|
265 |
+
|
266 |
+
- Focus on transformers and gives more detailed estimation based on more configs
|
267 |
+
- Will auto-calculate the proper batch size given a VRAM constraint later
|
268 |
+
- LoRA/QLoRA etc. will be supported later
|
269 |
|
270 |
+
Note:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
271 |
|
272 |
+
- inference_total = model
|
273 |
+
- training_total = model + buffer + optimizer + activation
|
274 |
|
275 |
+
"""
|
|
|
|
|
276 |
)
|
277 |
out_text = gr.Markdown()
|
278 |
+
out = gr.DataFrame(headers=[
|
279 |
+
"dtype",
|
280 |
+
'inference_total',
|
281 |
+
'training_total',
|
282 |
+
'model',
|
283 |
+
'buffer',
|
284 |
+
'optimizer',
|
285 |
+
'activation',
|
286 |
+
],
|
287 |
interactive=False,
|
288 |
visible=False,
|
289 |
)
|
290 |
with gr.Row():
|
291 |
inp = gr.Textbox(label="Model Name or URL", value="bert-base-cased")
|
292 |
with gr.Row():
|
293 |
+
library = gr.Radio(["transformers"], label="Library", value="transformers")
|
294 |
+
dtypes = gr.CheckboxGroup(
|
295 |
["float32", "float16/bfloat16", "int8", "int4"],
|
296 |
+
value=["float32", "float16/bfloat16", "int8", "int4"],
|
297 |
label="Model Precision",
|
298 |
)
|
299 |
+
# ADAMW_HF = "adamw_hf"
|
300 |
+
# ADAMW_TORCH = "adamw_torch"
|
301 |
+
# ADAMW_TORCH_FUSED = "adamw_torch_fused"
|
302 |
+
# ADAMW_TORCH_XLA = "adamw_torch_xla"
|
303 |
+
# ADAMW_APEX_FUSED = "adamw_apex_fused"
|
304 |
+
# ADAFACTOR = "adafactor"
|
305 |
+
# ADAMW_ANYPRECISION = "adamw_anyprecision"
|
306 |
+
# SGD = "sgd"
|
307 |
+
# ADAGRAD = "adagrad"
|
308 |
+
# ADAMW_BNB = "adamw_bnb_8bit"
|
309 |
+
# ADAMW_8BIT = "adamw_8bit" # just an alias for adamw_bnb_8bit
|
310 |
+
# LION_8BIT = "lion_8bit"
|
311 |
+
# LION = "lion_32bit"
|
312 |
+
# PAGED_ADAMW = "paged_adamw_32bit"
|
313 |
+
# PAGED_ADAMW_8BIT = "paged_adamw_8bit"
|
314 |
+
# PAGED_LION = "paged_lion_32bit"
|
315 |
+
# PAGED_LION_8BIT = "paged_lion_8bit"
|
316 |
+
optimizer = gr.Dropdown(choices=["adamw_hf", "adamw_torch", "sgd", "lion_32bit", "adamw_8bit", "lion_8bit", "paged_adamw_8bit", "paged_lion_8bit"],
|
317 |
+
value="adamw_hf", label="Optimizer", allow_custom_value=True)
|
318 |
access_token = gr.Textbox(label="API Token", placeholder="Optional (for gated models)")
|
319 |
with gr.Row():
|
320 |
btn = gr.Button("Calculate Memory Usage")
|
321 |
+
# post_to_hub = gr.Button(value = "Report results in this model repo's discussions!\n(Will open in a new tab)", visible=False)
|
322 |
USER_TOKEN = access_token
|
323 |
|
324 |
btn.click(
|
325 |
+
calculate_memory, inputs=[inp, library, dtypes, optimizer, access_token], outputs=[out_text, out],
|
326 |
)
|
327 |
|
328 |
+
# post_to_hub.click(report_results).then(lambda: gr.Button.update(visible=False), outputs=post_to_hub)
|
329 |
|
330 |
|
331 |
+
demo.launch(share=True, inline=False, debug=True)
|