model-memory-usage

Runtime error

App Files Files Community

utensil commited on Aug 29, 2023

Commit

9fc12c1

1 Parent(s): a168076

Adapt my vram calc code to this UI and create a prototype

Browse files

Files changed (1) hide show

app.py +209 -47

app.py CHANGED Viewed

@@ -31,18 +31,14 @@ def report_results():
     USER_TOKEN = None
     post = f"""# Model Memory Requirements\n
 You will need about {minimum[1]} VRAM to load this model for inference, and {minimum[3]} VRAM to train it using Adam.
 These calculations were measured from the [Model Memory Utility Space](https://hf.co/spaces/hf-accelerate/model-memory-utility) on the Hub.
 The minimum recommended vRAM needed for this model assumes using [Accelerate or `device_map="auto"`](https://huggingface.co/docs/accelerate/usage_guides/big_modeling) and is denoted by the size of the "largest layer".
 When performing inference, expect to add up to an additional 20% to this, as found by [EleutherAI](https://blog.eleuther.ai/transformer-math/). More tests will be performed in the future to get a more accurate benchmark for each model.
 When training with `Adam`, you can expect roughly 4x the reported results to be used. (1x for the model, 1x for the gradients, and 2x for the optimizer).
 ## Results:
 {results}
 """
     discussion = api.create_discussion(MODEL_NAME, "[AUTOMATED] Model Memory Requirements", description=post)
@@ -55,7 +51,123 @@ def convert_url_to_name(url:str):
         raise ValueError(f"URL {url} is not a valid model URL to the Hugging Face Hub")
     return results[0]
-def calculate_memory(model_name:str, library:str, options:list, access_token:str, raw=False):
     "Calculates the memory usage for a model"
     if library == "auto":
         library = None
@@ -82,27 +194,54 @@ def calculate_memory(model_name:str, library:str, options:list, access_token:str
     data = []
     title = f"Memory Usage for '{model_name}'"
-    for dtype in options:
-        dtype_total_size = total_size
-        dtype_largest_layer = largest_layer[0]
-        if dtype in ("fp16",  "bf16", "float16/bfloat16"):
-            dtype_total_size /= 2
-            dtype_largest_layer /= 2
-        elif dtype == "int8":
-            dtype_total_size /= 4
-            dtype_largest_layer /= 4
-        elif dtype == "int4":
-            dtype_total_size /= 8
-            dtype_largest_layer /= 8
-        dtype_training_size = convert_bytes(dtype_total_size * 4)
-        dtype_total_size = convert_bytes(dtype_total_size)
-        dtype_largest_layer = convert_bytes(dtype_largest_layer)
-        data.append({
             "dtype": dtype,
-            "Largest Layer or Residual Group": dtype_largest_layer,
-            "Total Size": dtype_total_size,
-            "Training using Adam": dtype_training_size
-        })
     global HAS_DISCUSSION, MODEL_NAME, LIBRARY
     HAS_DISCUSSION = check_for_discussion(model_name)
     MODEL_NAME = model_name
@@ -114,7 +253,7 @@ def calculate_memory(model_name:str, library:str, options:list, access_token:str
     results = [
         f'## {title}',
         gr.update(visible=True, value=pd.DataFrame(data)),
-        gr.update(visible=not HAS_DISCUSSION)
     ]
     return results
@@ -122,48 +261,71 @@ with gr.Blocks() as demo:
     with gr.Column():
         gr.Markdown(
             """<img src="https://huggingface.co/spaces/hf-accelerate/model-memory-usage/resolve/main/measure_model_size.png" style="float: left;" width="250" height="250"><h1>🤗 Model Memory Calculator</h1>
-    This tool will help you calculate how much vRAM is needed to train and perform big model inference
-    on a model hosted on the 🤗 Hugging Face Hub. The minimum recommended vRAM needed for a model
-    is denoted as the size of the "largest layer", and training of a model is roughly 4x its size (for Adam).
-    These calculations are accurate within a few percent at most, such as `bert-base-cased` being 413.68 MB and the calculator estimating 413.18 MB.
-    When performing inference, expect to add up to an additional 20% to this as found by [EleutherAI](https://blog.eleuther.ai/transformer-math/).
-    More tests will be performed in the future to get a more accurate benchmark for each model.
-    Currently this tool supports all models hosted that use `transformers` and `timm`.
-    To use this tool pass in the URL or model name of the model you want to calculate the memory usage for,
-    select which framework it originates from ("auto" will try and detect it from the model metadata), and
-    what precisions you want to use."""
         )
         out_text = gr.Markdown()
-        out = gr.DataFrame(
-            headers=["dtype", "Largest Layer", "Total Size", "Training using Adam"],
             interactive=False,
             visible=False,
         )
         with gr.Row():
             inp = gr.Textbox(label="Model Name or URL", value="bert-base-cased")
         with gr.Row():
-            library = gr.Radio(["auto", "transformers", "timm"], label="Library", value="auto")
-            options = gr.CheckboxGroup(
                 ["float32", "float16/bfloat16", "int8", "int4"],
-                value="float32",
                 label="Model Precision",
             )
             access_token = gr.Textbox(label="API Token", placeholder="Optional (for gated models)")
         with gr.Row():
             btn = gr.Button("Calculate Memory Usage")
-            post_to_hub = gr.Button(value = "Report results in this model repo's discussions!\n(Will open in a new tab)", visible=False)
     USER_TOKEN = access_token
     btn.click(
-        calculate_memory, inputs=[inp, library, options, access_token], outputs=[out_text, out, post_to_hub],
     )
-    post_to_hub.click(report_results).then(lambda: gr.Button.update(visible=False), outputs=post_to_hub)
-demo.launch()

     USER_TOKEN = None
     post = f"""# Model Memory Requirements\n
 You will need about {minimum[1]} VRAM to load this model for inference, and {minimum[3]} VRAM to train it using Adam.
 These calculations were measured from the [Model Memory Utility Space](https://hf.co/spaces/hf-accelerate/model-memory-utility) on the Hub.
 The minimum recommended vRAM needed for this model assumes using [Accelerate or `device_map="auto"`](https://huggingface.co/docs/accelerate/usage_guides/big_modeling) and is denoted by the size of the "largest layer".
 When performing inference, expect to add up to an additional 20% to this, as found by [EleutherAI](https://blog.eleuther.ai/transformer-math/). More tests will be performed in the future to get a more accurate benchmark for each model.
 When training with `Adam`, you can expect roughly 4x the reported results to be used. (1x for the model, 1x for the gradients, and 2x for the optimizer).
 ## Results:
 {results}
 """
     discussion = api.create_discussion(MODEL_NAME, "[AUTOMATED] Model Memory Requirements", description=post)
         raise ValueError(f"URL {url} is not a valid model URL to the Hugging Face Hub")
     return results[0]
+# Based on the following doc:
+#
+# - https://huggingface.co/docs/transformers/v4.31.0/perf_train_gpu_one#anatomy-of-models-memory
+# - https://blog.eleuther.ai/transformer-math/
+# - https://kipp.ly/transformer-inference-arithmetic/
+# - https://github.com/ray-project/llm-numbers
+#
+def calc_vram_f32(model, optimizer, sequence_len, micro_batch_size, device_count, gradient_checkpointing):
+    # is_16bit = cfg.bf16 or cfg.bfloat16 or cfg.load_in_8bit or cfg.fp16 or cfg.float16
+    # if torch.cuda.device_count() > 1 or cfg.fsdp or os.environ.get("ACCELERATE_USE_DEEPSPEED") == "true" or cfg.adapter:
+    #   return { 'supported': False }
+    # Model Weights
+    #
+    # Hf doc counts:
+    #
+    # - 4 bytes * number of parameters for fp32 training
+    # - 6 bytes * number of parameters for mixed precision training (maintains a model in fp32 and one in fp16 in memory)
+    #
+    # But we follow https://blog.eleuther.ai/transformer-math/#model-weights to count 2 bytes here for mixed precision training,
+    # leave the rest to optimizor state.
+    #
+    # Here we calculate only for fp32, will adjust for each dtype outside.
+    #
+    # for param in model.parameters():
+    #   print(f'{type(param)} {param.shape} {param.element_size()}')
+    #
+    # print(f'total parameters = {sum([param.nelement() for param in model.parameters()])}')
+    param_element_size = 4
+    vram_model = sum([param.nelement() * param_element_size for param in model.parameters()])
+    # Buffers
+    #
+    # Buffers are tensors that do not require gradients and not registered as parameters.
+    # e.g. mean and std in batch norm layers.
+    # - https://github.com/huggingface/transformers/blob/d4bd33cc9f11ca48635e54983d75249c78d72e2a/src/transformers/modeling_utils.py#L1897
+    # - https://discuss.pytorch.org/t/gpu-memory-that-model-uses/56822/2
+    #
+    # for buf in model.buffers():
+    #   print(f'buf.element_size() = {buf.element_size()}')
+    vram_buffer = sum([buf.nelement() * buf.element_size() for buf in model.buffers()])
+    # Optimizer States:
+    # - 8 bytes * number of parameters for normal AdamW (maintains 2 states)
+    # - 2 bytes * number of parameters for 8-bit AdamW optimizers like bitsandbytes
+    # - 4 bytes * number of parameters for optimizers like SGD with momentum (maintains only 1 state)
+    #
+    # For now we use AdamW/SGD as the baseline for the estimation, even for other more memory-efficient optimizers
+    # ADAMW_HF = "adamw_hf"
+    # ADAMW_TORCH = "adamw_torch"
+    # ADAMW_TORCH_FUSED = "adamw_torch_fused"
+    # ADAMW_TORCH_XLA = "adamw_torch_xla"
+    # ADAMW_APEX_FUSED = "adamw_apex_fused"
+    # ADAFACTOR = "adafactor"
+    # ADAMW_ANYPRECISION = "adamw_anyprecision"
+    # SGD = "sgd"
+    # ADAGRAD = "adagrad"
+    # ADAMW_BNB = "adamw_bnb_8bit"
+    # ADAMW_8BIT = "adamw_8bit"  # just an alias for adamw_bnb_8bit
+    # LION_8BIT = "lion_8bit"
+    # LION = "lion_32bit"
+    # PAGED_ADAMW = "paged_adamw_32bit"
+    # PAGED_ADAMW_8BIT = "paged_adamw_8bit"
+    # PAGED_LION = "paged_lion_32bit"
+    # PAGED_LION_8BIT = "paged_lion_8bit"
+    # optimizer = cfg.optimizer
+    optimizer_state_size_per_param = 4 if 'sgd' in optimizer else (2 if '8bit' in optimizer else 8)
+    vram_optimizer = sum([param.nelement() * optimizer_state_size_per_param for param in model.parameters()])
+    # Gradients
+    #
+    # 4 bytes * number of parameters for either fp32 or mixed precision training (gradients are always kept in fp32)
+    # but we will follow transformer-math to treat it conditionally outside
+    # for now we ignores whether is mixed precision training
+    #
+    gradient_element_size = 4 # 2 if is_16bit else 4
+    vram_gradient = sum([param.nelement() * gradient_element_size for param in model.parameters()])
+    # Forward Activations
+    # size depends on many factors, the key ones being sequence length, hidden size and batch size.
+    s = sequence_len # cfg.sequence_len
+    b = micro_batch_size # cfg.micro_batch_size
+    h = model.config.hidden_size
+    L = model.config.num_hidden_layers
+    t = device_count # max(1, torch.cuda.device_count()) # len(DataParallel(model).device_ids)  #torch.cuda.device_count()
+    a = model.config.num_attention_heads
+    print(f's={s} b={b} h={h} L={L} t={t} a={a}')
+    sbHL = s * b * h * L
+    print(f'sbHL = {sbHL / 1e9} GB')
+    print(f'10 + {24 / t} + {5 * a * s / (h * t)}')
+    vram_activation = sbHL * (10 + 24 / t) if gradient_checkpointing else sbHL * (10 + 24 / t + 5 * a * s / (h * t))
+    return {
+        # 'supported': True,
+        'param_element_size': param_element_size,
+        'total': vram_model + vram_buffer + vram_optimizer + vram_activation,
+        'model': vram_model,
+        'buffer': vram_buffer,
+        'optimizer': vram_optimizer,
+        'activation': vram_activation,
+  }
+def bytes_by_dtype(bytes, dtype):
+    if dtype in ("fp16",  "bf16", "float16/bfloat16"):
+        return bytes / 2
+    elif dtype == "int8":
+        return bytes / 4
+    elif dtype == "int4":
+        return bytes / 8
+    else:
+        return bytes
+def calculate_memory(model_name:str, library:str, dtypes:list, optimizer:str, access_token:str, raw=False):
     "Calculates the memory usage for a model"
     if library == "auto":
         library = None
     data = []
     title = f"Memory Usage for '{model_name}'"
+    vram_f32 = calc_vram_f32(model, optimizer=optimizer, sequence_len=2048, micro_batch_size=1, device_count=1, gradient_checkpointing=True)
+    for dtype in dtypes:
+        param_element_size = bytes_by_dtype(vram_f32['param_element_size'], dtype)
+        vram_model = bytes_by_dtype(vram_f32['model'], dtype)
+        vram_buffer = vram_f32['buffer']
+        vram_optimizer = vram_f32['optimizer']
+        vram_activation = vram_f32['activation']
+        row = {
             "dtype": dtype,
+            'inference_total': convert_bytes(vram_model + vram_activation),
+            'training_total': convert_bytes(vram_model + vram_buffer + vram_optimizer + vram_activation),
+            'model': convert_bytes(vram_model),
+            'buffer': convert_bytes(vram_buffer),
+            'optimizer': convert_bytes(vram_optimizer),
+            'activation': convert_bytes(vram_activation),
+        }
+        data.append(row)
+        # dtype_total_size = total_size
+        # dtype_largest_layer = largest_layer[0]
+        # if dtype in ("fp16",  "bf16", "float16/bfloat16"):
+        #     dtype_total_size /= 2
+        #     dtype_largest_layer /= 2
+        # elif dtype == "int8":
+        #     dtype_total_size /= 4
+        #     dtype_largest_layer /= 4
+        # elif dtype == "int4":
+        #     dtype_total_size /= 8
+        #     dtype_largest_layer /= 8
+        # dtype_training_size = convert_bytes(dtype_total_size * 4)
+        # dtype_total_size = convert_bytes(dtype_total_size)
+        # dtype_largest_layer = convert_bytes(dtype_largest_layer)
+        # data.append({
+        #     "dtype": dtype,
+        #     "Largest Layer or Residual Group": dtype_largest_layer,
+        #     "Total Size": dtype_total_size,
+        #     "Training using Adam": dtype_training_size,
+        #     "Test": 12345
+        # })
+        # data.append({
+        #     "dtype": dtype,
+        #     "Largest Layer or Residual Group": dtype_largest_layer,
+        #     "Total Size": dtype_total_size,
+        #     "Training using Adam": dtype_training_size,
+        #     "Test": 12345
+        # })
     global HAS_DISCUSSION, MODEL_NAME, LIBRARY
     HAS_DISCUSSION = check_for_discussion(model_name)
     MODEL_NAME = model_name
     results = [
         f'## {title}',
         gr.update(visible=True, value=pd.DataFrame(data)),
+        # gr.update(visible=not HAS_DISCUSSION)
     ]
     return results
     with gr.Column():
         gr.Markdown(
             """<img src="https://huggingface.co/spaces/hf-accelerate/model-memory-usage/resolve/main/measure_model_size.png" style="float: left;" width="250" height="250"><h1>🤗 Model Memory Calculator</h1>
+    This tool is modified from https://huggingface.co/spaces/hf-accelerate/model-memory-usage with the following changes:
+    - Focus on transformers and gives more detailed estimation based on more configs
+    - Will auto-calculate the proper batch size given a VRAM constraint later
+    - LoRA/QLoRA etc. will be supported later
+    Note:
+    - inference_total = model
+    - training_total = model + buffer + optimizer + activation
+    """
         )
         out_text = gr.Markdown()
+        out = gr.DataFrame(headers=[
+              "dtype",
+              'inference_total',
+              'training_total',
+              'model',
+              'buffer',
+              'optimizer',
+              'activation',
+            ],
             interactive=False,
             visible=False,
         )
         with gr.Row():
             inp = gr.Textbox(label="Model Name or URL", value="bert-base-cased")
         with gr.Row():
+            library = gr.Radio(["transformers"], label="Library", value="transformers")
+            dtypes = gr.CheckboxGroup(
                 ["float32", "float16/bfloat16", "int8", "int4"],
+                value=["float32", "float16/bfloat16", "int8", "int4"],
                 label="Model Precision",
             )
+              # ADAMW_HF = "adamw_hf"
+              # ADAMW_TORCH = "adamw_torch"
+              # ADAMW_TORCH_FUSED = "adamw_torch_fused"
+              # ADAMW_TORCH_XLA = "adamw_torch_xla"
+              # ADAMW_APEX_FUSED = "adamw_apex_fused"
+              # ADAFACTOR = "adafactor"
+              # ADAMW_ANYPRECISION = "adamw_anyprecision"
+              # SGD = "sgd"
+              # ADAGRAD = "adagrad"
+              # ADAMW_BNB = "adamw_bnb_8bit"
+              # ADAMW_8BIT = "adamw_8bit"  # just an alias for adamw_bnb_8bit
+              # LION_8BIT = "lion_8bit"
+              # LION = "lion_32bit"
+              # PAGED_ADAMW = "paged_adamw_32bit"
+              # PAGED_ADAMW_8BIT = "paged_adamw_8bit"
+              # PAGED_LION = "paged_lion_32bit"
+              # PAGED_LION_8BIT = "paged_lion_8bit"
+            optimizer = gr.Dropdown(choices=["adamw_hf", "adamw_torch", "sgd", "lion_32bit", "adamw_8bit", "lion_8bit", "paged_adamw_8bit", "paged_lion_8bit"],
+              value="adamw_hf", label="Optimizer", allow_custom_value=True)
             access_token = gr.Textbox(label="API Token", placeholder="Optional (for gated models)")
         with gr.Row():
             btn = gr.Button("Calculate Memory Usage")
+            # post_to_hub = gr.Button(value = "Report results in this model repo's discussions!\n(Will open in a new tab)", visible=False)
     USER_TOKEN = access_token
     btn.click(
+        calculate_memory, inputs=[inp, library, dtypes, optimizer, access_token], outputs=[out_text, out],
     )
+    # post_to_hub.click(report_results).then(lambda: gr.Button.update(visible=False), outputs=post_to_hub)
+demo.launch(share=True, inline=False, debug=True)