huckiyang's picture
[node] estimation
c358966
raw
history blame
16.5 kB
import gradio as gr
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import json
from typing import Dict, Tuple, List
# Model specifications (approximate parameter counts and memory requirements)
MODEL_SPECS = {
"LLaMA-2-7B": {"params": 7e9, "base_memory_gb": 14},
"LLaMA-2-13B": {"params": 13e9, "base_memory_gb": 26},
"LLaMA-2-70B": {"params": 70e9, "base_memory_gb": 140},
"LLaMA-3-8B": {"params": 8e9, "base_memory_gb": 16},
"LLaMA-3-70B": {"params": 70e9, "base_memory_gb": 140},
"LLaMA-3.1-8B": {"params": 8e9, "base_memory_gb": 16},
"LLaMA-3.1-70B": {"params": 70e9, "base_memory_gb": 140},
"LLaMA-3.1-405B": {"params": 405e9, "base_memory_gb": 810},
"Nemotron-4-340B": {"params": 340e9, "base_memory_gb": 680},
"Nemotron-4-15B": {"params": 15e9, "base_memory_gb": 30},
"Qwen2-0.5B": {"params": 0.5e9, "base_memory_gb": 1},
"Qwen2-1.5B": {"params": 1.5e9, "base_memory_gb": 3},
"Qwen2-7B": {"params": 7e9, "base_memory_gb": 14},
"Qwen2-72B": {"params": 72e9, "base_memory_gb": 144},
"Qwen2.5-0.5B": {"params": 0.5e9, "base_memory_gb": 1},
"Qwen2.5-1.5B": {"params": 1.5e9, "base_memory_gb": 3},
"Qwen2.5-7B": {"params": 7e9, "base_memory_gb": 14},
"Qwen2.5-14B": {"params": 14e9, "base_memory_gb": 28},
"Qwen2.5-32B": {"params": 32e9, "base_memory_gb": 64},
"Qwen2.5-72B": {"params": 72e9, "base_memory_gb": 144},
# Qwen Vision Language Models
"Qwen-VL": {"params": 9.6e9, "base_memory_gb": 20},
"Qwen-VL-Chat": {"params": 9.6e9, "base_memory_gb": 20},
"Qwen-VL-Plus": {"params": 12e9, "base_memory_gb": 25},
"Qwen-VL-Max": {"params": 30e9, "base_memory_gb": 65},
"Qwen2-VL-2B": {"params": 2e9, "base_memory_gb": 5},
"Qwen2-VL-7B": {"params": 8e9, "base_memory_gb": 18},
"Qwen2-VL-72B": {"params": 72e9, "base_memory_gb": 150},
# NVIDIA VILA Series
"VILA-1.5-3B": {"params": 3e9, "base_memory_gb": 7},
"VILA-1.5-8B": {"params": 8e9, "base_memory_gb": 18},
"VILA-1.5-13B": {"params": 13e9, "base_memory_gb": 28},
"VILA-1.5-40B": {"params": 40e9, "base_memory_gb": 85},
# Qwen Audio Models
"Qwen-Audio": {"params": 8e9, "base_memory_gb": 18},
"Qwen-Audio-Chat": {"params": 8e9, "base_memory_gb": 18},
"Qwen2-Audio-7B": {"params": 8e9, "base_memory_gb": 18},
# NVIDIA PhysicsNeMo Models
"PhysicsNeMo-FNO-Small": {"params": 1e6, "base_memory_gb": 0.5},
"PhysicsNeMo-FNO-Medium": {"params": 10e6, "base_memory_gb": 2},
"PhysicsNeMo-FNO-Large": {"params": 50e6, "base_memory_gb": 8},
"PhysicsNeMo-PINN-Small": {"params": 0.5e6, "base_memory_gb": 0.2},
"PhysicsNeMo-PINN-Medium": {"params": 5e6, "base_memory_gb": 1},
"PhysicsNeMo-PINN-Large": {"params": 20e6, "base_memory_gb": 4},
"PhysicsNeMo-GraphCast-Small": {"params": 50e6, "base_memory_gb": 8},
"PhysicsNeMo-GraphCast-Medium": {"params": 200e6, "base_memory_gb": 20},
"PhysicsNeMo-GraphCast-Large": {"params": 1e9, "base_memory_gb": 50},
"PhysicsNeMo-SFNO-Small": {"params": 25e6, "base_memory_gb": 5},
"PhysicsNeMo-SFNO-Medium": {"params": 100e6, "base_memory_gb": 15},
"PhysicsNeMo-SFNO-Large": {"params": 500e6, "base_memory_gb": 35},
}
# H100 specifications
H100_MEMORY_GB = 80 # Memory per GPU
H100_GPUS_PER_NODE = 8 # GPUs per node
H100_NODE_MEMORY_GB = H100_MEMORY_GB * H100_GPUS_PER_NODE # 640GB per node
H100_COMPUTE_CAPABILITY = "9.0"
# CUDA version recommendations based on model and use case
CUDA_RECOMMENDATIONS = {
"inference": {
"recommended": "12.1+",
"minimum": "11.8",
"optimal": "12.4"
},
"training": {
"recommended": "12.1+",
"minimum": "11.8",
"optimal": "12.4"
},
"fine_tuning": {
"recommended": "12.1+",
"minimum": "11.8",
"optimal": "12.4"
}
}
def calculate_kv_cache_memory(num_tokens: int, model_params: float, num_layers: int = None) -> float:
"""Calculate KV cache memory requirements in GB"""
if num_layers is None:
# Estimate layers based on model size
if model_params < 1e9:
num_layers = 24
elif model_params < 10e9:
num_layers = 32
elif model_params < 100e9:
num_layers = 80
else:
num_layers = 96
# KV cache memory per token (approximate)
# 2 (K + V) * 2 (fp16) * hidden_dim * num_layers
hidden_dim = int((model_params / (num_layers * 4)) ** 0.5) * 64 # Rough estimate
kv_memory_per_token = 2 * 2 * hidden_dim * num_layers / (1024**3) # GB
return num_tokens * kv_memory_per_token
def estimate_h100_nodes(
model_name: str,
input_tokens: int,
output_tokens: int,
batch_size: int,
use_case: str,
precision: str
) -> Tuple[int, str, Dict]:
"""
Estimate the number of H100 nodes required
Returns:
- Number of nodes required
- Detailed explanation
- Dictionary with breakdown
"""
if model_name not in MODEL_SPECS:
return 1, f"Model {model_name} not found in specifications", {}
model_spec = MODEL_SPECS[model_name]
base_memory = model_spec["base_memory_gb"]
# Adjust memory based on precision
precision_multiplier = {
"FP32": 1.0,
"FP16": 0.5,
"BF16": 0.5,
"INT8": 0.25,
"INT4": 0.125
}
model_memory = base_memory * precision_multiplier.get(precision, 0.5)
# Calculate KV cache memory
total_tokens = input_tokens + output_tokens
kv_cache_memory = calculate_kv_cache_memory(total_tokens, model_spec["params"]) * batch_size
# Use case specific memory overhead
overhead_multiplier = {
"inference": 1.2, # 20% overhead
"training": 3.0, # 3x for gradients, optimizer states
"fine_tuning": 2.5 # 2.5x for fine-tuning
}
total_memory_per_instance = (model_memory + kv_cache_memory) * overhead_multiplier.get(use_case, 1.2)
# Calculate nodes needed
memory_per_node = H100_NODE_MEMORY_GB * 0.9 # Reserve 10% for system (576GB usable per node)
nodes_needed = max(1, int(np.ceil(total_memory_per_instance / memory_per_node)))
# For very large models, consider model parallelism
if model_memory > memory_per_node:
min_nodes_for_model = int(np.ceil(model_memory / memory_per_node))
nodes_needed = max(nodes_needed, min_nodes_for_model)
# Generate explanation
explanation = f"""
**Estimation Breakdown:**
• **Model**: {model_name} ({model_spec['params']/1e9:.1f}B parameters)
• **Precision**: {precision}
• **Model Memory**: {model_memory:.1f} GB
• **KV Cache Memory**: {kv_cache_memory:.1f} GB (for {total_tokens:,} tokens × {batch_size} batch size)
• **Use Case Overhead**: {overhead_multiplier.get(use_case, 1.2):.1f}x ({use_case})
• **Total Memory Required**: {total_memory_per_instance:.1f} GB
• **H100 Node Specs**: {H100_GPUS_PER_NODE} × {H100_MEMORY_GB}GB = {H100_NODE_MEMORY_GB}GB per node
• **Usable Memory**: {memory_per_node:.1f} GB per node (10% reserved)
**Recommendation**: {nodes_needed} H100 node(s) ({nodes_needed * H100_GPUS_PER_NODE} H100 GPUs total)
"""
breakdown = {
"model_memory_gb": model_memory,
"kv_cache_memory_gb": kv_cache_memory,
"total_memory_gb": total_memory_per_instance,
"h100_memory_per_node_gb": memory_per_node,
"nodes_required": nodes_needed
}
return nodes_needed, explanation, breakdown
def get_cuda_recommendation(use_case: str) -> str:
"""Get CUDA version recommendation based on use case"""
cuda_info = CUDA_RECOMMENDATIONS.get(use_case, CUDA_RECOMMENDATIONS["inference"])
recommendation = f"""
**CUDA Version Recommendations for {use_case.title()}:**
• **Optimal**: CUDA {cuda_info['optimal']} + cuDNN 8.9+
• **Recommended**: CUDA {cuda_info['recommended']} + cuDNN 8.7+
• **Minimum**: CUDA {cuda_info['minimum']} + cuDNN 8.5+
**Additional Requirements:**
• **Driver Version**: 525.60.13+ (Linux) / 527.41+ (Windows)
• **Compute Capability**: {H100_COMPUTE_CAPABILITY} (H100 native)
• **Node Configuration**: {H100_GPUS_PER_NODE} × H100 GPUs per node ({H100_NODE_MEMORY_GB}GB total)
• **Memory**: ECC enabled recommended for production
"""
return recommendation
def create_performance_chart(breakdown: Dict) -> plt.Figure:
"""Create a memory utilization chart"""
if not breakdown:
fig, ax = plt.subplots(figsize=(8, 6))
ax.text(0.5, 0.5, 'No data to display', ha='center', va='center')
ax.set_xlim(0, 1)
ax.set_ylim(0, 1)
return fig
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(12, 5))
# Memory breakdown pie chart
labels = ['Model Memory', 'KV Cache', 'Overhead']
model_mem = breakdown['model_memory_gb']
kv_mem = breakdown['kv_cache_memory_gb']
overhead_mem = breakdown['total_memory_gb'] - model_mem - kv_mem
sizes = [model_mem, kv_mem, overhead_mem]
colors = ['#ff9999', '#66b3ff', '#99ff99']
ax1.pie(sizes, labels=labels, colors=colors, autopct='%1.1f%%', startangle=90)
ax1.set_title('Memory Breakdown')
# Node utilization bar chart
nodes = breakdown['nodes_required']
total_memory = breakdown['total_memory_gb']
memory_per_node = breakdown['h100_memory_per_node_gb']
node_labels = [f'Node {i+1}' for i in range(nodes)]
utilization = []
for i in range(nodes):
if i < nodes - 1:
utilization.append(memory_per_node)
else:
remaining_memory = total_memory - (nodes - 1) * memory_per_node
utilization.append(remaining_memory)
utilization_pct = [u / memory_per_node * 100 for u in utilization]
bars = ax2.bar(node_labels, utilization_pct, color='skyblue', alpha=0.7)
ax2.axhline(y=100, color='red', linestyle='--', alpha=0.7, label='Max Capacity')
ax2.set_ylabel('Memory Utilization (%)')
ax2.set_title('H100 Node Memory Utilization')
ax2.set_ylim(0, 110)
ax2.legend()
# Add value labels on bars
for bar, pct in zip(bars, utilization_pct):
ax2.text(bar.get_x() + bar.get_width()/2, bar.get_height() + 1,
f'{pct:.1f}%', ha='center', va='bottom')
plt.tight_layout()
return fig
def estimate_nodes_interface(
model_name: str,
input_tokens: int,
output_tokens: int,
batch_size: int,
use_case: str,
precision: str
):
"""Main interface function"""
# Validate inputs
if input_tokens <= 0 or output_tokens <= 0:
return "Please enter valid token counts (> 0)", "", None, ""
if batch_size <= 0:
return "Please enter a valid batch size (> 0)", "", None, ""
# Calculate node requirements
nodes_needed, explanation, breakdown = estimate_h100_nodes(
model_name, input_tokens, output_tokens, batch_size, use_case, precision
)
# Get CUDA recommendations
cuda_rec = get_cuda_recommendation(use_case)
# Create performance chart
fig = create_performance_chart(breakdown)
return explanation, cuda_rec, fig, f"**Estimated H100 Nodes Required: {nodes_needed}**"
# Create Gradio interface
def create_interface():
with gr.Blocks(title="H100 Node Estimator", theme=gr.themes.Soft()) as demo:
gr.Markdown("# 🚀 H100 Node & CUDA Version Estimator")
gr.Markdown("Get recommendations for H100 node count and CUDA version based on your model and workload requirements.")
gr.Markdown("**Comprehensive Model Support**: LLaMA, Nemotron, Qwen2/2.5, Qwen-VL, VILA, Qwen-Audio, and **NVIDIA PhysicsNeMo** series!")
with gr.Row():
with gr.Column(scale=1):
gr.Markdown("## Input Parameters")
model_dropdown = gr.Dropdown(
choices=list(MODEL_SPECS.keys()),
value="LLaMA-3-8B",
label="Model",
info="Select the model you want to run (includes LLMs, multimodal, and physics-ML models)"
)
input_tokens = gr.Number(
value=2048,
label="Input Tokens",
info="Number of input tokens per request"
)
output_tokens = gr.Number(
value=512,
label="Output Tokens",
info="Number of output tokens per request"
)
batch_size = gr.Number(
value=1,
label="Batch Size",
info="Number of concurrent requests"
)
use_case = gr.Dropdown(
choices=["inference", "training", "fine_tuning"],
value="inference",
label="Use Case",
info="What will you use the model for?"
)
precision = gr.Dropdown(
choices=["FP32", "FP16", "BF16", "INT8", "INT4"],
value="FP16",
label="Precision",
info="Model precision/quantization"
)
estimate_btn = gr.Button("💡 Estimate Requirements", variant="primary")
with gr.Column(scale=2):
gr.Markdown("## Results")
node_count = gr.Markdown("**Ready to estimate...**")
with gr.Tab("📊 Detailed Analysis"):
detailed_output = gr.Markdown()
with gr.Tab("🔧 CUDA Recommendations"):
cuda_output = gr.Markdown()
with gr.Tab("📈 Memory Utilization"):
chart_output = gr.Plot()
# Connect the interface
estimate_btn.click(
fn=estimate_nodes_interface,
inputs=[model_dropdown, input_tokens, output_tokens, batch_size, use_case, precision],
outputs=[detailed_output, cuda_output, chart_output, node_count]
)
# Add examples
gr.Markdown("## 💡 Example Scenarios")
examples = [
["LLaMA-3-8B", 2048, 512, 1, "inference", "FP16"],
["LLaMA-3-70B", 4096, 1024, 4, "inference", "FP16"],
["Qwen2.5-72B", 8192, 2048, 2, "fine_tuning", "BF16"],
["Nemotron-4-340B", 2048, 1024, 1, "inference", "INT8"],
["Qwen2-VL-7B", 1024, 256, 1, "inference", "FP16"],
["VILA-1.5-13B", 2048, 512, 2, "inference", "BF16"],
["Qwen2-Audio-7B", 1024, 256, 1, "inference", "FP16"],
["PhysicsNeMo-FNO-Large", 512, 128, 8, "training", "FP32"],
["PhysicsNeMo-GraphCast-Medium", 1024, 256, 4, "training", "FP16"],
]
gr.Examples(
examples=examples,
inputs=[model_dropdown, input_tokens, output_tokens, batch_size, use_case, precision],
outputs=[detailed_output, cuda_output, chart_output, node_count],
fn=estimate_nodes_interface,
cache_examples=False
)
gr.Markdown("""
## ℹ️ Notes
- **Multimodal Models**: Vision-language and audio models may require additional memory for image/audio processing
- **PhysicsNeMo Models**: Physics-ML models (FNO, PINN, GraphCast, SFNO) typically require higher batch sizes for training
- **Token Estimation**: For multimodal models, consider image patches (~256-1024 tokens per image) and audio frames
- **Physics Simulations**: PhysicsNeMo models often work with spatial/temporal grids rather than tokens
- Estimates are approximate and may vary based on actual implementation details
- Memory calculations include model weights, KV cache, and operational overhead
- Consider network bandwidth and storage requirements for multi-node setups
- For production deployments, add 10-20% buffer for optimal performance
""")
return demo
if __name__ == "__main__":
demo = create_interface()
demo.launch(share=True, server_name="0.0.0.0", server_port=7860)