Spaces:
Running
Running
[node] estimation
Browse files
README.md
CHANGED
@@ -69,7 +69,7 @@ python app.py
|
|
69 |
- Fine-tuning: 2.5x (moderate overhead)
|
70 |
|
71 |
### Node Calculation
|
72 |
-
- **H100
|
73 |
- **Model Parallelism**: Automatic consideration for large models
|
74 |
- **Memory Efficiency**: Optimal distribution across nodes
|
75 |
|
@@ -78,9 +78,9 @@ python app.py
|
|
78 |
| Model | Tokens (In/Out) | Batch Size | Use Case | Precision | Estimated Nodes |
|
79 |
|-------|----------------|------------|----------|-----------|----------------|
|
80 |
| LLaMA-3-8B | 2048/512 | 1 | Inference | FP16 | 1 |
|
81 |
-
| LLaMA-3-70B | 4096/1024 | 4 | Inference | FP16 |
|
82 |
-
| Qwen2.5-72B | 8192/2048 | 2 | Fine-tuning | BF16 |
|
83 |
-
| Nemotron-4-340B | 2048/1024 | 1 | Inference | INT8 |
|
84 |
|
85 |
## CUDA Recommendations
|
86 |
|
@@ -139,6 +139,8 @@ This project is licensed under the MIT License - see the LICENSE file for detail
|
|
139 |
|
140 |
## Notes
|
141 |
|
|
|
142 |
- For production deployments, consider adding a 10-20% buffer to estimates
|
143 |
- Network bandwidth and storage requirements are not included in calculations
|
144 |
- Estimates assume optimal memory layout and efficient implementations
|
|
|
|
69 |
- Fine-tuning: 2.5x (moderate overhead)
|
70 |
|
71 |
### Node Calculation
|
72 |
+
- **H100 Node**: 8 × H100 GPUs per node = 640GB HBM3 total (576GB usable per node)
|
73 |
- **Model Parallelism**: Automatic consideration for large models
|
74 |
- **Memory Efficiency**: Optimal distribution across nodes
|
75 |
|
|
|
78 |
| Model | Tokens (In/Out) | Batch Size | Use Case | Precision | Estimated Nodes |
|
79 |
|-------|----------------|------------|----------|-----------|----------------|
|
80 |
| LLaMA-3-8B | 2048/512 | 1 | Inference | FP16 | 1 |
|
81 |
+
| LLaMA-3-70B | 4096/1024 | 4 | Inference | FP16 | 1 |
|
82 |
+
| Qwen2.5-72B | 8192/2048 | 2 | Fine-tuning | BF16 | 1 |
|
83 |
+
| Nemotron-4-340B | 2048/1024 | 1 | Inference | INT8 | 1-2 |
|
84 |
|
85 |
## CUDA Recommendations
|
86 |
|
|
|
139 |
|
140 |
## Notes
|
141 |
|
142 |
+
- **Node Configuration**: Each H100 node contains 8 × H100 GPUs (640GB total memory)
|
143 |
- For production deployments, consider adding a 10-20% buffer to estimates
|
144 |
- Network bandwidth and storage requirements are not included in calculations
|
145 |
- Estimates assume optimal memory layout and efficient implementations
|
146 |
+
- Multi-node setups require high-speed interconnects (InfiniBand/NVLink) for optimal performance
|
app.py
CHANGED
@@ -30,7 +30,9 @@ MODEL_SPECS = {
|
|
30 |
}
|
31 |
|
32 |
# H100 specifications
|
33 |
-
H100_MEMORY_GB = 80
|
|
|
|
|
34 |
H100_COMPUTE_CAPABILITY = "9.0"
|
35 |
|
36 |
# CUDA version recommendations based on model and use case
|
@@ -120,7 +122,7 @@ def estimate_h100_nodes(
|
|
120 |
total_memory_per_instance = (model_memory + kv_cache_memory) * overhead_multiplier.get(use_case, 1.2)
|
121 |
|
122 |
# Calculate nodes needed
|
123 |
-
memory_per_node =
|
124 |
nodes_needed = max(1, int(np.ceil(total_memory_per_instance / memory_per_node)))
|
125 |
|
126 |
# For very large models, consider model parallelism
|
@@ -138,9 +140,10 @@ def estimate_h100_nodes(
|
|
138 |
• **KV Cache Memory**: {kv_cache_memory:.1f} GB (for {total_tokens:,} tokens × {batch_size} batch size)
|
139 |
• **Use Case Overhead**: {overhead_multiplier.get(use_case, 1.2):.1f}x ({use_case})
|
140 |
• **Total Memory Required**: {total_memory_per_instance:.1f} GB
|
141 |
-
• **H100
|
|
|
142 |
|
143 |
-
**Recommendation**: {nodes_needed} H100 node(s)
|
144 |
"""
|
145 |
|
146 |
breakdown = {
|
@@ -167,6 +170,7 @@ def get_cuda_recommendation(use_case: str) -> str:
|
|
167 |
**Additional Requirements:**
|
168 |
• **Driver Version**: 525.60.13+ (Linux) / 527.41+ (Windows)
|
169 |
• **Compute Capability**: {H100_COMPUTE_CAPABILITY} (H100 native)
|
|
|
170 |
• **Memory**: ECC enabled recommended for production
|
171 |
"""
|
172 |
|
|
|
30 |
}
|
31 |
|
32 |
# H100 specifications
|
33 |
+
H100_MEMORY_GB = 80 # Memory per GPU
|
34 |
+
H100_GPUS_PER_NODE = 8 # GPUs per node
|
35 |
+
H100_NODE_MEMORY_GB = H100_MEMORY_GB * H100_GPUS_PER_NODE # 640GB per node
|
36 |
H100_COMPUTE_CAPABILITY = "9.0"
|
37 |
|
38 |
# CUDA version recommendations based on model and use case
|
|
|
122 |
total_memory_per_instance = (model_memory + kv_cache_memory) * overhead_multiplier.get(use_case, 1.2)
|
123 |
|
124 |
# Calculate nodes needed
|
125 |
+
memory_per_node = H100_NODE_MEMORY_GB * 0.9 # Reserve 10% for system (576GB usable per node)
|
126 |
nodes_needed = max(1, int(np.ceil(total_memory_per_instance / memory_per_node)))
|
127 |
|
128 |
# For very large models, consider model parallelism
|
|
|
140 |
• **KV Cache Memory**: {kv_cache_memory:.1f} GB (for {total_tokens:,} tokens × {batch_size} batch size)
|
141 |
• **Use Case Overhead**: {overhead_multiplier.get(use_case, 1.2):.1f}x ({use_case})
|
142 |
• **Total Memory Required**: {total_memory_per_instance:.1f} GB
|
143 |
+
• **H100 Node Specs**: {H100_GPUS_PER_NODE} × {H100_MEMORY_GB}GB = {H100_NODE_MEMORY_GB}GB per node
|
144 |
+
• **Usable Memory**: {memory_per_node:.1f} GB per node (10% reserved)
|
145 |
|
146 |
+
**Recommendation**: {nodes_needed} H100 node(s) ({nodes_needed * H100_GPUS_PER_NODE} H100 GPUs total)
|
147 |
"""
|
148 |
|
149 |
breakdown = {
|
|
|
170 |
**Additional Requirements:**
|
171 |
• **Driver Version**: 525.60.13+ (Linux) / 527.41+ (Windows)
|
172 |
• **Compute Capability**: {H100_COMPUTE_CAPABILITY} (H100 native)
|
173 |
+
• **Node Configuration**: {H100_GPUS_PER_NODE} × H100 GPUs per node ({H100_NODE_MEMORY_GB}GB total)
|
174 |
• **Memory**: ECC enabled recommended for production
|
175 |
"""
|
176 |
|