NyxKrage's picture
Update app.py
260ecb5 verified
raw
history blame
4.85 kB
from typing import Any
import gradio as gr
import pandas as pd
import requests
quants = {
"Q2_K": 3.35,
"Q3_K_S": 3.5,
"Q3_K_M": 3.91,
"Q3_K_L": 4.27,
"Q4_0": 4.55,
"Q4_K_S": 4.58,
"Q4_K_M": 4.85,
"Q5_0": 5.54,
"Q5_K_S": 5.54,
"Q5_K_M": 5.69,
"Q6_K": 6.59,
"Q8_0": 8.5,
}
def calc_model_size(parameters: int, quant: float) -> int:
return parameters * quant // 8
def get_model_config(hf_model: str) -> dict[str, Any]:
config = requests.get(
f"https://huggingface.co/{hf_model}/raw/main/config.json"
).json()
model_size = 0
try:
model_size = requests.get(
f"https://huggingface.co/{hf_model}/raw/main/model.safetensors.index.json"
).json()["metadta"]["total_size"]
except:
try:
model_size = requests.get(
f"https://huggingface.co/{hf_model}/raw/main/pytorch_model.bin.index.json"
).json()["metadta"]["total_size"]
except:
model_page = requests.get(
f"https://huggingface.co/{hf_model}"
).text
param_props_idx = model_page.find('data-target="ModelSafetensorsParams"')
param_props_start = model_page.rfind("<div", 0, param_props_idx)
model_size = (
json.loads(
[
prop
for prop in model_page[param_props_start:param_props_idx].split(" ")
if prop.startswith("data-props=")
][0]
.split("=")[1]
.replace('"', "")
.replace("&quot;", '"')
)["safetensors"]["total"]
* 2
)
# assume fp16 weights
config["parameters"] = model_size / 2
return config
def calc_input_buffer_size(model_config, context: int) -> float:
return 4096 + 2048 * model_config["hidden_size"] + context * 4 + context * 2048
def calc_compute_buffer_size(model_config, context: int) -> float:
return (
(context / 1024 * 2 + 0.75) * model_config["num_attention_heads"] * 1024 * 1024
)
def calc_context_size(model_config, context: int) -> float:
n_gqa = model_config["num_attention_heads"] / model_config["num_key_value_heads"]
n_embd_gqa = model_config["hidden_size"] / n_gqa
n_elements = n_embd_gqa * (model_config["num_hidden_layers"] * context)
return 2 * n_elements * 2
def calc(model_base, context, quant_size):
model_config = get_model_config(model_base)
quant_bpw = 0
try:
quant_bpw = float(quant_size)
except:
quant_bpw = quants[quant_size]
model_size = round(
calc_model_size(model_config["parameters"], quant_bpw) / 1000 / 1000 / 1000, 2
)
context_size = round(
(
calc_input_buffer_size(model_config, context)
+ calc_context_size(model_config, context)
+ calc_compute_buffer_size(model_config, context)
)
/ 1000
/ 1000
/ 1000,
2,
)
return model_size, context_size, round(model_size + context_size, 2)
title = "GGUF VRAM Calculator"
with gr.Blocks(title=title, theme=gr.themes.Monochrome()) as app:
default_model = "mistralai/Mistral-7B-v0.1"
default_quant = "Q4_K_S"
default_context = 8192
default_size = calc(default_model, default_context, default_quant)
default_model_size = default_size[0]
default_context_size = default_size[1]
gr.Markdown(
f"# {app.title}\nThis is meant only as a guide and is will not be 100% accurate, this also does not account for anything that might be running in the background on your system or CUDA system memory fallback on Windows"
)
model = gr.Textbox(
value=default_model,
label="Enter Unquantized HF Model Name (e.g. mistralai/Mistral-7B-v0.1)",
)
context = gr.Number(
minimum=1, value=default_context, label="Desired Context Size (Tokens)"
)
quant = gr.Dropdown(
choices=list(quants.keys()),
value=default_quant,
allow_custom_value=True,
label="Enter GGUF Quant (e.g. Q4_K_S) or the specific BPW for other quantization schemes such as exl2 (e.g. 4.5)",
)
btn = gr.Button(value="Submit", variant="primary")
btn.click(
calc,
inputs=[
model,
context,
quant,
],
outputs=[
gr.Number(
label="Model Size (GB)",
value=default_size[0],
),
gr.Number(
label="Context Size (GB)",
value=default_size[1],
),
gr.Number(
label="Total Size (GB)",
value=default_size[2],
),
],
)
app.launch()