Spaces:
Running
Running
File size: 8,923 Bytes
76398c6 5c9514a 76398c6 a0b9dac f6e2171 76398c6 992ded3 76398c6 5c9514a fb765e5 1ba133e b10516b 1ba133e 8f4ec63 1ba133e eddcb6b 5c9514a 992ded3 76398c6 6826b0f eddcb6b 6826b0f 76398c6 6826b0f 76398c6 d37299b ae0011e d37299b ae0011e d37299b ae0011e 76398c6 0cc3d3a 6826b0f 76398c6 6826b0f 76398c6 e8be103 7e1cbde e8be103 76398c6 f6e2171 5c9514a 76398c6 a0b9dac 6826b0f fddae32 a0b9dac 76398c6 d37299b 79399de 161a324 79399de 76398c6 161a324 e8be103 161a324 76398c6 2d9aa2d 76398c6 a0b9dac 74c26d6 76398c6 d37299b 1ba133e 76398c6 0cc3d3a ae0011e d37299b ae0011e d37299b 76398c6 d37299b 76398c6 d37299b 76398c6 ae0011e d37299b |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 |
import streamlit as st
from streamlit_datalist import stDatalist
import pandas as pd
from utils import extract_from_url, get_model, calculate_memory
import plotly.express as px
import numpy as np
import gc
from huggingface_hub import login
st.set_page_config(page_title='Can you run it? LLM version', layout="wide", initial_sidebar_state="expanded")
model_list = [
"NousResearch/Meta-Llama-3-8B-Instruct",
"NousResearch/Meta-Llama-3-70B-Instruct",
"mistral-community/Mistral-7B-v0.2",
# "mistralai/Mixtral-8x7B-Instruct-v0.1",
"mistral-community/Mixtral-8x22B-v0.1",
"HuggingFaceH4/zephyr-orpo-141b-A35b-v0.1",
# "CohereForAI/c4ai-command-r-plus",
# "CohereForAI/c4ai-command-r-v01",
"hpcai-tech/grok-1",
"NexaAIDev/Octopus-v2",
"HuggingFaceH4/zephyr-7b-gemma-v0.1",
"HuggingFaceH4/starchat2-15b-v0.1",
"deepseek-ai/deepseek-coder-6.7b-instruct",
"deepseek-ai/deepseek-coder-1.3b-base",
"microsoft/phi-2",
"TinyLlama/TinyLlama-1.1B-Chat-v1.0",
"codellama/CodeLlama-7b-hf",
"codellama/CodeLlama-13b-hf",
"codellama/CodeLlama-34b-hf",
"Phind/Phind-CodeLlama-34B-v2",
"tiiuae/falcon-40B",
"tiiuae/falcon-40B-Instruct",
"tiiuae/falcon-180B",
"tiiuae/falcon-180B-Chat",
]
st.title("Can you run it? LLM version")
percentage_width_main = 80
st.markdown(
f"""<style>
.appview-container .main .block-container{{
max-width: {percentage_width_main}%;}}
</style>
""",
unsafe_allow_html=True,
)
@st.cache_resource()
def cache_model_list():
model_list_info = {}
for model_name in model_list:
if not "tiiuae/falcon" in model_name: # Exclude Falcon models
model = get_model(model_name, library="transformers", access_token="")
model_list_info[model_name] = calculate_memory(model, ["float32", "float16/bfloat16", "int8", "int4"])
del model
gc.collect()
return model_list_info
@st.cache_resource
def get_gpu_specs():
return pd.read_csv("data/gpu_specs.csv")
# @st.cache_resource
# def get_mistralai_table():
# model = get_model("mistralai/Mistral-7B-v0.1", library="transformers", access_token="")
# return calculate_memory(model, ["float32", "float16/bfloat16", "int8", "int4"])
def show_gpu_info(info, trainable_params=0, vendor=""):
for var in ['Inference', 'Full Training Adam', 'LoRa Fine-tuning']:
_info = info.loc[var]
if vendor != "Apple":
if _info['Number of GPUs'] >= 3:
func = st.error
icon = "⛔"
elif _info['Number of GPUs'] == 2:
func = st.warning
icon = "⚠️"
else:
func = st.success
icon = "✅"
msg = f"You require **{_info['Number of GPUs']}** GPUs for **{var}**"
if var == 'LoRa Fine-tuning':
msg += f" ({trainable_params}%)"
else:
if _info['Number of GPUs']==1:
msg = f"You can run **{var}**"
func = st.success
icon = "✅"
else:
msg = f"You cannot run **{var}**"
func = st.error
icon = "⛔"
func(msg, icon=icon)
def get_name(index):
row = gpu_specs.iloc[index]
return f"{row['Product Name']} ({row['RAM (GB)']} GB, {row['Year']})"
def custom_ceil(a, precision=0):
return np.round(a + 0.5 * 10**(-precision), precision)
gpu_specs = get_gpu_specs()
model_list_info = cache_model_list()
_, col, _ = st.columns([1,3,1])
with col.expander("Information", expanded=True):
st.markdown("""- GPU information comes from [TechPowerUp GPU Specs](https://www.techpowerup.com/gpu-specs/)
- Mainly based on [Model Memory Calculator by hf-accelerate](https://huggingface.co/spaces/hf-accelerate/model-memory-usage)
using `transformers` library
- Inference is calculated following [EleutherAI Transformer Math 101](https://blog.eleuther.ai/transformer-math/),
where is estimated as """)
st.latex(r"""\text{Memory}_\text{Inference} \approx \text{Model Size} \times 1.2""")
st.markdown("""- For LoRa Fine-tuning, I'm asuming a **16-bit** dtype of trainable parameters. The formula (in terms of GB) is""")
st.latex(r"\text{Memory}_\text{LoRa} \approx \left(\text{Model Size} + \text{ \# trainable Params}_\text{Billions}\times\frac{16}{8} \times 4\right) \times 1.2")
access_token = st.sidebar.text_input("Access token")
if access_token:
login(token=access_token)
#model_name = st.sidebar.text_input("Model name", value="mistralai/Mistral-7B-v0.1")
with st.sidebar.container():
model_name = stDatalist("Model name (Press Enter to apply)", model_list, index=0)
if not model_name:
st.info("Please enter a model name")
st.stop()
model_name = extract_from_url(model_name)
if model_name not in st.session_state:
if 'actual_model' in st.session_state:
del st.session_state[st.session_state['actual_model']]
del st.session_state['actual_model']
gc.collect()
if model_name in model_list_info.keys():
st.session_state[model_name] = model_list_info[model_name]
else:
model = get_model(model_name, library="transformers", access_token=access_token)
st.session_state[model_name] = calculate_memory(model, ["float32", "float16/bfloat16", "int8", "int4"])
del model
gc.collect()
st.session_state['actual_model'] = model_name
gpu_vendor = st.sidebar.selectbox("GPU Vendor", ["NVIDIA", "AMD", "Intel", "Apple"])
# year = st.sidebar.selectbox("Filter by Release Year", list(range(2014, 2024))[::-1], index=None)
gpu_info = gpu_specs[gpu_specs['Vendor'] == gpu_vendor].sort_values('Product Name')
# if year:
# gpu_info = gpu_info[gpu_info['Year'] == year]
min_ram = gpu_info['RAM (GB)'].min()
max_ram = gpu_info['RAM (GB)'].max()
ram = st.sidebar.slider("Filter by RAM (GB)", min_ram, max_ram, (10.0, 40.0), step=0.5)
gpu_info = gpu_info[gpu_info["RAM (GB)"].between(ram[0], ram[1])]
if len(gpu_info) == 0:
st.sidebar.error(f"**{gpu_vendor}** has no GPU in that RAM range")
st.stop()
gpu = st.sidebar.selectbox("GPU", gpu_info['Product Name'].index.tolist(), format_func=lambda x : gpu_specs.iloc[x]['Product Name'])
gpu_spec = gpu_specs.iloc[gpu]
gpu_spec.name = 'INFO'
lora_pct = st.sidebar.slider("LoRa % trainable parameters", 0.1, 100.0, 2.0, step=0.1)
st.sidebar.dataframe(gpu_spec.T.astype(str))
memory_table = pd.DataFrame(st.session_state[model_name]).set_index('dtype')
memory_table['LoRA Fine-Tuning (GB)'] = (memory_table["Total Size (GB)"] +
(memory_table["Parameters (Billion)"]* lora_pct/100 * (16/8)*4)) * 1.2
_memory_table = memory_table.copy()
memory_table = memory_table.round(2).T
_memory_table /= gpu_spec['RAM (GB)']
_memory_table = _memory_table.apply(np.ceil).astype(int).drop(columns=['Parameters (Billion)', 'Total Size (GB)'])
_memory_table.columns = ['Inference', 'Full Training Adam', 'LoRa Fine-tuning']
_memory_table = _memory_table.stack().reset_index()
_memory_table.columns = ['dtype', 'Variable', 'Number of GPUs']
col1, col2 = st.columns([1,1.3])
if gpu_vendor == "Apple":
col.warning("""For M1/M2/M3 Apple chips, PyTorch uses [Metal Performance Shaders (MPS)](https://huggingface.co/docs/accelerate/usage_guides/mps) as backend.\\
Remember that Apple M1/M2/M3 chips share memory between CPU and GPU.""", icon="⚠️")
with col1:
st.write(f"#### [{model_name}](https://huggingface.co/{model_name}) ({custom_ceil(memory_table.iloc[3,0],1):.1f}B)")
dtypes = memory_table.columns.tolist()[::-1]
tabs = st.tabs(dtypes)
for dtype, tab in zip(dtypes, tabs):
with tab:
if dtype in ["int4", "int8"]:
_dtype = dtype.replace("int", "")
st.markdown(f"`int{_dtype}` refers to models in `GPTQ-{_dtype}bit`, `AWQ-{_dtype}bit` or `Q{_dtype}_0 GGUF/GGML`")
info = _memory_table[_memory_table['dtype'] == dtype].set_index('Variable')
show_gpu_info(info, lora_pct, gpu_vendor)
st.write(memory_table.iloc[[0, 1, 2, 4]])
with col2:
extra = ""
if gpu_vendor == "Apple":
st.warning("This graph is irrelevant for M1/M2 chips as they can't run in parallel.", icon="⚠️")
extra = "⚠️"
num_colors= 4
colors = [px.colors.sequential.RdBu[int(i*(len(px.colors.sequential.RdBu)-1)/(num_colors-1))] for i in range(num_colors)]
fig = px.bar(_memory_table, x='Variable', y='Number of GPUs', color='dtype', barmode='group', color_discrete_sequence=colors)
fig.update_layout(title=dict(text=f"{extra} Number of GPUs required for<br> {get_name(gpu)}", font=dict(size=25))
, xaxis_tickfont_size=14, yaxis_tickfont_size=16, yaxis_dtick='1')
st.plotly_chart(fig, use_container_width=True)
|