Spaces:
Sleeping
Sleeping
MekkCyber
commited on
Commit
·
e5bb0c6
1
Parent(s):
3d2f5ba
AutoModel
Browse files
app.py
CHANGED
@@ -1,6 +1,6 @@
|
|
1 |
import gradio as gr
|
2 |
import torch
|
3 |
-
from transformers import TorchAoConfig, AutoModelForCausalLM, AutoTokenizer
|
4 |
import tempfile
|
5 |
from huggingface_hub import HfApi
|
6 |
from huggingface_hub import list_models
|
@@ -59,14 +59,14 @@ model = AutoModel.from_pretrained("{model_name}")"""
|
|
59 |
|
60 |
return model_card
|
61 |
|
62 |
-
def quantize_model(model_name, quantization_type, group_size=128, auth_token=None, username=None):
|
63 |
print(f"Quantizing model: {quantization_type}")
|
64 |
if quantization_type == "int4_weight_only" :
|
65 |
quantization_config = TorchAoConfig(quantization_type, group_size=group_size)
|
66 |
-
model = AutoModelForCausalLM.from_pretrained(model_name, device_map="cuda", torch_dtype=torch.bfloat16, quantization_config=quantization_config, use_auth_token=auth_token.token)
|
67 |
else :
|
68 |
quantization_config = TorchAoConfig(quantization_type)
|
69 |
-
|
|
|
70 |
|
71 |
return model
|
72 |
|
@@ -97,7 +97,7 @@ def save_model(model, model_name, quantization_type, group_size=128, username=No
|
|
97 |
|
98 |
return f"https://huggingface.co/{repo_name}"
|
99 |
|
100 |
-
def quantize_and_save(profile: gr.OAuthProfile | None, oauth_token: gr.OAuthToken | None, model_name, quantization_type, group_size, quantized_model_name):
|
101 |
if oauth_token is None :
|
102 |
return "Error : Please Sign In to your HuggingFace account to use the quantizer"
|
103 |
if not profile:
|
@@ -105,14 +105,16 @@ def quantize_and_save(profile: gr.OAuthProfile | None, oauth_token: gr.OAuthToke
|
|
105 |
exists_message = check_model_exists(oauth_token, profile.username, quantization_type, group_size, model_name, quantized_model_name)
|
106 |
if exists_message :
|
107 |
return exists_message
|
108 |
-
|
|
|
|
|
109 |
return save_model(quantized_model, model_name, quantization_type, group_size, profile.username, oauth_token, quantized_model_name)
|
110 |
|
111 |
|
112 |
with gr.Blocks(theme=gr.themes.Soft()) as app:
|
113 |
gr.Markdown(
|
114 |
"""
|
115 |
-
# 🚀 Model Quantization App
|
116 |
|
117 |
Quantize your favorite Hugging Face models and save them to your profile!
|
118 |
"""
|
@@ -141,6 +143,11 @@ with gr.Blocks(theme=gr.themes.Soft()) as app:
|
|
141 |
value=128,
|
142 |
interactive=True
|
143 |
)
|
|
|
|
|
|
|
|
|
|
|
144 |
quantized_model_name = gr.Textbox(
|
145 |
label="Model Name (optional : to override default)",
|
146 |
value="",
|
@@ -162,7 +169,7 @@ with gr.Blocks(theme=gr.themes.Soft()) as app:
|
|
162 |
"""
|
163 |
## Instructions
|
164 |
1. Login to your HuggingFace account
|
165 |
-
2. Enter the name of the Hugging Face model you want to quantize (Make sure you have access to it)
|
166 |
3. Choose the quantization type.
|
167 |
4. Optionally, specify the group size.
|
168 |
5. Optionally, choose a custom name for the quantized model
|
@@ -193,7 +200,7 @@ with gr.Blocks(theme=gr.themes.Soft()) as app:
|
|
193 |
|
194 |
quantize_button.click(
|
195 |
fn=quantize_and_save,
|
196 |
-
inputs=[model_name, quantization_type, group_size, quantized_model_name],
|
197 |
outputs=[output_link]
|
198 |
)
|
199 |
|
|
|
1 |
import gradio as gr
|
2 |
import torch
|
3 |
+
from transformers import TorchAoConfig, AutoModelForCausalLM, AutoTokenizer, AutoModel
|
4 |
import tempfile
|
5 |
from huggingface_hub import HfApi
|
6 |
from huggingface_hub import list_models
|
|
|
59 |
|
60 |
return model_card
|
61 |
|
62 |
+
def quantize_model(model_name, quantization_type, group_size=128, auth_token=None, username=None, device="cuda"):
|
63 |
print(f"Quantizing model: {quantization_type}")
|
64 |
if quantization_type == "int4_weight_only" :
|
65 |
quantization_config = TorchAoConfig(quantization_type, group_size=group_size)
|
|
|
66 |
else :
|
67 |
quantization_config = TorchAoConfig(quantization_type)
|
68 |
+
|
69 |
+
model = AutoModel.from_pretrained(model_name, device_map=device, torch_dtype=torch.bfloat16, quantization_config=quantization_config, use_auth_token=auth_token.token)
|
70 |
|
71 |
return model
|
72 |
|
|
|
97 |
|
98 |
return f"https://huggingface.co/{repo_name}"
|
99 |
|
100 |
+
def quantize_and_save(profile: gr.OAuthProfile | None, oauth_token: gr.OAuthToken | None, model_name, quantization_type, group_size, quantized_model_name, device):
|
101 |
if oauth_token is None :
|
102 |
return "Error : Please Sign In to your HuggingFace account to use the quantizer"
|
103 |
if not profile:
|
|
|
105 |
exists_message = check_model_exists(oauth_token, profile.username, quantization_type, group_size, model_name, quantized_model_name)
|
106 |
if exists_message :
|
107 |
return exists_message
|
108 |
+
if quantization_type == "int4_weight_only" and device == "cpu" :
|
109 |
+
return "int4_weight_only not supported on cpu"
|
110 |
+
quantized_model = quantize_model(model_name, quantization_type, group_size, oauth_token, profile.username, device)
|
111 |
return save_model(quantized_model, model_name, quantization_type, group_size, profile.username, oauth_token, quantized_model_name)
|
112 |
|
113 |
|
114 |
with gr.Blocks(theme=gr.themes.Soft()) as app:
|
115 |
gr.Markdown(
|
116 |
"""
|
117 |
+
# 🚀 LLM Model Quantization App
|
118 |
|
119 |
Quantize your favorite Hugging Face models and save them to your profile!
|
120 |
"""
|
|
|
143 |
value=128,
|
144 |
interactive=True
|
145 |
)
|
146 |
+
device = gr.Dropdown(
|
147 |
+
label="Device (int4 only works with cuda)",
|
148 |
+
choices=["cuda", "cpu"],
|
149 |
+
value="cuda"
|
150 |
+
)
|
151 |
quantized_model_name = gr.Textbox(
|
152 |
label="Model Name (optional : to override default)",
|
153 |
value="",
|
|
|
169 |
"""
|
170 |
## Instructions
|
171 |
1. Login to your HuggingFace account
|
172 |
+
2. Enter the name of the Hugging Face LLM model you want to quantize (Make sure you have access to it)
|
173 |
3. Choose the quantization type.
|
174 |
4. Optionally, specify the group size.
|
175 |
5. Optionally, choose a custom name for the quantized model
|
|
|
200 |
|
201 |
quantize_button.click(
|
202 |
fn=quantize_and_save,
|
203 |
+
inputs=[model_name, quantization_type, group_size, quantized_model_name, device],
|
204 |
outputs=[output_link]
|
205 |
)
|
206 |
|