Spaces:
Running
on
A100
Running
on
A100
import gradio as gr | |
import torch | |
from transformers import TorchAoConfig, AutoModelForCausalLM, AutoTokenizer | |
import tempfile | |
from huggingface_hub import HfApi | |
from huggingface_hub import list_models | |
from packaging import version | |
import os | |
def hello(profile: gr.OAuthProfile | None, oauth_token: gr.OAuthToken | None) -> str: | |
# ^ expect a gr.OAuthProfile object as input to get the user's profile | |
# if the user is not logged in, profile will be None | |
if profile is None: | |
return "Hello !" | |
return f"Hello {profile.name} !" | |
def check_model_exists(oauth_token: gr.OAuthToken | None, username, quantization_type, group_size, model_name, quantized_model_name): | |
"""Check if a model exists in the user's Hugging Face repository.""" | |
try: | |
models = list_models(author=username, token=oauth_token.token) | |
model_names = [model.id for model in models] | |
if quantized_model_name : | |
repo_name = f"{username}/{quantized_model_name}" | |
else : | |
if quantization_type == "int4_weight_only" : | |
repo_name = f"{username}/{model_name.split('/')[-1]}-torchao-{quantization_type.lower()}-gs_{group_size}" | |
else : | |
repo_name = f"{username}/{model_name.split('/')[-1]}-torchao-{quantization_type.lower()}" | |
if repo_name in model_names: | |
return f"Model '{repo_name}' already exists in your repository." | |
else: | |
return None # Model does not exist | |
except Exception as e: | |
return f"Error checking model existence: {str(e)}" | |
def create_model_card(model_name, quantization_type, group_size): | |
model_card = f"""--- | |
base_model: | |
- {model_name} | |
--- | |
# {model_name} (Quantized) | |
## Description | |
This model is a quantized version of the original model `{model_name}`. It has been quantized using {quantization_type} quantization with torchao. | |
## Quantization Details | |
- **Quantization Type**: {quantization_type} | |
- **Group Size**: {group_size if quantization_type == "int4_weight_only" else None} | |
## Usage | |
You can use this model in your applications by loading it directly from the Hugging Face Hub: | |
```python | |
from transformers import AutoModel | |
model = AutoModel.from_pretrained("{model_name}")""" | |
return model_card | |
def quantize_model(model_name, quantization_type, group_size=128, auth_token=None, username=None): | |
print(f"Quantizing model: {quantization_type}") | |
if quantization_type == "int4_weight_only" : | |
quantization_config = TorchAoConfig(quantization_type, group_size=group_size) | |
else : | |
quantization_config = TorchAoConfig(quantization_type) | |
model = AutoModelForCausalLM.from_pretrained(model_name, device_map="cpu", torch_dtype=torch.bfloat16, quantization_config=quantization_config, use_auth_token=auth_token.token) | |
return model | |
def save_model(model, model_name, quantization_type, group_size=128, username=None, auth_token=None, quantized_model_name=None): | |
print("Saving quantized model") | |
with tempfile.TemporaryDirectory() as tmpdirname: | |
model_card = create_model_card(model_name, quantization_type, group_size) | |
with open(os.path.join(tmpdirname, "README.md"), "w") as f: | |
f.write(model_card) | |
model.save_pretrained(tmpdirname, safe_serialization=False, use_auth_token=auth_token.token) | |
if quantized_model_name : | |
repo_name = f"{username}/{quantized_model_name}" | |
else : | |
if quantization_type == "int4_weight_only" : | |
repo_name = f"{username}/{model_name.split('/')[-1]}-torchao-{quantization_type.lower()}-gs_{group_size}" | |
else : | |
repo_name = f"{username}/{model_name.split('/')[-1]}-torchao-{quantization_type.lower()}" | |
# Push to Hub | |
api = HfApi() | |
api.create_repo(repo_name, exist_ok=True) | |
api.upload_folder( | |
folder_path=tmpdirname, | |
repo_id=repo_name, | |
repo_type="model", | |
) | |
return f"https://huggingface.co/{repo_name}" | |
def quantize_and_save(profile: gr.OAuthProfile | None, oauth_token: gr.OAuthToken | None, model_name, quantization_type, group_size, quantized_model_name): | |
if oauth_token is None : | |
return "Error : Please Sign In to your HuggingFace account to use the quantizer" | |
if not profile: | |
return "Error: Please Sign In to your HuggingFace account to use the quantizer" | |
exists_message = check_model_exists(oauth_token, profile.username, quantization_type, group_size, model_name, quantized_model_name) | |
if exists_message : | |
return exists_message | |
quantized_model = quantize_model(model_name, quantization_type, group_size, oauth_token, profile.username) | |
return save_model(quantized_model, model_name, quantization_type, group_size, profile.username, oauth_token, quantized_model_name) | |
with gr.Blocks(theme=gr.themes.Soft()) as app: | |
gr.Markdown( | |
""" | |
# π Model Quantization App | |
Quantize your favorite Hugging Face models and save them to your profile! | |
""" | |
) | |
gr.LoginButton(elem_id="login-button", elem_classes="center-button") | |
m1 = gr.Markdown() | |
app.load(hello, inputs=None, outputs=m1) | |
with gr.Row(): | |
with gr.Column(): | |
model_name = gr.Textbox( | |
label="Model Name", | |
placeholder="e.g., meta-llama/Meta-Llama-3-8B", | |
value="meta-llama/Meta-Llama-3-8B" | |
) | |
quantization_type = gr.Dropdown( | |
label="Quantization Type", | |
choices=["int4_weight_only", "int8_weight_only", "int8_dynamic_activation_int8_weight"], | |
value="int8_weight_only" | |
) | |
group_size = gr.Number( | |
label="Group Size (only for int4_weight_only)", | |
value=128, | |
interactive=True | |
) | |
quantized_model_name = gr.Textbox( | |
label="Model Name (optional : to override default)", | |
value="", | |
interactive=True | |
) | |
# with gr.Row(): | |
# username = gr.Textbox( | |
# label="Hugging Face Username", | |
# placeholder="Enter your Hugging Face username", | |
# value="", | |
# interactive=True, | |
# elem_id="username-box" | |
# ) | |
with gr.Column(): | |
quantize_button = gr.Button("Quantize and Save Model", variant="primary") | |
output_link = gr.Textbox(label="Quantized Model Link") | |
gr.Markdown( | |
""" | |
## Instructions | |
1. Enter the name of the Hugging Face model you want to quantize. | |
2. Choose the quantization type. | |
3. Optionally, specify the group size. | |
4. Click "Quantize and Save Model" to start the process. | |
5. Once complete, you'll receive a link to the quantized model on Hugging Face. | |
Note: This process may take some time depending on the model size and your hardware. | |
""" | |
) | |
# Adding CSS styles for the username box | |
app.css = """ | |
#username-box { | |
background-color: #f0f8ff; /* Light color */ | |
border-radius: 8px; | |
padding: 10px; | |
} | |
""" | |
app.css = """ | |
.center-button { | |
display: flex; | |
justify-content: center; | |
align-items: center; | |
margin: 0 auto; /* Center horizontally */ | |
} | |
""" | |
quantize_button.click( | |
fn=quantize_and_save, | |
inputs=[model_name, quantization_type, group_size, quantized_model_name], | |
outputs=[output_link] | |
) | |
# Launch the app | |
app.launch(share=True) | |
from torchao.quantization import ( | |
int4_weight_only, | |
int8_dynamic_activation_int8_weight, | |
int8_weight_only, | |
) | |
# import gradio as gr | |
# import torch | |
# from transformers import AutoModelForCausalLM, AutoTokenizer | |
# import torch.ao.quantization as quant | |
# import os | |
# from huggingface_hub import HfApi | |
# import tempfile | |
# import torch.utils.data as data | |
# from torchao.quantization import quantize_ | |
# def load_calibration_dataset(tokenizer, num_samples=100): | |
# # This is a placeholder. In a real scenario, you'd load actual data. | |
# dummy_texts = ["This is a sample text" for _ in range(num_samples)] | |
# encodings = tokenizer(dummy_texts, truncation=True, padding=True, return_tensors="pt") | |
# dataset = data.TensorDataset(encodings['input_ids'], encodings['attention_mask']) | |
# return data.DataLoader(dataset, batch_size=1) | |
# def load_model(model_name): | |
# print(f"Loading model: {model_name}") | |
# model = AutoModelForCausalLM.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.bfloat16, device_map="auto") | |
# tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True) | |
# return model, tokenizer | |
# def quantize_model(model, quant_type, dtype): | |
# print(f"Quantizing model: {quant_type} - {dtype}") | |
# quantize_(model, _STR_TO_METHOD[dtype](group_size=128)) | |
# def save_model(model, model_name, quant_type, dtype): | |
# print("Saving quantized model") | |
# model.save_pretrained("medmekk/model_llama", safe_serialization=False) | |
# with tempfile.TemporaryDirectory() as tmpdirname: | |
# model.save_pretrained(tmpdirname) | |
# # Create a new repo name | |
# repo_name = f"{model_name.split('/')[-1]}-quantized-{quant_type.lower()}-{dtype}bit" | |
# # Push to Hub | |
# api = HfApi() | |
# api.create_repo(repo_name, exist_ok=True) | |
# api.upload_folder( | |
# folder_path=tmpdirname, | |
# repo_id=repo_name, | |
# repo_type="model", | |
# ) | |
# return f"https://huggingface.co/{repo_name}" | |
# _STR_TO_METHOD = { | |
# "int4_weight_only": int4_weight_only, | |
# "int8_weight_only": int8_weight_only, | |
# "int8_dynamic_activation_int8_weight": int8_dynamic_activation_int8_weight, | |
# } | |
# def quantize_and_save(model_name, quant_type, dtype): | |
# model, tokenizer = load_model(model_name) | |
# quantize_model(model, quant_type, dtype) | |
# print(model.device) | |
# return save_model(model, model_name, quant_type, dtype) | |
# # Gradio interface | |
# with gr.Blocks(theme=gr.themes.Soft()) as app: | |
# gr.Markdown( | |
# """ | |
# # π Model Quantization App | |
# Quantize your favorite Hugging Face models and save them to your profile! | |
# """ | |
# ) | |
# with gr.Row(): | |
# with gr.Column(): | |
# model_name = gr.Textbox( | |
# label="Model Name", | |
# placeholder="e.g., gpt2, distilgpt2", | |
# value="meta-llama/Meta-Llama-3-8B-Instruct" | |
# ) | |
# quant_type = gr.Dropdown( | |
# label="Quantization Type", | |
# choices=["Dynamic", "Static"], | |
# value="Dynamic" | |
# ) | |
# dtype = gr.Dropdown( | |
# label="Data Type", | |
# choices=["int4_weight_only", "int8_weight_only", "int8_dynamic_activation_int8_weight"], | |
# value="int4_weight_only" | |
# ) | |
# with gr.Column(): | |
# quantize_button = gr.Button("Quantize and Save Model", variant="primary") | |
# output_link = gr.Textbox(label="Output", interactive=False) | |
# gr.Markdown( | |
# """ | |
# ## Instructions | |
# 1. Enter the name of the Hugging Face model you want to quantize. | |
# 2. Choose the quantization type. | |
# 3. If using Weight Only quantization, select the number of bits. | |
# 4. Click "Quantize and Save Model" to start the process. | |
# 5. Once complete, you'll receive a link to the quantized model on Hugging Face. | |
# Note: This process may take some time depending on the model size and your hardware. | |
# """ | |
# ) | |
# quantize_button.click( | |
# fn=quantize_and_save, | |
# inputs=[model_name, quant_type, dtype], | |
# outputs=[output_link] | |
# ) | |
# # Launch the app | |
# app.launch(share=True) |