Spaces:

medmekk
/

TorchAO_Quantization

Running on A100

App Files Files Community

MekkCyber commited on Oct 17, 2024

Commit

9b71f2b

1 Parent(s): 677834b

Add app file

Browse files

Files changed (3) hide show

README.md +15 -6
app.py +327 -0
requirements.txt +4 -0

README.md CHANGED Viewed

@@ -1,13 +1,22 @@
 ---
-title: TorchAO Quantization
-emoji: 🏃
-colorFrom: red
-colorTo: green
 sdk: gradio
-sdk_version: 5.1.0
 app_file: app.py
 pinned: false
-short_description: The Go To space to quantize your models using Torchao simply
 ---
 Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

 ---
+title: QuantizationTorchAODraft
+emoji: 💻
+colorFrom: blue
+colorTo: red
 sdk: gradio
+sdk_version: 5.0.1
 app_file: app.py
 pinned: false
+hf_oauth: true
+# optional, default duration is 8 hours/480 minutes. Max duration is 30 days/43200 minutes.
+hf_oauth_expiration_minutes: 480
+# optional, see "Scopes" below. "openid profile" is always included.
+hf_oauth_scopes:
+ - read-repos
+ - write-repos
+ - manage-repos
+ - inference-api
 ---
 Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

app.py ADDED Viewed

	@@ -0,0 +1,327 @@

+import gradio as gr
+import torch
+from transformers import TorchAoConfig, AutoModelForCausalLM, AutoTokenizer
+import tempfile
+from huggingface_hub import HfApi
+from huggingface_hub import list_models
+from packaging import version
+import os
+def hello(profile: gr.OAuthProfile | None, oauth_token: gr.OAuthToken | None) -> str:
+    # ^ expect a gr.OAuthProfile object as input to get the user's profile
+    # if the user is not logged in, profile will be None
+    if profile is None:
+        return "Hello !"
+    return f"Hello {profile.name} !"
+def check_model_exists(oauth_token: gr.OAuthToken | None, username, quantization_type, group_size, model_name, quantized_model_name):
+    """Check if a model exists in the user's Hugging Face repository."""
+    try:
+        models = list_models(author=username, token=oauth_token.token)
+        model_names = [model.id for model in models]
+        if quantized_model_name :
+            repo_name = f"{username}/{quantized_model_name}"
+        else :
+            if quantization_type == "int4_weight_only" :
+                repo_name = f"{username}/{model_name.split('/')[-1]}-torchao-{quantization_type.lower()}-gs_{group_size}"
+            else :
+                repo_name = f"{username}/{model_name.split('/')[-1]}-torchao-{quantization_type.lower()}"
+        if repo_name in model_names:
+            return f"Model '{repo_name}' already exists in your repository."
+        else:
+            return None  # Model does not exist
+    except Exception as e:
+        return f"Error checking model existence: {str(e)}"
+def create_model_card(model_name, quantization_type, group_size):
+    model_card = f"""---
+base_model:
+- {model_name}
+---
+# {model_name} (Quantized)
+## Description
+This model is a quantized version of the original model `{model_name}`. It has been quantized using {quantization_type} quantization with torchao.
+## Quantization Details
+- **Quantization Type**: {quantization_type}
+- **Group Size**: {group_size if quantization_type == "int4_weight_only" else None}
+## Usage
+You can use this model in your applications by loading it directly from the Hugging Face Hub:
+```python
+from transformers import AutoModel
+model = AutoModel.from_pretrained("{model_name}")"""
+    return model_card
+def quantize_model(model_name, quantization_type, group_size=128, auth_token=None, username=None):
+    print(f"Quantizing model: {quantization_type}")
+    if quantization_type == "int4_weight_only" :
+        quantization_config = TorchAoConfig(quantization_type, group_size=group_size)
+    else :
+        quantization_config = TorchAoConfig(quantization_type)
+    model = AutoModelForCausalLM.from_pretrained(model_name, device_map="cpu", torch_dtype=torch.bfloat16, quantization_config=quantization_config, use_auth_token=auth_token.token)
+    return model
+def save_model(model, model_name, quantization_type, group_size=128, username=None, auth_token=None, quantized_model_name=None):
+    print("Saving quantized model")
+    with tempfile.TemporaryDirectory() as tmpdirname:
+        model_card = create_model_card(model_name, quantization_type, group_size)
+        with open(os.path.join(tmpdirname, "README.md"), "w") as f:
+            f.write(model_card)
+        model.save_pretrained(tmpdirname, safe_serialization=False, use_auth_token=auth_token.token)
+        if quantized_model_name :
+            repo_name = f"{username}/{quantized_model_name}"
+        else :
+            if quantization_type == "int4_weight_only" :
+                repo_name = f"{username}/{model_name.split('/')[-1]}-torchao-{quantization_type.lower()}-gs_{group_size}"
+            else :
+                repo_name = f"{username}/{model_name.split('/')[-1]}-torchao-{quantization_type.lower()}"
+        # Push to Hub
+        api = HfApi()
+        api.create_repo(repo_name, exist_ok=True)
+        api.upload_folder(
+            folder_path=tmpdirname,
+            repo_id=repo_name,
+            repo_type="model",
+        )
+    return f"https://huggingface.co/{repo_name}"
+def quantize_and_save(profile: gr.OAuthProfile | None, oauth_token: gr.OAuthToken | None, model_name, quantization_type, group_size, quantized_model_name):
+    if oauth_token is None :
+        return "Error : Please Sign In to your HuggingFace account to use the quantizer"
+    if not profile:
+        return "Error: Please Sign In to your HuggingFace account to use the quantizer"
+    exists_message = check_model_exists(oauth_token, profile.username, quantization_type, group_size, model_name, quantized_model_name)
+    if exists_message :
+        return exists_message
+    quantized_model = quantize_model(model_name, quantization_type, group_size, oauth_token, profile.username)
+    return save_model(quantized_model, model_name, quantization_type, group_size, profile.username, oauth_token, quantized_model_name)
+with gr.Blocks(theme=gr.themes.Soft()) as app:
+    gr.Markdown(
+        """
+        # 🚀 Model Quantization App
+        Quantize your favorite Hugging Face models and save them to your profile!
+        """
+    )
+    gr.LoginButton(elem_id="login-button", elem_classes="center-button")
+    m1 = gr.Markdown()
+    app.load(hello, inputs=None, outputs=m1)
+    with gr.Row():
+        with gr.Column():
+            model_name = gr.Textbox(
+                label="Model Name",
+                placeholder="e.g., meta-llama/Meta-Llama-3-8B",
+                value="meta-llama/Meta-Llama-3-8B"
+            )
+            quantization_type = gr.Dropdown(
+                label="Quantization Type",
+                choices=["int4_weight_only", "int8_weight_only", "int8_dynamic_activation_int8_weight"],
+                value="int8_weight_only"
+            )
+            group_size = gr.Number(
+                label="Group Size (only for int4_weight_only)",
+                value=128,
+                interactive=True
+            )
+            quantized_model_name = gr.Textbox(
+                label="Model Name (optional : to override default)",
+                value="",
+                interactive=True
+            )
+            # with gr.Row():
+            #     username = gr.Textbox(
+            #         label="Hugging Face Username",
+            #         placeholder="Enter your Hugging Face username",
+            #         value="",
+            #         interactive=True,
+            #         elem_id="username-box"
+            #     )
+        with gr.Column():
+            quantize_button = gr.Button("Quantize and Save Model", variant="primary")
+            output_link = gr.Textbox(label="Quantized Model Link")
+    gr.Markdown(
+        """
+        ## Instructions
+        1. Enter the name of the Hugging Face model you want to quantize.
+        2. Choose the quantization type.
+        3. Optionally, specify the group size.
+        4. Click "Quantize and Save Model" to start the process.
+        5. Once complete, you'll receive a link to the quantized model on Hugging Face.
+        Note: This process may take some time depending on the model size and your hardware.
+        """
+    )
+    # Adding CSS styles for the username box
+    app.css = """
+    #username-box {
+        background-color: #f0f8ff; /* Light color */
+        border-radius: 8px;
+        padding: 10px;
+    }
+    """
+    app.css = """
+    .center-button {
+        display: flex;
+        justify-content: center;
+        align-items: center;
+        margin: 0 auto; /* Center horizontally */
+    }
+    """
+    quantize_button.click(
+        fn=quantize_and_save,
+        inputs=[model_name, quantization_type, group_size, quantized_model_name],
+        outputs=[output_link]
+    )
+# Launch the app
+app.launch(share=True)
+from torchao.quantization import (
+                int4_weight_only,
+                int8_dynamic_activation_int8_weight,
+                int8_weight_only,
+            )
+# import gradio as gr
+# import torch
+# from transformers import AutoModelForCausalLM, AutoTokenizer
+# import torch.ao.quantization as quant
+# import os
+# from huggingface_hub import HfApi
+# import tempfile
+# import torch.utils.data as data
+# from torchao.quantization import quantize_
+# def load_calibration_dataset(tokenizer, num_samples=100):
+#     # This is a placeholder. In a real scenario, you'd load actual data.
+#     dummy_texts = ["This is a sample text" for _ in range(num_samples)]
+#     encodings = tokenizer(dummy_texts, truncation=True, padding=True, return_tensors="pt")
+#     dataset = data.TensorDataset(encodings['input_ids'], encodings['attention_mask'])
+#     return data.DataLoader(dataset, batch_size=1)
+# def load_model(model_name):
+#     print(f"Loading model: {model_name}")
+#     model = AutoModelForCausalLM.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.bfloat16, device_map="auto")
+#     tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
+#     return model, tokenizer
+# def quantize_model(model, quant_type, dtype):
+#     print(f"Quantizing model: {quant_type} - {dtype}")
+#     quantize_(model, _STR_TO_METHOD[dtype](group_size=128))
+# def save_model(model, model_name, quant_type, dtype):
+#     print("Saving quantized model")
+#     model.save_pretrained("medmekk/model_llama", safe_serialization=False)
+#     with tempfile.TemporaryDirectory() as tmpdirname:
+#         model.save_pretrained(tmpdirname)
+#         # Create a new repo name
+#         repo_name = f"{model_name.split('/')[-1]}-quantized-{quant_type.lower()}-{dtype}bit"
+#         # Push to Hub
+#         api = HfApi()
+#         api.create_repo(repo_name, exist_ok=True)
+#         api.upload_folder(
+#             folder_path=tmpdirname,
+#             repo_id=repo_name,
+#             repo_type="model",
+#         )
+#     return f"https://huggingface.co/{repo_name}"
+# _STR_TO_METHOD = {
+#     "int4_weight_only": int4_weight_only,
+#     "int8_weight_only": int8_weight_only,
+#     "int8_dynamic_activation_int8_weight": int8_dynamic_activation_int8_weight,
+# }
+# def quantize_and_save(model_name, quant_type, dtype):
+#     model, tokenizer = load_model(model_name)
+#     quantize_model(model, quant_type, dtype)
+#     print(model.device)
+#     return save_model(model, model_name, quant_type, dtype)
+# # Gradio interface
+# with gr.Blocks(theme=gr.themes.Soft()) as app:
+#     gr.Markdown(
+#         """
+#         # 🚀 Model Quantization App
+#         Quantize your favorite Hugging Face models and save them to your profile!
+#         """
+#     )
+#     with gr.Row():
+#         with gr.Column():
+#             model_name = gr.Textbox(
+#                 label="Model Name",
+#                 placeholder="e.g., gpt2, distilgpt2",
+#                 value="meta-llama/Meta-Llama-3-8B-Instruct"
+#             )
+#             quant_type = gr.Dropdown(
+#                 label="Quantization Type",
+#                 choices=["Dynamic", "Static"],
+#                 value="Dynamic"
+#             )
+#             dtype = gr.Dropdown(
+#                 label="Data Type",
+#                 choices=["int4_weight_only", "int8_weight_only", "int8_dynamic_activation_int8_weight"],
+#                 value="int4_weight_only"
+#             )
+#         with gr.Column():
+#             quantize_button = gr.Button("Quantize and Save Model", variant="primary")
+#             output_link = gr.Textbox(label="Output", interactive=False)
+#     gr.Markdown(
+#         """
+#         ## Instructions
+#         1. Enter the name of the Hugging Face model you want to quantize.
+#         2. Choose the quantization type.
+#         3. If using Weight Only quantization, select the number of bits.
+#         4. Click "Quantize and Save Model" to start the process.
+#         5. Once complete, you'll receive a link to the quantized model on Hugging Face.
+#         Note: This process may take some time depending on the model size and your hardware.
+#         """
+#     )
+#     quantize_button.click(
+#         fn=quantize_and_save,
+#         inputs=[model_name, quant_type, dtype],
+#         outputs=[output_link]
+#     )
+# # Launch the app
+# app.launch(share=True)

requirements.txt ADDED Viewed

	@@ -0,0 +1,4 @@

+git+https://github.com/huggingface/transformers.git@main#egg=transformers
+accelerate
+torchao
+huggingface-hub