MekkCyber commited on
Commit
63d14c6
Β·
1 Parent(s): ce0c4f3

add app logic

Browse files
Files changed (3) hide show
  1. README.md +15 -6
  2. app.py +221 -4
  3. requirement.txt +6 -0
README.md CHANGED
@@ -1,13 +1,22 @@
1
  ---
2
- title: TorchAO
3
- emoji: πŸš€
4
- colorFrom: gray
5
- colorTo: blue
6
  sdk: gradio
7
- sdk_version: 5.10.0
8
  app_file: app.py
9
  pinned: false
10
- short_description: TorchAO Quantization
 
 
 
 
 
 
 
 
 
11
  ---
12
 
13
  Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
 
1
  ---
2
+ title: QuantizationTorchAODraft
3
+ emoji: πŸ’»
4
+ colorFrom: blue
5
+ colorTo: red
6
  sdk: gradio
7
+ sdk_version: 4.27.0
8
  app_file: app.py
9
  pinned: false
10
+
11
+ hf_oauth: true
12
+ # optional, default duration is 8 hours/480 minutes. Max duration is 30 days/43200 minutes.
13
+ hf_oauth_expiration_minutes: 480
14
+ # optional, see "Scopes" below. "openid profile" is always included.
15
+ hf_oauth_scopes:
16
+ - read-repos
17
+ - write-repos
18
+ - manage-repos
19
+ - inference-api
20
  ---
21
 
22
  Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
app.py CHANGED
@@ -1,7 +1,224 @@
1
  import gradio as gr
 
 
 
 
 
 
 
 
 
2
 
3
- def greet(name):
4
- return "Hello " + name + "!!"
 
 
 
 
5
 
6
- demo = gr.Interface(fn=greet, inputs="text", outputs="text")
7
- demo.launch()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  import gradio as gr
2
+ import torch
3
+ from transformers import TorchAoConfig, AutoModelForCausalLM, AutoTokenizer, AutoModel
4
+ import tempfile
5
+ from huggingface_hub import HfApi
6
+ from huggingface_hub import list_models
7
+ from gradio_huggingfacehub_search import HuggingfaceHubSearch
8
+ from packaging import version
9
+ import os
10
+ import spaces
11
 
12
+ def hello(profile: gr.OAuthProfile | None, oauth_token: gr.OAuthToken | None) -> str:
13
+ # ^ expect a gr.OAuthProfile object as input to get the user's profile
14
+ # if the user is not logged in, profile will be None
15
+ if profile is None:
16
+ return "Hello !"
17
+ return f"Hello {profile.name} !"
18
 
19
+ def check_model_exists(oauth_token: gr.OAuthToken | None, username, quantization_type, group_size, model_name, quantized_model_name):
20
+ """Check if a model exists in the user's Hugging Face repository."""
21
+ try:
22
+ models = list_models(author=username, token=oauth_token.token)
23
+ model_names = [model.id for model in models]
24
+ if quantized_model_name :
25
+ repo_name = f"{username}/{quantized_model_name}"
26
+ else :
27
+ if quantization_type == "int4_weight_only" :
28
+ repo_name = f"{username}/{model_name.split('/')[-1]}-torchao-{quantization_type.lower()}-gs_{group_size}"
29
+ else :
30
+ repo_name = f"{username}/{model_name.split('/')[-1]}-torchao-{quantization_type.lower()}"
31
+
32
+ if repo_name in model_names:
33
+ return f"Model '{repo_name}' already exists in your repository."
34
+ else:
35
+ return None # Model does not exist
36
+ except Exception as e:
37
+ return f"Error checking model existence: {str(e)}"
38
+
39
+ def create_model_card(model_name, quantization_type, group_size):
40
+ model_card = f"""---
41
+ base_model:
42
+ - {model_name}
43
+ ---
44
+
45
+ # {model_name} (Quantized)
46
+
47
+ ## Description
48
+ This model is a quantized version of the original model `{model_name}`. It has been quantized using {quantization_type} quantization with torchao.
49
+
50
+ ## Quantization Details
51
+ - **Quantization Type**: {quantization_type}
52
+ - **Group Size**: {group_size if quantization_type == "int4_weight_only" else None}
53
+
54
+ ## Usage
55
+ You can use this model in your applications by loading it directly from the Hugging Face Hub:
56
+
57
+ ```python
58
+ from transformers import AutoModel
59
+
60
+ model = AutoModel.from_pretrained("{model_name}")"""
61
+
62
+ return model_card
63
+
64
+ @spaces.GPU
65
+ def load_model_gpu(model_name, quantization_config, auth_token) :
66
+ return AutoModel.from_pretrained(model_name, torch_dtype=torch.bfloat16, quantization_config=quantization_config, use_auth_token=auth_token.token)
67
+
68
+ def load_model_cpu(model_name, quantization_config, auth_token) :
69
+ return AutoModel.from_pretrained(model_name, torch_dtype=torch.bfloat16, quantization_config=quantization_config, use_auth_token=auth_token.token)
70
+
71
+ def quantize_model(model_name, quantization_type, group_size=128, auth_token=None, username=None, device="cuda"):
72
+ print(f"Quantizing model: {quantization_type}")
73
+ if quantization_type == "int4_weight_only" :
74
+ quantization_config = TorchAoConfig(quantization_type, group_size=group_size)
75
+ else :
76
+ quantization_config = TorchAoConfig(quantization_type)
77
+ if device == "cuda" :
78
+ model = load_model_gpu(model_name, quantization_config=quantization_config, auth_token=auth_token)
79
+ else :
80
+ model = load_model_cpu(model_name, quantization_config=quantization_config, auth_token=auth_token)
81
+
82
+ return model
83
+
84
+ def save_model(model, model_name, quantization_type, group_size=128, username=None, auth_token=None, quantized_model_name=None):
85
+ print("Saving quantized model")
86
+ with tempfile.TemporaryDirectory() as tmpdirname:
87
+
88
+
89
+ model.save_pretrained(tmpdirname, safe_serialization=False, use_auth_token=auth_token.token)
90
+ if quantized_model_name :
91
+ repo_name = f"{username}/{quantized_model_name}"
92
+ else :
93
+ if quantization_type == "int4_weight_only" :
94
+ repo_name = f"{username}/{model_name.split('/')[-1]}-torchao-{quantization_type.lower()}-gs_{group_size}"
95
+ else :
96
+ repo_name = f"{username}/{model_name.split('/')[-1]}-torchao-{quantization_type.lower()}"
97
+
98
+ model_card = create_model_card(repo_name, quantization_type, group_size)
99
+ with open(os.path.join(tmpdirname, "README.md"), "w") as f:
100
+ f.write(model_card)
101
+ # Push to Hub
102
+ api = HfApi(token=auth_token.token)
103
+ api.create_repo(repo_name, exist_ok=True)
104
+ api.upload_folder(
105
+ folder_path=tmpdirname,
106
+ repo_id=repo_name,
107
+ repo_type="model",
108
+ )
109
+
110
+ return f"https://huggingface.co/{repo_name}"
111
+
112
+ def quantize_and_save(profile: gr.OAuthProfile | None, oauth_token: gr.OAuthToken | None, model_name, quantization_type, group_size, quantized_model_name, device):
113
+ if oauth_token is None :
114
+ return "Error : Please Sign In to your HuggingFace account to use the quantizer"
115
+ if not profile:
116
+ return "Error: Please Sign In to your HuggingFace account to use the quantizer"
117
+ exists_message = check_model_exists(oauth_token, profile.username, quantization_type, group_size, model_name, quantized_model_name)
118
+ if exists_message :
119
+ return exists_message
120
+ if quantization_type == "int4_weight_only" and device == "cpu" :
121
+ return "int4_weight_only not supported on cpu"
122
+ # try :
123
+ quantized_model = quantize_model(model_name, quantization_type, group_size, oauth_token, profile.username, device)
124
+ return save_model(quantized_model, model_name, quantization_type, group_size, profile.username, oauth_token, quantized_model_name)
125
+ # except Exception as e :
126
+ # return e
127
+
128
+
129
+ with gr.Blocks(theme=gr.themes.Soft()) as app:
130
+ gr.Markdown(
131
+ """
132
+ # πŸš€ LLM Model Quantization App
133
+
134
+ Quantize your favorite Hugging Face models and save them to your profile!
135
+ """
136
+ )
137
+
138
+
139
+ gr.LoginButton(elem_id="login-button", elem_classes="center-button")
140
+
141
+ m1 = gr.Markdown()
142
+ app.load(hello, inputs=None, outputs=m1)
143
+
144
+ with gr.Row():
145
+ with gr.Column():
146
+ model_name = HuggingfaceHubSearch(
147
+ label="Hub Model ID",
148
+ placeholder="Search for model id on Huggingface",
149
+ search_type="model",
150
+ )
151
+ quantization_type = gr.Dropdown(
152
+ label="Quantization Type",
153
+ choices=["int4_weight_only", "int8_weight_only", "int8_dynamic_activation_int8_weight"],
154
+ value="int8_weight_only"
155
+ )
156
+ group_size = gr.Number(
157
+ label="Group Size (only for int4_weight_only)",
158
+ value=128,
159
+ interactive=True
160
+ )
161
+ device = gr.Dropdown(
162
+ label="Device (int4 only works with cuda)",
163
+ choices=["cuda", "cpu"],
164
+ value="cuda"
165
+ )
166
+ quantized_model_name = gr.Textbox(
167
+ label="Model Name (optional : to override default)",
168
+ value="",
169
+ interactive=True
170
+ )
171
+ # with gr.Row():
172
+ # username = gr.Textbox(
173
+ # label="Hugging Face Username",
174
+ # placeholder="Enter your Hugging Face username",
175
+ # value="",
176
+ # interactive=True,
177
+ # elem_id="username-box"
178
+ # )
179
+ with gr.Column():
180
+ quantize_button = gr.Button("Quantize and Save Model", variant="primary")
181
+ output_link = gr.Textbox(label="Quantized Model Link")
182
+
183
+ gr.Markdown(
184
+ """
185
+ ## Instructions
186
+ 1. Login to your HuggingFace account
187
+ 2. Enter the name of the Hugging Face LLM model you want to quantize (Make sure you have access to it)
188
+ 3. Choose the quantization type.
189
+ 4. Optionally, specify the group size.
190
+ 5. Optionally, choose a custom name for the quantized model
191
+ 6. Click "Quantize and Save Model" to start the process.
192
+ 7. Once complete, you'll receive a link to the quantized model on Hugging Face.
193
+
194
+ Note: This process may take some time depending on the model size and your hardware you can check the container logs to see where are you at in the process!
195
+ """
196
+ )
197
+
198
+
199
+ # Adding CSS styles for the username box
200
+ app.css = """
201
+ #username-box {
202
+ background-color: #f0f8ff; /* Light color */
203
+ border-radius: 8px;
204
+ padding: 10px;
205
+ }
206
+ """
207
+ app.css = """
208
+ .center-button {
209
+ display: flex;
210
+ justify-content: center;
211
+ align-items: center;
212
+ margin: 0 auto; /* Center horizontally */
213
+ }
214
+ """
215
+
216
+ quantize_button.click(
217
+ fn=quantize_and_save,
218
+ inputs=[model_name, quantization_type, group_size, quantized_model_name, device],
219
+ outputs=[output_link]
220
+ )
221
+
222
+
223
+ # Launch the app
224
+ app.launch()
requirement.txt ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ git+https://github.com/huggingface/transformers.git@main#egg=transformers
2
+ accelerate
3
+ torchao
4
+ huggingface-hub
5
+ https://gradio-builds.s3.amazonaws.com/4485dd46a8e4b3f5b35e42d52f291b72fdc1a952/gradio-4.39.0-py3-none-any.whl
6
+ gradio-huggingfacehub-search