MekkCyber commited on
Commit
9b71f2b
Β·
1 Parent(s): 677834b

Add app file

Browse files
Files changed (3) hide show
  1. README.md +15 -6
  2. app.py +327 -0
  3. requirements.txt +4 -0
README.md CHANGED
@@ -1,13 +1,22 @@
1
  ---
2
- title: TorchAO Quantization
3
- emoji: πŸƒ
4
- colorFrom: red
5
- colorTo: green
6
  sdk: gradio
7
- sdk_version: 5.1.0
8
  app_file: app.py
9
  pinned: false
10
- short_description: The Go To space to quantize your models using Torchao simply
 
 
 
 
 
 
 
 
 
11
  ---
12
 
13
  Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
 
1
  ---
2
+ title: QuantizationTorchAODraft
3
+ emoji: πŸ’»
4
+ colorFrom: blue
5
+ colorTo: red
6
  sdk: gradio
7
+ sdk_version: 5.0.1
8
  app_file: app.py
9
  pinned: false
10
+
11
+ hf_oauth: true
12
+ # optional, default duration is 8 hours/480 minutes. Max duration is 30 days/43200 minutes.
13
+ hf_oauth_expiration_minutes: 480
14
+ # optional, see "Scopes" below. "openid profile" is always included.
15
+ hf_oauth_scopes:
16
+ - read-repos
17
+ - write-repos
18
+ - manage-repos
19
+ - inference-api
20
  ---
21
 
22
  Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
app.py ADDED
@@ -0,0 +1,327 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ import torch
3
+ from transformers import TorchAoConfig, AutoModelForCausalLM, AutoTokenizer
4
+ import tempfile
5
+ from huggingface_hub import HfApi
6
+ from huggingface_hub import list_models
7
+ from packaging import version
8
+ import os
9
+
10
+ def hello(profile: gr.OAuthProfile | None, oauth_token: gr.OAuthToken | None) -> str:
11
+ # ^ expect a gr.OAuthProfile object as input to get the user's profile
12
+ # if the user is not logged in, profile will be None
13
+ if profile is None:
14
+ return "Hello !"
15
+ return f"Hello {profile.name} !"
16
+
17
+ def check_model_exists(oauth_token: gr.OAuthToken | None, username, quantization_type, group_size, model_name, quantized_model_name):
18
+ """Check if a model exists in the user's Hugging Face repository."""
19
+ try:
20
+ models = list_models(author=username, token=oauth_token.token)
21
+ model_names = [model.id for model in models]
22
+ if quantized_model_name :
23
+ repo_name = f"{username}/{quantized_model_name}"
24
+ else :
25
+ if quantization_type == "int4_weight_only" :
26
+ repo_name = f"{username}/{model_name.split('/')[-1]}-torchao-{quantization_type.lower()}-gs_{group_size}"
27
+ else :
28
+ repo_name = f"{username}/{model_name.split('/')[-1]}-torchao-{quantization_type.lower()}"
29
+
30
+ if repo_name in model_names:
31
+ return f"Model '{repo_name}' already exists in your repository."
32
+ else:
33
+ return None # Model does not exist
34
+ except Exception as e:
35
+ return f"Error checking model existence: {str(e)}"
36
+
37
+ def create_model_card(model_name, quantization_type, group_size):
38
+ model_card = f"""---
39
+ base_model:
40
+ - {model_name}
41
+ ---
42
+
43
+ # {model_name} (Quantized)
44
+
45
+ ## Description
46
+ This model is a quantized version of the original model `{model_name}`. It has been quantized using {quantization_type} quantization with torchao.
47
+
48
+ ## Quantization Details
49
+ - **Quantization Type**: {quantization_type}
50
+ - **Group Size**: {group_size if quantization_type == "int4_weight_only" else None}
51
+
52
+ ## Usage
53
+ You can use this model in your applications by loading it directly from the Hugging Face Hub:
54
+
55
+ ```python
56
+ from transformers import AutoModel
57
+
58
+ model = AutoModel.from_pretrained("{model_name}")"""
59
+
60
+ return model_card
61
+
62
+ def quantize_model(model_name, quantization_type, group_size=128, auth_token=None, username=None):
63
+ print(f"Quantizing model: {quantization_type}")
64
+ if quantization_type == "int4_weight_only" :
65
+ quantization_config = TorchAoConfig(quantization_type, group_size=group_size)
66
+ else :
67
+ quantization_config = TorchAoConfig(quantization_type)
68
+
69
+ model = AutoModelForCausalLM.from_pretrained(model_name, device_map="cpu", torch_dtype=torch.bfloat16, quantization_config=quantization_config, use_auth_token=auth_token.token)
70
+ return model
71
+
72
+ def save_model(model, model_name, quantization_type, group_size=128, username=None, auth_token=None, quantized_model_name=None):
73
+ print("Saving quantized model")
74
+ with tempfile.TemporaryDirectory() as tmpdirname:
75
+ model_card = create_model_card(model_name, quantization_type, group_size)
76
+ with open(os.path.join(tmpdirname, "README.md"), "w") as f:
77
+ f.write(model_card)
78
+
79
+ model.save_pretrained(tmpdirname, safe_serialization=False, use_auth_token=auth_token.token)
80
+ if quantized_model_name :
81
+ repo_name = f"{username}/{quantized_model_name}"
82
+ else :
83
+ if quantization_type == "int4_weight_only" :
84
+ repo_name = f"{username}/{model_name.split('/')[-1]}-torchao-{quantization_type.lower()}-gs_{group_size}"
85
+ else :
86
+ repo_name = f"{username}/{model_name.split('/')[-1]}-torchao-{quantization_type.lower()}"
87
+
88
+ # Push to Hub
89
+ api = HfApi()
90
+ api.create_repo(repo_name, exist_ok=True)
91
+ api.upload_folder(
92
+ folder_path=tmpdirname,
93
+ repo_id=repo_name,
94
+ repo_type="model",
95
+ )
96
+
97
+ return f"https://huggingface.co/{repo_name}"
98
+
99
+ def quantize_and_save(profile: gr.OAuthProfile | None, oauth_token: gr.OAuthToken | None, model_name, quantization_type, group_size, quantized_model_name):
100
+ if oauth_token is None :
101
+ return "Error : Please Sign In to your HuggingFace account to use the quantizer"
102
+ if not profile:
103
+ return "Error: Please Sign In to your HuggingFace account to use the quantizer"
104
+ exists_message = check_model_exists(oauth_token, profile.username, quantization_type, group_size, model_name, quantized_model_name)
105
+ if exists_message :
106
+ return exists_message
107
+ quantized_model = quantize_model(model_name, quantization_type, group_size, oauth_token, profile.username)
108
+ return save_model(quantized_model, model_name, quantization_type, group_size, profile.username, oauth_token, quantized_model_name)
109
+
110
+
111
+ with gr.Blocks(theme=gr.themes.Soft()) as app:
112
+ gr.Markdown(
113
+ """
114
+ # πŸš€ Model Quantization App
115
+
116
+ Quantize your favorite Hugging Face models and save them to your profile!
117
+ """
118
+ )
119
+
120
+
121
+ gr.LoginButton(elem_id="login-button", elem_classes="center-button")
122
+
123
+ m1 = gr.Markdown()
124
+ app.load(hello, inputs=None, outputs=m1)
125
+
126
+ with gr.Row():
127
+ with gr.Column():
128
+ model_name = gr.Textbox(
129
+ label="Model Name",
130
+ placeholder="e.g., meta-llama/Meta-Llama-3-8B",
131
+ value="meta-llama/Meta-Llama-3-8B"
132
+ )
133
+ quantization_type = gr.Dropdown(
134
+ label="Quantization Type",
135
+ choices=["int4_weight_only", "int8_weight_only", "int8_dynamic_activation_int8_weight"],
136
+ value="int8_weight_only"
137
+ )
138
+ group_size = gr.Number(
139
+ label="Group Size (only for int4_weight_only)",
140
+ value=128,
141
+ interactive=True
142
+ )
143
+ quantized_model_name = gr.Textbox(
144
+ label="Model Name (optional : to override default)",
145
+ value="",
146
+ interactive=True
147
+ )
148
+ # with gr.Row():
149
+ # username = gr.Textbox(
150
+ # label="Hugging Face Username",
151
+ # placeholder="Enter your Hugging Face username",
152
+ # value="",
153
+ # interactive=True,
154
+ # elem_id="username-box"
155
+ # )
156
+ with gr.Column():
157
+ quantize_button = gr.Button("Quantize and Save Model", variant="primary")
158
+ output_link = gr.Textbox(label="Quantized Model Link")
159
+
160
+ gr.Markdown(
161
+ """
162
+ ## Instructions
163
+ 1. Enter the name of the Hugging Face model you want to quantize.
164
+ 2. Choose the quantization type.
165
+ 3. Optionally, specify the group size.
166
+ 4. Click "Quantize and Save Model" to start the process.
167
+ 5. Once complete, you'll receive a link to the quantized model on Hugging Face.
168
+
169
+ Note: This process may take some time depending on the model size and your hardware.
170
+ """
171
+ )
172
+
173
+
174
+ # Adding CSS styles for the username box
175
+ app.css = """
176
+ #username-box {
177
+ background-color: #f0f8ff; /* Light color */
178
+ border-radius: 8px;
179
+ padding: 10px;
180
+ }
181
+ """
182
+ app.css = """
183
+ .center-button {
184
+ display: flex;
185
+ justify-content: center;
186
+ align-items: center;
187
+ margin: 0 auto; /* Center horizontally */
188
+ }
189
+ """
190
+
191
+ quantize_button.click(
192
+ fn=quantize_and_save,
193
+ inputs=[model_name, quantization_type, group_size, quantized_model_name],
194
+ outputs=[output_link]
195
+ )
196
+
197
+
198
+ # Launch the app
199
+ app.launch(share=True)
200
+
201
+
202
+
203
+
204
+
205
+
206
+
207
+
208
+ from torchao.quantization import (
209
+ int4_weight_only,
210
+ int8_dynamic_activation_int8_weight,
211
+ int8_weight_only,
212
+ )
213
+
214
+ # import gradio as gr
215
+ # import torch
216
+ # from transformers import AutoModelForCausalLM, AutoTokenizer
217
+ # import torch.ao.quantization as quant
218
+ # import os
219
+ # from huggingface_hub import HfApi
220
+ # import tempfile
221
+ # import torch.utils.data as data
222
+ # from torchao.quantization import quantize_
223
+
224
+ # def load_calibration_dataset(tokenizer, num_samples=100):
225
+ # # This is a placeholder. In a real scenario, you'd load actual data.
226
+ # dummy_texts = ["This is a sample text" for _ in range(num_samples)]
227
+ # encodings = tokenizer(dummy_texts, truncation=True, padding=True, return_tensors="pt")
228
+ # dataset = data.TensorDataset(encodings['input_ids'], encodings['attention_mask'])
229
+ # return data.DataLoader(dataset, batch_size=1)
230
+
231
+ # def load_model(model_name):
232
+ # print(f"Loading model: {model_name}")
233
+ # model = AutoModelForCausalLM.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.bfloat16, device_map="auto")
234
+ # tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
235
+ # return model, tokenizer
236
+
237
+ # def quantize_model(model, quant_type, dtype):
238
+ # print(f"Quantizing model: {quant_type} - {dtype}")
239
+ # quantize_(model, _STR_TO_METHOD[dtype](group_size=128))
240
+
241
+ # def save_model(model, model_name, quant_type, dtype):
242
+ # print("Saving quantized model")
243
+ # model.save_pretrained("medmekk/model_llama", safe_serialization=False)
244
+ # with tempfile.TemporaryDirectory() as tmpdirname:
245
+ # model.save_pretrained(tmpdirname)
246
+
247
+ # # Create a new repo name
248
+ # repo_name = f"{model_name.split('/')[-1]}-quantized-{quant_type.lower()}-{dtype}bit"
249
+
250
+ # # Push to Hub
251
+ # api = HfApi()
252
+ # api.create_repo(repo_name, exist_ok=True)
253
+ # api.upload_folder(
254
+ # folder_path=tmpdirname,
255
+ # repo_id=repo_name,
256
+ # repo_type="model",
257
+ # )
258
+
259
+ # return f"https://huggingface.co/{repo_name}"
260
+
261
+ # _STR_TO_METHOD = {
262
+ # "int4_weight_only": int4_weight_only,
263
+ # "int8_weight_only": int8_weight_only,
264
+ # "int8_dynamic_activation_int8_weight": int8_dynamic_activation_int8_weight,
265
+ # }
266
+
267
+ # def quantize_and_save(model_name, quant_type, dtype):
268
+
269
+ # model, tokenizer = load_model(model_name)
270
+ # quantize_model(model, quant_type, dtype)
271
+ # print(model.device)
272
+ # return save_model(model, model_name, quant_type, dtype)
273
+
274
+
275
+ # # Gradio interface
276
+ # with gr.Blocks(theme=gr.themes.Soft()) as app:
277
+ # gr.Markdown(
278
+ # """
279
+ # # πŸš€ Model Quantization App
280
+
281
+ # Quantize your favorite Hugging Face models and save them to your profile!
282
+ # """
283
+ # )
284
+
285
+ # with gr.Row():
286
+ # with gr.Column():
287
+ # model_name = gr.Textbox(
288
+ # label="Model Name",
289
+ # placeholder="e.g., gpt2, distilgpt2",
290
+ # value="meta-llama/Meta-Llama-3-8B-Instruct"
291
+ # )
292
+ # quant_type = gr.Dropdown(
293
+ # label="Quantization Type",
294
+ # choices=["Dynamic", "Static"],
295
+ # value="Dynamic"
296
+ # )
297
+ # dtype = gr.Dropdown(
298
+ # label="Data Type",
299
+ # choices=["int4_weight_only", "int8_weight_only", "int8_dynamic_activation_int8_weight"],
300
+ # value="int4_weight_only"
301
+ # )
302
+
303
+ # with gr.Column():
304
+ # quantize_button = gr.Button("Quantize and Save Model", variant="primary")
305
+ # output_link = gr.Textbox(label="Output", interactive=False)
306
+
307
+ # gr.Markdown(
308
+ # """
309
+ # ## Instructions
310
+ # 1. Enter the name of the Hugging Face model you want to quantize.
311
+ # 2. Choose the quantization type.
312
+ # 3. If using Weight Only quantization, select the number of bits.
313
+ # 4. Click "Quantize and Save Model" to start the process.
314
+ # 5. Once complete, you'll receive a link to the quantized model on Hugging Face.
315
+
316
+ # Note: This process may take some time depending on the model size and your hardware.
317
+ # """
318
+ # )
319
+
320
+ # quantize_button.click(
321
+ # fn=quantize_and_save,
322
+ # inputs=[model_name, quant_type, dtype],
323
+ # outputs=[output_link]
324
+ # )
325
+
326
+ # # Launch the app
327
+ # app.launch(share=True)
requirements.txt ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ git+https://github.com/huggingface/transformers.git@main#egg=transformers
2
+ accelerate
3
+ torchao
4
+ huggingface-hub