Ffftdtd5dtft commited on
Commit
9f74a4f
·
verified ·
1 Parent(s): 1bb6979

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +24 -63
app.py CHANGED
@@ -196,73 +196,34 @@ def process_model(model_id, q_method, use_imatrix, imatrix_q_method, private_rep
196
  ```
197
  cd llama.cpp && LLAMA_CURL=1 make
198
  ```
199
- Step 3: Run inference through the main binary.
200
  ```
201
- ./llama-cli --hf-repo {new_repo_id} --hf-file {quantized_gguf_name} -p "The meaning to life and the universe is"
202
  ```
203
- """,
204
  )
205
- if split_model:
206
- split_upload_model(quantized_gguf_path, new_repo_id, oauth_token, split_max_tensors, split_max_size)
207
- else:
208
- api.upload_file(
209
- path_or_fileobj=quantized_gguf_path,
210
- path_in_repo=quantized_gguf_name,
211
- repo_id=new_repo_id,
212
- )
213
- card.push_to_hub(repo_id=new_repo_id, token=oauth_token.token)
214
- print("Quantized model uploaded and model card created successfully!")
215
- return f"Quantized model uploaded to: {new_repo_url}"
216
 
 
 
 
 
 
 
 
 
 
 
217
  except Exception as e:
218
- print(f"Error: {str(e)}")
219
- raise
 
 
 
 
 
 
220
 
221
- with gr.Blocks() as demo:
222
- hf_token_input = gr.Textbox(label="HF Token", type="password", value=HF_TOKEN, visible=False, interactive=False)
223
- hf_token = gr.oauth.OAuth(hf_token_input)
224
-
225
- model_id = HuggingfaceHubSearch(label="Select a model from HuggingFace Hub")
226
-
227
- quantization_method = gr.Dropdown(
228
- ["Q2_K", "Q3_K_S", "Q3_K_M", "Q3_K_L", "Q4_0", "Q4_K_S", "Q4_K_M", "Q5_0", "Q5_K_S", "Q5_K_M", "Q6_K", "Q8_0"], label="Select quantization method")
229
- imatrix_quantization_method = gr.Dropdown(
230
- ["IQ3_M", "IQ3_XXS", "Q4_K_M", "Q4_K_S", "IQ4_NL", "IQ4_XS", "Q5_K_M", "Q5_K_S"], label="Select imatrix quantization method", visible=False)
231
- use_imatrix_checkbox = gr.Checkbox(label="Use imatrix")
232
- private_repo_checkbox = gr.Checkbox(label="Create a private repo")
233
- train_data_upload = gr.File(label="Upload train data for imatrix (optional)", visible=False)
234
- split_model_checkbox = gr.Checkbox(label="Split model", visible=False)
235
- split_max_tensors = gr.Number(label="Split Max Tensors", visible=False)
236
- split_max_size = gr.Number(label="Split Max Size (MB)", visible=False)
237
-
238
- quantized_model_output = gr.Textbox(label="Output")
239
-
240
- use_imatrix_checkbox.change(fn=lambda x: [
241
- imatrix_quantization_method.update(visible=x),
242
- train_data_upload.update(visible=x),
243
- split_model_checkbox.update(visible=x),
244
- split_max_tensors.update(visible=x),
245
- split_max_size.update(visible=x)
246
- ], inputs=use_imatrix_checkbox, outputs=[imatrix_quantization_method, train_data_upload, split_model_checkbox, split_max_tensors, split_max_size])
247
-
248
- process_button = gr.Button(label="Quantize and Upload")
249
-
250
- process_button.click(
251
- process_model,
252
- inputs=[
253
- model_id,
254
- quantization_method,
255
- use_imatrix_checkbox,
256
- imatrix_quantization_method,
257
- private_repo_checkbox,
258
- train_data_upload,
259
- split_model_checkbox,
260
- split_max_tensors,
261
- split_max_size,
262
- hf_token
263
- ],
264
- outputs=[quantized_model_output],
265
- )
266
 
267
- if __name__ == "__main__":
268
- demo.launch()
 
196
  ```
197
  cd llama.cpp && LLAMA_CURL=1 make
198
  ```
199
+ Step 3: Fetch model weights from HF using curl command and use them with the above `llama_cli` or `llama_server`.
200
  ```
201
+ curl -L {new_repo_id} > .gguf/{quantized_gguf_name}
202
  ```
203
+ """
204
  )
205
+ if use_imatrix:
206
+ card.text += "\nNote: This model was quantized using imatrix."
 
 
 
 
 
 
 
 
 
207
 
208
+ card.push_to_hub(repo_id=new_repo_id, token=oauth_token.token)
209
+ api.upload_file(
210
+ path_or_fileobj=quantized_gguf_path,
211
+ path_in_repo=quantized_gguf_name,
212
+ repo_id=new_repo_id,
213
+ token=oauth_token.token,
214
+ )
215
+ print("Pushed model to the hub!")
216
+ if split_model:
217
+ split_upload_model(quantized_gguf_name, new_repo_id, oauth_token, split_max_tensors=split_max_tensors, split_max_size=split_max_size)
218
  except Exception as e:
219
+ print("Error in process_model:", e)
220
+ raise e
221
+ finally:
222
+ os.makedirs("model_cache", exist_ok=True)
223
+ shutil.move(model_name, f"model_cache/{model_name}")
224
+ shutil.move(fp16, f"model_cache/{fp16}")
225
+ shutil.move(quantized_gguf_path, f"model_cache/{quantized_gguf_path}")
226
+ print("Moved model files to model_cache.")
227
 
228
+ print("Process completed successfully!")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
229