Ffftdtd5dtft commited on
Commit
e188c9c
·
verified ·
1 Parent(s): 9f74a4f

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +60 -17
app.py CHANGED
@@ -18,7 +18,7 @@ def generate_importance_matrix(model_path, train_data_path):
18
  print(f"Current working directory: {os.getcwd()}")
19
  print(f"Files in the current directory: {os.listdir('.')}")
20
 
21
- if not os.path.isfile(f"../{model_path}"):
22
  raise Exception(f"Model file not found: {model_path}")
23
 
24
  print("Running imatrix command...")
@@ -196,34 +196,77 @@ def process_model(model_id, q_method, use_imatrix, imatrix_q_method, private_rep
196
  ```
197
  cd llama.cpp && LLAMA_CURL=1 make
198
  ```
199
- Step 3: Fetch model weights from HF using curl command and use them with the above `llama_cli` or `llama_server`.
200
  ```
201
- curl -L {new_repo_id} > .gguf/{quantized_gguf_name}
 
202
  ```
 
 
 
203
  """
204
  )
205
- if use_imatrix:
206
- card.text += "\nNote: This model was quantized using imatrix."
207
-
208
  card.push_to_hub(repo_id=new_repo_id, token=oauth_token.token)
209
  api.upload_file(
210
  path_or_fileobj=quantized_gguf_path,
211
  path_in_repo=quantized_gguf_name,
212
  repo_id=new_repo_id,
213
- token=oauth_token.token,
214
  )
215
- print("Pushed model to the hub!")
216
  if split_model:
217
- split_upload_model(quantized_gguf_name, new_repo_id, oauth_token, split_max_tensors=split_max_tensors, split_max_size=split_max_size)
 
 
 
 
218
  except Exception as e:
219
- print("Error in process_model:", e)
220
- raise e
221
  finally:
222
- os.makedirs("model_cache", exist_ok=True)
223
- shutil.move(model_name, f"model_cache/{model_name}")
224
- shutil.move(fp16, f"model_cache/{fp16}")
225
- shutil.move(quantized_gguf_path, f"model_cache/{quantized_gguf_path}")
226
- print("Moved model files to model_cache.")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
227
 
228
- print("Process completed successfully!")
 
229
 
 
 
18
  print(f"Current working directory: {os.getcwd()}")
19
  print(f"Files in the current directory: {os.listdir('.')}")
20
 
21
+ if not os.path.isfile(f"../{model_path}")):
22
  raise Exception(f"Model file not found: {model_path}")
23
 
24
  print("Running imatrix command...")
 
196
  ```
197
  cd llama.cpp && LLAMA_CURL=1 make
198
  ```
199
+ Step 3: Fetch model weights from HF using curl command and run the models directly!
200
  ```
201
+ curl -L https://huggingface.co/{new_repo_id}/resolve/main/{quantized_gguf_name} -o ./models/{quantized_gguf_name}
202
+ ./llama -m ./models/{quantized_gguf_name} -p "Hello, world!"
203
  ```
204
+
205
+ ## Additional Notes:
206
+ To gain higher performance, ensure that you have aligned on llama.cpp's threading tips by having your CPU fully utilized and setting threads dynamically using `OMP_NUM_THREADS`.
207
  """
208
  )
 
 
 
209
  card.push_to_hub(repo_id=new_repo_id, token=oauth_token.token)
210
  api.upload_file(
211
  path_or_fileobj=quantized_gguf_path,
212
  path_in_repo=quantized_gguf_name,
213
  repo_id=new_repo_id,
 
214
  )
 
215
  if split_model:
216
+ split_upload_model(quantized_gguf_path, new_repo_id, oauth_token, split_max_tensors=split_max_tensors, split_max_size=split_max_size)
217
+ else:
218
+ print("Model split skipped by user.")
219
+
220
+ print("Model has been uploaded successfully!")
221
  except Exception as e:
222
+ print(f"An error occurred: {str(e)}")
223
+ return False, str(e)
224
  finally:
225
+ if os.path.exists(fp16):
226
+ os.remove(fp16)
227
+ if os.path.exists(quantized_gguf_path):
228
+ os.remove(quantized_gguf_path)
229
+ shutil.rmtree(model_name)
230
+ print(f"Removed temporary files for model {model_name}")
231
+
232
+ return True, None
233
+
234
+ def app_interface():
235
+ with gr.Blocks() as demo:
236
+ gr.Markdown("## GGUF Model Processing")
237
+
238
+ with gr.Row():
239
+ with gr.Column():
240
+ repo_id = gr.Textbox(label="HuggingFace Repo ID")
241
+ model_id = gr.Textbox(label="Model ID")
242
+ q_method = gr.Dropdown(["q4_0", "q4_1", "q5_0", "q5_1", "q8_0"], label="Quantization Method")
243
+ imatrix_q_method = gr.Dropdown(["q4_0", "q4_1", "q5_0", "q5_1", "q8_0"], label="Imatrix Quantization Method")
244
+ use_imatrix = gr.Checkbox(label="Use Importance Matrix")
245
+ private_repo = gr.Checkbox(label="Private Repo")
246
+ train_data_file = gr.File(label="Training Data File (Optional)")
247
+ split_model = gr.Checkbox(label="Split Model")
248
+ split_max_tensors = gr.Number(label="Max Tensors per Shard", value=256)
249
+ split_max_size = gr.Number(label="Max Shard Size (MB)", value=None)
250
+ with gr.Column():
251
+ oauth_token = gr.oauth.HuggingFace(
252
+ "Gradio OAuth Authentication",
253
+ token=HF_TOKEN,
254
+ )
255
+
256
+ process_btn = gr.Button("Process Model")
257
+ process_btn.click(
258
+ process_model,
259
+ [model_id, q_method, use_imatrix, imatrix_q_method, private_repo, train_data_file, split_model, split_max_tensors, split_max_size, oauth_token],
260
+ outputs=["status_text"]
261
+ )
262
+
263
+ return demo
264
+
265
+ if __name__ == "__main__":
266
+ scheduler = BackgroundScheduler(daemon=True)
267
+ scheduler.start()
268
 
269
+ demo = app_interface()
270
+ demo.launch()
271
 
272
+ signal.signal(signal.SIGINT, signal.SIG_DFL)