Ffftdtd5dtft commited on
Commit
36f5cda
·
verified ·
1 Parent(s): 5110965

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +73 -134
app.py CHANGED
@@ -2,6 +2,8 @@ import os
2
  import shutil
3
  import subprocess
4
  import signal
 
 
5
  os.environ["GRADIO_ANALYTICS_ENABLED"] = "False"
6
  import gradio as gr
7
 
@@ -9,6 +11,7 @@ from huggingface_hub import create_repo, HfApi
9
  from huggingface_hub import snapshot_download
10
  from huggingface_hub import whoami
11
  from huggingface_hub import ModelCard
 
12
 
13
  from gradio_huggingfacehub_search import HuggingfaceHubSearch
14
 
@@ -20,59 +23,35 @@ HF_TOKEN = os.environ.get("HF_TOKEN")
20
 
21
  def generate_importance_matrix(model_path, train_data_path):
22
  imatrix_command = f"./llama-imatrix -m ../{model_path} -f {train_data_path} -ngl 99 --output-frequency 10"
23
-
24
  os.chdir("llama.cpp")
25
-
26
- print(f"Current working directory: {os.getcwd()}")
27
- print(f"Files in the current directory: {os.listdir('.')}")
28
-
29
  if not os.path.isfile(f"../{model_path}"):
30
  raise Exception(f"Model file not found: {model_path}")
31
-
32
- print("Running imatrix command...")
33
  process = subprocess.Popen(imatrix_command, shell=True)
34
-
35
  try:
36
- process.wait(timeout=60) # added wait
37
  except subprocess.TimeoutExpired:
38
- print("Imatrix computation timed out. Sending SIGINT to allow graceful termination...")
39
  process.send_signal(signal.SIGINT)
40
  try:
41
- process.wait(timeout=5) # grace period
42
  except subprocess.TimeoutExpired:
43
- print("Imatrix proc still didn't term. Forecfully terming process...")
44
  process.kill()
45
-
46
  os.chdir("..")
47
 
48
- print("Importance matrix generation completed.")
49
-
50
- def split_upload_model(model_path, repo_id, oauth_token: gr.OAuthToken | None, split_max_tensors=256, split_max_size=None):
51
  if oauth_token.token is None:
52
  raise ValueError("You have to be logged in.")
53
-
54
  split_cmd = f"llama.cpp/llama-gguf-split --split --split-max-tensors {split_max_tensors}"
55
  if split_max_size:
56
  split_cmd += f" --split-max-size {split_max_size}"
57
  split_cmd += f" {model_path} {model_path.split('.')[0]}"
58
-
59
- print(f"Split command: {split_cmd}")
60
-
61
  result = subprocess.run(split_cmd, shell=True, capture_output=True, text=True)
62
- print(f"Split command stdout: {result.stdout}")
63
- print(f"Split command stderr: {result.stderr}")
64
-
65
  if result.returncode != 0:
66
  raise Exception(f"Error splitting the model: {result.stderr}")
67
- print("Model split successfully!")
68
-
69
  sharded_model_files = [f for f in os.listdir('.') if f.startswith(model_path.split('.')[0])]
70
  if sharded_model_files:
71
- print(f"Sharded model files: {sharded_model_files}")
72
  api = HfApi(token=oauth_token.token)
73
  for file in sharded_model_files:
74
  file_path = os.path.join('.', file)
75
- print(f"Uploading file: {file_path}")
76
  try:
77
  api.upload_file(
78
  path_or_fileobj=file_path,
@@ -83,123 +62,97 @@ def split_upload_model(model_path, repo_id, oauth_token: gr.OAuthToken | None, s
83
  raise Exception(f"Error uploading file {file_path}: {e}")
84
  else:
85
  raise Exception("No sharded files found.")
86
-
87
- print("Sharded model has been uploaded successfully!")
88
 
89
- def process_model(model_id, q_method, use_imatrix, imatrix_q_method, private_repo, train_data_file, split_model, split_max_tensors, split_max_size, oauth_token: gr.OAuthToken | None):
90
  if oauth_token.token is None:
91
  raise ValueError("You must be logged in to use GGUF-my-repo")
92
  model_name = model_id.split('/')[-1]
93
- fp16 = f"{model_name}.fp16.gguf"
94
-
95
  try:
96
  api = HfApi(token=oauth_token.token)
 
 
 
 
 
 
 
 
 
 
 
 
 
97
 
98
- dl_pattern = [
99
- "*.safetensors", "*.bin", "*.pt", "*.onnx", "*.h5", "*.tflite",
100
- "*.ckpt", "*.pb", "*.tar", "*.xml", "*.caffemodel",
101
- "*.md", "*.json", "*.model"
102
- ]
103
-
104
- pattern = (
105
- "*.safetensors"
106
- if any(
107
- file.path.endswith(".safetensors")
108
- for file in api.list_repo_tree(
109
- repo_id=model_id,
110
- recursive=True,
111
- )
112
- )
113
- else "*.bin"
114
- )
115
-
116
- dl_pattern += pattern
117
-
118
- api.snapshot_download(repo_id=model_id, local_dir=model_name, local_dir_use_symlinks=False, allow_patterns=dl_pattern)
119
- print("Model downloaded successfully!")
120
- print(f"Current working directory: {os.getcwd()}")
121
- print(f"Model directory contents: {os.listdir(model_name)}")
122
 
 
 
123
  conversion_script = "convert_hf_to_gguf.py"
124
- fp16_conversion = f"python llama.cpp/{conversion_script} {model_name} --outtype f16 --outfile {fp16}"
125
  result = subprocess.run(fp16_conversion, shell=True, capture_output=True)
126
- print(result)
127
  if result.returncode != 0:
128
  raise Exception(f"Error converting to fp16: {result.stderr}")
129
- print("Model converted to fp16 successfully!")
130
- print(f"Converted model path: {fp16}")
131
 
 
132
  imatrix_path = "llama.cpp/imatrix.dat"
133
-
134
  if use_imatrix:
135
  if train_data_file:
136
  train_data_path = train_data_file.name
137
  else:
138
- train_data_path = "groups_merged.txt" # fallback calibration dataset
139
-
140
- print(f"Training data file path: {train_data_path}")
141
-
142
  if not os.path.isfile(train_data_path):
143
  raise Exception(f"Training data file not found: {train_data_path}")
144
-
145
  generate_importance_matrix(fp16, train_data_path)
146
- else:
147
- print("Not using imatrix quantization.")
148
-
149
  username = whoami(oauth_token.token)["name"]
150
- quantized_gguf_name = f"{model_name.lower()}-{imatrix_q_method.lower()}-imat.gguf" if use_imatrix else f"{model_name.lower()}-{q_method.lower()}.gguf"
151
  quantized_gguf_path = quantized_gguf_name
152
-
153
- if use_imatrix:
154
- quantise_ggml = f"./llama.cpp/llama-quantize --imatrix {imatrix_path} {fp16} {quantized_gguf_path} {imatrix_q_method}"
155
- else:
156
- quantise_ggml = f"./llama.cpp/llama-quantize {fp16} {quantized_gguf_path} {q_method}"
157
-
158
  result = subprocess.run(quantise_ggml, shell=True, capture_output=True)
159
  if result.returncode != 0:
160
  raise Exception(f"Error quantizing: {result.stderr}")
161
- print(f"Quantized successfully with {imatrix_q_method if use_imatrix else q_method} option!")
162
- print(f"Quantized model path: {quantized_gguf_path}")
163
 
164
- # Create empty repo
165
- new_repo_url = api.create_repo(repo_id=f"{username}/{model_name}-{imatrix_q_method if use_imatrix else q_method}-GGUF", exist_ok=True, private=private_repo)
166
  new_repo_id = new_repo_url.repo_id
167
- print("Repo created successfully!", new_repo_url)
168
-
169
  try:
170
  card = ModelCard.load(model_id, token=oauth_token.token)
171
- except:
172
  card = ModelCard("")
173
  if card.data.tags is None:
174
  card.data.tags = []
175
- card.data.tags.append("llama-cpp")
176
- card.data.tags.append("gguf-my-repo")
177
  card.data.base_model = model_id
178
  card.text = dedent(
179
  f"""
180
  # {new_repo_id}
181
  This model was converted to GGUF format from [`{model_id}`](https://huggingface.co/{model_id}) using llama.cpp via the ggml.ai's [GGUF-my-repo](https://huggingface.co/spaces/ggml-org/gguf-my-repo) space.
182
  Refer to the [original model card](https://huggingface.co/{model_id}) for more details on the model.
183
-
184
  ## Use with llama.cpp
185
  Install llama.cpp through brew (works on Mac and Linux)
186
-
187
  ```bash
188
  brew install llama.cpp
189
-
190
  ```
191
  Invoke the llama.cpp server or the CLI.
192
-
193
  ### CLI:
194
  ```bash
195
  llama-cli --hf-repo {new_repo_id} --hf-file {quantized_gguf_name} -p "The meaning to life and the universe is"
196
  ```
197
-
198
  ### Server:
199
  ```bash
200
  llama-server --hf-repo {new_repo_id} --hf-file {quantized_gguf_name} -c 2048
201
  ```
202
-
203
  Note: You can also use this checkpoint directly through the [usage steps](https://github.com/ggerganov/llama.cpp?tab=readme-ov-file#usage) listed in the Llama.cpp repo as well.
204
  Step 1: Clone llama.cpp from GitHub.
205
  ```
@@ -224,35 +177,22 @@ def process_model(model_id, q_method, use_imatrix, imatrix_q_method, private_rep
224
  if split_model:
225
  split_upload_model(quantized_gguf_path, new_repo_id, oauth_token, split_max_tensors, split_max_size)
226
  else:
227
- try:
228
- print(f"Uploading quantized model: {quantized_gguf_path}")
229
- api.upload_file(
230
- path_or_fileobj=quantized_gguf_path,
231
- path_in_repo=quantized_gguf_name,
232
- repo_id=new_repo_id,
233
- )
234
- except Exception as e:
235
- raise Exception(f"Error uploading quantized model: {e}")
236
-
237
- imatrix_path = "llama.cpp/imatrix.dat"
238
  if os.path.isfile(imatrix_path):
239
- try:
240
- print(f"Uploading imatrix.dat: {imatrix_path}")
241
- api.upload_file(
242
- path_or_fileobj=imatrix_path,
243
- path_in_repo="imatrix.dat",
244
- repo_id=new_repo_id,
245
- )
246
- except Exception as e:
247
- raise Exception(f"Error uploading imatrix.dat: {e}")
248
-
249
  api.upload_file(
250
  path_or_fileobj=f"README.md",
251
  path_in_repo=f"README.md",
252
  repo_id=new_repo_id,
253
  )
254
- print(f"Uploaded successfully with {imatrix_q_method if use_imatrix else q_method} option!")
255
-
256
  return (
257
  f'Find your repo <a href=\'{new_repo_url}\' target="_blank" style="text-decoration:underline">here</a>',
258
  "llama.png",
@@ -261,13 +201,12 @@ def process_model(model_id, q_method, use_imatrix, imatrix_q_method, private_rep
261
  return (f"Error: {e}", "error.png")
262
  finally:
263
  shutil.rmtree(model_name, ignore_errors=True)
264
- print("Folder cleaned up successfully!")
265
 
266
- css="""/* Custom CSS to allow scrolling */
267
  .gradio-container {overflow-y: auto;}
268
  """
269
- # Create Gradio interface
270
- with gr.Blocks(css=css) as demo:
271
  gr.Markdown("You must be logged in to use GGUF-my-repo.")
272
  gr.LoginButton(min_width=250)
273
 
@@ -290,7 +229,7 @@ with gr.Blocks(css=css) as demo:
290
  ["IQ3_M", "IQ3_XXS", "Q4_K_M", "Q4_K_S", "IQ4_NL", "IQ4_XS", "Q5_K_M", "Q5_K_S"],
291
  label="Imatrix Quantization Method",
292
  info="GGML imatrix quants type",
293
- value="IQ4_NL",
294
  filterable=False,
295
  visible=False
296
  )
@@ -332,15 +271,25 @@ with gr.Blocks(css=css) as demo:
332
  visible=False
333
  )
334
 
335
- def update_visibility(use_imatrix):
336
- return gr.update(visible=not use_imatrix), gr.update(visible=use_imatrix), gr.update(visible=use_imatrix)
337
-
338
  use_imatrix.change(
339
- fn=update_visibility,
 
 
 
 
340
  inputs=use_imatrix,
341
  outputs=[q_method, imatrix_q_method, train_data_file]
342
  )
343
 
 
 
 
 
 
 
 
 
 
344
  iface = gr.Interface(
345
  fn=process_model,
346
  inputs=[
@@ -363,15 +312,6 @@ with gr.Blocks(css=css) as demo:
363
  api_name=False
364
  )
365
 
366
- def update_split_visibility(split_model):
367
- return gr.update(visible=split_model), gr.update(visible=split_model)
368
-
369
- split_model.change(
370
- fn=update_split_visibility,
371
- inputs=split_model,
372
- outputs=[split_max_tensors, split_max_size]
373
- )
374
-
375
  def restart_space():
376
  HfApi().restart_space(repo_id="ggml-org/gguf-my-repo", token=HF_TOKEN, factory_reboot=True)
377
 
@@ -379,5 +319,4 @@ scheduler = BackgroundScheduler()
379
  scheduler.add_job(restart_space, "interval", seconds=21600)
380
  scheduler.start()
381
 
382
- # Launch the interface
383
- demo.queue(default_concurrency_limit=1, max_size=5).launch(debug=True, show_api=False)
 
2
  import shutil
3
  import subprocess
4
  import signal
5
+ import re
6
+
7
  os.environ["GRADIO_ANALYTICS_ENABLED"] = "False"
8
  import gradio as gr
9
 
 
11
  from huggingface_hub import snapshot_download
12
  from huggingface_hub import whoami
13
  from huggingface_hub import ModelCard
14
+ from huggingface_hub.utils import RepositoryNotFoundError
15
 
16
  from gradio_huggingfacehub_search import HuggingfaceHubSearch
17
 
 
23
 
24
  def generate_importance_matrix(model_path, train_data_path):
25
  imatrix_command = f"./llama-imatrix -m ../{model_path} -f {train_data_path} -ngl 99 --output-frequency 10"
 
26
  os.chdir("llama.cpp")
 
 
 
 
27
  if not os.path.isfile(f"../{model_path}"):
28
  raise Exception(f"Model file not found: {model_path}")
 
 
29
  process = subprocess.Popen(imatrix_command, shell=True)
 
30
  try:
31
+ process.wait(timeout=60)
32
  except subprocess.TimeoutExpired:
 
33
  process.send_signal(signal.SIGINT)
34
  try:
35
+ process.wait(timeout=5)
36
  except subprocess.TimeoutExpired:
 
37
  process.kill()
 
38
  os.chdir("..")
39
 
40
+ def split_upload_model(model_path, repo_id, oauth_token, split_max_tensors=256, split_max_size=None):
 
 
41
  if oauth_token.token is None:
42
  raise ValueError("You have to be logged in.")
 
43
  split_cmd = f"llama.cpp/llama-gguf-split --split --split-max-tensors {split_max_tensors}"
44
  if split_max_size:
45
  split_cmd += f" --split-max-size {split_max_size}"
46
  split_cmd += f" {model_path} {model_path.split('.')[0]}"
 
 
 
47
  result = subprocess.run(split_cmd, shell=True, capture_output=True, text=True)
 
 
 
48
  if result.returncode != 0:
49
  raise Exception(f"Error splitting the model: {result.stderr}")
 
 
50
  sharded_model_files = [f for f in os.listdir('.') if f.startswith(model_path.split('.')[0])]
51
  if sharded_model_files:
 
52
  api = HfApi(token=oauth_token.token)
53
  for file in sharded_model_files:
54
  file_path = os.path.join('.', file)
 
55
  try:
56
  api.upload_file(
57
  path_or_fileobj=file_path,
 
62
  raise Exception(f"Error uploading file {file_path}: {e}")
63
  else:
64
  raise Exception("No sharded files found.")
 
 
65
 
66
+ def process_model(model_id, q_method, use_imatrix, imatrix_q_method, private_repo, train_data_file, split_model, split_max_tensors, split_max_size, oauth_token):
67
  if oauth_token.token is None:
68
  raise ValueError("You must be logged in to use GGUF-my-repo")
69
  model_name = model_id.split('/')[-1]
 
 
70
  try:
71
  api = HfApi(token=oauth_token.token)
72
+ try:
73
+ # Attempt to download using the model ID directly
74
+ snapshot_download(repo_id=model_id, local_dir=model_name, local_dir_use_symlinks=False)
75
+ except RepositoryNotFoundError:
76
+ # If the model ID is not found, search for it
77
+ print(f"Model ID not found directly. Searching for: {model_id}")
78
+ search_results = api.list_models(search=model_id, limit=1)
79
+ if search_results:
80
+ found_model_id = search_results[0].modelId
81
+ print(f"Found model ID: {found_model_id}")
82
+ snapshot_download(repo_id=found_model_id, local_dir=model_name, local_dir_use_symlinks=False)
83
+ else:
84
+ raise ValueError(f"Model not found: {model_id}")
85
 
86
+ # Find the model file
87
+ for filename in os.listdir(model_name):
88
+ if filename.endswith((".bin", ".pt", ".safetensors")):
89
+ model_file = os.path.join(model_name, filename)
90
+ break
91
+ else:
92
+ raise ValueError("No model file found in the downloaded files.")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
93
 
94
+ # Convert to fp16
95
+ fp16 = f"{model_name}.fp16.gguf"
96
  conversion_script = "convert_hf_to_gguf.py"
97
+ fp16_conversion = f"python llama.cpp/{conversion_script} {model_file} --outtype f16 --outfile {fp16}"
98
  result = subprocess.run(fp16_conversion, shell=True, capture_output=True)
 
99
  if result.returncode != 0:
100
  raise Exception(f"Error converting to fp16: {result.stderr}")
 
 
101
 
102
+ # Quantization
103
  imatrix_path = "llama.cpp/imatrix.dat"
 
104
  if use_imatrix:
105
  if train_data_file:
106
  train_data_path = train_data_file.name
107
  else:
108
+ train_data_path = "groups_merged.txt"
 
 
 
109
  if not os.path.isfile(train_data_path):
110
  raise Exception(f"Training data file not found: {train_data_path}")
 
111
  generate_importance_matrix(fp16, train_data_path)
112
+ quant_method = imatrix_q_method if use_imatrix else q_method
 
 
113
  username = whoami(oauth_token.token)["name"]
114
+ quantized_gguf_name = f"{model_name.lower()}-{quant_method.lower()}-imat.gguf" if use_imatrix else f"{model_name.lower()}-{quant_method.lower()}.gguf"
115
  quantized_gguf_path = quantized_gguf_name
116
+ quantise_ggml = f"./llama.cpp/llama-quantize {'--imatrix' if use_imatrix else ''} {imatrix_path if use_imatrix else ''} {fp16} {quantized_gguf_path} {quant_method}"
 
 
 
 
 
117
  result = subprocess.run(quantise_ggml, shell=True, capture_output=True)
118
  if result.returncode != 0:
119
  raise Exception(f"Error quantizing: {result.stderr}")
 
 
120
 
121
+ # Repo creation and upload
122
+ new_repo_url = api.create_repo(repo_id=f"{username}/{model_name}-{quant_method}-GGUF", exist_ok=True, private=private_repo)
123
  new_repo_id = new_repo_url.repo_id
 
 
124
  try:
125
  card = ModelCard.load(model_id, token=oauth_token.token)
126
+ except Exception:
127
  card = ModelCard("")
128
  if card.data.tags is None:
129
  card.data.tags = []
130
+ card.data.tags.extend(["llama-cpp", "gguf-my-repo"])
 
131
  card.data.base_model = model_id
132
  card.text = dedent(
133
  f"""
134
  # {new_repo_id}
135
  This model was converted to GGUF format from [`{model_id}`](https://huggingface.co/{model_id}) using llama.cpp via the ggml.ai's [GGUF-my-repo](https://huggingface.co/spaces/ggml-org/gguf-my-repo) space.
136
  Refer to the [original model card](https://huggingface.co/{model_id}) for more details on the model.
137
+
138
  ## Use with llama.cpp
139
  Install llama.cpp through brew (works on Mac and Linux)
140
+
141
  ```bash
142
  brew install llama.cpp
 
143
  ```
144
  Invoke the llama.cpp server or the CLI.
145
+
146
  ### CLI:
147
  ```bash
148
  llama-cli --hf-repo {new_repo_id} --hf-file {quantized_gguf_name} -p "The meaning to life and the universe is"
149
  ```
150
+
151
  ### Server:
152
  ```bash
153
  llama-server --hf-repo {new_repo_id} --hf-file {quantized_gguf_name} -c 2048
154
  ```
155
+
156
  Note: You can also use this checkpoint directly through the [usage steps](https://github.com/ggerganov/llama.cpp?tab=readme-ov-file#usage) listed in the Llama.cpp repo as well.
157
  Step 1: Clone llama.cpp from GitHub.
158
  ```
 
177
  if split_model:
178
  split_upload_model(quantized_gguf_path, new_repo_id, oauth_token, split_max_tensors, split_max_size)
179
  else:
180
+ api.upload_file(
181
+ path_or_fileobj=quantized_gguf_path,
182
+ path_in_repo=quantized_gguf_name,
183
+ repo_id=new_repo_id,
184
+ )
 
 
 
 
 
 
185
  if os.path.isfile(imatrix_path):
186
+ api.upload_file(
187
+ path_or_fileobj=imatrix_path,
188
+ path_in_repo="imatrix.dat",
189
+ repo_id=new_repo_id,
190
+ )
 
 
 
 
 
191
  api.upload_file(
192
  path_or_fileobj=f"README.md",
193
  path_in_repo=f"README.md",
194
  repo_id=new_repo_id,
195
  )
 
 
196
  return (
197
  f'Find your repo <a href=\'{new_repo_url}\' target="_blank" style="text-decoration:underline">here</a>',
198
  "llama.png",
 
201
  return (f"Error: {e}", "error.png")
202
  finally:
203
  shutil.rmtree(model_name, ignore_errors=True)
 
204
 
205
+ css = """/* Custom CSS to allow scrolling */
206
  .gradio-container {overflow-y: auto;}
207
  """
208
+
209
+ with gr.Blocks(css=css) as demo:
210
  gr.Markdown("You must be logged in to use GGUF-my-repo.")
211
  gr.LoginButton(min_width=250)
212
 
 
229
  ["IQ3_M", "IQ3_XXS", "Q4_K_M", "Q4_K_S", "IQ4_NL", "IQ4_XS", "Q5_K_M", "Q5_K_S"],
230
  label="Imatrix Quantization Method",
231
  info="GGML imatrix quants type",
232
+ value="IQ4_NL",
233
  filterable=False,
234
  visible=False
235
  )
 
271
  visible=False
272
  )
273
 
 
 
 
274
  use_imatrix.change(
275
+ fn=lambda use_imatrix: {
276
+ q_method: gr.update(visible=not use_imatrix),
277
+ imatrix_q_method: gr.update(visible=use_imatrix),
278
+ train_data_file: gr.update(visible=use_imatrix),
279
+ },
280
  inputs=use_imatrix,
281
  outputs=[q_method, imatrix_q_method, train_data_file]
282
  )
283
 
284
+ split_model.change(
285
+ fn=lambda split_model: {
286
+ split_max_tensors: gr.update(visible=split_model),
287
+ split_max_size: gr.update(visible=split_model),
288
+ },
289
+ inputs=split_model,
290
+ outputs=[split_max_tensors, split_max_size]
291
+ )
292
+
293
  iface = gr.Interface(
294
  fn=process_model,
295
  inputs=[
 
312
  api_name=False
313
  )
314
 
 
 
 
 
 
 
 
 
 
315
  def restart_space():
316
  HfApi().restart_space(repo_id="ggml-org/gguf-my-repo", token=HF_TOKEN, factory_reboot=True)
317
 
 
319
  scheduler.add_job(restart_space, "interval", seconds=21600)
320
  scheduler.start()
321
 
322
+ demo.queue(default_concurrency_limit=1, max_size=5).launch(debug=True, show_api=False)