Ffftdtd5dtft commited on
Commit
7c74789
·
verified ·
1 Parent(s): 6617ac5

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +73 -54
app.py CHANGED
@@ -4,53 +4,70 @@ import subprocess
4
  import signal
5
  os.environ["GRADIO_ANALYTICS_ENABLED"] = "False"
6
  import gradio as gr
 
7
  from huggingface_hub import create_repo, HfApi
8
  from huggingface_hub import snapshot_download
9
  from huggingface_hub import whoami
10
  from huggingface_hub import ModelCard
 
11
  from gradio_huggingfacehub_search import HuggingfaceHubSearch
 
12
  from apscheduler.schedulers.background import BackgroundScheduler
 
13
  from textwrap import dedent
14
 
15
  HF_TOKEN = os.environ.get("HF_TOKEN")
16
 
17
  def generate_importance_matrix(model_path, train_data_path):
18
  imatrix_command = f"./llama-imatrix -m ../{model_path} -f {train_data_path} -ngl 99 --output-frequency 10"
 
19
  os.chdir("llama.cpp")
 
20
  print(f"Current working directory: {os.getcwd()}")
21
  print(f"Files in the current directory: {os.listdir('.')}")
 
22
  if not os.path.isfile(f"../{model_path}"):
23
  raise Exception(f"Model file not found: {model_path}")
 
24
  print("Running imatrix command...")
25
  process = subprocess.Popen(imatrix_command, shell=True)
 
26
  try:
27
- process.wait(timeout=60)
28
  except subprocess.TimeoutExpired:
29
  print("Imatrix computation timed out. Sending SIGINT to allow graceful termination...")
30
  process.send_signal(signal.SIGINT)
31
  try:
32
- process.wait(timeout=5)
33
  except subprocess.TimeoutExpired:
34
  print("Imatrix proc still didn't term. Forecfully terming process...")
35
  process.kill()
 
36
  os.chdir("..")
 
37
  print("Importance matrix generation completed.")
38
 
39
  def split_upload_model(model_path, repo_id, oauth_token: gr.OAuthToken | None, split_max_tensors=256, split_max_size=None):
40
  if oauth_token.token is None:
41
  raise ValueError("You have to be logged in.")
 
42
  split_cmd = f"llama.cpp/llama-gguf-split --split --split-max-tensors {split_max_tensors}"
43
  if split_max_size:
44
  split_cmd += f" --split-max-size {split_max_size}"
45
- split_cmd += f" {model_path} {os.path.splitext(model_path)[0]}"
46
- print(f"Split command: {split_cmd}")
 
 
47
  result = subprocess.run(split_cmd, shell=True, capture_output=True, text=True)
48
- print(f"Split command stdout: {result.stdout}")
49
- print(f"Split command stderr: {result.stderr}")
 
50
  if result.returncode != 0:
51
  raise Exception(f"Error splitting the model: {result.stderr}")
52
  print("Model split successfully!")
53
- sharded_model_files = [f for f in os.listdir('.') if f.startswith(os.path.splitext(model_path)[0])]
 
 
54
  if sharded_model_files:
55
  print(f"Sharded model files: {sharded_model_files}")
56
  api = HfApi(token=oauth_token.token)
@@ -67,45 +84,47 @@ def split_upload_model(model_path, repo_id, oauth_token: gr.OAuthToken | None, s
67
  raise Exception(f"Error uploading file {file_path}: {e}")
68
  else:
69
  raise Exception("No sharded files found.")
 
70
  print("Sharded model has been uploaded successfully!")
71
 
72
  def process_model(model_id, q_method, use_imatrix, imatrix_q_method, private_repo, train_data_file, split_model, split_max_tensors, split_max_size, oauth_token: gr.OAuthToken | None):
73
  if oauth_token.token is None:
74
  raise ValueError("You must be logged in to use GGUF-my-repo")
75
  model_name = model_id.split('/')[-1]
76
- model_file = None
77
 
78
  try:
79
  api = HfApi(token=oauth_token.token)
80
 
81
- # Download only necessary files based on model format
82
- dl_pattern = ["*.md", "*.json", "*.safetensors", "*.bin", "*.pkl", "*.ckp", "*.pth", "*.gguf"]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
83
  api.snapshot_download(repo_id=model_id, local_dir=model_name, local_dir_use_symlinks=False, allow_patterns=dl_pattern)
84
  print("Model downloaded successfully!")
85
  print(f"Current working directory: {os.getcwd()}")
86
  print(f"Model directory contents: {os.listdir(model_name)}")
87
 
88
- # Find downloaded model file
89
- supported_extensions = (".bin", ".safetensors", ".pkl", ".ckp", ".pth", ".gguf")
90
- for filename in os.listdir(model_name):
91
- if any(filename.endswith(ext) for ext in supported_extensions):
92
- model_file = os.path.join(model_name, filename)
93
- break
94
-
95
- if model_file is None:
96
- raise FileNotFoundError(f"No supported model file found in the downloaded files. Supported formats: {', '.join(supported_extensions)}")
97
-
98
- # If the model is not already in GGUF format, convert it
99
- if not model_file.endswith(".gguf"):
100
- gguf_model_file = f"{os.path.splitext(model_file)[0]}.gguf"
101
- conversion_command = f"python llama.cpp/convert_hf_to_gguf.py {model_file} --outfile {gguf_model_file}"
102
- result = subprocess.run(conversion_command, shell=True, capture_output=True)
103
- if result.returncode != 0:
104
- raise Exception(f"Error converting to GGUF: {result.stderr}")
105
- print("Model converted to GGUF successfully!")
106
- print(f"Converted model path: {gguf_model_file}")
107
- else:
108
- gguf_model_file = model_file # If already GGUF, use the original file
109
 
110
  imatrix_path = "llama.cpp/imatrix.dat"
111
 
@@ -120,27 +139,23 @@ def process_model(model_id, q_method, use_imatrix, imatrix_q_method, private_rep
120
  if not os.path.isfile(train_data_path):
121
  raise Exception(f"Training data file not found: {train_data_path}")
122
 
123
- generate_importance_matrix(gguf_model_file, train_data_path)
124
  else:
125
  print("Not using imatrix quantization.")
126
-
127
  username = whoami(oauth_token.token)["name"]
128
  quantized_gguf_name = f"{model_name.lower()}-{imatrix_q_method.lower()}-imat.gguf" if use_imatrix else f"{model_name.lower()}-{q_method.lower()}.gguf"
129
  quantized_gguf_path = quantized_gguf_name
130
-
131
- os.chdir("llama.cpp")
132
  if use_imatrix:
133
- quantise_ggml = f"./llama-quantize --imatrix {imatrix_path} ../{gguf_model_file} ../{quantized_gguf_path} {imatrix_q_method}"
134
  else:
135
- quantise_ggml = f"./llama-quantize ../{gguf_model_file} ../{quantized_gguf_path} {q_method}"
136
  result = subprocess.run(quantise_ggml, shell=True, capture_output=True)
137
- os.chdir("..")
138
-
139
  if result.returncode != 0:
140
  raise Exception(f"Error quantizing: {result.stderr}")
141
  print(f"Quantized successfully with {imatrix_q_method if use_imatrix else q_method} option!")
142
  print(f"Quantized model path: {quantized_gguf_path}")
143
 
 
144
  new_repo_url = api.create_repo(repo_id=f"{username}/{model_name}-{imatrix_q_method if use_imatrix else q_method}-GGUF", exist_ok=True, private=private_repo)
145
  new_repo_id = new_repo_url.repo_id
146
  print("Repo created successfully!", new_repo_url)
@@ -159,26 +174,26 @@ def process_model(model_id, q_method, use_imatrix, imatrix_q_method, private_rep
159
  # {new_repo_id}
160
  This model was converted to GGUF format from [`{model_id}`](https://huggingface.co/{model_id}) using llama.cpp via the ggml.ai's [GGUF-my-repo](https://huggingface.co/spaces/ggml-org/gguf-my-repo) space.
161
  Refer to the [original model card](https://huggingface.co/{model_id}) for more details on the model.
162
-
163
  ## Use with llama.cpp
164
  Install llama.cpp through brew (works on Mac and Linux)
165
-
166
  ```bash
167
  brew install llama.cpp
168
-
169
  ```
170
  Invoke the llama.cpp server or the CLI.
171
-
172
  ### CLI:
173
  ```bash
174
  llama-cli --hf-repo {new_repo_id} --hf-file {quantized_gguf_name} -p "The meaning to life and the universe is"
175
  ```
176
-
177
  ### Server:
178
  ```bash
179
  llama-server --hf-repo {new_repo_id} --hf-file {quantized_gguf_name} -c 2048
180
  ```
181
-
182
  Note: You can also use this checkpoint directly through the [usage steps](https://github.com/ggerganov/llama.cpp?tab=readme-ov-file#usage) listed in the Llama.cpp repo as well.
183
  Step 1: Clone llama.cpp from GitHub.
184
  ```
@@ -192,7 +207,7 @@ def process_model(model_id, q_method, use_imatrix, imatrix_q_method, private_rep
192
  ```
193
  ./llama-cli --hf-repo {new_repo_id} --hf-file {quantized_gguf_name} -p "The meaning to life and the universe is"
194
  ```
195
- or
196
  ```
197
  ./llama-server --hf-repo {new_repo_id} --hf-file {quantized_gguf_name} -c 2048
198
  ```
@@ -212,8 +227,10 @@ def process_model(model_id, q_method, use_imatrix, imatrix_q_method, private_rep
212
  )
213
  except Exception as e:
214
  raise Exception(f"Error uploading quantized model: {e}")
215
-
216
- if use_imatrix and os.path.isfile(imatrix_path):
 
 
217
  try:
218
  print(f"Uploading imatrix.dat: {imatrix_path}")
219
  api.upload_file(
@@ -244,7 +261,8 @@ def process_model(model_id, q_method, use_imatrix, imatrix_q_method, private_rep
244
  css="""/* Custom CSS to allow scrolling */
245
  .gradio-container {overflow-y: auto;}
246
  """
247
- with gr.Blocks(css=css) as demo:
 
248
  gr.Markdown("You must be logged in to use GGUF-my-repo.")
249
  gr.LoginButton(min_width=250)
250
 
@@ -267,7 +285,7 @@ with gr.Blocks(css=css) as demo:
267
  ["IQ3_M", "IQ3_XXS", "Q4_K_M", "Q4_K_S", "IQ4_NL", "IQ4_XS", "Q5_K_M", "Q5_K_S"],
268
  label="Imatrix Quantization Method",
269
  info="GGML imatrix quants type",
270
- value="IQ4_NL",
271
  filterable=False,
272
  visible=False
273
  )
@@ -311,7 +329,7 @@ with gr.Blocks(css=css) as demo:
311
 
312
  def update_visibility(use_imatrix):
313
  return gr.update(visible=not use_imatrix), gr.update(visible=use_imatrix), gr.update(visible=use_imatrix)
314
-
315
  use_imatrix.change(
316
  fn=update_visibility,
317
  inputs=use_imatrix,
@@ -335,7 +353,7 @@ with gr.Blocks(css=css) as demo:
335
  gr.Markdown(label="output"),
336
  gr.Image(show_label=False),
337
  ],
338
- title="Create your own GGUF Quants, blazingly fast !",
339
  description="The space takes an HF repo as an input, quantizes it and creates a Public repo containing the selected quant under your HF user namespace.",
340
  api_name=False
341
  )
@@ -350,10 +368,11 @@ with gr.Blocks(css=css) as demo:
350
  )
351
 
352
  def restart_space():
353
- HfApi().restart_space(repo_id="Ffftdtd5dtft/gguf-my-repo", token=HF_TOKEN, factory_reboot=True)
354
 
355
  scheduler = BackgroundScheduler()
356
  scheduler.add_job(restart_space, "interval", seconds=21600)
357
  scheduler.start()
358
 
359
- demo.queue(default_concurrency_limit=2, max_size=5).launch(debug=True, show_api=False)
 
 
4
  import signal
5
  os.environ["GRADIO_ANALYTICS_ENABLED"] = "False"
6
  import gradio as gr
7
+
8
  from huggingface_hub import create_repo, HfApi
9
  from huggingface_hub import snapshot_download
10
  from huggingface_hub import whoami
11
  from huggingface_hub import ModelCard
12
+
13
  from gradio_huggingfacehub_search import HuggingfaceHubSearch
14
+
15
  from apscheduler.schedulers.background import BackgroundScheduler
16
+
17
  from textwrap import dedent
18
 
19
  HF_TOKEN = os.environ.get("HF_TOKEN")
20
 
21
  def generate_importance_matrix(model_path, train_data_path):
22
  imatrix_command = f"./llama-imatrix -m ../{model_path} -f {train_data_path} -ngl 99 --output-frequency 10"
23
+
24
  os.chdir("llama.cpp")
25
+
26
  print(f"Current working directory: {os.getcwd()}")
27
  print(f"Files in the current directory: {os.listdir('.')}")
28
+
29
  if not os.path.isfile(f"../{model_path}"):
30
  raise Exception(f"Model file not found: {model_path}")
31
+
32
  print("Running imatrix command...")
33
  process = subprocess.Popen(imatrix_command, shell=True)
34
+
35
  try:
36
+ process.wait(timeout=60) # added wait
37
  except subprocess.TimeoutExpired:
38
  print("Imatrix computation timed out. Sending SIGINT to allow graceful termination...")
39
  process.send_signal(signal.SIGINT)
40
  try:
41
+ process.wait(timeout=5) # grace period
42
  except subprocess.TimeoutExpired:
43
  print("Imatrix proc still didn't term. Forecfully terming process...")
44
  process.kill()
45
+
46
  os.chdir("..")
47
+
48
  print("Importance matrix generation completed.")
49
 
50
  def split_upload_model(model_path, repo_id, oauth_token: gr.OAuthToken | None, split_max_tensors=256, split_max_size=None):
51
  if oauth_token.token is None:
52
  raise ValueError("You have to be logged in.")
53
+
54
  split_cmd = f"llama.cpp/llama-gguf-split --split --split-max-tensors {split_max_tensors}"
55
  if split_max_size:
56
  split_cmd += f" --split-max-size {split_max_size}"
57
+ split_cmd += f" {model_path} {model_path.split('.')[0]}"
58
+
59
+ print(f"Split command: {split_cmd}")
60
+
61
  result = subprocess.run(split_cmd, shell=True, capture_output=True, text=True)
62
+ print(f"Split command stdout: {result.stdout}")
63
+ print(f"Split command stderr: {result.stderr}")
64
+
65
  if result.returncode != 0:
66
  raise Exception(f"Error splitting the model: {result.stderr}")
67
  print("Model split successfully!")
68
+
69
+
70
+ sharded_model_files = [f for f in os.listdir('.') if f.startswith(model_path.split('.')[0])]
71
  if sharded_model_files:
72
  print(f"Sharded model files: {sharded_model_files}")
73
  api = HfApi(token=oauth_token.token)
 
84
  raise Exception(f"Error uploading file {file_path}: {e}")
85
  else:
86
  raise Exception("No sharded files found.")
87
+
88
  print("Sharded model has been uploaded successfully!")
89
 
90
  def process_model(model_id, q_method, use_imatrix, imatrix_q_method, private_repo, train_data_file, split_model, split_max_tensors, split_max_size, oauth_token: gr.OAuthToken | None):
91
  if oauth_token.token is None:
92
  raise ValueError("You must be logged in to use GGUF-my-repo")
93
  model_name = model_id.split('/')[-1]
94
+ fp16 = f"{model_name}.fp16.gguf"
95
 
96
  try:
97
  api = HfApi(token=oauth_token.token)
98
 
99
+ dl_pattern = ["*.md", "*.json", "*.model"]
100
+
101
+ pattern = (
102
+ "*.safetensors"
103
+ if any(
104
+ file.path.endswith(".safetensors")
105
+ for file in api.list_repo_tree(
106
+ repo_id=model_id,
107
+ recursive=True,
108
+ )
109
+ )
110
+ else "*.bin"
111
+ )
112
+
113
+ dl_pattern += pattern
114
+
115
  api.snapshot_download(repo_id=model_id, local_dir=model_name, local_dir_use_symlinks=False, allow_patterns=dl_pattern)
116
  print("Model downloaded successfully!")
117
  print(f"Current working directory: {os.getcwd()}")
118
  print(f"Model directory contents: {os.listdir(model_name)}")
119
 
120
+ conversion_script = "convert_hf_to_gguf.py"
121
+ fp16_conversion = f"python llama.cpp/{conversion_script} {model_name} --outtype f16 --outfile {fp16}"
122
+ result = subprocess.run(fp16_conversion, shell=True, capture_output=True)
123
+ print(result)
124
+ if result.returncode != 0:
125
+ raise Exception(f"Error converting to fp16: {result.stderr}")
126
+ print("Model converted to fp16 successfully!")
127
+ print(f"Converted model path: {fp16}")
 
 
 
 
 
 
 
 
 
 
 
 
 
128
 
129
  imatrix_path = "llama.cpp/imatrix.dat"
130
 
 
139
  if not os.path.isfile(train_data_path):
140
  raise Exception(f"Training data file not found: {train_data_path}")
141
 
142
+ generate_importance_matrix(fp16, train_data_path)
143
  else:
144
  print("Not using imatrix quantization.")
 
145
  username = whoami(oauth_token.token)["name"]
146
  quantized_gguf_name = f"{model_name.lower()}-{imatrix_q_method.lower()}-imat.gguf" if use_imatrix else f"{model_name.lower()}-{q_method.lower()}.gguf"
147
  quantized_gguf_path = quantized_gguf_name
 
 
148
  if use_imatrix:
149
+ quantise_ggml = f"./llama.cpp/llama-quantize --imatrix {imatrix_path} {fp16} {quantized_gguf_path} {imatrix_q_method}"
150
  else:
151
+ quantise_ggml = f"./llama.cpp/llama-quantize {fp16} {quantized_gguf_path} {q_method}"
152
  result = subprocess.run(quantise_ggml, shell=True, capture_output=True)
 
 
153
  if result.returncode != 0:
154
  raise Exception(f"Error quantizing: {result.stderr}")
155
  print(f"Quantized successfully with {imatrix_q_method if use_imatrix else q_method} option!")
156
  print(f"Quantized model path: {quantized_gguf_path}")
157
 
158
+ # Create empty repo
159
  new_repo_url = api.create_repo(repo_id=f"{username}/{model_name}-{imatrix_q_method if use_imatrix else q_method}-GGUF", exist_ok=True, private=private_repo)
160
  new_repo_id = new_repo_url.repo_id
161
  print("Repo created successfully!", new_repo_url)
 
174
  # {new_repo_id}
175
  This model was converted to GGUF format from [`{model_id}`](https://huggingface.co/{model_id}) using llama.cpp via the ggml.ai's [GGUF-my-repo](https://huggingface.co/spaces/ggml-org/gguf-my-repo) space.
176
  Refer to the [original model card](https://huggingface.co/{model_id}) for more details on the model.
177
+
178
  ## Use with llama.cpp
179
  Install llama.cpp through brew (works on Mac and Linux)
180
+
181
  ```bash
182
  brew install llama.cpp
183
+
184
  ```
185
  Invoke the llama.cpp server or the CLI.
186
+
187
  ### CLI:
188
  ```bash
189
  llama-cli --hf-repo {new_repo_id} --hf-file {quantized_gguf_name} -p "The meaning to life and the universe is"
190
  ```
191
+
192
  ### Server:
193
  ```bash
194
  llama-server --hf-repo {new_repo_id} --hf-file {quantized_gguf_name} -c 2048
195
  ```
196
+
197
  Note: You can also use this checkpoint directly through the [usage steps](https://github.com/ggerganov/llama.cpp?tab=readme-ov-file#usage) listed in the Llama.cpp repo as well.
198
  Step 1: Clone llama.cpp from GitHub.
199
  ```
 
207
  ```
208
  ./llama-cli --hf-repo {new_repo_id} --hf-file {quantized_gguf_name} -p "The meaning to life and the universe is"
209
  ```
210
+ or
211
  ```
212
  ./llama-server --hf-repo {new_repo_id} --hf-file {quantized_gguf_name} -c 2048
213
  ```
 
227
  )
228
  except Exception as e:
229
  raise Exception(f"Error uploading quantized model: {e}")
230
+
231
+
232
+ imatrix_path = "llama.cpp/imatrix.dat"
233
+ if os.path.isfile(imatrix_path):
234
  try:
235
  print(f"Uploading imatrix.dat: {imatrix_path}")
236
  api.upload_file(
 
261
  css="""/* Custom CSS to allow scrolling */
262
  .gradio-container {overflow-y: auto;}
263
  """
264
+ # Create Gradio interface
265
+ with gr.Blocks(css=css) as demo:
266
  gr.Markdown("You must be logged in to use GGUF-my-repo.")
267
  gr.LoginButton(min_width=250)
268
 
 
285
  ["IQ3_M", "IQ3_XXS", "Q4_K_M", "Q4_K_S", "IQ4_NL", "IQ4_XS", "Q5_K_M", "Q5_K_S"],
286
  label="Imatrix Quantization Method",
287
  info="GGML imatrix quants type",
288
+ value="IQ4_NL",
289
  filterable=False,
290
  visible=False
291
  )
 
329
 
330
  def update_visibility(use_imatrix):
331
  return gr.update(visible=not use_imatrix), gr.update(visible=use_imatrix), gr.update(visible=use_imatrix)
332
+
333
  use_imatrix.change(
334
  fn=update_visibility,
335
  inputs=use_imatrix,
 
353
  gr.Markdown(label="output"),
354
  gr.Image(show_label=False),
355
  ],
356
+ title="Create your own GGUF Quants, blazingly fast ⚡!",
357
  description="The space takes an HF repo as an input, quantizes it and creates a Public repo containing the selected quant under your HF user namespace.",
358
  api_name=False
359
  )
 
368
  )
369
 
370
  def restart_space():
371
+ HfApi().restart_space(repo_id="ggml-org/gguf-my-repo", token=HF_TOKEN, factory_reboot=True)
372
 
373
  scheduler = BackgroundScheduler()
374
  scheduler.add_job(restart_space, "interval", seconds=21600)
375
  scheduler.start()
376
 
377
+ # Launch the interface
378
+ demo.queue(default_concurrency_limit=999, max_size=5).launch(debug=True, show_api=False)