reach-vb HF staff SixOpen commited on
Commit
3ad22ce
1 Parent(s): 5b0a252

Split/shard support (#65)

Browse files

- Split/shard support (78ee58d5a3e5dc560e44a5aedc8f2a1ff9d61610)


Co-authored-by: E <[email protected]>

Files changed (1) hide show
  1. app.py +143 -61
app.py CHANGED
@@ -28,11 +28,51 @@ def script_to_use(model_id, api):
28
  arch = arch[0]
29
  return "convert.py" if arch in LLAMA_LIKE_ARCHS else "convert-hf-to-gguf.py"
30
 
31
- def process_model(model_id, q_method, private_repo, oauth_token: gr.OAuthToken | None):
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
32
  if oauth_token.token is None:
33
  raise ValueError("You must be logged in to use GGUF-my-repo")
34
  model_name = model_id.split('/')[-1]
35
- fp16 = f"{model_name}/{model_name.lower()}.fp16.bin"
36
 
37
  try:
38
  api = HfApi(token=oauth_token.token)
@@ -54,7 +94,9 @@ def process_model(model_id, q_method, private_repo, oauth_token: gr.OAuthToken |
54
  dl_pattern += pattern
55
 
56
  api.snapshot_download(repo_id=model_id, local_dir=model_name, local_dir_use_symlinks=False, allow_patterns=dl_pattern)
57
- print("Model downloaded successully!")
 
 
58
 
59
  conversion_script = script_to_use(model_id, api)
60
  fp16_conversion = f"python llama.cpp/{conversion_script} {model_name} --outtype f16 --outfile {fp16}"
@@ -62,17 +104,21 @@ def process_model(model_id, q_method, private_repo, oauth_token: gr.OAuthToken |
62
  print(result)
63
  if result.returncode != 0:
64
  raise Exception(f"Error converting to fp16: {result.stderr}")
65
- print("Model converted to fp16 successully!")
 
66
 
67
- qtype = f"{model_name}/{model_name.lower()}.{q_method.upper()}.gguf"
68
- quantise_ggml = f"./llama.cpp/quantize {fp16} {qtype} {q_method}"
 
 
69
  result = subprocess.run(quantise_ggml, shell=True, capture_output=True)
70
  if result.returncode != 0:
71
  raise Exception(f"Error quantizing: {result.stderr}")
72
- print("Quantised successfully!")
 
73
 
74
  # Create empty repo
75
- new_repo_url = api.create_repo(repo_id=f"{model_name}-{q_method}-GGUF", exist_ok=True, private=private_repo)
76
  new_repo_id = new_repo_url.repo_id
77
  print("Repo created successfully!", new_repo_url)
78
 
@@ -90,50 +136,49 @@ def process_model(model_id, q_method, private_repo, oauth_token: gr.OAuthToken |
90
  This model was converted to GGUF format from [`{model_id}`](https://huggingface.co/{model_id}) using llama.cpp via the ggml.ai's [GGUF-my-repo](https://huggingface.co/spaces/ggml-org/gguf-my-repo) space.
91
  Refer to the [original model card](https://huggingface.co/{model_id}) for more details on the model.
92
  ## Use with llama.cpp
93
-
94
  Install llama.cpp through brew.
95
-
96
  ```bash
97
  brew install ggerganov/ggerganov/llama.cpp
98
  ```
99
  Invoke the llama.cpp server or the CLI.
100
-
101
  CLI:
102
-
103
  ```bash
104
- llama-cli --hf-repo {new_repo_id} --model {qtype.split("/")[-1]} -p "The meaning to life and the universe is"
105
  ```
106
-
107
  Server:
108
-
109
  ```bash
110
- llama-server --hf-repo {new_repo_id} --model {qtype.split("/")[-1]} -c 2048
111
  ```
112
-
113
  Note: You can also use this checkpoint directly through the [usage steps](https://github.com/ggerganov/llama.cpp?tab=readme-ov-file#usage) listed in the Llama.cpp repo as well.
114
-
115
  ```
116
- git clone https://github.com/ggerganov/llama.cpp && \
117
- cd llama.cpp && \
118
- make && \
119
- ./main -m {qtype.split("/")[-1]} -n 128
120
  ```
121
  """
122
  )
123
- card.save(os.path.join(model_name, "README-new.md"))
124
-
125
- api.upload_file(
126
- path_or_fileobj=qtype,
127
- path_in_repo=qtype.split("/")[-1],
128
- repo_id=new_repo_id,
129
- )
 
 
 
 
 
 
 
130
 
131
  api.upload_file(
132
- path_or_fileobj=f"{model_name}/README-new.md",
133
- path_in_repo="README.md",
134
  repo_id=new_repo_id,
135
  )
136
- print("Uploaded successfully!")
137
 
138
  return (
139
  f'Find your repo <a href=\'{new_repo_url}\' target="_blank" style="text-decoration:underline">here</a>',
@@ -147,38 +192,75 @@ def process_model(model_id, q_method, private_repo, oauth_token: gr.OAuthToken |
147
 
148
 
149
  # Create Gradio interface
150
- iface = gr.Interface(
151
- fn=process_model,
152
- inputs=[
153
- HuggingfaceHubSearch(
154
- label="Hub Model ID",
155
- placeholder="Search for model id on Huggingface",
156
- search_type="model",
157
- ),
158
- gr.Dropdown(
159
- ["Q2_K", "Q3_K_S", "Q3_K_M", "Q3_K_L", "Q4_0", "Q4_K_S", "Q4_K_M", "Q5_0", "Q5_K_S", "Q5_K_M", "Q6_K", "Q8_0"],
160
- label="Quantization Method",
161
- info="GGML quantisation type",
162
- value="Q4_K_M",
163
- filterable=False
164
- ),
165
- gr.Checkbox(
166
- value=False,
167
- label="Private Repo",
168
- info="Create a private repo under your username."
169
- ),
170
- ],
171
- outputs=[
172
- gr.Markdown(label="output"),
173
- gr.Image(show_label=False),
174
- ],
175
- title="Create your own GGUF Quants, blazingly fast ⚡!",
176
- description="The space takes an HF repo as an input, quantises it and creates a Public repo containing the selected quant under your HF user namespace.",
177
- )
178
  with gr.Blocks() as demo:
179
  gr.Markdown("You must be logged in to use GGUF-my-repo.")
180
  gr.LoginButton(min_width=250)
181
- iface.render()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
182
 
183
  def restart_space():
184
  HfApi().restart_space(repo_id="ggml-org/gguf-my-repo", token=HF_TOKEN, factory_reboot=True)
 
28
  arch = arch[0]
29
  return "convert.py" if arch in LLAMA_LIKE_ARCHS else "convert-hf-to-gguf.py"
30
 
31
+ def split_upload_model(model_path, repo_id, oauth_token: gr.OAuthToken | None, split_max_tensors=256, split_max_size=None):
32
+ if oauth_token.token is None:
33
+ raise ValueError("You have to be logged in.")
34
+
35
+ split_cmd = f"llama.cpp/gguf-split --split --split-max-tensors {split_max_tensors}"
36
+ if split_max_size:
37
+ split_cmd += f" --split-max-size {split_max_size}"
38
+ split_cmd += f" {model_path} {model_path.split('.')[0]}"
39
+
40
+ print(f"Split command: {split_cmd}")
41
+
42
+ result = subprocess.run(split_cmd, shell=True, capture_output=True, text=True)
43
+ print(f"Split command stdout: {result.stdout}")
44
+ print(f"Split command stderr: {result.stderr}")
45
+
46
+ if result.returncode != 0:
47
+ raise Exception(f"Error splitting the model: {result.stderr}")
48
+ print("Model split successfully!")
49
+
50
+
51
+ sharded_model_files = [f for f in os.listdir('.') if f.startswith(model_path.split('.')[0])]
52
+ if sharded_model_files:
53
+ print(f"Sharded model files: {sharded_model_files}")
54
+ api = HfApi(token=oauth_token.token)
55
+ for file in sharded_model_files:
56
+ file_path = os.path.join('.', file)
57
+ print(f"Uploading file: {file_path}")
58
+ try:
59
+ api.upload_file(
60
+ path_or_fileobj=file_path,
61
+ path_in_repo=file,
62
+ repo_id=repo_id,
63
+ )
64
+ except Exception as e:
65
+ raise Exception(f"Error uploading file {file_path}: {e}")
66
+ else:
67
+ raise Exception("No sharded files found.")
68
+
69
+ print("Sharded model has been uploaded successfully!")
70
+
71
+ def process_model(model_id, q_method, private_repo, split_model, split_max_tensors, split_max_size, oauth_token: gr.OAuthToken | None):
72
  if oauth_token.token is None:
73
  raise ValueError("You must be logged in to use GGUF-my-repo")
74
  model_name = model_id.split('/')[-1]
75
+ fp16 = f"{model_name}.fp16.gguf"
76
 
77
  try:
78
  api = HfApi(token=oauth_token.token)
 
94
  dl_pattern += pattern
95
 
96
  api.snapshot_download(repo_id=model_id, local_dir=model_name, local_dir_use_symlinks=False, allow_patterns=dl_pattern)
97
+ print("Model downloaded successfully!")
98
+ print(f"Current working directory: {os.getcwd()}")
99
+ print(f"Model directory contents: {os.listdir(model_name)}")
100
 
101
  conversion_script = script_to_use(model_id, api)
102
  fp16_conversion = f"python llama.cpp/{conversion_script} {model_name} --outtype f16 --outfile {fp16}"
 
104
  print(result)
105
  if result.returncode != 0:
106
  raise Exception(f"Error converting to fp16: {result.stderr}")
107
+ print("Model converted to fp16 successfully!")
108
+ print(f"Converted model path: {fp16}")
109
 
110
+ username = whoami(oauth_token.token)["name"]
111
+ quantized_gguf_name = f"{model_name.lower()}-{q_method.lower()}.gguf"
112
+ quantized_gguf_path = quantized_gguf_name
113
+ quantise_ggml = f"./llama.cpp/quantize {fp16} {quantized_gguf_path} {q_method}"
114
  result = subprocess.run(quantise_ggml, shell=True, capture_output=True)
115
  if result.returncode != 0:
116
  raise Exception(f"Error quantizing: {result.stderr}")
117
+ print(f"Quantized successfully with {q_method} option!")
118
+ print(f"Quantized model path: {quantized_gguf_path}")
119
 
120
  # Create empty repo
121
+ new_repo_url = api.create_repo(repo_id=f"{username}/{model_name}-{q_method}-GGUF", exist_ok=True, private=private_repo)
122
  new_repo_id = new_repo_url.repo_id
123
  print("Repo created successfully!", new_repo_url)
124
 
 
136
  This model was converted to GGUF format from [`{model_id}`](https://huggingface.co/{model_id}) using llama.cpp via the ggml.ai's [GGUF-my-repo](https://huggingface.co/spaces/ggml-org/gguf-my-repo) space.
137
  Refer to the [original model card](https://huggingface.co/{model_id}) for more details on the model.
138
  ## Use with llama.cpp
 
139
  Install llama.cpp through brew.
 
140
  ```bash
141
  brew install ggerganov/ggerganov/llama.cpp
142
  ```
143
  Invoke the llama.cpp server or the CLI.
 
144
  CLI:
 
145
  ```bash
146
+ llama-cli --hf-repo {new_repo_id} --model {quantized_gguf_name} -p "The meaning to life and the universe is"
147
  ```
 
148
  Server:
 
149
  ```bash
150
+ llama-server --hf-repo {new_repo_id} --model {quantized_gguf_name} -c 2048
151
  ```
 
152
  Note: You can also use this checkpoint directly through the [usage steps](https://github.com/ggerganov/llama.cpp?tab=readme-ov-file#usage) listed in the Llama.cpp repo as well.
 
153
  ```
154
+ git clone https://github.com/ggerganov/llama.cpp && \\
155
+ cd llama.cpp && \\
156
+ make && \\
157
+ ./main -m {quantized_gguf_name} -n 128
158
  ```
159
  """
160
  )
161
+ card.save(f"README.md")
162
+
163
+ if split_model:
164
+ split_upload_model(quantized_gguf_path, new_repo_id, oauth_token, split_max_tensors, split_max_size)
165
+ else:
166
+ try:
167
+ print(f"Uploading quantized model: {quantized_gguf_path}")
168
+ api.upload_file(
169
+ path_or_fileobj=quantized_gguf_path,
170
+ path_in_repo=quantized_gguf_name,
171
+ repo_id=new_repo_id,
172
+ )
173
+ except Exception as e:
174
+ raise Exception(f"Error uploading quantized model: {e}")
175
 
176
  api.upload_file(
177
+ path_or_fileobj=f"README.md",
178
+ path_in_repo=f"README.md",
179
  repo_id=new_repo_id,
180
  )
181
+ print(f"Uploaded successfully with {q_method} option!")
182
 
183
  return (
184
  f'Find your repo <a href=\'{new_repo_url}\' target="_blank" style="text-decoration:underline">here</a>',
 
192
 
193
 
194
  # Create Gradio interface
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
195
  with gr.Blocks() as demo:
196
  gr.Markdown("You must be logged in to use GGUF-my-repo.")
197
  gr.LoginButton(min_width=250)
198
+
199
+ model_id_input = HuggingfaceHubSearch(
200
+ label="Hub Model ID",
201
+ placeholder="Search for model id on Huggingface",
202
+ search_type="model",
203
+ )
204
+
205
+ q_method_input = gr.Dropdown(
206
+ ["Q2_K", "Q3_K_S", "Q3_K_M", "Q3_K_L", "Q4_0", "Q4_K_S", "Q4_K_M", "Q5_0", "Q5_K_S", "Q5_K_M", "Q6_K", "Q8_0"],
207
+ label="Quantization Method",
208
+ info="GGML quantization type",
209
+ value="Q4_K_M",
210
+ filterable=False
211
+ )
212
+
213
+ private_repo_input = gr.Checkbox(
214
+ value=False,
215
+ label="Private Repo",
216
+ info="Create a private repo under your username."
217
+ )
218
+
219
+ split_model_input = gr.Checkbox(
220
+ value=False,
221
+ label="Split Model",
222
+ info="Shard the model using gguf-split."
223
+ )
224
+
225
+ split_max_tensors_input = gr.Number(
226
+ value=256,
227
+ label="Max Tensors per File",
228
+ info="Maximum number of tensors per file when splitting model.",
229
+ visible=False
230
+ )
231
+
232
+ split_max_size_input = gr.Textbox(
233
+ label="Max File Size",
234
+ info="Maximum file size when splitting model (--split-max-size). May leave empty to use the default.",
235
+ visible=False
236
+ )
237
+
238
+ iface = gr.Interface(
239
+ fn=process_model,
240
+ inputs=[
241
+ model_id_input,
242
+ q_method_input,
243
+ private_repo_input,
244
+ split_model_input,
245
+ split_max_tensors_input,
246
+ split_max_size_input,
247
+ ],
248
+ outputs=[
249
+ gr.Markdown(label="output"),
250
+ gr.Image(show_label=False),
251
+ ],
252
+ title="Create your own GGUF Quants, blazingly fast ⚡!",
253
+ description="The space takes an HF repo as an input, quantizes it and creates a Public repo containing the selected quant under your HF user namespace.",
254
+ )
255
+
256
+ def update_visibility(split_model):
257
+ return gr.update(visible=split_model), gr.update(visible=split_model)
258
+
259
+ split_model_input.change(
260
+ fn=update_visibility,
261
+ inputs=split_model_input,
262
+ outputs=[split_max_tensors_input, split_max_size_input]
263
+ )
264
 
265
  def restart_space():
266
  HfApi().restart_space(repo_id="ggml-org/gguf-my-repo", token=HF_TOKEN, factory_reboot=True)