gguf-my-repo

Runtime error

App Files Files Community

Ffftdtd5dtft commited on Aug 30, 2024

Commit

e188c9c

verified ·

1 Parent(s): 9f74a4f

Update app.py

Browse files

Files changed (1) hide show

app.py +60 -17

app.py CHANGED Viewed

@@ -18,7 +18,7 @@ def generate_importance_matrix(model_path, train_data_path):
     print(f"Current working directory: {os.getcwd()}")
     print(f"Files in the current directory: {os.listdir('.')}")
-    if not os.path.isfile(f"../{model_path}"):
         raise Exception(f"Model file not found: {model_path}")
     print("Running imatrix command...")
@@ -196,34 +196,77 @@ def process_model(model_id, q_method, use_imatrix, imatrix_q_method, private_rep
             ```
             cd llama.cpp && LLAMA_CURL=1 make
             ```
-            Step 3: Fetch model weights from HF using curl command and use them with the above `llama_cli` or `llama_server`.
             ```
-            curl -L {new_repo_id} > .gguf/{quantized_gguf_name}
             ```
             """
         )
-        if use_imatrix:
-            card.text += "\nNote: This model was quantized using imatrix."
         card.push_to_hub(repo_id=new_repo_id, token=oauth_token.token)
         api.upload_file(
             path_or_fileobj=quantized_gguf_path,
             path_in_repo=quantized_gguf_name,
             repo_id=new_repo_id,
-            token=oauth_token.token,
         )
-        print("Pushed model to the hub!")
         if split_model:
-            split_upload_model(quantized_gguf_name, new_repo_id, oauth_token, split_max_tensors=split_max_tensors, split_max_size=split_max_size)
     except Exception as e:
-        print("Error in process_model:", e)
-        raise e
     finally:
-        os.makedirs("model_cache", exist_ok=True)
-        shutil.move(model_name, f"model_cache/{model_name}")
-        shutil.move(fp16, f"model_cache/{fp16}")
-        shutil.move(quantized_gguf_path, f"model_cache/{quantized_gguf_path}")
-        print("Moved model files to model_cache.")
-    print("Process completed successfully!")

     print(f"Current working directory: {os.getcwd()}")
     print(f"Files in the current directory: {os.listdir('.')}")
+    if not os.path.isfile(f"../{model_path}")):
         raise Exception(f"Model file not found: {model_path}")
     print("Running imatrix command...")
             ```
             cd llama.cpp && LLAMA_CURL=1 make
             ```
+            Step 3: Fetch model weights from HF using curl command and run the models directly!
             ```
+            curl -L https://huggingface.co/{new_repo_id}/resolve/main/{quantized_gguf_name} -o ./models/{quantized_gguf_name}
+            ./llama -m ./models/{quantized_gguf_name} -p "Hello, world!"
             ```
+            ## Additional Notes:
+            To gain higher performance, ensure that you have aligned on llama.cpp's threading tips by having your CPU fully utilized and setting threads dynamically using `OMP_NUM_THREADS`.
             """
         )
         card.push_to_hub(repo_id=new_repo_id, token=oauth_token.token)
         api.upload_file(
             path_or_fileobj=quantized_gguf_path,
             path_in_repo=quantized_gguf_name,
             repo_id=new_repo_id,
         )
         if split_model:
+            split_upload_model(quantized_gguf_path, new_repo_id, oauth_token, split_max_tensors=split_max_tensors, split_max_size=split_max_size)
+        else:
+            print("Model split skipped by user.")
+        print("Model has been uploaded successfully!")
     except Exception as e:
+        print(f"An error occurred: {str(e)}")
+        return False, str(e)
     finally:
+        if os.path.exists(fp16):
+            os.remove(fp16)
+        if os.path.exists(quantized_gguf_path):
+            os.remove(quantized_gguf_path)
+        shutil.rmtree(model_name)
+        print(f"Removed temporary files for model {model_name}")
+    return True, None
+def app_interface():
+    with gr.Blocks() as demo:
+        gr.Markdown("## GGUF Model Processing")
+        with gr.Row():
+            with gr.Column():
+                repo_id = gr.Textbox(label="HuggingFace Repo ID")
+                model_id = gr.Textbox(label="Model ID")
+                q_method = gr.Dropdown(["q4_0", "q4_1", "q5_0", "q5_1", "q8_0"], label="Quantization Method")
+                imatrix_q_method = gr.Dropdown(["q4_0", "q4_1", "q5_0", "q5_1", "q8_0"], label="Imatrix Quantization Method")
+                use_imatrix = gr.Checkbox(label="Use Importance Matrix")
+                private_repo = gr.Checkbox(label="Private Repo")
+                train_data_file = gr.File(label="Training Data File (Optional)")
+                split_model = gr.Checkbox(label="Split Model")
+                split_max_tensors = gr.Number(label="Max Tensors per Shard", value=256)
+                split_max_size = gr.Number(label="Max Shard Size (MB)", value=None)
+            with gr.Column():
+                oauth_token = gr.oauth.HuggingFace(
+                    "Gradio OAuth Authentication",
+                    token=HF_TOKEN,
+                )
+        process_btn = gr.Button("Process Model")
+        process_btn.click(
+            process_model,
+            [model_id, q_method, use_imatrix, imatrix_q_method, private_repo, train_data_file, split_model, split_max_tensors, split_max_size, oauth_token],
+            outputs=["status_text"]
+        )
+    return demo
+if __name__ == "__main__":
+    scheduler = BackgroundScheduler(daemon=True)
+    scheduler.start()
+    demo = app_interface()
+    demo.launch()
+    signal.signal(signal.SIGINT, signal.SIG_DFL)