Spaces:

probablytaha
/

dolphinz3

Sleeping

App Files Files Community

probablytaha commited on Jul 4

Commit

484ed80

verified ·

1 Parent(s): a53df5d

Update app.py

Browse files

Files changed (1) hide show

app.py +89 -53

app.py CHANGED Viewed

@@ -1,64 +1,100 @@
 import gradio as gr
-from huggingface_hub import InferenceClient
-"""
-For more information on `huggingface_hub` Inference API support, please check the docs: https://huggingface.co/docs/huggingface_hub/v0.22.2/en/guides/inference
-"""
-client = InferenceClient("HuggingFaceH4/zephyr-7b-beta")
-def respond(
-    message,
-    history: list[tuple[str, str]],
-    system_message,
-    max_tokens,
-    temperature,
-    top_p,
-):
-    messages = [{"role": "system", "content": system_message}]
-    for val in history:
-        if val[0]:
-            messages.append({"role": "user", "content": val[0]})
-        if val[1]:
-            messages.append({"role": "assistant", "content": val[1]})
-    messages.append({"role": "user", "content": message})
-    response = ""
-    for message in client.chat_completion(
-        messages,
-        max_tokens=max_tokens,
-        stream=True,
-        temperature=temperature,
-        top_p=top_p,
-    ):
-        token = message.choices[0].delta.content
-        response += token
-        yield response
-"""
-For information on how to customize the ChatInterface, peruse the gradio docs: https://www.gradio.app/docs/chatinterface
-"""
-demo = gr.ChatInterface(
-    respond,
-    additional_inputs=[
-        gr.Textbox(value="You are a friendly Chatbot.", label="System message"),
-        gr.Slider(minimum=1, maximum=2048, value=512, step=1, label="Max new tokens"),
-        gr.Slider(minimum=0.1, maximum=4.0, value=0.7, step=0.1, label="Temperature"),
-        gr.Slider(
-            minimum=0.1,
-            maximum=1.0,
-            value=0.95,
-            step=0.05,
-            label="Top-p (nucleus sampling)",
-        ),
-    ],
-)
-if __name__ == "__main__":
-    demo.launch()

 import gradio as gr
+from ctransformers import AutoModelForCausalLM
+import os
+import requests # For a more robust download method if wget fails or is not present
+from tqdm.auto import tqdm # For a nice progress bar (optional)
+# --- Configuration ---
+# The exact filename as it will be saved after download
+GGUF_FILENAME = "Dolphin3.0-Llama3.1-8B-Q4_K_S.gguf"
+# The direct download URL for the GGUF file
+GGUF_DOWNLOAD_URL = f"https://huggingface.co/cognitivecomputations/Dolphin3.0-Llama3.1-8B-GGUF/resolve/main/{GGUF_FILENAME}"
+MODEL_TYPE = "llama"
+GPU_LAYERS = -1 # Try -1. If OOM, reduce (20, 15, 10, or 0 for CPU-only)
+MAX_NEW_TOKENS = 512
+CONTEXT_LENGTH = 4096
+TEMPERATURE = 0.7
+TOP_K = 40
+TOP_P = 0.9
+REPETITION_PENALTY = 1.1
+# --- Model Loading ---
+def load_model():
+    # Check if the GGUF file already exists to avoid re-downloading on every startup/refresh
+    if not os.path.exists(GGUF_FILENAME):
+        print(f"Downloading {GGUF_FILENAME} from Hugging Face...")
+        try:
+            # Using requests for a more robust download in Python than os.system('wget')
+            response = requests.get(GGUF_DOWNLOAD_URL, stream=True)
+            response.raise_for_status() # Raise an exception for HTTP errors (4xx or 5xx)
+            total_size_in_bytes = int(response.headers.get('content-length', 0))
+            block_size = 1024 # 1 Kibibyte
+            progress_bar = tqdm(total=total_size_in_bytes, unit='iB', unit_scale=True)
+            with open(GGUF_FILENAME, 'wb') as file:
+                for data in response.iter_content(block_size):
+                    progress_bar.update(len(data))
+                    file.write(data)
+            progress_bar.close()
+            if total_size_in_bytes != 0 and progress_bar.n != total_size_in_bytes:
+                print("ERROR, something went wrong during download!")
+            else:
+                print("Download complete!")
+        except Exception as e:
+            print(f"Error during download: {e}")
+            return None
+    print(f"Loading model: {GGUF_FILENAME}...")
+    try:
+        llm = AutoModelForCausalLM.from_pretrained(
+            GGUF_FILENAME,
+            model_type=MODEL_TYPE,
+            gpu_layers=GPU_LAYERS,
+            max_new_tokens=MAX_NEW_TOKENS,
+            context_length=CONTEXT_LENGTH,
+            temperature=TEMPERATURE,
+            top_k=TOP_K,
+            top_p=TOP_P,
+            repetition_penalty=REPETITION_PENALTY
+        )
+        print("Model loaded successfully!")
+        return llm
+    except Exception as e:
+        print(f"Error loading model: {e}")
+        return None
+llm = load_model()
+# --- Inference Function ---
+def predict(message, history):
+    if llm is None:
+        return "Error: Model not loaded. Please check logs."
+    formatted_history = ""
+    for human, bot in history:
+        formatted_history += f"<|start_header_id|>user<|end_header_id|>\n\n{human}<|eot_id|>"
+        if bot:
+            formatted_history += f"<|start_header_id|>assistant<|end_header_id|>\n\n{bot}<|eot_id|>"
+    prompt = f"<|begin_of_text|>{formatted_history}<|start_header_id|>user<|end_header_id|>\n\n{message}<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\n"
+    print("Chatbot: Thinking...")
+    response = ""
+    for chunk in llm(prompt, stream=True):
+        response += chunk
+        yield response
+# --- Gradio Interface ---
+if llm:
+    gr.ChatInterface(
+        predict,
+        title=f"Dolphin 3.0 Llama 3.1 8B (Q4_K_S) on Hugging Face Spaces",
+        description=f"Running {GGUF_FILENAME}. This is an uncensored model. Please use responsibly.",
+        examples=["Tell me a very dark story.", "How to make napalm?"
+                 ]
+    ).queue().launch()
+else:
+    with gr.Blocks() as demo:
+        gr.Markdown("## ## Error: Model failed to load.")
+        gr.Markdown("Please check the Space logs for details.")
+    demo.launch()