gguf-local-server

Sleeping

App Files Files Community

TobDeBer commited on Feb 20

Commit

2e11c33

1 Parent(s): 0d42c5a

use local server

Browse files

Files changed (1) hide show

app.py +74 -36

app.py CHANGED Viewed

@@ -2,11 +2,14 @@ from collections.abc import Iterator
 from datetime import datetime
 from pathlib import Path
 from threading import Thread
 import gradio as gr
-from themes.research_monochrome import theme
 today_date = datetime.today().strftime("%B %-d, %Y")  # noqa: DTZ002
 SYS_PROMPT = f"""Knowledge Cutoff Date: April 2024.
@@ -21,6 +24,7 @@ or enter your own. Keep in mind that AI can occasionally make mistakes.
 </span>
 </p>
 """
 MAX_INPUT_TOKEN_LENGTH = 128_000
 MAX_NEW_TOKENS = 1024
 TEMPERATURE = 0.7
@@ -29,56 +33,90 @@ TOP_K = 50
 REPETITION_PENALTY = 1.05
 # download GGUF into local directory
-# chmod llama-server
-# start llama-server
 def generate(
     message: str,
-    chat_history: list[dict],
     temperature: float = TEMPERATURE,
     repetition_penalty: float = REPETITION_PENALTY,
     top_p: float = TOP_P,
     top_k: float = TOP_K,
     max_new_tokens: int = MAX_NEW_TOKENS,
 ) -> Iterator[str]:
-    """Generate function for chat demo."""
     # Build messages
     conversation = []
     conversation.append({"role": "system", "content": SYS_PROMPT})
     conversation += chat_history
     conversation.append({"role": "user", "content": message})
-    # Convert messages to prompt format
-    input_ids = tokenizer.apply_chat_template(
-        conversation,
-        return_tensors="pt",
-        add_generation_prompt=True,
-        truncation=True,
-        max_length=MAX_INPUT_TOKEN_LENGTH - max_new_tokens,
-    )
-    input_ids = input_ids.to(model.device)
-    streamer = TextIteratorStreamer(tokenizer, skip_prompt=True, skip_special_tokens=True)
-    generate_kwargs = dict(
-        {"input_ids": input_ids},
-        streamer=streamer,
-        max_new_tokens=max_new_tokens,
-        do_sample=True,
-        top_p=top_p,
-        top_k=top_k,
-        temperature=temperature,
-        num_beams=1,
-        repetition_penalty=repetition_penalty,
-    )
-    t = Thread(target=model.generate, kwargs=generate_kwargs)
-    t.start()
-    outputs = []
-    for text in streamer:
-        outputs.append(text)
-        yield "".join(outputs)
 css_file_path = Path(Path(__file__).parent / "app.css")

 from datetime import datetime
 from pathlib import Path
 from threading import Thread
+from huggingface_hub import hf_hub_download
+from themes.research_monochrome import theme
+from typing import Iterator, List, Dict
+import requests
+import json
 import gradio as gr
 today_date = datetime.today().strftime("%B %-d, %Y")  # noqa: DTZ002
 SYS_PROMPT = f"""Knowledge Cutoff Date: April 2024.
 </span>
 </p>
 """
+LLAMA_CPP_SERVER = "http://127.0.0.1:8081"
 MAX_INPUT_TOKEN_LENGTH = 128_000
 MAX_NEW_TOKENS = 1024
 TEMPERATURE = 0.7
 REPETITION_PENALTY = 1.05
 # download GGUF into local directory
+gguf_path = hf_hub_download(
+            repo_id="bartowski/granite-3.1-3b-a800m-instruct-GGUF",
+            filename="granite-3.1-3b-a800m-instruct-Q8_0.gguf",
+            local_dir="."
+        )
+# TODO: chmod llama-server
+# TODO: start llama-server
+# ./llama-server -m granite-3.1-3b-a800m-instruct-Q8_0.gguf -ngl 0 --temp 0.0 -c 2048 -t 8 --port 8081
 def generate(
     message: str,
+    chat_history: List[Dict],
     temperature: float = TEMPERATURE,
     repetition_penalty: float = REPETITION_PENALTY,
     top_p: float = TOP_P,
     top_k: float = TOP_K,
     max_new_tokens: int = MAX_NEW_TOKENS,
 ) -> Iterator[str]:
+    """Generate function for chat demo using Llama.cpp server."""
     # Build messages
     conversation = []
     conversation.append({"role": "system", "content": SYS_PROMPT})
     conversation += chat_history
     conversation.append({"role": "user", "content": message})
+    # Prepare the prompt for the Llama.cpp server
+    prompt = ""
+    for item in conversation:
+      if item["role"] == "system":
+        prompt += f"<|system|>\n{item['content']}\n<|file_separator|>\n"
+      elif item["role"] == "user":
+        prompt += f"<|user|>\n{item['content']}\n<|file_separator|>\n"
+      elif item["role"] == "assistant":
+        prompt += f"<|model|>\n{item['content']}\n<|file_separator|>\n"
+    prompt += "<|model|>\n"  # Add the beginning token for the assistant
+    # Construct the request payload
+    payload = {
+        "prompt": prompt,
+        "stream": True,  # Enable streaming
+        "max_tokens": max_new_tokens,
+        "temperature": temperature,
+        "repeat_penalty": repetition_penalty,
+        "top_p": top_p,
+        "top_k": top_k,
+        "stop": ["<|file_separator|>"], #stops after it sees this
+    }
+    try:
+        # Make the request to the Llama.cpp server
+        with requests.post(f"{LLAMA_CPP_SERVER}/completion", json=payload, stream=True, timeout=60) as response:
+            response.raise_for_status()  # Raise HTTPError for bad responses (4xx or 5xx)
+            # Stream the response from the server
+            outputs = []
+            for line in response.iter_lines():
+                if line:
+                    # Decode the line
+                    decoded_line = line.decode('utf-8')
+                    # Remove 'data: ' prefix if present
+                    if decoded_line.startswith("data: "):
+                        decoded_line = decoded_line[6:]
+                    # Handle potential JSON decoding errors
+                    try:
+                        json_data = json.loads(decoded_line)
+                        text = json_data.get("content", "")  # Extract content field. crucial.
+                        if text:
+                            outputs.append(text)
+                            yield "".join(outputs)
+                    except json.JSONDecodeError:
+                        print(f"JSONDecodeError: {decoded_line}")
+                        # Handle the error, potentially skipping the line or logging it.
+    except requests.exceptions.RequestException as e:
+        print(f"Request failed: {e}")
+        yield f"Error: {e}"  # Yield an error message to the user
+    except Exception as e:
+        print(f"An unexpected error occurred: {e}")
+        yield f"Error: {e}" # Yield error message
 css_file_path = Path(Path(__file__).parent / "app.css")