CreitinGameplays
/

Llama-3.1-8B-R1-experimental-Q4_K_M-GGUF

@@ -55,69 +55,3 @@ or
 ```
 ./llama-server --hf-repo CreitinGameplays/Llama-3.1-8B-R1-experimental-Q4_K_M-GGUF --hf-file llama-3.1-8b-r1-experimental-q4_k_m.gguf -c 2048
 ```
-Run this model:
-```python
-from llama_cpp import Llama
-# Load the model (using the full training context for inference)
-llm = Llama.from_pretrained(
-    repo_id="CreitinGameplays/Llama-3.1-8b-reasoning-test-Q4_K_M-GGUF",
-    filename="*.gguf",
-    verbose=False,
-    n_gpu_layers=0,  # CPU-only; increase if using GPU
-    n_batch=512,
-    n_ctx=8192,
-    n_ctx_per_seq=8192,
-    f16_kv=True
-)
-# Set up initial chat history with a system prompt.
-chat_history = [
-    {"role": "system", "content": """
-You are a helpful assistant named Llama, made by Meta AI.
-You are focused on providing systematic, well-reasoned responses. Response Structure: - Format: <think>{{reasoning}}</think>{{answer}} - Reasoning: Minimum 6 logical steps only when it required in <think> block - Process: Think first, then answer.
-    """}
-]
-print("Enter 'quit' or 'exit' to stop the conversation.")
-while True:
-    # Prompt the user for input
-    user_input = input("\nUser: ")
-    if user_input.lower() in ["quit", "exit"]:
-        break
-    # Append the new user message to the chat history.
-    chat_history.append({"role": "user", "content": user_input})
-    # Call the chat completion API in streaming mode with the updated conversation.
-    output_stream = llm.create_chat_completion(
-        messages=chat_history,
-        temperature=0.6,
-        top_p=0.95,
-        repeat_penalty=1.08,
-        max_tokens=4096,
-        stream=True
-    )
-    collected_reply = ""
-    last_finish_reason = None
-    # Process each chunk as it arrives.
-    print("Assistant: ", end="", flush=True)
-    for chunk in output_stream:
-        # Each chunk has a 'choices' list; we get the first choice's delta.
-        delta = chunk["choices"][0].get("delta", {})
-        if "content" in delta:
-            text = delta["content"]
-            print(text, end="", flush=True)
-            collected_reply += text                                                                                                                                                                                                                                               if "finish_reason" in chunk["choices"][0]:                                                                                                                                                                                                                                    last_finish_reason = chunk["choices"][0]["finish_reason"]
-    # Add the assistant's reply to the conversation history.
-    chat_history.append({"role": "assistant", "content": collected_reply})
-                                                                                                                                                                                                                                                                              # Inform the user if generation stopped due to reaching the token limit.
-    if last_finish_reason == "length":
-        print("\n[Generation stopped: reached max_tokens. Consider increasing max_tokens or continuing the conversation.]")
-```

 ```
 ./llama-server --hf-repo CreitinGameplays/Llama-3.1-8B-R1-experimental-Q4_K_M-GGUF --hf-file llama-3.1-8b-r1-experimental-q4_k_m.gguf -c 2048
 ```