Upload README.md with huggingface_hub
Browse files
README.md
CHANGED
@@ -55,69 +55,3 @@ or
|
|
55 |
```
|
56 |
./llama-server --hf-repo CreitinGameplays/Llama-3.1-8B-R1-experimental-Q4_K_M-GGUF --hf-file llama-3.1-8b-r1-experimental-q4_k_m.gguf -c 2048
|
57 |
```
|
58 |
-
|
59 |
-
|
60 |
-
Run this model:
|
61 |
-
```python
|
62 |
-
from llama_cpp import Llama
|
63 |
-
|
64 |
-
# Load the model (using the full training context for inference)
|
65 |
-
llm = Llama.from_pretrained(
|
66 |
-
repo_id="CreitinGameplays/Llama-3.1-8b-reasoning-test-Q4_K_M-GGUF",
|
67 |
-
filename="*.gguf",
|
68 |
-
verbose=False,
|
69 |
-
n_gpu_layers=0, # CPU-only; increase if using GPU
|
70 |
-
n_batch=512,
|
71 |
-
n_ctx=8192,
|
72 |
-
n_ctx_per_seq=8192,
|
73 |
-
f16_kv=True
|
74 |
-
)
|
75 |
-
|
76 |
-
# Set up initial chat history with a system prompt.
|
77 |
-
chat_history = [
|
78 |
-
{"role": "system", "content": """
|
79 |
-
You are a helpful assistant named Llama, made by Meta AI.
|
80 |
-
You are focused on providing systematic, well-reasoned responses. Response Structure: - Format: <think>{{reasoning}}</think>{{answer}} - Reasoning: Minimum 6 logical steps only when it required in <think> block - Process: Think first, then answer.
|
81 |
-
"""}
|
82 |
-
]
|
83 |
-
|
84 |
-
print("Enter 'quit' or 'exit' to stop the conversation.")
|
85 |
-
|
86 |
-
while True:
|
87 |
-
# Prompt the user for input
|
88 |
-
user_input = input("\nUser: ")
|
89 |
-
if user_input.lower() in ["quit", "exit"]:
|
90 |
-
break
|
91 |
-
|
92 |
-
# Append the new user message to the chat history.
|
93 |
-
chat_history.append({"role": "user", "content": user_input})
|
94 |
-
|
95 |
-
# Call the chat completion API in streaming mode with the updated conversation.
|
96 |
-
output_stream = llm.create_chat_completion(
|
97 |
-
messages=chat_history,
|
98 |
-
temperature=0.6,
|
99 |
-
top_p=0.95,
|
100 |
-
repeat_penalty=1.08,
|
101 |
-
max_tokens=4096,
|
102 |
-
stream=True
|
103 |
-
)
|
104 |
-
|
105 |
-
collected_reply = ""
|
106 |
-
last_finish_reason = None
|
107 |
-
|
108 |
-
# Process each chunk as it arrives.
|
109 |
-
print("Assistant: ", end="", flush=True)
|
110 |
-
for chunk in output_stream:
|
111 |
-
# Each chunk has a 'choices' list; we get the first choice's delta.
|
112 |
-
delta = chunk["choices"][0].get("delta", {})
|
113 |
-
if "content" in delta:
|
114 |
-
text = delta["content"]
|
115 |
-
print(text, end="", flush=True)
|
116 |
-
collected_reply += text if "finish_reason" in chunk["choices"][0]: last_finish_reason = chunk["choices"][0]["finish_reason"]
|
117 |
-
|
118 |
-
# Add the assistant's reply to the conversation history.
|
119 |
-
chat_history.append({"role": "assistant", "content": collected_reply})
|
120 |
-
# Inform the user if generation stopped due to reaching the token limit.
|
121 |
-
if last_finish_reason == "length":
|
122 |
-
print("\n[Generation stopped: reached max_tokens. Consider increasing max_tokens or continuing the conversation.]")
|
123 |
-
```
|
|
|
55 |
```
|
56 |
./llama-server --hf-repo CreitinGameplays/Llama-3.1-8B-R1-experimental-Q4_K_M-GGUF --hf-file llama-3.1-8b-r1-experimental-q4_k_m.gguf -c 2048
|
57 |
```
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|