CreitinGameplays commited on
Commit
5181e48
·
verified ·
1 Parent(s): 522e902

Upload README.md with huggingface_hub

Browse files
Files changed (1) hide show
  1. README.md +0 -66
README.md CHANGED
@@ -55,69 +55,3 @@ or
55
  ```
56
  ./llama-server --hf-repo CreitinGameplays/Llama-3.1-8B-R1-experimental-Q4_K_M-GGUF --hf-file llama-3.1-8b-r1-experimental-q4_k_m.gguf -c 2048
57
  ```
58
-
59
-
60
- Run this model:
61
- ```python
62
- from llama_cpp import Llama
63
-
64
- # Load the model (using the full training context for inference)
65
- llm = Llama.from_pretrained(
66
- repo_id="CreitinGameplays/Llama-3.1-8b-reasoning-test-Q4_K_M-GGUF",
67
- filename="*.gguf",
68
- verbose=False,
69
- n_gpu_layers=0, # CPU-only; increase if using GPU
70
- n_batch=512,
71
- n_ctx=8192,
72
- n_ctx_per_seq=8192,
73
- f16_kv=True
74
- )
75
-
76
- # Set up initial chat history with a system prompt.
77
- chat_history = [
78
- {"role": "system", "content": """
79
- You are a helpful assistant named Llama, made by Meta AI.
80
- You are focused on providing systematic, well-reasoned responses. Response Structure: - Format: <think>{{reasoning}}</think>{{answer}} - Reasoning: Minimum 6 logical steps only when it required in <think> block - Process: Think first, then answer.
81
- """}
82
- ]
83
-
84
- print("Enter 'quit' or 'exit' to stop the conversation.")
85
-
86
- while True:
87
- # Prompt the user for input
88
- user_input = input("\nUser: ")
89
- if user_input.lower() in ["quit", "exit"]:
90
- break
91
-
92
- # Append the new user message to the chat history.
93
- chat_history.append({"role": "user", "content": user_input})
94
-
95
- # Call the chat completion API in streaming mode with the updated conversation.
96
- output_stream = llm.create_chat_completion(
97
- messages=chat_history,
98
- temperature=0.6,
99
- top_p=0.95,
100
- repeat_penalty=1.08,
101
- max_tokens=4096,
102
- stream=True
103
- )
104
-
105
- collected_reply = ""
106
- last_finish_reason = None
107
-
108
- # Process each chunk as it arrives.
109
- print("Assistant: ", end="", flush=True)
110
- for chunk in output_stream:
111
- # Each chunk has a 'choices' list; we get the first choice's delta.
112
- delta = chunk["choices"][0].get("delta", {})
113
- if "content" in delta:
114
- text = delta["content"]
115
- print(text, end="", flush=True)
116
- collected_reply += text if "finish_reason" in chunk["choices"][0]: last_finish_reason = chunk["choices"][0]["finish_reason"]
117
-
118
- # Add the assistant's reply to the conversation history.
119
- chat_history.append({"role": "assistant", "content": collected_reply})
120
- # Inform the user if generation stopped due to reaching the token limit.
121
- if last_finish_reason == "length":
122
- print("\n[Generation stopped: reached max_tokens. Consider increasing max_tokens or continuing the conversation.]")
123
- ```
 
55
  ```
56
  ./llama-server --hf-repo CreitinGameplays/Llama-3.1-8B-R1-experimental-Q4_K_M-GGUF --hf-file llama-3.1-8b-r1-experimental-q4_k_m.gguf -c 2048
57
  ```