nananie143 commited on
Commit
627e1c6
·
verified ·
1 Parent(s): 7eb9343

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +56 -177
app.py CHANGED
@@ -1,5 +1,5 @@
1
  import gradio as gr
2
- from langchain.llms import LlamaCpp
3
  import os
4
  import json
5
  import torch
@@ -11,6 +11,10 @@ from pydantic import BaseModel
11
  import uvicorn
12
  import time
13
  from threading import Lock
 
 
 
 
14
 
15
  # Configure logging
16
  logging.basicConfig(level=logging.INFO)
@@ -23,6 +27,31 @@ class ChatCompletionRequest(BaseModel):
23
  max_tokens: Optional[int] = 2048
24
  stream: Optional[bool] = False
25
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
26
  class QwenModel:
27
  def __init__(self, model_path: str):
28
  """Initialize the Qwen model with automatic device detection."""
@@ -32,12 +61,19 @@ class QwenModel:
32
  self.device_count = torch.cuda.device_count() if self.has_gpu else 0
33
  logger.info(f"GPU available: {self.has_gpu}, Device count: {self.device_count}")
34
 
 
 
 
 
 
 
 
35
  # Configure model parameters based on available hardware
36
  n_gpu_layers = 40 if self.has_gpu else 0
37
  logger.info(f"Using {'GPU' if self.has_gpu else 'CPU'} for inference")
38
 
39
  self.llm = LlamaCpp(
40
- model_path=model_path,
41
  n_gpu_layers=n_gpu_layers,
42
  n_ctx=4096,
43
  n_batch=512 if self.has_gpu else 128, # Reduced batch size for CPU
@@ -47,7 +83,7 @@ class QwenModel:
47
  top_p=0.95,
48
  top_k=50,
49
  f16_kv=self.has_gpu, # Only use f16 when GPU is available
50
- use_mlock=True, # Pin memory for better performance
51
  use_mmap=True,
52
  )
53
 
@@ -58,194 +94,37 @@ class QwenModel:
58
  logger.error(f"Failed to initialize model: {str(e)}")
59
  raise
60
 
61
- def generate_cot_prompt(self, messages: List[Dict[str, str]]) -> str:
62
- """Generate a chain-of-thought prompt from message history."""
63
- conversation = []
64
- for msg in messages:
65
- role = msg.get("role", "")
66
- content = msg.get("content", "")
67
-
68
- if role == "system":
69
- conversation.append(f"System: {content}")
70
- elif role == "user":
71
- conversation.append(f"Human: {content}")
72
- elif role == "assistant":
73
- conversation.append(f"Assistant: {content}")
74
-
75
- last_user_msg = next((msg["content"] for msg in reversed(messages)
76
- if msg["role"] == "user"), None)
77
-
78
- if not last_user_msg:
79
- raise ValueError("No user message found in the conversation")
80
-
81
- cot_template = f"""Previous conversation:
82
- {chr(10).join(conversation)}
83
-
84
- Let's approach the latest question step-by-step:
85
-
86
- 1. Understanding the question:
87
- {last_user_msg}
88
-
89
- 2. Breaking down components:
90
- - Key elements to consider
91
- - Specific information requested
92
- - Relevant constraints
93
-
94
- 3. Reasoning process:
95
- - Systematic approach
96
- - Applicable knowledge
97
- - Potential challenges
98
-
99
- 4. Step-by-step solution:
100
-
101
- """
102
- return cot_template
103
 
104
- def process_response(self, response: str) -> str:
105
- """Process and format the model's response."""
106
- try:
107
- response = response.strip()
108
- # Add structural markers for better readability
109
- if not response.startswith("Step"):
110
- response = "Step-by-step solution:\n" + response
111
- return response
112
- except Exception as e:
113
- logger.error(f"Error processing response: {str(e)}")
114
- return "Error processing response"
115
-
116
- def generate_response(self,
117
- messages: List[Dict[str, str]],
118
- temperature: float = 0.7,
119
- max_tokens: int = 2048) -> Dict[str, Any]:
120
- """Generate a response using chain-of-thought reasoning."""
121
- try:
122
- with self.lock: # Thread safety for concurrent API requests
123
- # Generate the CoT prompt
124
- full_prompt = self.generate_cot_prompt(messages)
125
-
126
- # Get response from model
127
- start_time = time.time()
128
- response = self.llm(
129
- full_prompt,
130
- temperature=temperature,
131
- max_tokens=max_tokens
132
- )
133
- end_time = time.time()
134
-
135
- # Process response
136
- processed_response = self.process_response(response)
137
-
138
- # Format response in OpenAI-compatible structure
139
- return {
140
- "id": f"chatcmpl-{int(time.time()*1000)}",
141
- "object": "chat.completion",
142
- "created": int(time.time()),
143
- "model": "qwen-2.5-14b",
144
- "choices": [{
145
- "index": 0,
146
- "message": {
147
- "role": "assistant",
148
- "content": processed_response
149
- },
150
- "finish_reason": "stop"
151
- }],
152
- "usage": {
153
- "prompt_tokens": len(full_prompt.split()),
154
- "completion_tokens": len(processed_response.split()),
155
- "total_tokens": len(full_prompt.split()) + len(processed_response.split())
156
- },
157
- "system_info": {
158
- "device": "gpu" if self.has_gpu else "cpu",
159
- "processing_time": round(end_time - start_time, 2)
160
- }
161
- }
162
- except Exception as e:
163
- logger.error(f"Error generating response: {str(e)}")
164
- raise HTTPException(status_code=500, detail=str(e))
165
-
166
- # Initialize FastAPI
167
  app = FastAPI(title="Qwen 2.5 API")
168
 
169
- def create_gradio_interface(model: QwenModel):
170
- """Create and configure the Gradio interface."""
171
-
172
- def predict(message: str,
173
- temperature: float,
174
- max_tokens: int) -> str:
175
- messages = [{"role": "user", "content": message}]
176
- response = model.generate_response(
177
- messages,
178
- temperature=temperature,
179
- max_tokens=max_tokens
180
- )
181
- return response["choices"][0]["message"]["content"]
182
-
183
- iface = gr.Interface(
184
- fn=predict,
185
- inputs=[
186
- gr.Textbox(
187
- label="Input",
188
- placeholder="Enter your question or task here...",
189
- lines=5
190
- ),
191
- gr.Slider(
192
- minimum=0.1,
193
- maximum=1.0,
194
- value=0.7,
195
- label="Temperature",
196
- info="Higher values make the output more random"
197
- ),
198
- gr.Slider(
199
- minimum=64,
200
- maximum=4096,
201
- value=2048,
202
- step=64,
203
- label="Max Tokens",
204
- info="Maximum length of the generated response"
205
- )
206
- ],
207
- outputs=gr.Textbox(label="Response", lines=10),
208
- title=f"Qwen 2.5 14B Instruct Model ({'GPU' if model.has_gpu else 'CPU'} Mode)",
209
- description="""This is a Qwen 2.5 14B model interface with chain-of-thought prompting.
210
- The model will break down complex problems and solve them step by step.""",
211
- examples=[
212
- ["Explain how photosynthesis works", 0.7, 2048],
213
- ["Solve the quadratic equation: x² + 5x + 6 = 0", 0.7, 1024],
214
- ["What are the implications of Moore's Law for future computing?", 0.8, 2048]
215
- ]
216
- )
217
- return iface
218
-
219
  # Global model instance
220
  model = None
221
 
222
- @app.on_event("startup")
223
- async def startup_event():
224
- """Initialize the model on startup."""
225
  global model
226
- model_path = "G17c21ds/Qwen2.5-14B-Instruct-Uncensored-Q8_0-GGUF"
227
- model = QwenModel(model_path)
228
- logger.info("Model initialized successfully")
229
-
230
- @app.post("/v1/chat/completions")
231
- async def create_chat_completion(request: ChatCompletionRequest):
232
- """OpenAI-compatible chat completions endpoint."""
233
  try:
234
- response = model.generate_response(
235
- request.messages,
236
- temperature=request.temperature,
237
- max_tokens=request.max_tokens
238
- )
239
- return JSONResponse(content=response)
240
- except Exception as e:
241
- raise HTTPException(status_code=500, detail=str(e))
 
 
 
242
 
243
  def main():
244
  """Main function to initialize and launch the application."""
245
  try:
246
  global model
247
  # Model path
248
- model_path = "G17c21ds/Qwen2.5-14B-Instruct-Uncensored-Q8_0-GGUF"
249
 
250
  # Initialize the model if not already initialized
251
  if model is None:
 
1
  import gradio as gr
2
+ from langchain_community.llms import LlamaCpp # Updated import
3
  import os
4
  import json
5
  import torch
 
11
  import uvicorn
12
  import time
13
  from threading import Lock
14
+ import requests
15
+ from pathlib import Path
16
+ from tqdm import tqdm
17
+ from contextlib import asynccontextmanager
18
 
19
  # Configure logging
20
  logging.basicConfig(level=logging.INFO)
 
27
  max_tokens: Optional[int] = 2048
28
  stream: Optional[bool] = False
29
 
30
+ def download_model(model_url: str, local_path: Path) -> Path:
31
+ """Download the model file if it doesn't exist locally."""
32
+ if local_path.exists():
33
+ logger.info(f"Model already exists at {local_path}")
34
+ return local_path
35
+
36
+ logger.info(f"Downloading model from {model_url}")
37
+ local_path.parent.mkdir(parents=True, exist_ok=True)
38
+
39
+ response = requests.get(model_url, stream=True)
40
+ total_size = int(response.headers.get('content-length', 0))
41
+
42
+ with open(local_path, 'wb') as file, tqdm(
43
+ desc=local_path.name,
44
+ total=total_size,
45
+ unit='iB',
46
+ unit_scale=True,
47
+ unit_divisor=1024,
48
+ ) as pbar:
49
+ for data in response.iter_content(chunk_size=1024):
50
+ size = file.write(data)
51
+ pbar.update(size)
52
+
53
+ return local_path
54
+
55
  class QwenModel:
56
  def __init__(self, model_path: str):
57
  """Initialize the Qwen model with automatic device detection."""
 
61
  self.device_count = torch.cuda.device_count() if self.has_gpu else 0
62
  logger.info(f"GPU available: {self.has_gpu}, Device count: {self.device_count}")
63
 
64
+ # Ensure model path exists
65
+ model_path = Path(model_path)
66
+ if not model_path.exists():
67
+ # If model doesn't exist locally, download it
68
+ model_url = "https://huggingface.co/G17c21ds/Qwen2.5-14B-Instruct-Uncensored-Q8_0-GGUF/resolve/main/model.gguf"
69
+ model_path = download_model(model_url, model_path)
70
+
71
  # Configure model parameters based on available hardware
72
  n_gpu_layers = 40 if self.has_gpu else 0
73
  logger.info(f"Using {'GPU' if self.has_gpu else 'CPU'} for inference")
74
 
75
  self.llm = LlamaCpp(
76
+ model_path=str(model_path),
77
  n_gpu_layers=n_gpu_layers,
78
  n_ctx=4096,
79
  n_batch=512 if self.has_gpu else 128, # Reduced batch size for CPU
 
83
  top_p=0.95,
84
  top_k=50,
85
  f16_kv=self.has_gpu, # Only use f16 when GPU is available
86
+ use_mlock=True,
87
  use_mmap=True,
88
  )
89
 
 
94
  logger.error(f"Failed to initialize model: {str(e)}")
95
  raise
96
 
97
+ # ... [rest of the QwenModel class methods remain the same] ...
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
98
 
99
+ # Initialize FastAPI with lifespan
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
100
  app = FastAPI(title="Qwen 2.5 API")
101
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
102
  # Global model instance
103
  model = None
104
 
105
+ @asynccontextmanager
106
+ async def lifespan(app: FastAPI):
107
+ """Lifespan context manager for FastAPI startup and shutdown events."""
108
  global model
 
 
 
 
 
 
 
109
  try:
110
+ model_path = Path("models/qwen-2.5-14b-gguf")
111
+ model = QwenModel(model_path)
112
+ logger.info("Model initialized successfully")
113
+ yield
114
+ finally:
115
+ # Cleanup code (if needed)
116
+ pass
117
+
118
+ app = FastAPI(lifespan=lifespan)
119
+
120
+ # ... [rest of the FastAPI routes and main function remain the same] ...
121
 
122
  def main():
123
  """Main function to initialize and launch the application."""
124
  try:
125
  global model
126
  # Model path
127
+ model_path = Path("models/qwen-2.5-14b-gguf")
128
 
129
  # Initialize the model if not already initialized
130
  if model is None: