nananie143 commited on
Commit
f871a33
·
verified ·
1 Parent(s): c139197

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +272 -0
app.py ADDED
@@ -0,0 +1,272 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ from langchain.llms import LlamaCpp
3
+ import os
4
+ import json
5
+ import torch
6
+ import logging
7
+ from typing import Optional, List, Dict, Any
8
+ from fastapi import FastAPI, HTTPException, Request
9
+ from fastapi.responses import JSONResponse
10
+ from pydantic import BaseModel
11
+ import uvicorn
12
+ import time
13
+ from threading import Lock
14
+
15
+ # Configure logging
16
+ logging.basicConfig(level=logging.INFO)
17
+ logger = logging.getLogger(__name__)
18
+
19
+ class ChatCompletionRequest(BaseModel):
20
+ model: str
21
+ messages: List[Dict[str, str]]
22
+ temperature: Optional[float] = 0.7
23
+ max_tokens: Optional[int] = 2048
24
+ stream: Optional[bool] = False
25
+
26
+ class QwenModel:
27
+ def __init__(self, model_path: str):
28
+ """Initialize the Qwen model with automatic device detection."""
29
+ try:
30
+ # Check for GPU availability
31
+ self.has_gpu = torch.cuda.is_available()
32
+ self.device_count = torch.cuda.device_count() if self.has_gpu else 0
33
+ logger.info(f"GPU available: {self.has_gpu}, Device count: {self.device_count}")
34
+
35
+ # Configure model parameters based on available hardware
36
+ n_gpu_layers = 40 if self.has_gpu else 0
37
+ logger.info(f"Using {'GPU' if self.has_gpu else 'CPU'} for inference")
38
+
39
+ self.llm = LlamaCpp(
40
+ model_path=model_path,
41
+ n_gpu_layers=n_gpu_layers,
42
+ n_ctx=4096,
43
+ n_batch=512 if self.has_gpu else 128, # Reduced batch size for CPU
44
+ verbose=True,
45
+ temperature=0.7,
46
+ max_tokens=2048,
47
+ top_p=0.95,
48
+ top_k=50,
49
+ f16_kv=self.has_gpu, # Only use f16 when GPU is available
50
+ use_mlock=True, # Pin memory for better performance
51
+ use_mmap=True,
52
+ )
53
+
54
+ # Thread lock for concurrent API requests
55
+ self.lock = Lock()
56
+
57
+ except Exception as e:
58
+ logger.error(f"Failed to initialize model: {str(e)}")
59
+ raise
60
+
61
+ def generate_cot_prompt(self, messages: List[Dict[str, str]]) -> str:
62
+ """Generate a chain-of-thought prompt from message history."""
63
+ conversation = []
64
+ for msg in messages:
65
+ role = msg.get("role", "")
66
+ content = msg.get("content", "")
67
+
68
+ if role == "system":
69
+ conversation.append(f"System: {content}")
70
+ elif role == "user":
71
+ conversation.append(f"Human: {content}")
72
+ elif role == "assistant":
73
+ conversation.append(f"Assistant: {content}")
74
+
75
+ last_user_msg = next((msg["content"] for msg in reversed(messages)
76
+ if msg["role"] == "user"), None)
77
+
78
+ if not last_user_msg:
79
+ raise ValueError("No user message found in the conversation")
80
+
81
+ cot_template = f"""Previous conversation:
82
+ {chr(10).join(conversation)}
83
+
84
+ Let's approach the latest question step-by-step:
85
+
86
+ 1. Understanding the question:
87
+ {last_user_msg}
88
+
89
+ 2. Breaking down components:
90
+ - Key elements to consider
91
+ - Specific information requested
92
+ - Relevant constraints
93
+
94
+ 3. Reasoning process:
95
+ - Systematic approach
96
+ - Applicable knowledge
97
+ - Potential challenges
98
+
99
+ 4. Step-by-step solution:
100
+
101
+ """
102
+ return cot_template
103
+
104
+ def process_response(self, response: str) -> str:
105
+ """Process and format the model's response."""
106
+ try:
107
+ response = response.strip()
108
+ # Add structural markers for better readability
109
+ if not response.startswith("Step"):
110
+ response = "Step-by-step solution:\n" + response
111
+ return response
112
+ except Exception as e:
113
+ logger.error(f"Error processing response: {str(e)}")
114
+ return "Error processing response"
115
+
116
+ def generate_response(self,
117
+ messages: List[Dict[str, str]],
118
+ temperature: float = 0.7,
119
+ max_tokens: int = 2048) -> Dict[str, Any]:
120
+ """Generate a response using chain-of-thought reasoning."""
121
+ try:
122
+ with self.lock: # Thread safety for concurrent API requests
123
+ # Generate the CoT prompt
124
+ full_prompt = self.generate_cot_prompt(messages)
125
+
126
+ # Get response from model
127
+ start_time = time.time()
128
+ response = self.llm(
129
+ full_prompt,
130
+ temperature=temperature,
131
+ max_tokens=max_tokens
132
+ )
133
+ end_time = time.time()
134
+
135
+ # Process response
136
+ processed_response = self.process_response(response)
137
+
138
+ # Format response in OpenAI-compatible structure
139
+ return {
140
+ "id": f"chatcmpl-{int(time.time()*1000)}",
141
+ "object": "chat.completion",
142
+ "created": int(time.time()),
143
+ "model": "qwen-2.5-14b",
144
+ "choices": [{
145
+ "index": 0,
146
+ "message": {
147
+ "role": "assistant",
148
+ "content": processed_response
149
+ },
150
+ "finish_reason": "stop"
151
+ }],
152
+ "usage": {
153
+ "prompt_tokens": len(full_prompt.split()),
154
+ "completion_tokens": len(processed_response.split()),
155
+ "total_tokens": len(full_prompt.split()) + len(processed_response.split())
156
+ },
157
+ "system_info": {
158
+ "device": "gpu" if self.has_gpu else "cpu",
159
+ "processing_time": round(end_time - start_time, 2)
160
+ }
161
+ }
162
+ except Exception as e:
163
+ logger.error(f"Error generating response: {str(e)}")
164
+ raise HTTPException(status_code=500, detail=str(e))
165
+
166
+ # Initialize FastAPI
167
+ app = FastAPI(title="Qwen 2.5 API")
168
+
169
+ def create_gradio_interface(model: QwenModel):
170
+ """Create and configure the Gradio interface."""
171
+
172
+ def predict(message: str,
173
+ temperature: float,
174
+ max_tokens: int) -> str:
175
+ messages = [{"role": "user", "content": message}]
176
+ response = model.generate_response(
177
+ messages,
178
+ temperature=temperature,
179
+ max_tokens=max_tokens
180
+ )
181
+ return response["choices"][0]["message"]["content"]
182
+
183
+ iface = gr.Interface(
184
+ fn=predict,
185
+ inputs=[
186
+ gr.Textbox(
187
+ label="Input",
188
+ placeholder="Enter your question or task here...",
189
+ lines=5
190
+ ),
191
+ gr.Slider(
192
+ minimum=0.1,
193
+ maximum=1.0,
194
+ value=0.7,
195
+ label="Temperature",
196
+ info="Higher values make the output more random"
197
+ ),
198
+ gr.Slider(
199
+ minimum=64,
200
+ maximum=4096,
201
+ value=2048,
202
+ step=64,
203
+ label="Max Tokens",
204
+ info="Maximum length of the generated response"
205
+ )
206
+ ],
207
+ outputs=gr.Textbox(label="Response", lines=10),
208
+ title=f"Qwen 2.5 14B Instruct Model ({'GPU' if model.has_gpu else 'CPU'} Mode)",
209
+ description="""This is a Qwen 2.5 14B model interface with chain-of-thought prompting.
210
+ The model will break down complex problems and solve them step by step.""",
211
+ examples=[
212
+ ["Explain how photosynthesis works", 0.7, 2048],
213
+ ["Solve the quadratic equation: x² + 5x + 6 = 0", 0.7, 1024],
214
+ ["What are the implications of Moore's Law for future computing?", 0.8, 2048]
215
+ ]
216
+ )
217
+ return iface
218
+
219
+ # Global model instance
220
+ model = None
221
+
222
+ @app.on_event("startup")
223
+ async def startup_event():
224
+ """Initialize the model on startup."""
225
+ global model
226
+ model_path = "G17c21ds/Qwen2.5-14B-Instruct-Uncensored-Q8_0-GGUF"
227
+ model = QwenModel(model_path)
228
+ logger.info("Model initialized successfully")
229
+
230
+ @app.post("/v1/chat/completions")
231
+ async def create_chat_completion(request: ChatCompletionRequest):
232
+ """OpenAI-compatible chat completions endpoint."""
233
+ try:
234
+ response = model.generate_response(
235
+ request.messages,
236
+ temperature=request.temperature,
237
+ max_tokens=request.max_tokens
238
+ )
239
+ return JSONResponse(content=response)
240
+ except Exception as e:
241
+ raise HTTPException(status_code=500, detail=str(e))
242
+
243
+ def main():
244
+ """Main function to initialize and launch the application."""
245
+ try:
246
+ global model
247
+ # Model path
248
+ model_path = "G17c21ds/Qwen2.5-14B-Instruct-Uncensored-Q8_0-GGUF"
249
+
250
+ # Initialize the model if not already initialized
251
+ if model is None:
252
+ model = QwenModel(model_path)
253
+
254
+ # Create and launch the Gradio interface
255
+ interface = create_gradio_interface(model)
256
+
257
+ # Mount FastAPI app to Gradio
258
+ app.mount("/", interface.app)
259
+
260
+ # Launch with uvicorn
261
+ uvicorn.run(
262
+ app,
263
+ host="0.0.0.0",
264
+ port=7860,
265
+ log_level="info"
266
+ )
267
+ except Exception as e:
268
+ logger.error(f"Application failed to start: {str(e)}")
269
+ raise
270
+
271
+ if __name__ == "__main__":
272
+ main()