mike23415 commited on
Commit
480e847
·
verified ·
1 Parent(s): 5d565fc

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +63 -181
app.py CHANGED
@@ -6,21 +6,12 @@ import time
6
  import logging
7
  import threading
8
  import queue
9
- import json
10
- import gc
11
  from transformers import AutoTokenizer, AutoModelForCausalLM
12
 
13
  # Set up logging
14
- logging.basicConfig(
15
- level=logging.INFO,
16
- format='%(asctime)s - %(name)s - %(levelname)s - %(message)s',
17
- datefmt='%Y-%m-%d %H:%M:%S'
18
- )
19
  logger = logging.getLogger(__name__)
20
 
21
- # Print startup banner for visibility in logs
22
- print("\n===== Application Startup at", time.strftime("%Y-%m-%d %H:%M:%S"), "=====\n")
23
-
24
  # Fix caching issue on Hugging Face Spaces
25
  os.environ["TRANSFORMERS_CACHE"] = "/tmp"
26
  os.environ["HF_HOME"] = "/tmp"
@@ -36,110 +27,41 @@ logger.info(f"Using device: {device}")
36
  tokenizer = None
37
  model = None
38
 
39
- # Check available system resources
40
- def log_system_info():
41
- # Basic system info
42
- logger.info(f"Python version: {os.sys.version}")
43
-
44
- # CPU info
45
- import multiprocessing
46
- logger.info(f"CPU cores: {multiprocessing.cpu_count()}")
47
-
48
- # Memory info
49
- try:
50
- import psutil
51
- mem = psutil.virtual_memory()
52
- logger.info(f"Memory: Total={mem.total/1e9:.1f}GB, Available={mem.available/1e9:.1f}GB ({mem.percent}% used)")
53
- except ImportError:
54
- logger.info("psutil not installed, skipping detailed memory info")
55
-
56
- # PyTorch info
57
- logger.info(f"PyTorch version: {torch.__version__}")
58
- logger.info(f"CUDA available: {torch.cuda.is_available()}")
59
- if torch.cuda.is_available():
60
- logger.info(f"CUDA version: {torch.version.cuda}")
61
- logger.info(f"GPU: {torch.cuda.get_device_name(0)}")
62
-
63
  # Initialize models once on startup
64
  def initialize_models():
65
  global tokenizer, model
66
  try:
67
  logger.info("Loading language model...")
68
- log_system_info()
69
-
70
- # You can change the model here if needed
71
- model_name = "Qwen/Qwen2.5-1.5B-Instruct" # Good balance of quality and speed for CPU
72
-
73
- # Load tokenizer with caching
74
- logger.info(f"Loading tokenizer: {model_name}")
75
- tokenizer = AutoTokenizer.from_pretrained(
76
- model_name,
77
- use_fast=True, # Use the fast tokenizers when available
78
- local_files_only=False # Allow downloading if not cached
79
- )
80
-
81
- # Free up memory before loading model
82
- gc.collect()
83
- torch.cuda.empty_cache() if torch.cuda.is_available() else None
84
-
85
- # Load model with optimizations for CPU
86
- logger.info(f"Loading model: {model_name}")
87
-
88
- # Set lower precision for CPU to reduce memory usage
89
- torch_dtype = torch.bfloat16 if torch.cuda.is_available() else torch.float32
90
-
91
  model = AutoModelForCausalLM.from_pretrained(
92
  model_name,
93
- torch_dtype=torch_dtype,
94
- low_cpu_mem_usage=True, # Optimize memory loading
95
- device_map="auto" # Let the system decide optimal device mapping
96
  )
97
 
98
- # Handle padding tokens
99
  if tokenizer.pad_token is None:
100
- logger.info("Setting pad token to EOS token")
101
  tokenizer.pad_token = tokenizer.eos_token
102
  model.config.pad_token_id = model.config.eos_token_id
103
 
104
- # Set up model configuration for better generation
105
- model.config.do_sample = True # Enable sampling
106
- model.config.temperature = 0.7 # Default temperature
107
- model.config.top_p = 0.9 # Default top_p
108
-
109
  logger.info("Models initialized successfully")
110
  except Exception as e:
111
  logger.error(f"Error initializing models: {str(e)}")
112
  raise
113
 
114
- # TextStreamer class for token-by-token generation
115
- class TextStreamer:
116
- def __init__(self, tokenizer, queue):
117
- self.tokenizer = tokenizer
118
- self.queue = queue
119
- self.current_tokens = []
120
-
121
- def put(self, token_ids):
122
- self.current_tokens.extend(token_ids.tolist())
123
- text = self.tokenizer.decode(self.current_tokens, skip_special_tokens=True)
124
- self.queue.put(text)
125
-
126
- def end(self):
127
- pass
128
-
129
  # Function to simulate "thinking" process
130
  def thinking_process(message, result_queue):
131
  """
132
- This function simulates a thinking process and puts the result in the queue.
133
- It includes both an explicit thinking stage and then a generation stage.
134
  """
135
  try:
136
- # Simulate explicit thinking stage
137
  logger.info(f"Thinking about: '{message}'")
138
 
139
- # Create thoughtful prompt with system message and thinking instructions
140
  prompt = f"""<|im_start|>system
141
- You are a helpful, friendly, and thoughtful AI assistant.
142
- Let's approach the user's request step by step.
143
  <|im_end|>
144
  <|im_start|>user
145
  {message}<|im_end|>
@@ -147,31 +69,23 @@ Let's approach the user's request step by step.
147
  """
148
 
149
  # Handle inputs
150
- inputs = tokenizer(prompt, return_tensors="pt", truncation=True, max_length=512)
151
- inputs = {k: v.to(device) for k, v in inputs.items()}
152
 
153
  # Generate answer with streaming
154
  streamer = TextStreamer(tokenizer, result_queue)
155
 
156
- # Simulate thinking first by sending some initial dots
157
- result_queue.put("Let me think about this...")
158
-
159
- # Generate response with simpler parameters to avoid memory issues
160
- try:
161
- with torch.no_grad(): # Disable gradient calculation to save memory
162
- model.generate(
163
- **inputs,
164
- max_new_tokens=256, # Reduced from 512
165
- temperature=0.7,
166
- top_p=0.9,
167
- do_sample=True,
168
- streamer=streamer,
169
- num_beams=1, # Reduced from 2
170
- repetition_penalty=1.2
171
- )
172
- except Exception as e:
173
- logger.error(f"Model generation error: {str(e)}")
174
- result_queue.put(f"\n\nI apologize, but I encountered an error while processing your request.")
175
 
176
  # Signal generation is complete
177
  result_queue.put(None)
@@ -182,54 +96,42 @@ Let's approach the user's request step by step.
182
  # Signal generation is complete
183
  result_queue.put(None)
184
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
185
  # API route for home page
186
  @app.route('/')
187
  def home():
188
- return jsonify({"message": "AI Chat API is running!", "status": "online"})
189
-
190
- # Health check endpoint
191
- @app.route('/health')
192
- def health():
193
- if model is None or tokenizer is None:
194
- return jsonify({"status": "initializing"}), 503
195
- return jsonify({"status": "healthy"})
196
 
197
  # API route for streaming chat responses
198
- @app.route('/chat', methods=['POST', 'GET'])
199
  def chat():
200
- # Check if models are loaded
201
- if model is None or tokenizer is None:
202
- return jsonify({"error": "Models are still initializing. Please try again shortly."}), 503
203
-
204
- # Handle both POST JSON and GET query parameters for flexibility
205
- if request.method == 'POST':
206
- try:
207
- data = request.get_json()
208
- message = data.get("message", "")
209
- except:
210
- # If JSON parsing fails, try form data
211
- message = request.form.get("message", "")
212
- else: # GET
213
- message = request.args.get("message", "")
214
 
215
  if not message:
216
  return jsonify({"error": "Message is required"}), 400
217
 
218
  try:
219
  def generate():
220
- # Signal the start of streaming with headers
221
- yield "retry: 1000\n"
222
- yield "event: message\n"
223
-
224
- # Show thinking indicator
225
- yield f"data: [Thinking...]\n\n"
226
-
227
  # Create a queue for communication between threads
228
  result_queue = queue.Queue()
229
 
230
  # Start thinking in a separate thread
231
  thread = threading.Thread(target=thinking_process, args=(message, result_queue))
232
- thread.daemon = True # Make thread die when main thread exits
233
  thread.start()
234
 
235
  # Stream results as they become available
@@ -245,8 +147,7 @@ def chat():
245
  new_part = result[len(previous_text):]
246
  previous_text = result
247
  if new_part:
248
- yield f"data: {json.dumps(new_part)}\n\n"
249
- time.sleep(0.01) # Small delay for more natural typing effect
250
 
251
  except queue.Empty:
252
  # Timeout occurred
@@ -255,15 +156,7 @@ def chat():
255
 
256
  yield "data: [DONE]\n\n"
257
 
258
- return Response(
259
- stream_with_context(generate()),
260
- mimetype='text/event-stream',
261
- headers={
262
- 'Cache-Control': 'no-cache',
263
- 'Connection': 'keep-alive',
264
- 'X-Accel-Buffering': 'no' # Disable buffering for Nginx
265
- }
266
- )
267
 
268
  except Exception as e:
269
  logger.error(f"Error processing chat request: {str(e)}")
@@ -272,10 +165,6 @@ def chat():
272
  # Simple API for non-streaming chat (fallback)
273
  @app.route('/chat-simple', methods=['POST'])
274
  def chat_simple():
275
- # Check if models are loaded
276
- if model is None or tokenizer is None:
277
- return jsonify({"error": "Models are still initializing. Please try again shortly."}), 503
278
-
279
  data = request.get_json()
280
  message = data.get("message", "")
281
 
@@ -283,29 +172,29 @@ def chat_simple():
283
  return jsonify({"error": "Message is required"}), 400
284
 
285
  try:
286
- # Create prompt with system message (shorter version)
287
  prompt = f"""<|im_start|>system
288
- You are a helpful assistant.
289
  <|im_end|>
290
  <|im_start|>user
291
  {message}<|im_end|>
292
  <|im_start|>assistant
293
  """
294
 
295
- # Handle inputs with reduced context
296
- inputs = tokenizer(prompt, return_tensors="pt", truncation=True, max_length=256)
297
- inputs = {k: v.to(device) for k, v in inputs.items()}
298
-
299
- # Generate answer with reduced parameters
300
- with torch.no_grad(): # Disable gradient calculation
301
- output = model.generate(
302
- **inputs,
303
- max_new_tokens=256, # Reduced from 512
304
- temperature=0.7,
305
- top_p=0.9,
306
- do_sample=True,
307
- num_beams=1
308
- )
309
 
310
  # Decode and format answer
311
  answer = tokenizer.decode(output[0], skip_special_tokens=True)
@@ -322,16 +211,9 @@ You are a helpful assistant.
322
 
323
  if __name__ == "__main__":
324
  try:
325
- # Start the Flask app in a separate thread
326
- flask_thread = threading.Thread(target=lambda: app.run(host="0.0.0.0", port=7860))
327
- flask_thread.daemon = True
328
- flask_thread.start()
329
-
330
- # Initialize models in the main thread
331
- logger.info("Starting Flask application")
332
  initialize_models()
333
-
334
- # Keep the main thread alive
335
- flask_thread.join()
336
  except Exception as e:
337
  logger.critical(f"Failed to start application: {str(e)}")
 
6
  import logging
7
  import threading
8
  import queue
 
 
9
  from transformers import AutoTokenizer, AutoModelForCausalLM
10
 
11
  # Set up logging
12
+ logging.basicConfig(level=logging.INFO)
 
 
 
 
13
  logger = logging.getLogger(__name__)
14
 
 
 
 
15
  # Fix caching issue on Hugging Face Spaces
16
  os.environ["TRANSFORMERS_CACHE"] = "/tmp"
17
  os.environ["HF_HOME"] = "/tmp"
 
27
  tokenizer = None
28
  model = None
29
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
30
  # Initialize models once on startup
31
  def initialize_models():
32
  global tokenizer, model
33
  try:
34
  logger.info("Loading language model...")
35
+ model_name = "Qwen/Qwen2.5-1.5B-Instruct"
36
+ tokenizer = AutoTokenizer.from_pretrained(model_name)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
37
  model = AutoModelForCausalLM.from_pretrained(
38
  model_name,
39
+ torch_dtype=torch.float16, # Use float16 for lower memory on CPU
40
+ device_map="cpu", # Explicitly set to CPU
41
+ low_cpu_mem_usage=True # Optimize memory loading
42
  )
43
 
 
44
  if tokenizer.pad_token is None:
 
45
  tokenizer.pad_token = tokenizer.eos_token
46
  model.config.pad_token_id = model.config.eos_token_id
47
 
 
 
 
 
 
48
  logger.info("Models initialized successfully")
49
  except Exception as e:
50
  logger.error(f"Error initializing models: {str(e)}")
51
  raise
52
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
53
  # Function to simulate "thinking" process
54
  def thinking_process(message, result_queue):
55
  """
56
+ This function simulates a thinking process and puts the result in the queue
 
57
  """
58
  try:
59
+ # Simulate thinking process
60
  logger.info(f"Thinking about: '{message}'")
61
 
62
+ # Create prompt with system message
63
  prompt = f"""<|im_start|>system
64
+ You are a helpful, friendly, and thoughtful AI assistant. Think carefully and provide informative, detailed responses.
 
65
  <|im_end|>
66
  <|im_start|>user
67
  {message}<|im_end|>
 
69
  """
70
 
71
  # Handle inputs
72
+ inputs = tokenizer(prompt, return_tensors="pt", truncation=True, max_length=1024)
73
+ inputs = {k: v.to('cpu') for k, v in inputs.items()}
74
 
75
  # Generate answer with streaming
76
  streamer = TextStreamer(tokenizer, result_queue)
77
 
78
+ # Generate response
79
+ model.generate(
80
+ **inputs,
81
+ max_new_tokens=512,
82
+ temperature=0.7,
83
+ top_p=0.9,
84
+ do_sample=True,
85
+ streamer=streamer,
86
+ num_beams=1,
87
+ no_repeat_ngram_size=3
88
+ )
 
 
 
 
 
 
 
 
89
 
90
  # Signal generation is complete
91
  result_queue.put(None)
 
96
  # Signal generation is complete
97
  result_queue.put(None)
98
 
99
+ # TextStreamer class for token-by-token generation
100
+ class TextStreamer:
101
+ def __init__(self, tokenizer, queue):
102
+ self.tokenizer = tokenizer
103
+ self.queue = queue
104
+ self.current_tokens = []
105
+
106
+ def put(self, token_ids):
107
+ self.current_tokens.extend(token_ids.tolist())
108
+ text = self.tokenizer.decode(self.current_tokens, skip_special_tokens=True)
109
+ self.queue.put(text)
110
+
111
+ def end(self):
112
+ pass
113
+
114
  # API route for home page
115
  @app.route('/')
116
  def home():
117
+ return jsonify({"message": "AI Chat API is running!"})
 
 
 
 
 
 
 
118
 
119
  # API route for streaming chat responses
120
+ @app.route('/chat', methods=['POST'])
121
  def chat():
122
+ data = request.get_json()
123
+ message = data.get("message", "")
 
 
 
 
 
 
 
 
 
 
 
 
124
 
125
  if not message:
126
  return jsonify({"error": "Message is required"}), 400
127
 
128
  try:
129
  def generate():
 
 
 
 
 
 
 
130
  # Create a queue for communication between threads
131
  result_queue = queue.Queue()
132
 
133
  # Start thinking in a separate thread
134
  thread = threading.Thread(target=thinking_process, args=(message, result_queue))
 
135
  thread.start()
136
 
137
  # Stream results as they become available
 
147
  new_part = result[len(previous_text):]
148
  previous_text = result
149
  if new_part:
150
+ yield f"data: {new_part}\n\n"
 
151
 
152
  except queue.Empty:
153
  # Timeout occurred
 
156
 
157
  yield "data: [DONE]\n\n"
158
 
159
+ return Response(stream_with_context(generate()), mimetype='text/event-stream')
 
 
 
 
 
 
 
 
160
 
161
  except Exception as e:
162
  logger.error(f"Error processing chat request: {str(e)}")
 
165
  # Simple API for non-streaming chat (fallback)
166
  @app.route('/chat-simple', methods=['POST'])
167
  def chat_simple():
 
 
 
 
168
  data = request.get_json()
169
  message = data.get("message", "")
170
 
 
172
  return jsonify({"error": "Message is required"}), 400
173
 
174
  try:
175
+ # Create prompt with system message
176
  prompt = f"""<|im_start|>system
177
+ You are a helpful, friendly, and thoughtful AI assistant. Think carefully and provide informative, detailed responses.
178
  <|im_end|>
179
  <|im_start|>user
180
  {message}<|im_end|>
181
  <|im_start|>assistant
182
  """
183
 
184
+ # Handle inputs
185
+ inputs = tokenizer(prompt, return_tensors="pt", truncation=True, max_length=1024)
186
+ inputs = {k: v.to('cpu') for k, v in inputs.items()}
187
+
188
+ # Generate answer
189
+ output = model.generate(
190
+ **inputs,
191
+ max_new_tokens=512,
192
+ temperature=0.7,
193
+ top_p=0.9,
194
+ do_sample=True,
195
+ num_beams=1,
196
+ no_repeat_ngram_size=3
197
+ )
198
 
199
  # Decode and format answer
200
  answer = tokenizer.decode(output[0], skip_special_tokens=True)
 
211
 
212
  if __name__ == "__main__":
213
  try:
214
+ # Initialize models at startup
 
 
 
 
 
 
215
  initialize_models()
216
+ logger.info("Starting Flask application")
217
+ app.run(host="0.0.0.0", port=7860)
 
218
  except Exception as e:
219
  logger.critical(f"Failed to start application: {str(e)}")