mike23415 commited on
Commit
5d565fc
·
verified ·
1 Parent(s): ec804b3

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +112 -64
app.py CHANGED
@@ -7,6 +7,7 @@ import logging
7
  import threading
8
  import queue
9
  import json
 
10
  from transformers import AutoTokenizer, AutoModelForCausalLM
11
 
12
  # Set up logging
@@ -17,6 +18,9 @@ logging.basicConfig(
17
  )
18
  logger = logging.getLogger(__name__)
19
 
 
 
 
20
  # Fix caching issue on Hugging Face Spaces
21
  os.environ["TRANSFORMERS_CACHE"] = "/tmp"
22
  os.environ["HF_HOME"] = "/tmp"
@@ -32,11 +36,36 @@ logger.info(f"Using device: {device}")
32
  tokenizer = None
33
  model = None
34
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
35
  # Initialize models once on startup
36
  def initialize_models():
37
  global tokenizer, model
38
  try:
39
  logger.info("Loading language model...")
 
40
 
41
  # You can change the model here if needed
42
  model_name = "Qwen/Qwen2.5-1.5B-Instruct" # Good balance of quality and speed for CPU
@@ -45,17 +74,25 @@ def initialize_models():
45
  logger.info(f"Loading tokenizer: {model_name}")
46
  tokenizer = AutoTokenizer.from_pretrained(
47
  model_name,
48
- use_fast=True # Use the fast tokenizers when available
 
49
  )
50
 
 
 
 
 
51
  # Load model with optimizations for CPU
52
  logger.info(f"Loading model: {model_name}")
 
 
 
 
53
  model = AutoModelForCausalLM.from_pretrained(
54
  model_name,
55
- torch_dtype=torch.float16, # Use float16 for lower memory
56
- device_map="cpu", # Explicitly set to CPU
57
  low_cpu_mem_usage=True, # Optimize memory loading
58
- offload_folder="offload" # Use disk offloading if needed
59
  )
60
 
61
  # Handle padding tokens
@@ -74,6 +111,21 @@ def initialize_models():
74
  logger.error(f"Error initializing models: {str(e)}")
75
  raise
76
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
77
  # Function to simulate "thinking" process
78
  def thinking_process(message, result_queue):
79
  """
@@ -84,19 +136,10 @@ def thinking_process(message, result_queue):
84
  # Simulate explicit thinking stage
85
  logger.info(f"Thinking about: '{message}'")
86
 
87
- # Pause to simulate deeper thinking (helps with more complex queries)
88
- time.sleep(1)
89
-
90
  # Create thoughtful prompt with system message and thinking instructions
91
  prompt = f"""<|im_start|>system
92
  You are a helpful, friendly, and thoughtful AI assistant.
93
- Let's approach the user's request step by step:
94
- 1. First, understand what the user is really asking
95
- 2. Consider the key aspects we need to address
96
- 3. Think about the best way to structure the response
97
- 4. Provide clear, accurate information in a conversational tone
98
-
99
- Always think carefully before responding, consider different angles, and provide thoughtful, detailed answers.
100
  <|im_end|>
101
  <|im_start|>user
102
  {message}<|im_end|>
@@ -104,30 +147,28 @@ Always think carefully before responding, consider different angles, and provide
104
  """
105
 
106
  # Handle inputs
107
- inputs = tokenizer(prompt, return_tensors="pt", truncation=True, max_length=1024)
108
- inputs = {k: v.to('cpu') for k, v in inputs.items()}
109
 
110
  # Generate answer with streaming
111
  streamer = TextStreamer(tokenizer, result_queue)
112
 
113
  # Simulate thinking first by sending some initial dots
114
  result_queue.put("Let me think about this...")
115
- time.sleep(0.5)
116
 
117
- # Generate response - we use a temperature of 0.7 for more thoughtful outputs
118
- # and top_p for nucleus sampling to avoid repetitive or generic responses
119
  try:
120
- model.generate(
121
- **inputs,
122
- max_new_tokens=512,
123
- temperature=0.7,
124
- top_p=0.9,
125
- do_sample=True,
126
- streamer=streamer,
127
- num_beams=2, # Using 2 beams helps with coherence
128
- no_repeat_ngram_size=3,
129
- repetition_penalty=1.2 # Discourages token repetition
130
- )
131
  except Exception as e:
132
  logger.error(f"Model generation error: {str(e)}")
133
  result_queue.put(f"\n\nI apologize, but I encountered an error while processing your request.")
@@ -141,29 +182,25 @@ Always think carefully before responding, consider different angles, and provide
141
  # Signal generation is complete
142
  result_queue.put(None)
143
 
144
- # TextStreamer class for token-by-token generation
145
- class TextStreamer:
146
- def __init__(self, tokenizer, queue):
147
- self.tokenizer = tokenizer
148
- self.queue = queue
149
- self.current_tokens = []
150
-
151
- def put(self, token_ids):
152
- self.current_tokens.extend(token_ids.tolist())
153
- text = self.tokenizer.decode(self.current_tokens, skip_special_tokens=True)
154
- self.queue.put(text)
155
-
156
- def end(self):
157
- pass
158
-
159
  # API route for home page
160
  @app.route('/')
161
  def home():
162
- return jsonify({"message": "AI Chat API is running!"})
 
 
 
 
 
 
 
163
 
164
  # API route for streaming chat responses
165
  @app.route('/chat', methods=['POST', 'GET'])
166
  def chat():
 
 
 
 
167
  # Handle both POST JSON and GET query parameters for flexibility
168
  if request.method == 'POST':
169
  try:
@@ -235,6 +272,10 @@ def chat():
235
  # Simple API for non-streaming chat (fallback)
236
  @app.route('/chat-simple', methods=['POST'])
237
  def chat_simple():
 
 
 
 
238
  data = request.get_json()
239
  message = data.get("message", "")
240
 
@@ -242,29 +283,29 @@ def chat_simple():
242
  return jsonify({"error": "Message is required"}), 400
243
 
244
  try:
245
- # Create prompt with system message
246
  prompt = f"""<|im_start|>system
247
- You are a helpful, friendly, and thoughtful AI assistant. Think carefully and provide informative, detailed responses.
248
  <|im_end|>
249
  <|im_start|>user
250
  {message}<|im_end|>
251
  <|im_start|>assistant
252
  """
253
 
254
- # Handle inputs
255
- inputs = tokenizer(prompt, return_tensors="pt", truncation=True, max_length=1024)
256
- inputs = {k: v.to('cpu') for k, v in inputs.items()}
257
 
258
- # Generate answer
259
- output = model.generate(
260
- **inputs,
261
- max_new_tokens=512,
262
- temperature=0.7,
263
- top_p=0.9,
264
- do_sample=True,
265
- num_beams=1,
266
- no_repeat_ngram_size=3
267
- )
268
 
269
  # Decode and format answer
270
  answer = tokenizer.decode(output[0], skip_special_tokens=True)
@@ -281,9 +322,16 @@ You are a helpful, friendly, and thoughtful AI assistant. Think carefully and pr
281
 
282
  if __name__ == "__main__":
283
  try:
284
- # Initialize models at startup
285
- initialize_models()
 
 
 
 
286
  logger.info("Starting Flask application")
287
- app.run(host="0.0.0.0", port=7860)
 
 
 
288
  except Exception as e:
289
  logger.critical(f"Failed to start application: {str(e)}")
 
7
  import threading
8
  import queue
9
  import json
10
+ import gc
11
  from transformers import AutoTokenizer, AutoModelForCausalLM
12
 
13
  # Set up logging
 
18
  )
19
  logger = logging.getLogger(__name__)
20
 
21
+ # Print startup banner for visibility in logs
22
+ print("\n===== Application Startup at", time.strftime("%Y-%m-%d %H:%M:%S"), "=====\n")
23
+
24
  # Fix caching issue on Hugging Face Spaces
25
  os.environ["TRANSFORMERS_CACHE"] = "/tmp"
26
  os.environ["HF_HOME"] = "/tmp"
 
36
  tokenizer = None
37
  model = None
38
 
39
+ # Check available system resources
40
+ def log_system_info():
41
+ # Basic system info
42
+ logger.info(f"Python version: {os.sys.version}")
43
+
44
+ # CPU info
45
+ import multiprocessing
46
+ logger.info(f"CPU cores: {multiprocessing.cpu_count()}")
47
+
48
+ # Memory info
49
+ try:
50
+ import psutil
51
+ mem = psutil.virtual_memory()
52
+ logger.info(f"Memory: Total={mem.total/1e9:.1f}GB, Available={mem.available/1e9:.1f}GB ({mem.percent}% used)")
53
+ except ImportError:
54
+ logger.info("psutil not installed, skipping detailed memory info")
55
+
56
+ # PyTorch info
57
+ logger.info(f"PyTorch version: {torch.__version__}")
58
+ logger.info(f"CUDA available: {torch.cuda.is_available()}")
59
+ if torch.cuda.is_available():
60
+ logger.info(f"CUDA version: {torch.version.cuda}")
61
+ logger.info(f"GPU: {torch.cuda.get_device_name(0)}")
62
+
63
  # Initialize models once on startup
64
  def initialize_models():
65
  global tokenizer, model
66
  try:
67
  logger.info("Loading language model...")
68
+ log_system_info()
69
 
70
  # You can change the model here if needed
71
  model_name = "Qwen/Qwen2.5-1.5B-Instruct" # Good balance of quality and speed for CPU
 
74
  logger.info(f"Loading tokenizer: {model_name}")
75
  tokenizer = AutoTokenizer.from_pretrained(
76
  model_name,
77
+ use_fast=True, # Use the fast tokenizers when available
78
+ local_files_only=False # Allow downloading if not cached
79
  )
80
 
81
+ # Free up memory before loading model
82
+ gc.collect()
83
+ torch.cuda.empty_cache() if torch.cuda.is_available() else None
84
+
85
  # Load model with optimizations for CPU
86
  logger.info(f"Loading model: {model_name}")
87
+
88
+ # Set lower precision for CPU to reduce memory usage
89
+ torch_dtype = torch.bfloat16 if torch.cuda.is_available() else torch.float32
90
+
91
  model = AutoModelForCausalLM.from_pretrained(
92
  model_name,
93
+ torch_dtype=torch_dtype,
 
94
  low_cpu_mem_usage=True, # Optimize memory loading
95
+ device_map="auto" # Let the system decide optimal device mapping
96
  )
97
 
98
  # Handle padding tokens
 
111
  logger.error(f"Error initializing models: {str(e)}")
112
  raise
113
 
114
+ # TextStreamer class for token-by-token generation
115
+ class TextStreamer:
116
+ def __init__(self, tokenizer, queue):
117
+ self.tokenizer = tokenizer
118
+ self.queue = queue
119
+ self.current_tokens = []
120
+
121
+ def put(self, token_ids):
122
+ self.current_tokens.extend(token_ids.tolist())
123
+ text = self.tokenizer.decode(self.current_tokens, skip_special_tokens=True)
124
+ self.queue.put(text)
125
+
126
+ def end(self):
127
+ pass
128
+
129
  # Function to simulate "thinking" process
130
  def thinking_process(message, result_queue):
131
  """
 
136
  # Simulate explicit thinking stage
137
  logger.info(f"Thinking about: '{message}'")
138
 
 
 
 
139
  # Create thoughtful prompt with system message and thinking instructions
140
  prompt = f"""<|im_start|>system
141
  You are a helpful, friendly, and thoughtful AI assistant.
142
+ Let's approach the user's request step by step.
 
 
 
 
 
 
143
  <|im_end|>
144
  <|im_start|>user
145
  {message}<|im_end|>
 
147
  """
148
 
149
  # Handle inputs
150
+ inputs = tokenizer(prompt, return_tensors="pt", truncation=True, max_length=512)
151
+ inputs = {k: v.to(device) for k, v in inputs.items()}
152
 
153
  # Generate answer with streaming
154
  streamer = TextStreamer(tokenizer, result_queue)
155
 
156
  # Simulate thinking first by sending some initial dots
157
  result_queue.put("Let me think about this...")
 
158
 
159
+ # Generate response with simpler parameters to avoid memory issues
 
160
  try:
161
+ with torch.no_grad(): # Disable gradient calculation to save memory
162
+ model.generate(
163
+ **inputs,
164
+ max_new_tokens=256, # Reduced from 512
165
+ temperature=0.7,
166
+ top_p=0.9,
167
+ do_sample=True,
168
+ streamer=streamer,
169
+ num_beams=1, # Reduced from 2
170
+ repetition_penalty=1.2
171
+ )
172
  except Exception as e:
173
  logger.error(f"Model generation error: {str(e)}")
174
  result_queue.put(f"\n\nI apologize, but I encountered an error while processing your request.")
 
182
  # Signal generation is complete
183
  result_queue.put(None)
184
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
185
  # API route for home page
186
  @app.route('/')
187
  def home():
188
+ return jsonify({"message": "AI Chat API is running!", "status": "online"})
189
+
190
+ # Health check endpoint
191
+ @app.route('/health')
192
+ def health():
193
+ if model is None or tokenizer is None:
194
+ return jsonify({"status": "initializing"}), 503
195
+ return jsonify({"status": "healthy"})
196
 
197
  # API route for streaming chat responses
198
  @app.route('/chat', methods=['POST', 'GET'])
199
  def chat():
200
+ # Check if models are loaded
201
+ if model is None or tokenizer is None:
202
+ return jsonify({"error": "Models are still initializing. Please try again shortly."}), 503
203
+
204
  # Handle both POST JSON and GET query parameters for flexibility
205
  if request.method == 'POST':
206
  try:
 
272
  # Simple API for non-streaming chat (fallback)
273
  @app.route('/chat-simple', methods=['POST'])
274
  def chat_simple():
275
+ # Check if models are loaded
276
+ if model is None or tokenizer is None:
277
+ return jsonify({"error": "Models are still initializing. Please try again shortly."}), 503
278
+
279
  data = request.get_json()
280
  message = data.get("message", "")
281
 
 
283
  return jsonify({"error": "Message is required"}), 400
284
 
285
  try:
286
+ # Create prompt with system message (shorter version)
287
  prompt = f"""<|im_start|>system
288
+ You are a helpful assistant.
289
  <|im_end|>
290
  <|im_start|>user
291
  {message}<|im_end|>
292
  <|im_start|>assistant
293
  """
294
 
295
+ # Handle inputs with reduced context
296
+ inputs = tokenizer(prompt, return_tensors="pt", truncation=True, max_length=256)
297
+ inputs = {k: v.to(device) for k, v in inputs.items()}
298
 
299
+ # Generate answer with reduced parameters
300
+ with torch.no_grad(): # Disable gradient calculation
301
+ output = model.generate(
302
+ **inputs,
303
+ max_new_tokens=256, # Reduced from 512
304
+ temperature=0.7,
305
+ top_p=0.9,
306
+ do_sample=True,
307
+ num_beams=1
308
+ )
309
 
310
  # Decode and format answer
311
  answer = tokenizer.decode(output[0], skip_special_tokens=True)
 
322
 
323
  if __name__ == "__main__":
324
  try:
325
+ # Start the Flask app in a separate thread
326
+ flask_thread = threading.Thread(target=lambda: app.run(host="0.0.0.0", port=7860))
327
+ flask_thread.daemon = True
328
+ flask_thread.start()
329
+
330
+ # Initialize models in the main thread
331
  logger.info("Starting Flask application")
332
+ initialize_models()
333
+
334
+ # Keep the main thread alive
335
+ flask_thread.join()
336
  except Exception as e:
337
  logger.critical(f"Failed to start application: {str(e)}")