mike23415 commited on
Commit
45ef073
·
verified ·
1 Parent(s): fdb5001

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +77 -137
app.py CHANGED
@@ -1,135 +1,105 @@
1
  import os
2
  import time
3
  import json
4
- import gc # For garbage collection
5
  from pathlib import Path
6
  from flask import Flask, request, jsonify, Response
7
  from flask_cors import CORS
8
  import torch
9
 
10
- # Create cache directory if not exists
11
  cache_dir = Path(os.getenv('TRANSFORMERS_CACHE', '/app/cache'))
12
  cache_dir.mkdir(parents=True, exist_ok=True)
13
 
14
  app = Flask(__name__)
15
- CORS(app) # Allow cross-origin requests
16
 
17
- # Model configuration
18
- # Use DeepSeek R1 Distill Qwen 1.5B model (much lighter than 7B)
19
  MODEL_NAME = "deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B"
20
  MAX_NEW_TOKENS = 256
21
- DEVICE = "cpu" if not torch.cuda.is_available() else "cuda"
22
 
23
- # Initialize model variables
24
  tokenizer = None
25
  model = None
26
 
27
  def load_model():
28
- """Load model on first request to save memory at startup"""
29
  global tokenizer, model
30
-
31
  if tokenizer is not None and model is not None:
32
  return True
33
-
34
  try:
35
  from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig
36
- print(f"Loading model {MODEL_NAME}...")
37
- print(f"Using device: {DEVICE}")
38
- print(f"Cache directory: {cache_dir}")
39
-
40
- # Use 4-bit quantization for memory efficiency if on CUDA
 
 
 
 
 
 
 
 
 
 
41
  if DEVICE == "cuda":
42
- quantization_config = BitsAndBytesConfig(
43
  load_in_4bit=True,
44
  bnb_4bit_compute_dtype=torch.float16,
45
  bnb_4bit_quant_type="nf4",
46
  bnb_4bit_use_double_quant=True
47
  )
48
- else:
49
- # For CPU, we'll use a different optimization approach
50
- quantization_config = None
51
-
52
- # Load tokenizer
53
- tokenizer = AutoTokenizer.from_pretrained(
54
- MODEL_NAME,
55
- cache_dir=str(cache_dir),
56
- trust_remote_code=True
57
- )
58
-
59
- # Configure token if HF_TOKEN is set
60
- hf_token = os.environ.get("HF_TOKEN")
61
- token_kwargs = {"token": hf_token} if hf_token else {}
62
-
63
- # Additional memory optimization settings for low resource environments
64
- if DEVICE == "cpu":
65
- # Load model with 8-bit quantization for CPU
66
- try:
67
- # Try int8 quantization for CPU
68
- model = AutoModelForCausalLM.from_pretrained(
69
- MODEL_NAME,
70
- cache_dir=str(cache_dir),
71
- load_in_8bit=True,
72
- low_cpu_mem_usage=True,
73
- trust_remote_code=True,
74
- **token_kwargs
75
- )
76
- except Exception as e:
77
- print(f"8-bit quantization failed, falling back to standard loading: {str(e)}")
78
- model = AutoModelForCausalLM.from_pretrained(
79
- MODEL_NAME,
80
- cache_dir=str(cache_dir),
81
- low_cpu_mem_usage=True,
82
- trust_remote_code=True,
83
- **token_kwargs
84
- )
85
- else:
86
- # Load model with 4-bit quantization for CUDA
87
  model = AutoModelForCausalLM.from_pretrained(
88
  MODEL_NAME,
89
  cache_dir=str(cache_dir),
 
90
  device_map="auto",
 
91
  torch_dtype=torch.float16,
92
- quantization_config=quantization_config,
93
  low_cpu_mem_usage=True,
 
 
 
 
 
 
 
94
  trust_remote_code=True,
 
 
95
  **token_kwargs
96
  )
97
-
98
- print("✅ Model loaded successfully!")
99
  return True
 
100
  except Exception as e:
101
- print(f"❌ Model loading failed: {str(e)}")
102
  return False
103
 
104
  def stream_generator(prompt):
105
- """Generator function for streaming response with thinking steps"""
106
- # Ensure model is loaded
107
  if not load_model():
108
  yield json.dumps({"type": "error", "content": "Model not loaded"}) + '\n'
109
  return
110
-
111
- # Thinking phases
112
  thinking_steps = [
113
  "🔍 Analyzing your question...",
114
  "🧠 Processing...",
115
  "💡 Formulating response..."
116
  ]
117
-
118
- # Stream thinking steps (fewer steps, faster timing for lighter model)
119
  for step in thinking_steps:
120
  yield json.dumps({"type": "thinking", "content": step}) + '\n'
121
- time.sleep(0.5) # Reduced timing for faster response
122
-
123
- # Prepare streaming generation
124
  try:
125
- # Format prompt for the model (DeepSeek specific)
126
  formatted_prompt = f"<|im_start|>user\n{prompt}<|im_end|>\n<|im_start|>assistant\n"
127
-
128
  inputs = tokenizer(formatted_prompt, return_tensors="pt")
129
  if DEVICE == "cuda":
130
  inputs = inputs.to("cuda")
131
-
132
- # Use memory efficient approach
133
  with torch.no_grad():
134
  generated_ids = model.generate(
135
  **inputs,
@@ -139,40 +109,28 @@ def stream_generator(prompt):
139
  do_sample=True,
140
  pad_token_id=tokenizer.eos_token_id,
141
  return_dict_in_generate=True,
142
- output_scores=False)
143
-
144
- # Get output sequence
145
  output_ids = generated_ids.sequences[0][len(inputs.input_ids[0]):]
146
-
147
- # Stream in slightly larger chunks for better performance
148
  full_output = ""
149
- chunk_size = 5 # Increased number of tokens per chunk
 
150
  for i in range(0, len(output_ids), chunk_size):
151
- chunk_ids = output_ids[i:i+chunk_size]
152
  chunk_text = tokenizer.decode(chunk_ids, skip_special_tokens=True)
153
  full_output += chunk_text
154
-
155
- yield json.dumps({
156
- "type": "answer",
157
- "content": chunk_text
158
- }) + '\n'
159
-
160
- # Smaller delay for faster streaming
161
  time.sleep(0.03)
162
-
163
  except Exception as e:
164
  import traceback
165
  error_details = f"Error: {str(e)}\n{traceback.format_exc()}"
166
  print(error_details)
167
- yield json.dumps({
168
- "type": "error",
169
- "content": f"Generation error: {str(e)}"
170
- }) + '\n'
171
-
172
- # Signal completion
173
  yield json.dumps({"type": "complete"}) + '\n'
174
-
175
- # Clean up memory aggressively
176
  if DEVICE == "cuda":
177
  torch.cuda.empty_cache()
178
  gc.collect()
@@ -181,40 +139,37 @@ def stream_generator(prompt):
181
  def stream_chat():
182
  data = request.get_json()
183
  prompt = data.get('prompt', '').strip()
184
-
185
  if not prompt:
186
  return jsonify({"error": "Empty prompt"}), 400
187
-
188
  return Response(
189
  stream_generator(prompt),
190
  mimetype='text/event-stream',
191
  headers={
192
  'Cache-Control': 'no-cache',
193
- 'X-Accel-Buffering': 'no', # Prevent Nginx buffering
194
  'Connection': 'keep-alive'
195
  }
196
  )
197
 
198
  @app.route('/chat', methods=['POST'])
199
  def chat():
200
- # Ensure model is loaded
201
  if not load_model():
202
  return jsonify({"error": "Model failed to load"}), 500
203
-
204
  data = request.get_json()
205
  prompt = data.get('prompt', '').strip()
206
-
207
  if not prompt:
208
  return jsonify({"error": "Empty prompt"}), 400
209
-
210
  try:
211
- # Format prompt for DeepSeek model
212
  formatted_prompt = f"<|im_start|>user\n{prompt}<|im_end|>\n<|im_start|>assistant\n"
213
-
214
  inputs = tokenizer(formatted_prompt, return_tensors="pt")
215
  if DEVICE == "cuda":
216
  inputs = inputs.to("cuda")
217
-
218
  with torch.no_grad():
219
  outputs = model.generate(
220
  **inputs,
@@ -222,43 +177,37 @@ def chat():
222
  temperature=0.7,
223
  top_p=0.9,
224
  do_sample=True,
225
- pad_token_id=tokenizer.eos_token_id)
226
-
 
227
  response = tokenizer.decode(outputs[0][len(inputs.input_ids[0]):], skip_special_tokens=True)
228
-
229
- # Clean up memory
230
  if DEVICE == "cuda":
231
  torch.cuda.empty_cache()
232
  gc.collect()
233
-
234
  return jsonify({"response": response})
235
-
236
  except Exception as e:
237
  import traceback
238
- error_details = f"Error: {str(e)}\n{traceback.format_exc()}"
239
- print(error_details)
240
  return jsonify({"error": str(e)}), 500
241
 
242
  @app.route('/health', methods=['GET'])
243
  def health_check():
244
  model_loaded = tokenizer is not None and model is not None
245
  memory_info = "N/A"
246
-
247
- # Get memory usage stats
248
  if torch.cuda.is_available():
249
  memory_info = f"{torch.cuda.memory_allocated()/1024**2:.2f}MB / {torch.cuda.get_device_properties(0).total_memory/1024**2:.2f}MB"
250
  else:
251
  import psutil
252
  memory_info = f"{psutil.virtual_memory().used/1024**3:.2f}GB / {psutil.virtual_memory().total/1024**3:.2f}GB"
253
-
254
- try:
255
- # Check if we need to load the model
256
- if not model_loaded and request.args.get('load') == 'true':
257
- model_loaded = load_model()
258
- except Exception as e:
259
- print(f"Health check error: {str(e)}")
260
-
261
- status = {
262
  "status": "ok" if model_loaded else "waiting",
263
  "model": MODEL_NAME,
264
  "model_loaded": model_loaded,
@@ -266,27 +215,20 @@ def health_check():
266
  "cache_dir": str(cache_dir),
267
  "max_tokens": MAX_NEW_TOKENS,
268
  "memory_usage": memory_info
269
- }
270
- return jsonify(status)
271
 
272
  @app.route('/unload', methods=['POST'])
273
  def unload_model():
274
- """Endpoint to manually unload model and free memory"""
275
  global model, tokenizer
276
-
277
  if model is not None:
278
  del model
279
  model = None
280
-
281
  if tokenizer is not None:
282
  del tokenizer
283
  tokenizer = None
284
-
285
- # Force garbage collection
286
  if torch.cuda.is_available():
287
  torch.cuda.empty_cache()
288
  gc.collect()
289
-
290
  return jsonify({"status": "Model unloaded", "memory_freed": True})
291
 
292
  @app.route('/')
@@ -296,9 +238,9 @@ def home():
296
  "status": "online",
297
  "endpoints": {
298
  "POST /chat": "Single-response chat",
299
- "POST /stream_chat": "Streaming chat with thinking steps",
300
  "GET /health": "Service health check",
301
- "POST /unload": "Unload model to free memory"
302
  },
303
  "config": {
304
  "model": MODEL_NAME,
@@ -309,9 +251,7 @@ def home():
309
  })
310
 
311
  if __name__ == '__main__':
312
- # Load model at startup only if explicitly requested
313
  if os.getenv('PRELOAD_MODEL', 'false').lower() == 'true':
314
  load_model()
315
-
316
  port = int(os.environ.get("PORT", 5000))
317
- app.run(host='0.0.0.0', port=port)
 
1
  import os
2
  import time
3
  import json
4
+ import gc
5
  from pathlib import Path
6
  from flask import Flask, request, jsonify, Response
7
  from flask_cors import CORS
8
  import torch
9
 
10
+ # Cache and model settings
11
  cache_dir = Path(os.getenv('TRANSFORMERS_CACHE', '/app/cache'))
12
  cache_dir.mkdir(parents=True, exist_ok=True)
13
 
14
  app = Flask(__name__)
15
+ CORS(app)
16
 
 
 
17
  MODEL_NAME = "deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B"
18
  MAX_NEW_TOKENS = 256
19
+ DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
20
 
 
21
  tokenizer = None
22
  model = None
23
 
24
  def load_model():
 
25
  global tokenizer, model
26
+
27
  if tokenizer is not None and model is not None:
28
  return True
29
+
30
  try:
31
  from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig
32
+
33
+ print(f"Loading model: {MODEL_NAME}")
34
+ print(f"Device: {DEVICE}")
35
+
36
+ # HF auth token if needed
37
+ hf_token = os.environ.get("HF_TOKEN")
38
+ token_kwargs = {"token": hf_token} if hf_token else {}
39
+
40
+ tokenizer = AutoTokenizer.from_pretrained(
41
+ MODEL_NAME,
42
+ cache_dir=str(cache_dir),
43
+ trust_remote_code=True,
44
+ **token_kwargs
45
+ )
46
+
47
  if DEVICE == "cuda":
48
+ quant_config = BitsAndBytesConfig(
49
  load_in_4bit=True,
50
  bnb_4bit_compute_dtype=torch.float16,
51
  bnb_4bit_quant_type="nf4",
52
  bnb_4bit_use_double_quant=True
53
  )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
54
  model = AutoModelForCausalLM.from_pretrained(
55
  MODEL_NAME,
56
  cache_dir=str(cache_dir),
57
+ trust_remote_code=True,
58
  device_map="auto",
59
+ quantization_config=quant_config,
60
  torch_dtype=torch.float16,
 
61
  low_cpu_mem_usage=True,
62
+ **token_kwargs
63
+ )
64
+ else:
65
+ # CPU: no quantization_config; use float16 if possible
66
+ model = AutoModelForCausalLM.from_pretrained(
67
+ MODEL_NAME,
68
+ cache_dir=str(cache_dir),
69
  trust_remote_code=True,
70
+ torch_dtype=torch.float16,
71
+ low_cpu_mem_usage=True,
72
  **token_kwargs
73
  )
74
+
75
+ print("✅ Model loaded successfully")
76
  return True
77
+
78
  except Exception as e:
79
+ print(f"❌ Failed to load model: {e}")
80
  return False
81
 
82
  def stream_generator(prompt):
 
 
83
  if not load_model():
84
  yield json.dumps({"type": "error", "content": "Model not loaded"}) + '\n'
85
  return
86
+
 
87
  thinking_steps = [
88
  "🔍 Analyzing your question...",
89
  "🧠 Processing...",
90
  "💡 Formulating response..."
91
  ]
92
+
 
93
  for step in thinking_steps:
94
  yield json.dumps({"type": "thinking", "content": step}) + '\n'
95
+ time.sleep(0.5)
96
+
 
97
  try:
 
98
  formatted_prompt = f"<|im_start|>user\n{prompt}<|im_end|>\n<|im_start|>assistant\n"
 
99
  inputs = tokenizer(formatted_prompt, return_tensors="pt")
100
  if DEVICE == "cuda":
101
  inputs = inputs.to("cuda")
102
+
 
103
  with torch.no_grad():
104
  generated_ids = model.generate(
105
  **inputs,
 
109
  do_sample=True,
110
  pad_token_id=tokenizer.eos_token_id,
111
  return_dict_in_generate=True,
112
+ output_scores=False
113
+ )
114
+
115
  output_ids = generated_ids.sequences[0][len(inputs.input_ids[0]):]
 
 
116
  full_output = ""
117
+ chunk_size = 5
118
+
119
  for i in range(0, len(output_ids), chunk_size):
120
+ chunk_ids = output_ids[i:i + chunk_size]
121
  chunk_text = tokenizer.decode(chunk_ids, skip_special_tokens=True)
122
  full_output += chunk_text
123
+ yield json.dumps({"type": "answer", "content": chunk_text}) + '\n'
 
 
 
 
 
 
124
  time.sleep(0.03)
125
+
126
  except Exception as e:
127
  import traceback
128
  error_details = f"Error: {str(e)}\n{traceback.format_exc()}"
129
  print(error_details)
130
+ yield json.dumps({"type": "error", "content": str(e)}) + '\n'
131
+
 
 
 
 
132
  yield json.dumps({"type": "complete"}) + '\n'
133
+
 
134
  if DEVICE == "cuda":
135
  torch.cuda.empty_cache()
136
  gc.collect()
 
139
  def stream_chat():
140
  data = request.get_json()
141
  prompt = data.get('prompt', '').strip()
142
+
143
  if not prompt:
144
  return jsonify({"error": "Empty prompt"}), 400
145
+
146
  return Response(
147
  stream_generator(prompt),
148
  mimetype='text/event-stream',
149
  headers={
150
  'Cache-Control': 'no-cache',
151
+ 'X-Accel-Buffering': 'no',
152
  'Connection': 'keep-alive'
153
  }
154
  )
155
 
156
  @app.route('/chat', methods=['POST'])
157
  def chat():
 
158
  if not load_model():
159
  return jsonify({"error": "Model failed to load"}), 500
160
+
161
  data = request.get_json()
162
  prompt = data.get('prompt', '').strip()
163
+
164
  if not prompt:
165
  return jsonify({"error": "Empty prompt"}), 400
166
+
167
  try:
 
168
  formatted_prompt = f"<|im_start|>user\n{prompt}<|im_end|>\n<|im_start|>assistant\n"
 
169
  inputs = tokenizer(formatted_prompt, return_tensors="pt")
170
  if DEVICE == "cuda":
171
  inputs = inputs.to("cuda")
172
+
173
  with torch.no_grad():
174
  outputs = model.generate(
175
  **inputs,
 
177
  temperature=0.7,
178
  top_p=0.9,
179
  do_sample=True,
180
+ pad_token_id=tokenizer.eos_token_id
181
+ )
182
+
183
  response = tokenizer.decode(outputs[0][len(inputs.input_ids[0]):], skip_special_tokens=True)
184
+
 
185
  if DEVICE == "cuda":
186
  torch.cuda.empty_cache()
187
  gc.collect()
188
+
189
  return jsonify({"response": response})
190
+
191
  except Exception as e:
192
  import traceback
193
+ print(f"Error: {e}\n{traceback.format_exc()}")
 
194
  return jsonify({"error": str(e)}), 500
195
 
196
  @app.route('/health', methods=['GET'])
197
  def health_check():
198
  model_loaded = tokenizer is not None and model is not None
199
  memory_info = "N/A"
200
+
 
201
  if torch.cuda.is_available():
202
  memory_info = f"{torch.cuda.memory_allocated()/1024**2:.2f}MB / {torch.cuda.get_device_properties(0).total_memory/1024**2:.2f}MB"
203
  else:
204
  import psutil
205
  memory_info = f"{psutil.virtual_memory().used/1024**3:.2f}GB / {psutil.virtual_memory().total/1024**3:.2f}GB"
206
+
207
+ if not model_loaded and request.args.get('load') == 'true':
208
+ model_loaded = load_model()
209
+
210
+ return jsonify({
 
 
 
 
211
  "status": "ok" if model_loaded else "waiting",
212
  "model": MODEL_NAME,
213
  "model_loaded": model_loaded,
 
215
  "cache_dir": str(cache_dir),
216
  "max_tokens": MAX_NEW_TOKENS,
217
  "memory_usage": memory_info
218
+ })
 
219
 
220
  @app.route('/unload', methods=['POST'])
221
  def unload_model():
 
222
  global model, tokenizer
 
223
  if model is not None:
224
  del model
225
  model = None
 
226
  if tokenizer is not None:
227
  del tokenizer
228
  tokenizer = None
 
 
229
  if torch.cuda.is_available():
230
  torch.cuda.empty_cache()
231
  gc.collect()
 
232
  return jsonify({"status": "Model unloaded", "memory_freed": True})
233
 
234
  @app.route('/')
 
238
  "status": "online",
239
  "endpoints": {
240
  "POST /chat": "Single-response chat",
241
+ "POST /stream_chat": "Streaming chat",
242
  "GET /health": "Service health check",
243
+ "POST /unload": "Unload model"
244
  },
245
  "config": {
246
  "model": MODEL_NAME,
 
251
  })
252
 
253
  if __name__ == '__main__':
 
254
  if os.getenv('PRELOAD_MODEL', 'false').lower() == 'true':
255
  load_model()
 
256
  port = int(os.environ.get("PORT", 5000))
257
+ app.run(host='0.0.0.0', port=port)