mike23415 commited on
Commit
98ee9d3
Β·
verified Β·
1 Parent(s): 4a9bfbe

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +87 -46
app.py CHANGED
@@ -1,12 +1,11 @@
1
  import os
2
  import time
3
  import json
4
- import numpy as np
5
  from pathlib import Path
6
  from flask import Flask, request, jsonify, Response
7
  from flask_cors import CORS
8
  import torch
9
- import gc # For garbage collection
10
 
11
  # Create cache directory if not exists
12
  cache_dir = Path(os.getenv('TRANSFORMERS_CACHE', '/app/cache'))
@@ -16,8 +15,8 @@ app = Flask(__name__)
16
  CORS(app) # Allow cross-origin requests
17
 
18
  # Model configuration
19
- # Use DeepSeek R1 Distill Qwen 7B model
20
- MODEL_NAME = "deepseek-ai/DeepSeek-R1-Distill-Qwen-7B"
21
  MAX_NEW_TOKENS = 256
22
  DEVICE = "cpu" if not torch.cuda.is_available() else "cuda"
23
 
@@ -47,6 +46,7 @@ def load_model():
47
  bnb_4bit_use_double_quant=True
48
  )
49
  else:
 
50
  quantization_config = None
51
 
52
  # Load tokenizer
@@ -60,17 +60,40 @@ def load_model():
60
  hf_token = os.environ.get("HF_TOKEN")
61
  token_kwargs = {"token": hf_token} if hf_token else {}
62
 
63
- # Load model with appropriate settings for the device
64
- model = AutoModelForCausalLM.from_pretrained(
65
- MODEL_NAME,
66
- cache_dir=str(cache_dir),
67
- device_map="auto" if DEVICE == "cuda" else None,
68
- torch_dtype=torch.float16 if DEVICE == "cuda" else torch.float32,
69
- quantization_config=quantization_config,
70
- low_cpu_mem_usage=True,
71
- trust_remote_code=True,
72
- **token_kwargs
73
- )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
74
 
75
  print("βœ… Model loaded successfully!")
76
  return True
@@ -88,32 +111,25 @@ def stream_generator(prompt):
88
  # Thinking phases
89
  thinking_steps = [
90
  "πŸ” Analyzing your question...",
91
- "🧠 Accessing knowledge base...",
92
- "πŸ’‘ Formulating response...",
93
- "πŸ“š Verifying information..."
94
  ]
95
 
96
- # Stream thinking steps
97
  for step in thinking_steps:
98
  yield json.dumps({"type": "thinking", "content": step}) + '\n'
99
- time.sleep(0.8) # Reduced timing for faster response
100
 
101
  # Prepare streaming generation
102
  try:
103
- # Format prompt for the model
104
- if "mistral" in MODEL_NAME.lower():
105
- formatted_prompt = f"<s>[INST] {prompt} [/INST]"
106
- elif "deepseek" in MODEL_NAME.lower():
107
- formatted_prompt = f"<|im_start|>user\n{prompt}<|im_end|>\n<|im_start|>assistant\n"
108
- else:
109
- formatted_prompt = prompt
110
 
111
  inputs = tokenizer(formatted_prompt, return_tensors="pt")
112
  if DEVICE == "cuda":
113
  inputs = inputs.to("cuda")
114
 
115
- # Use custom streaming implementation
116
- # Start generation
117
  with torch.no_grad():
118
  generated_ids = model.generate(
119
  **inputs,
@@ -128,9 +144,9 @@ def stream_generator(prompt):
128
  # Get output sequence
129
  output_ids = generated_ids.sequences[0][len(inputs.input_ids[0]):]
130
 
131
- # Stream in chunks for smoother experience
132
  full_output = ""
133
- chunk_size = 3 # Number of tokens per chunk
134
  for i in range(0, len(output_ids), chunk_size):
135
  chunk_ids = output_ids[i:i+chunk_size]
136
  chunk_text = tokenizer.decode(chunk_ids, skip_special_tokens=True)
@@ -141,8 +157,8 @@ def stream_generator(prompt):
141
  "content": chunk_text
142
  }) + '\n'
143
 
144
- # Small delay for smoother streaming
145
- time.sleep(0.05)
146
 
147
  except Exception as e:
148
  import traceback
@@ -156,7 +172,7 @@ def stream_generator(prompt):
156
  # Signal completion
157
  yield json.dumps({"type": "complete"}) + '\n'
158
 
159
- # Clean up memory
160
  if DEVICE == "cuda":
161
  torch.cuda.empty_cache()
162
  gc.collect()
@@ -192,13 +208,8 @@ def chat():
192
  return jsonify({"error": "Empty prompt"}), 400
193
 
194
  try:
195
- # Format prompt for the model
196
- if "mistral" in MODEL_NAME.lower():
197
- formatted_prompt = f"<s>[INST] {prompt} [/INST]"
198
- elif "deepseek" in MODEL_NAME.lower():
199
- formatted_prompt = f"<|im_start|>user\n{prompt}<|im_end|>\n<|im_start|>assistant\n"
200
- else:
201
- formatted_prompt = prompt
202
 
203
  inputs = tokenizer(formatted_prompt, return_tensors="pt")
204
  if DEVICE == "cuda":
@@ -231,6 +242,14 @@ def chat():
231
  @app.route('/health', methods=['GET'])
232
  def health_check():
233
  model_loaded = tokenizer is not None and model is not None
 
 
 
 
 
 
 
 
234
 
235
  try:
236
  # Check if we need to load the model
@@ -241,34 +260,56 @@ def health_check():
241
 
242
  status = {
243
  "status": "ok" if model_loaded else "waiting",
 
244
  "model_loaded": model_loaded,
245
  "device": DEVICE,
246
  "cache_dir": str(cache_dir),
247
  "max_tokens": MAX_NEW_TOKENS,
248
- "memory_usage": f"{torch.cuda.memory_allocated()/1024**2:.2f}MB"
249
- if torch.cuda.is_available() else "CPU"
250
  }
251
  return jsonify(status)
252
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
253
  @app.route('/')
254
  def home():
255
  return jsonify({
256
- "service": "DeepSeek Chat API",
257
  "status": "online",
258
  "endpoints": {
259
  "POST /chat": "Single-response chat",
260
  "POST /stream_chat": "Streaming chat with thinking steps",
261
- "GET /health": "Service health check"
 
262
  },
263
  "config": {
264
  "model": MODEL_NAME,
265
  "max_tokens": MAX_NEW_TOKENS,
 
266
  "cache_location": str(cache_dir)
267
  }
268
  })
269
 
270
  if __name__ == '__main__':
271
- # Load model at startup - only if explicitly requested
272
  if os.getenv('PRELOAD_MODEL', 'false').lower() == 'true':
273
  load_model()
274
 
 
1
  import os
2
  import time
3
  import json
4
+ import gc # For garbage collection
5
  from pathlib import Path
6
  from flask import Flask, request, jsonify, Response
7
  from flask_cors import CORS
8
  import torch
 
9
 
10
  # Create cache directory if not exists
11
  cache_dir = Path(os.getenv('TRANSFORMERS_CACHE', '/app/cache'))
 
15
  CORS(app) # Allow cross-origin requests
16
 
17
  # Model configuration
18
+ # Use DeepSeek R1 Distill Qwen 1.5B model (much lighter than 7B)
19
+ MODEL_NAME = "deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B"
20
  MAX_NEW_TOKENS = 256
21
  DEVICE = "cpu" if not torch.cuda.is_available() else "cuda"
22
 
 
46
  bnb_4bit_use_double_quant=True
47
  )
48
  else:
49
+ # For CPU, we'll use a different optimization approach
50
  quantization_config = None
51
 
52
  # Load tokenizer
 
60
  hf_token = os.environ.get("HF_TOKEN")
61
  token_kwargs = {"token": hf_token} if hf_token else {}
62
 
63
+ # Additional memory optimization settings for low resource environments
64
+ if DEVICE == "cpu":
65
+ # Load model with 8-bit quantization for CPU
66
+ try:
67
+ # Try int8 quantization for CPU
68
+ model = AutoModelForCausalLM.from_pretrained(
69
+ MODEL_NAME,
70
+ cache_dir=str(cache_dir),
71
+ load_in_8bit=True,
72
+ low_cpu_mem_usage=True,
73
+ trust_remote_code=True,
74
+ **token_kwargs
75
+ )
76
+ except Exception as e:
77
+ print(f"8-bit quantization failed, falling back to standard loading: {str(e)}")
78
+ model = AutoModelForCausalLM.from_pretrained(
79
+ MODEL_NAME,
80
+ cache_dir=str(cache_dir),
81
+ low_cpu_mem_usage=True,
82
+ trust_remote_code=True,
83
+ **token_kwargs
84
+ )
85
+ else:
86
+ # Load model with 4-bit quantization for CUDA
87
+ model = AutoModelForCausalLM.from_pretrained(
88
+ MODEL_NAME,
89
+ cache_dir=str(cache_dir),
90
+ device_map="auto",
91
+ torch_dtype=torch.float16,
92
+ quantization_config=quantization_config,
93
+ low_cpu_mem_usage=True,
94
+ trust_remote_code=True,
95
+ **token_kwargs
96
+ )
97
 
98
  print("βœ… Model loaded successfully!")
99
  return True
 
111
  # Thinking phases
112
  thinking_steps = [
113
  "πŸ” Analyzing your question...",
114
+ "🧠 Processing...",
115
+ "πŸ’‘ Formulating response..."
 
116
  ]
117
 
118
+ # Stream thinking steps (fewer steps, faster timing for lighter model)
119
  for step in thinking_steps:
120
  yield json.dumps({"type": "thinking", "content": step}) + '\n'
121
+ time.sleep(0.5) # Reduced timing for faster response
122
 
123
  # Prepare streaming generation
124
  try:
125
+ # Format prompt for the model (DeepSeek specific)
126
+ formatted_prompt = f"<|im_start|>user\n{prompt}<|im_end|>\n<|im_start|>assistant\n"
 
 
 
 
 
127
 
128
  inputs = tokenizer(formatted_prompt, return_tensors="pt")
129
  if DEVICE == "cuda":
130
  inputs = inputs.to("cuda")
131
 
132
+ # Use memory efficient approach
 
133
  with torch.no_grad():
134
  generated_ids = model.generate(
135
  **inputs,
 
144
  # Get output sequence
145
  output_ids = generated_ids.sequences[0][len(inputs.input_ids[0]):]
146
 
147
+ # Stream in slightly larger chunks for better performance
148
  full_output = ""
149
+ chunk_size = 5 # Increased number of tokens per chunk
150
  for i in range(0, len(output_ids), chunk_size):
151
  chunk_ids = output_ids[i:i+chunk_size]
152
  chunk_text = tokenizer.decode(chunk_ids, skip_special_tokens=True)
 
157
  "content": chunk_text
158
  }) + '\n'
159
 
160
+ # Smaller delay for faster streaming
161
+ time.sleep(0.03)
162
 
163
  except Exception as e:
164
  import traceback
 
172
  # Signal completion
173
  yield json.dumps({"type": "complete"}) + '\n'
174
 
175
+ # Clean up memory aggressively
176
  if DEVICE == "cuda":
177
  torch.cuda.empty_cache()
178
  gc.collect()
 
208
  return jsonify({"error": "Empty prompt"}), 400
209
 
210
  try:
211
+ # Format prompt for DeepSeek model
212
+ formatted_prompt = f"<|im_start|>user\n{prompt}<|im_end|>\n<|im_start|>assistant\n"
 
 
 
 
 
213
 
214
  inputs = tokenizer(formatted_prompt, return_tensors="pt")
215
  if DEVICE == "cuda":
 
242
  @app.route('/health', methods=['GET'])
243
  def health_check():
244
  model_loaded = tokenizer is not None and model is not None
245
+ memory_info = "N/A"
246
+
247
+ # Get memory usage stats
248
+ if torch.cuda.is_available():
249
+ memory_info = f"{torch.cuda.memory_allocated()/1024**2:.2f}MB / {torch.cuda.get_device_properties(0).total_memory/1024**2:.2f}MB"
250
+ else:
251
+ import psutil
252
+ memory_info = f"{psutil.virtual_memory().used/1024**3:.2f}GB / {psutil.virtual_memory().total/1024**3:.2f}GB"
253
 
254
  try:
255
  # Check if we need to load the model
 
260
 
261
  status = {
262
  "status": "ok" if model_loaded else "waiting",
263
+ "model": MODEL_NAME,
264
  "model_loaded": model_loaded,
265
  "device": DEVICE,
266
  "cache_dir": str(cache_dir),
267
  "max_tokens": MAX_NEW_TOKENS,
268
+ "memory_usage": memory_info
 
269
  }
270
  return jsonify(status)
271
 
272
+ @app.route('/unload', methods=['POST'])
273
+ def unload_model():
274
+ """Endpoint to manually unload model and free memory"""
275
+ global model, tokenizer
276
+
277
+ if model is not None:
278
+ del model
279
+ model = None
280
+
281
+ if tokenizer is not None:
282
+ del tokenizer
283
+ tokenizer = None
284
+
285
+ # Force garbage collection
286
+ if torch.cuda.is_available():
287
+ torch.cuda.empty_cache()
288
+ gc.collect()
289
+
290
+ return jsonify({"status": "Model unloaded", "memory_freed": True})
291
+
292
  @app.route('/')
293
  def home():
294
  return jsonify({
295
+ "service": "DeepSeek-1.5B Chat API",
296
  "status": "online",
297
  "endpoints": {
298
  "POST /chat": "Single-response chat",
299
  "POST /stream_chat": "Streaming chat with thinking steps",
300
+ "GET /health": "Service health check",
301
+ "POST /unload": "Unload model to free memory"
302
  },
303
  "config": {
304
  "model": MODEL_NAME,
305
  "max_tokens": MAX_NEW_TOKENS,
306
+ "device": DEVICE,
307
  "cache_location": str(cache_dir)
308
  }
309
  })
310
 
311
  if __name__ == '__main__':
312
+ # Load model at startup only if explicitly requested
313
  if os.getenv('PRELOAD_MODEL', 'false').lower() == 'true':
314
  load_model()
315