mike23415 commited on
Commit
2f665a8
Β·
verified Β·
1 Parent(s): bedd56d

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +132 -64
app.py CHANGED
@@ -5,44 +5,65 @@ import numpy as np
5
  from pathlib import Path
6
  from flask import Flask, request, jsonify, Response
7
  from flask_cors import CORS
8
- from transformers import AutoTokenizer, AutoModelForCausalLM, TextStreamer
9
  import torch
10
-
11
- # Verify numpy version
12
- assert np.__version__.startswith('1.'), f"Invalid numpy version {np.__version__} - must be 1.x series"
13
 
14
  # Create cache directory if not exists
15
  cache_dir = Path(os.getenv('TRANSFORMERS_CACHE', '/app/cache'))
16
  cache_dir.mkdir(parents=True, exist_ok=True)
17
 
18
  app = Flask(__name__)
19
- CORS(app)
20
 
21
  # Model configuration
22
  MODEL_NAME = "deepseek-ai/deepseek-r1-6b-chat"
23
  MAX_NEW_TOKENS = 256
24
- DEVICE = "cpu"
25
 
26
- # Initialize model
27
- try:
28
- tokenizer = AutoTokenizer.from_pretrained(
29
- MODEL_NAME,
30
- cache_dir=str(cache_dir)
31
- )
 
32
 
33
- model = AutoModelForCausalLM.from_pretrained(
34
- MODEL_NAME,
35
- cache_dir=str(cache_dir),
36
- device_map="auto",
37
- torch_dtype=torch.float32,
38
- low_cpu_mem_usage=True)
39
- print("Model loaded successfully!")
40
- except Exception as e:
41
- print(f"Model loading failed: {str(e)}")
42
- model = None
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
43
 
44
  def stream_generator(prompt):
45
  """Generator function for streaming response with thinking steps"""
 
 
 
 
 
46
  # Thinking phases
47
  thinking_steps = [
48
  "πŸ” Analyzing your question...",
@@ -54,48 +75,65 @@ def stream_generator(prompt):
54
  # Stream thinking steps
55
  for step in thinking_steps:
56
  yield json.dumps({"type": "thinking", "content": step}) + '\n'
57
- time.sleep(1.5) # Simulate processing time
58
 
59
  # Prepare streaming generation
60
- inputs = tokenizer(prompt, return_tensors="pt").to(DEVICE)
61
- streamer = TextStreamer(tokenizer, skip_prompt=True)
62
-
63
- # Generate response chunks
64
  try:
65
- generated_ids = model.generate(
66
- **inputs,
67
- max_new_tokens=MAX_NEW_TOKENS,
68
- streamer=streamer,
69
- temperature=0.7,
70
- top_p=0.9,
71
- do_sample=True,
72
- pad_token_id=tokenizer.eos_token_id)
73
 
74
- # Stream generated text
75
- full_response = ""
76
- for token_ids in generated_ids:
77
- chunk = tokenizer.decode(token_ids, skip_special_tokens=True)
78
- new_content = chunk[len(full_response):]
79
- if new_content.strip():
80
- full_response = chunk
81
- yield json.dumps({
82
- "type": "answer",
83
- "content": new_content
84
- }) + '\n'
85
-
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
86
  except Exception as e:
 
 
 
87
  yield json.dumps({
88
  "type": "error",
89
  "content": f"Generation error: {str(e)}"
90
  }) + '\n'
91
 
 
92
  yield json.dumps({"type": "complete"}) + '\n'
 
 
 
 
 
93
 
94
  @app.route('/stream_chat', methods=['POST'])
95
  def stream_chat():
96
- if not model:
97
- return jsonify({"error": "Model not loaded"}), 500
98
-
99
  data = request.get_json()
100
  prompt = data.get('prompt', '').strip()
101
 
@@ -107,14 +145,16 @@ def stream_chat():
107
  mimetype='text/event-stream',
108
  headers={
109
  'Cache-Control': 'no-cache',
 
110
  'Connection': 'keep-alive'
111
  }
112
  )
113
 
114
  @app.route('/chat', methods=['POST'])
115
  def chat():
116
- if not model:
117
- return jsonify({"error": "Model not loaded"}), 500
 
118
 
119
  data = request.get_json()
120
  prompt = data.get('prompt', '').strip()
@@ -123,26 +163,48 @@ def chat():
123
  return jsonify({"error": "Empty prompt"}), 400
124
 
125
  try:
126
- inputs = tokenizer(prompt, return_tensors="pt").to(DEVICE)
127
- outputs = model.generate(
128
- **inputs,
129
- max_new_tokens=MAX_NEW_TOKENS,
130
- temperature=0.7,
131
- top_p=0.9,
132
- do_sample=True,
133
- pad_token_id=tokenizer.eos_token_id)
 
 
 
 
 
 
 
 
 
 
 
134
 
135
- response = tokenizer.decode(outputs[0], skip_special_tokens=True)
136
- response = response.split("</s>")[0].strip()
137
  return jsonify({"response": response})
138
 
139
  except Exception as e:
 
 
 
140
  return jsonify({"error": str(e)}), 500
141
 
142
  @app.route('/health', methods=['GET'])
143
  def health_check():
 
 
 
 
 
 
 
 
 
144
  status = {
145
- "model_loaded": bool(model),
 
146
  "device": DEVICE,
147
  "cache_dir": str(cache_dir),
148
  "max_tokens": MAX_NEW_TOKENS,
@@ -155,6 +217,7 @@ def health_check():
155
  def home():
156
  return jsonify({
157
  "service": "DeepSeek Chat API",
 
158
  "endpoints": {
159
  "POST /chat": "Single-response chat",
160
  "POST /stream_chat": "Streaming chat with thinking steps",
@@ -168,4 +231,9 @@ def home():
168
  })
169
 
170
  if __name__ == '__main__':
171
- app.run(host='0.0.0.0', port=5000)
 
 
 
 
 
 
5
  from pathlib import Path
6
  from flask import Flask, request, jsonify, Response
7
  from flask_cors import CORS
 
8
  import torch
9
+ import gc # For garbage collection
 
 
10
 
11
  # Create cache directory if not exists
12
  cache_dir = Path(os.getenv('TRANSFORMERS_CACHE', '/app/cache'))
13
  cache_dir.mkdir(parents=True, exist_ok=True)
14
 
15
  app = Flask(__name__)
16
+ CORS(app) # Allow cross-origin requests
17
 
18
  # Model configuration
19
  MODEL_NAME = "deepseek-ai/deepseek-r1-6b-chat"
20
  MAX_NEW_TOKENS = 256
21
+ DEVICE = "cpu" if not torch.cuda.is_available() else "cuda"
22
 
23
+ # Initialize model variables
24
+ tokenizer = None
25
+ model = None
26
+
27
+ def load_model():
28
+ """Load model on first request to save memory at startup"""
29
+ global tokenizer, model
30
 
31
+ if tokenizer is not None and model is not None:
32
+ return True
33
+
34
+ try:
35
+ from transformers import AutoTokenizer, AutoModelForCausalLM
36
+ print(f"Loading model {MODEL_NAME}...")
37
+ print(f"Using device: {DEVICE}")
38
+ print(f"Cache directory: {cache_dir}")
39
+
40
+ # Load tokenizer
41
+ tokenizer = AutoTokenizer.from_pretrained(
42
+ MODEL_NAME,
43
+ cache_dir=str(cache_dir)
44
+ )
45
+
46
+ # Load model with low memory settings
47
+ model = AutoModelForCausalLM.from_pretrained(
48
+ MODEL_NAME,
49
+ cache_dir=str(cache_dir),
50
+ device_map="auto" if DEVICE == "cuda" else None,
51
+ torch_dtype=torch.float16 if DEVICE == "cuda" else torch.float32,
52
+ low_cpu_mem_usage=True)
53
+
54
+ print("βœ… Model loaded successfully!")
55
+ return True
56
+ except Exception as e:
57
+ print(f"❌ Model loading failed: {str(e)}")
58
+ return False
59
 
60
  def stream_generator(prompt):
61
  """Generator function for streaming response with thinking steps"""
62
+ # Ensure model is loaded
63
+ if not load_model():
64
+ yield json.dumps({"type": "error", "content": "Model not loaded"}) + '\n'
65
+ return
66
+
67
  # Thinking phases
68
  thinking_steps = [
69
  "πŸ” Analyzing your question...",
 
75
  # Stream thinking steps
76
  for step in thinking_steps:
77
  yield json.dumps({"type": "thinking", "content": step}) + '\n'
78
+ time.sleep(0.8) # Reduced timing for faster response
79
 
80
  # Prepare streaming generation
 
 
 
 
81
  try:
82
+ inputs = tokenizer(prompt, return_tensors="pt")
83
+ if DEVICE == "cuda":
84
+ inputs = inputs.to("cuda")
 
 
 
 
 
85
 
86
+ # Use custom streaming implementation
87
+ # Start generation
88
+ with torch.no_grad():
89
+ generated_ids = model.generate(
90
+ **inputs,
91
+ max_new_tokens=MAX_NEW_TOKENS,
92
+ temperature=0.7,
93
+ top_p=0.9,
94
+ do_sample=True,
95
+ pad_token_id=tokenizer.eos_token_id,
96
+ return_dict_in_generate=True,
97
+ output_scores=False)
98
+
99
+ # Get output sequence
100
+ output_ids = generated_ids.sequences[0][len(inputs.input_ids[0]):]
101
+
102
+ # Stream in chunks for smoother experience
103
+ full_output = ""
104
+ chunk_size = 3 # Number of tokens per chunk
105
+ for i in range(0, len(output_ids), chunk_size):
106
+ chunk_ids = output_ids[i:i+chunk_size]
107
+ chunk_text = tokenizer.decode(chunk_ids, skip_special_tokens=True)
108
+ full_output += chunk_text
109
+
110
+ yield json.dumps({
111
+ "type": "answer",
112
+ "content": chunk_text
113
+ }) + '\n'
114
+
115
+ # Small delay for smoother streaming
116
+ time.sleep(0.05)
117
+
118
  except Exception as e:
119
+ import traceback
120
+ error_details = f"Error: {str(e)}\n{traceback.format_exc()}"
121
+ print(error_details)
122
  yield json.dumps({
123
  "type": "error",
124
  "content": f"Generation error: {str(e)}"
125
  }) + '\n'
126
 
127
+ # Signal completion
128
  yield json.dumps({"type": "complete"}) + '\n'
129
+
130
+ # Clean up memory
131
+ if DEVICE == "cuda":
132
+ torch.cuda.empty_cache()
133
+ gc.collect()
134
 
135
  @app.route('/stream_chat', methods=['POST'])
136
  def stream_chat():
 
 
 
137
  data = request.get_json()
138
  prompt = data.get('prompt', '').strip()
139
 
 
145
  mimetype='text/event-stream',
146
  headers={
147
  'Cache-Control': 'no-cache',
148
+ 'X-Accel-Buffering': 'no', # Prevent Nginx buffering
149
  'Connection': 'keep-alive'
150
  }
151
  )
152
 
153
  @app.route('/chat', methods=['POST'])
154
  def chat():
155
+ # Ensure model is loaded
156
+ if not load_model():
157
+ return jsonify({"error": "Model failed to load"}), 500
158
 
159
  data = request.get_json()
160
  prompt = data.get('prompt', '').strip()
 
163
  return jsonify({"error": "Empty prompt"}), 400
164
 
165
  try:
166
+ inputs = tokenizer(prompt, return_tensors="pt")
167
+ if DEVICE == "cuda":
168
+ inputs = inputs.to("cuda")
169
+
170
+ with torch.no_grad():
171
+ outputs = model.generate(
172
+ **inputs,
173
+ max_new_tokens=MAX_NEW_TOKENS,
174
+ temperature=0.7,
175
+ top_p=0.9,
176
+ do_sample=True,
177
+ pad_token_id=tokenizer.eos_token_id)
178
+
179
+ response = tokenizer.decode(outputs[0][len(inputs.input_ids[0]):], skip_special_tokens=True)
180
+
181
+ # Clean up memory
182
+ if DEVICE == "cuda":
183
+ torch.cuda.empty_cache()
184
+ gc.collect()
185
 
 
 
186
  return jsonify({"response": response})
187
 
188
  except Exception as e:
189
+ import traceback
190
+ error_details = f"Error: {str(e)}\n{traceback.format_exc()}"
191
+ print(error_details)
192
  return jsonify({"error": str(e)}), 500
193
 
194
  @app.route('/health', methods=['GET'])
195
  def health_check():
196
+ model_loaded = tokenizer is not None and model is not None
197
+
198
+ try:
199
+ # Check if we need to load the model
200
+ if not model_loaded and request.args.get('load') == 'true':
201
+ model_loaded = load_model()
202
+ except Exception as e:
203
+ print(f"Health check error: {str(e)}")
204
+
205
  status = {
206
+ "status": "ok" if model_loaded else "waiting",
207
+ "model_loaded": model_loaded,
208
  "device": DEVICE,
209
  "cache_dir": str(cache_dir),
210
  "max_tokens": MAX_NEW_TOKENS,
 
217
  def home():
218
  return jsonify({
219
  "service": "DeepSeek Chat API",
220
+ "status": "online",
221
  "endpoints": {
222
  "POST /chat": "Single-response chat",
223
  "POST /stream_chat": "Streaming chat with thinking steps",
 
231
  })
232
 
233
  if __name__ == '__main__':
234
+ # Load model at startup - only if explicitly requested
235
+ if os.getenv('PRELOAD_MODEL', 'false').lower() == 'true':
236
+ load_model()
237
+
238
+ port = int(os.environ.get("PORT", 5000))
239
+ app.run(host='0.0.0.0', port=port)