mike23415 commited on
Commit
7625bb8
·
verified ·
1 Parent(s): 45ef073

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +42 -133
app.py CHANGED
@@ -7,130 +7,87 @@ from flask import Flask, request, jsonify, Response
7
  from flask_cors import CORS
8
  import torch
9
 
10
- # Cache and model settings
11
  cache_dir = Path(os.getenv('TRANSFORMERS_CACHE', '/app/cache'))
12
  cache_dir.mkdir(parents=True, exist_ok=True)
13
 
14
  app = Flask(__name__)
15
  CORS(app)
16
 
17
- MODEL_NAME = "deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B"
18
  MAX_NEW_TOKENS = 256
19
- DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
20
 
21
  tokenizer = None
22
  model = None
23
 
24
  def load_model():
25
  global tokenizer, model
26
-
27
- if tokenizer is not None and model is not None:
28
  return True
29
 
30
  try:
31
- from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig
32
-
33
- print(f"Loading model: {MODEL_NAME}")
34
- print(f"Device: {DEVICE}")
35
 
36
- # HF auth token if needed
37
  hf_token = os.environ.get("HF_TOKEN")
38
  token_kwargs = {"token": hf_token} if hf_token else {}
39
 
40
- tokenizer = AutoTokenizer.from_pretrained(
 
 
41
  MODEL_NAME,
42
  cache_dir=str(cache_dir),
43
- trust_remote_code=True,
 
 
44
  **token_kwargs
45
  )
46
 
47
  if DEVICE == "cuda":
48
- quant_config = BitsAndBytesConfig(
49
- load_in_4bit=True,
50
- bnb_4bit_compute_dtype=torch.float16,
51
- bnb_4bit_quant_type="nf4",
52
- bnb_4bit_use_double_quant=True
53
- )
54
- model = AutoModelForCausalLM.from_pretrained(
55
- MODEL_NAME,
56
- cache_dir=str(cache_dir),
57
- trust_remote_code=True,
58
- device_map="auto",
59
- quantization_config=quant_config,
60
- torch_dtype=torch.float16,
61
- low_cpu_mem_usage=True,
62
- **token_kwargs
63
- )
64
- else:
65
- # CPU: no quantization_config; use float16 if possible
66
- model = AutoModelForCausalLM.from_pretrained(
67
- MODEL_NAME,
68
- cache_dir=str(cache_dir),
69
- trust_remote_code=True,
70
- torch_dtype=torch.float16,
71
- low_cpu_mem_usage=True,
72
- **token_kwargs
73
- )
74
 
75
- print("✅ Model loaded successfully")
76
  return True
77
-
78
  except Exception as e:
79
- print(f"❌ Failed to load model: {e}")
80
  return False
81
 
82
  def stream_generator(prompt):
83
  if not load_model():
84
- yield json.dumps({"type": "error", "content": "Model not loaded"}) + '\n'
85
  return
86
 
87
- thinking_steps = [
88
- "🔍 Analyzing your question...",
89
- "🧠 Processing...",
90
- "💡 Formulating response..."
91
- ]
92
-
93
- for step in thinking_steps:
94
  yield json.dumps({"type": "thinking", "content": step}) + '\n'
95
- time.sleep(0.5)
96
 
97
  try:
98
- formatted_prompt = f"<|im_start|>user\n{prompt}<|im_end|>\n<|im_start|>assistant\n"
99
- inputs = tokenizer(formatted_prompt, return_tensors="pt")
100
- if DEVICE == "cuda":
101
- inputs = inputs.to("cuda")
102
 
103
  with torch.no_grad():
104
- generated_ids = model.generate(
105
  **inputs,
106
  max_new_tokens=MAX_NEW_TOKENS,
107
  temperature=0.7,
108
  top_p=0.9,
109
  do_sample=True,
110
- pad_token_id=tokenizer.eos_token_id,
111
- return_dict_in_generate=True,
112
- output_scores=False
113
  )
114
 
115
- output_ids = generated_ids.sequences[0][len(inputs.input_ids[0]):]
116
- full_output = ""
117
- chunk_size = 5
118
 
119
- for i in range(0, len(output_ids), chunk_size):
120
- chunk_ids = output_ids[i:i + chunk_size]
121
- chunk_text = tokenizer.decode(chunk_ids, skip_special_tokens=True)
122
- full_output += chunk_text
123
- yield json.dumps({"type": "answer", "content": chunk_text}) + '\n'
124
  time.sleep(0.03)
125
 
126
  except Exception as e:
127
- import traceback
128
- error_details = f"Error: {str(e)}\n{traceback.format_exc()}"
129
- print(error_details)
130
  yield json.dumps({"type": "error", "content": str(e)}) + '\n'
131
 
132
  yield json.dumps({"type": "complete"}) + '\n'
133
-
134
  if DEVICE == "cuda":
135
  torch.cuda.empty_cache()
136
  gc.collect()
@@ -139,7 +96,6 @@ def stream_generator(prompt):
139
  def stream_chat():
140
  data = request.get_json()
141
  prompt = data.get('prompt', '').strip()
142
-
143
  if not prompt:
144
  return jsonify({"error": "Empty prompt"}), 400
145
 
@@ -160,18 +116,15 @@ def chat():
160
 
161
  data = request.get_json()
162
  prompt = data.get('prompt', '').strip()
163
-
164
  if not prompt:
165
  return jsonify({"error": "Empty prompt"}), 400
166
 
167
  try:
168
- formatted_prompt = f"<|im_start|>user\n{prompt}<|im_end|>\n<|im_start|>assistant\n"
169
- inputs = tokenizer(formatted_prompt, return_tensors="pt")
170
- if DEVICE == "cuda":
171
- inputs = inputs.to("cuda")
172
 
173
  with torch.no_grad():
174
- outputs = model.generate(
175
  **inputs,
176
  max_new_tokens=MAX_NEW_TOKENS,
177
  temperature=0.7,
@@ -180,78 +133,34 @@ def chat():
180
  pad_token_id=tokenizer.eos_token_id
181
  )
182
 
183
- response = tokenizer.decode(outputs[0][len(inputs.input_ids[0]):], skip_special_tokens=True)
184
-
185
- if DEVICE == "cuda":
186
- torch.cuda.empty_cache()
187
- gc.collect()
188
-
189
- return jsonify({"response": response})
190
-
191
  except Exception as e:
192
- import traceback
193
- print(f"Error: {e}\n{traceback.format_exc()}")
194
  return jsonify({"error": str(e)}), 500
195
 
196
- @app.route('/health', methods=['GET'])
197
- def health_check():
198
- model_loaded = tokenizer is not None and model is not None
199
- memory_info = "N/A"
200
-
201
- if torch.cuda.is_available():
202
- memory_info = f"{torch.cuda.memory_allocated()/1024**2:.2f}MB / {torch.cuda.get_device_properties(0).total_memory/1024**2:.2f}MB"
203
- else:
204
- import psutil
205
- memory_info = f"{psutil.virtual_memory().used/1024**3:.2f}GB / {psutil.virtual_memory().total/1024**3:.2f}GB"
206
-
207
- if not model_loaded and request.args.get('load') == 'true':
208
- model_loaded = load_model()
209
-
210
  return jsonify({
211
  "status": "ok" if model_loaded else "waiting",
212
- "model": MODEL_NAME,
213
  "model_loaded": model_loaded,
 
214
  "device": DEVICE,
215
- "cache_dir": str(cache_dir),
216
- "max_tokens": MAX_NEW_TOKENS,
217
- "memory_usage": memory_info
218
  })
219
 
220
- @app.route('/unload', methods=['POST'])
221
- def unload_model():
222
- global model, tokenizer
223
- if model is not None:
224
- del model
225
- model = None
226
- if tokenizer is not None:
227
- del tokenizer
228
- tokenizer = None
229
- if torch.cuda.is_available():
230
- torch.cuda.empty_cache()
231
- gc.collect()
232
- return jsonify({"status": "Model unloaded", "memory_freed": True})
233
-
234
  @app.route('/')
235
  def home():
236
  return jsonify({
237
- "service": "DeepSeek-1.5B Chat API",
238
  "status": "online",
239
  "endpoints": {
240
- "POST /chat": "Single-response chat",
241
- "POST /stream_chat": "Streaming chat",
242
- "GET /health": "Service health check",
243
- "POST /unload": "Unload model"
244
- },
245
- "config": {
246
- "model": MODEL_NAME,
247
- "max_tokens": MAX_NEW_TOKENS,
248
- "device": DEVICE,
249
- "cache_location": str(cache_dir)
250
  }
251
  })
252
 
253
  if __name__ == '__main__':
254
- if os.getenv('PRELOAD_MODEL', 'false').lower() == 'true':
255
  load_model()
256
- port = int(os.environ.get("PORT", 5000))
257
- app.run(host='0.0.0.0', port=port)
 
7
  from flask_cors import CORS
8
  import torch
9
 
10
+ # Caching
11
  cache_dir = Path(os.getenv('TRANSFORMERS_CACHE', '/app/cache'))
12
  cache_dir.mkdir(parents=True, exist_ok=True)
13
 
14
  app = Flask(__name__)
15
  CORS(app)
16
 
17
+ MODEL_NAME = "microsoft/Phi-3-mini-4k-instruct"
18
  MAX_NEW_TOKENS = 256
19
+ DEVICE = "cpu" if not torch.cuda.is_available() else "cuda"
20
 
21
  tokenizer = None
22
  model = None
23
 
24
  def load_model():
25
  global tokenizer, model
26
+ if tokenizer and model:
 
27
  return True
28
 
29
  try:
30
+ from transformers import AutoTokenizer, AutoModelForCausalLM
 
 
 
31
 
32
+ print(f"Loading {MODEL_NAME} on {DEVICE}...")
33
  hf_token = os.environ.get("HF_TOKEN")
34
  token_kwargs = {"token": hf_token} if hf_token else {}
35
 
36
+ tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME, cache_dir=str(cache_dir), trust_remote_code=False, **token_kwargs)
37
+
38
+ model = AutoModelForCausalLM.from_pretrained(
39
  MODEL_NAME,
40
  cache_dir=str(cache_dir),
41
+ torch_dtype=torch.bfloat16 if DEVICE == "cpu" else torch.float16,
42
+ low_cpu_mem_usage=True,
43
+ trust_remote_code=False,
44
  **token_kwargs
45
  )
46
 
47
  if DEVICE == "cuda":
48
+ model = model.to("cuda")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
49
 
50
+ print("✅ Phi-3 Mini loaded successfully!")
51
  return True
 
52
  except Exception as e:
53
+ print(f"❌ Model load failed: {e}")
54
  return False
55
 
56
  def stream_generator(prompt):
57
  if not load_model():
58
+ yield json.dumps({"type": "error", "content": "Model failed to load"}) + '\n'
59
  return
60
 
61
+ thinking = ["🧠 Thinking...", "🤖 Preparing answer..."]
62
+ for step in thinking:
 
 
 
 
 
63
  yield json.dumps({"type": "thinking", "content": step}) + '\n'
64
+ time.sleep(0.4)
65
 
66
  try:
67
+ formatted_prompt = f"<|user|>\n{prompt}\n<|assistant|>\n"
68
+ inputs = tokenizer(formatted_prompt, return_tensors="pt").to(DEVICE if DEVICE == "cuda" else "cpu")
 
 
69
 
70
  with torch.no_grad():
71
+ output = model.generate(
72
  **inputs,
73
  max_new_tokens=MAX_NEW_TOKENS,
74
  temperature=0.7,
75
  top_p=0.9,
76
  do_sample=True,
77
+ pad_token_id=tokenizer.eos_token_id
 
 
78
  )
79
 
80
+ new_tokens = output[0][inputs.input_ids.shape[-1]:]
81
+ generated_text = tokenizer.decode(new_tokens, skip_special_tokens=True)
 
82
 
83
+ for i in range(0, len(generated_text), 12):
84
+ yield json.dumps({"type": "answer", "content": generated_text[i:i+12]}) + '\n'
 
 
 
85
  time.sleep(0.03)
86
 
87
  except Exception as e:
 
 
 
88
  yield json.dumps({"type": "error", "content": str(e)}) + '\n'
89
 
90
  yield json.dumps({"type": "complete"}) + '\n'
 
91
  if DEVICE == "cuda":
92
  torch.cuda.empty_cache()
93
  gc.collect()
 
96
  def stream_chat():
97
  data = request.get_json()
98
  prompt = data.get('prompt', '').strip()
 
99
  if not prompt:
100
  return jsonify({"error": "Empty prompt"}), 400
101
 
 
116
 
117
  data = request.get_json()
118
  prompt = data.get('prompt', '').strip()
 
119
  if not prompt:
120
  return jsonify({"error": "Empty prompt"}), 400
121
 
122
  try:
123
+ formatted_prompt = f"<|user|>\n{prompt}\n<|assistant|>\n"
124
+ inputs = tokenizer(formatted_prompt, return_tensors="pt").to(DEVICE if DEVICE == "cuda" else "cpu")
 
 
125
 
126
  with torch.no_grad():
127
+ output = model.generate(
128
  **inputs,
129
  max_new_tokens=MAX_NEW_TOKENS,
130
  temperature=0.7,
 
133
  pad_token_id=tokenizer.eos_token_id
134
  )
135
 
136
+ response_text = tokenizer.decode(output[0][inputs.input_ids.shape[-1]:], skip_special_tokens=True)
137
+ return jsonify({"response": response_text})
 
 
 
 
 
 
138
  except Exception as e:
 
 
139
  return jsonify({"error": str(e)}), 500
140
 
141
+ @app.route('/health')
142
+ def health():
143
+ import psutil
144
+ model_loaded = model is not None
 
 
 
 
 
 
 
 
 
 
145
  return jsonify({
146
  "status": "ok" if model_loaded else "waiting",
 
147
  "model_loaded": model_loaded,
148
+ "memory": f"{psutil.virtual_memory().used/1024**3:.2f}GB used",
149
  "device": DEVICE,
 
 
 
150
  })
151
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
152
  @app.route('/')
153
  def home():
154
  return jsonify({
155
+ "service": "Phi-3 Mini Chat API",
156
  "status": "online",
157
  "endpoints": {
158
+ "POST /chat": "Single-response",
159
+ "POST /stream_chat": "Streaming chat"
 
 
 
 
 
 
 
 
160
  }
161
  })
162
 
163
  if __name__ == '__main__':
164
+ if os.getenv('PRELOAD_MODEL', 'false') == 'true':
165
  load_model()
166
+ app.run(host='0.0.0.0', port=int(os.environ.get("PORT", 5000)))