mike23415 commited on
Commit
7ae54ea
Β·
verified Β·
1 Parent(s): 580eaed

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +87 -21
app.py CHANGED
@@ -1,9 +1,11 @@
1
  import os
 
 
2
  import numpy as np
3
  from pathlib import Path
4
- from flask import Flask, request, jsonify
5
  from flask_cors import CORS
6
- from transformers import AutoTokenizer, AutoModelForCausalLM
7
  import torch
8
 
9
  # Verify numpy version
@@ -23,37 +25,90 @@ DEVICE = "cpu"
23
 
24
  # Initialize model
25
  try:
26
- tokenizer = AutoTokenizer.from_pretrained( # Fixed this line
27
  MODEL_NAME,
28
  cache_dir=str(cache_dir)
29
- ) # Added closing parenthesis
30
 
31
  model = AutoModelForCausalLM.from_pretrained(
32
  MODEL_NAME,
33
  cache_dir=str(cache_dir),
34
  device_map="auto",
35
  torch_dtype=torch.float32,
36
- low_cpu_mem_usage=True
37
- )
38
  print("Model loaded successfully!")
39
  except Exception as e:
40
  print(f"Model loading failed: {str(e)}")
41
  model = None
42
 
43
- def generate_response(prompt):
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
44
  try:
45
- inputs = tokenizer(prompt, return_tensors="pt").to(DEVICE)
46
- outputs = model.generate(
47
  **inputs,
48
  max_new_tokens=MAX_NEW_TOKENS,
 
49
  temperature=0.7,
50
  top_p=0.9,
51
  do_sample=True,
52
- pad_token_id=tokenizer.eos_token_id
53
- )
54
- return tokenizer.decode(outputs[0], skip_special_tokens=True)
 
 
 
 
 
 
 
 
 
 
 
55
  except Exception as e:
56
- return f"Error generating response: {str(e)}"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
57
 
58
  @app.route('/chat', methods=['POST'])
59
  def chat():
@@ -61,18 +116,25 @@ def chat():
61
  return jsonify({"error": "Model not loaded"}), 500
62
 
63
  data = request.get_json()
64
- if not data or 'prompt' not in data:
65
- return jsonify({"error": "No prompt provided"}), 400
66
 
67
- prompt = data['prompt'].strip()
68
  if not prompt:
69
  return jsonify({"error": "Empty prompt"}), 400
70
 
71
  try:
72
- response = generate_response(prompt)
73
- # Clean up extra text after the final answer
 
 
 
 
 
 
 
 
74
  response = response.split("</s>")[0].strip()
75
  return jsonify({"response": response})
 
76
  except Exception as e:
77
  return jsonify({"error": str(e)}), 500
78
 
@@ -82,7 +144,9 @@ def health_check():
82
  "model_loaded": bool(model),
83
  "device": DEVICE,
84
  "cache_dir": str(cache_dir),
85
- "memory_usage": f"{torch.cuda.memory_allocated()/1024**2:.2f}MB" if torch.cuda.is_available() else "CPU"
 
 
86
  }
87
  return jsonify(status)
88
 
@@ -91,12 +155,14 @@ def home():
91
  return jsonify({
92
  "service": "DeepSeek Chat API",
93
  "endpoints": {
94
- "POST /chat": "Process chat prompts",
 
95
  "GET /health": "Service health check"
96
  },
97
  "config": {
 
98
  "max_tokens": MAX_NEW_TOKENS,
99
- "model": MODEL_NAME
100
  }
101
  })
102
 
 
1
  import os
2
+ import time
3
+ import json
4
  import numpy as np
5
  from pathlib import Path
6
+ from flask import Flask, request, jsonify, Response
7
  from flask_cors import CORS
8
+ from transformers import AutoTokenizer, AutoModelForCausalLM, TextStreamer
9
  import torch
10
 
11
  # Verify numpy version
 
25
 
26
  # Initialize model
27
  try:
28
+ tokenizer = AutoTokenizer.from_pretrained(
29
  MODEL_NAME,
30
  cache_dir=str(cache_dir)
 
31
 
32
  model = AutoModelForCausalLM.from_pretrained(
33
  MODEL_NAME,
34
  cache_dir=str(cache_dir),
35
  device_map="auto",
36
  torch_dtype=torch.float32,
37
+ low_cpu_mem_usage=True)
 
38
  print("Model loaded successfully!")
39
  except Exception as e:
40
  print(f"Model loading failed: {str(e)}")
41
  model = None
42
 
43
+ def stream_generator(prompt):
44
+ """Generator function for streaming response with thinking steps"""
45
+ # Thinking phases
46
+ thinking_steps = [
47
+ "πŸ” Analyzing your question...",
48
+ "🧠 Accessing knowledge base...",
49
+ "πŸ’‘ Formulating response...",
50
+ "πŸ“š Verifying information..."
51
+ ]
52
+
53
+ # Stream thinking steps
54
+ for step in thinking_steps:
55
+ yield json.dumps({"type": "thinking", "content": step}) + '\n'
56
+ time.sleep(1.5) # Simulate processing time
57
+
58
+ # Prepare streaming generation
59
+ inputs = tokenizer(prompt, return_tensors="pt").to(DEVICE)
60
+ streamer = TextStreamer(tokenizer, skip_prompt=True)
61
+
62
+ # Generate response chunks
63
  try:
64
+ generated_ids = model.generate(
 
65
  **inputs,
66
  max_new_tokens=MAX_NEW_TOKENS,
67
+ streamer=streamer,
68
  temperature=0.7,
69
  top_p=0.9,
70
  do_sample=True,
71
+ pad_token_id=tokenizer.eos_token_id)
72
+
73
+ # Stream generated text
74
+ full_response = ""
75
+ for token_ids in generated_ids:
76
+ chunk = tokenizer.decode(token_ids, skip_special_tokens=True)
77
+ new_content = chunk[len(full_response):]
78
+ if new_content.strip():
79
+ full_response = chunk
80
+ yield json.dumps({
81
+ "type": "answer",
82
+ "content": new_content
83
+ }) + '\n'
84
+
85
  except Exception as e:
86
+ yield json.dumps({
87
+ "type": "error",
88
+ "content": f"Generation error: {str(e)}"
89
+ }) + '\n'
90
+
91
+ yield json.dumps({"type": "complete"}) + '\n'
92
+
93
+ @app.route('/stream_chat', methods=['POST'])
94
+ def stream_chat():
95
+ if not model:
96
+ return jsonify({"error": "Model not loaded"}), 500
97
+
98
+ data = request.get_json()
99
+ prompt = data.get('prompt', '').strip()
100
+
101
+ if not prompt:
102
+ return jsonify({"error": "Empty prompt"}), 400
103
+
104
+ return Response(
105
+ stream_generator(prompt),
106
+ mimetype='text/event-stream',
107
+ headers={
108
+ 'Cache-Control': 'no-cache',
109
+ 'Connection': 'keep-alive'
110
+ }
111
+ )
112
 
113
  @app.route('/chat', methods=['POST'])
114
  def chat():
 
116
  return jsonify({"error": "Model not loaded"}), 500
117
 
118
  data = request.get_json()
119
+ prompt = data.get('prompt', '').strip()
 
120
 
 
121
  if not prompt:
122
  return jsonify({"error": "Empty prompt"}), 400
123
 
124
  try:
125
+ inputs = tokenizer(prompt, return_tensors="pt").to(DEVICE)
126
+ outputs = model.generate(
127
+ **inputs,
128
+ max_new_tokens=MAX_NEW_TOKENS,
129
+ temperature=0.7,
130
+ top_p=0.9,
131
+ do_sample=True,
132
+ pad_token_id=tokenizer.eos_token_id)
133
+
134
+ response = tokenizer.decode(outputs[0], skip_special_tokens=True)
135
  response = response.split("</s>")[0].strip()
136
  return jsonify({"response": response})
137
+
138
  except Exception as e:
139
  return jsonify({"error": str(e)}), 500
140
 
 
144
  "model_loaded": bool(model),
145
  "device": DEVICE,
146
  "cache_dir": str(cache_dir),
147
+ "max_tokens": MAX_NEW_TOKENS,
148
+ "memory_usage": f"{torch.cuda.memory_allocated()/1024**2:.2f}MB"
149
+ if torch.cuda.is_available() else "CPU"
150
  }
151
  return jsonify(status)
152
 
 
155
  return jsonify({
156
  "service": "DeepSeek Chat API",
157
  "endpoints": {
158
+ "POST /chat": "Single-response chat",
159
+ "POST /stream_chat": "Streaming chat with thinking steps",
160
  "GET /health": "Service health check"
161
  },
162
  "config": {
163
+ "model": MODEL_NAME,
164
  "max_tokens": MAX_NEW_TOKENS,
165
+ "cache_location": str(cache_dir)
166
  }
167
  })
168