File size: 10,498 Bytes
f1fd41e
7ae54ea
 
98ee9d3
f1fd41e
7ae54ea
6f93dce
 
580eaed
f1fd41e
 
 
 
6f93dce
2f665a8
6f93dce
 
98ee9d3
 
d10798f
2f665a8
6f93dce
2f665a8
 
 
 
 
 
 
f1fd41e
2f665a8
 
 
 
4a9bfbe
2f665a8
 
 
 
4a9bfbe
 
 
 
 
 
 
 
 
98ee9d3
4a9bfbe
 
2f665a8
 
 
4a9bfbe
 
2f665a8
 
4a9bfbe
 
 
 
98ee9d3
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2f665a8
 
 
 
 
 
6f93dce
7ae54ea
 
2f665a8
 
 
 
 
7ae54ea
 
 
98ee9d3
 
7ae54ea
 
98ee9d3
7ae54ea
 
98ee9d3
7ae54ea
 
f1fd41e
98ee9d3
 
4a9bfbe
 
2f665a8
 
7ae54ea
98ee9d3
2f665a8
 
 
 
 
 
 
 
 
 
 
 
 
 
98ee9d3
2f665a8
98ee9d3
2f665a8
 
 
 
 
 
 
 
 
 
98ee9d3
 
2f665a8
f1fd41e
2f665a8
 
 
7ae54ea
 
 
 
 
2f665a8
7ae54ea
2f665a8
98ee9d3
2f665a8
 
 
7ae54ea
 
 
 
 
 
 
 
 
 
 
 
 
 
2f665a8
7ae54ea
 
 
6f93dce
 
 
2f665a8
 
 
6f93dce
f1fd41e
7ae54ea
6f93dce
 
f1fd41e
6f93dce
 
98ee9d3
 
4a9bfbe
 
2f665a8
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
7ae54ea
6f93dce
7ae54ea
6f93dce
2f665a8
 
 
6f93dce
 
 
 
2f665a8
98ee9d3
 
 
 
 
 
 
 
2f665a8
 
 
 
 
 
 
 
f1fd41e
2f665a8
98ee9d3
2f665a8
f1fd41e
 
7ae54ea
98ee9d3
f1fd41e
 
 
98ee9d3
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
f1fd41e
 
 
98ee9d3
2f665a8
f1fd41e
7ae54ea
 
98ee9d3
 
f1fd41e
 
7ae54ea
f1fd41e
98ee9d3
7ae54ea
f1fd41e
 
6f93dce
 
98ee9d3
2f665a8
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
import os
import time
import json
import gc  # For garbage collection
from pathlib import Path
from flask import Flask, request, jsonify, Response
from flask_cors import CORS
import torch

# Create cache directory if not exists
cache_dir = Path(os.getenv('TRANSFORMERS_CACHE', '/app/cache'))
cache_dir.mkdir(parents=True, exist_ok=True)

app = Flask(__name__)
CORS(app)  # Allow cross-origin requests

# Model configuration
# Use DeepSeek R1 Distill Qwen 1.5B model (much lighter than 7B)
MODEL_NAME = "deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B"
MAX_NEW_TOKENS = 256
DEVICE = "cpu" if not torch.cuda.is_available() else "cuda"

# Initialize model variables
tokenizer = None
model = None

def load_model():
    """Load model on first request to save memory at startup"""
    global tokenizer, model
    
    if tokenizer is not None and model is not None:
        return True
    
    try:
        from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig
        print(f"Loading model {MODEL_NAME}...")
        print(f"Using device: {DEVICE}")
        print(f"Cache directory: {cache_dir}")
        
        # Use 4-bit quantization for memory efficiency if on CUDA
        if DEVICE == "cuda":
            quantization_config = BitsAndBytesConfig(
                load_in_4bit=True,
                bnb_4bit_compute_dtype=torch.float16,
                bnb_4bit_quant_type="nf4",
                bnb_4bit_use_double_quant=True
            )
        else:
            # For CPU, we'll use a different optimization approach
            quantization_config = None
        
        # Load tokenizer
        tokenizer = AutoTokenizer.from_pretrained(
            MODEL_NAME,
            cache_dir=str(cache_dir),
            trust_remote_code=True
        )
        
        # Configure token if HF_TOKEN is set
        hf_token = os.environ.get("HF_TOKEN")
        token_kwargs = {"token": hf_token} if hf_token else {}
        
        # Additional memory optimization settings for low resource environments
        if DEVICE == "cpu":
            # Load model with 8-bit quantization for CPU
            try:
                # Try int8 quantization for CPU
                model = AutoModelForCausalLM.from_pretrained(
                    MODEL_NAME,
                    cache_dir=str(cache_dir),
                    load_in_8bit=True,
                    low_cpu_mem_usage=True,
                    trust_remote_code=True,
                    **token_kwargs
                )
            except Exception as e:
                print(f"8-bit quantization failed, falling back to standard loading: {str(e)}")
                model = AutoModelForCausalLM.from_pretrained(
                    MODEL_NAME,
                    cache_dir=str(cache_dir),
                    low_cpu_mem_usage=True,
                    trust_remote_code=True,
                    **token_kwargs
                )
        else:
            # Load model with 4-bit quantization for CUDA
            model = AutoModelForCausalLM.from_pretrained(
                MODEL_NAME,
                cache_dir=str(cache_dir),
                device_map="auto",
                torch_dtype=torch.float16,
                quantization_config=quantization_config,
                low_cpu_mem_usage=True,
                trust_remote_code=True,
                **token_kwargs
            )
        
        print("βœ… Model loaded successfully!")
        return True
    except Exception as e:
        print(f"❌ Model loading failed: {str(e)}")
        return False

def stream_generator(prompt):
    """Generator function for streaming response with thinking steps"""
    # Ensure model is loaded
    if not load_model():
        yield json.dumps({"type": "error", "content": "Model not loaded"}) + '\n'
        return
    
    # Thinking phases
    thinking_steps = [
        "πŸ” Analyzing your question...",
        "🧠 Processing...",
        "πŸ’‘ Formulating response..."
    ]
    
    # Stream thinking steps (fewer steps, faster timing for lighter model)
    for step in thinking_steps:
        yield json.dumps({"type": "thinking", "content": step}) + '\n'
        time.sleep(0.5)  # Reduced timing for faster response
    
    # Prepare streaming generation
    try:
        # Format prompt for the model (DeepSeek specific)
        formatted_prompt = f"<|im_start|>user\n{prompt}<|im_end|>\n<|im_start|>assistant\n"
            
        inputs = tokenizer(formatted_prompt, return_tensors="pt")
        if DEVICE == "cuda":
            inputs = inputs.to("cuda")
        
        # Use memory efficient approach
        with torch.no_grad():
            generated_ids = model.generate(
                **inputs,
                max_new_tokens=MAX_NEW_TOKENS,
                temperature=0.7,
                top_p=0.9,
                do_sample=True,
                pad_token_id=tokenizer.eos_token_id,
                return_dict_in_generate=True,
                output_scores=False)
        
        # Get output sequence
        output_ids = generated_ids.sequences[0][len(inputs.input_ids[0]):]
        
        # Stream in slightly larger chunks for better performance
        full_output = ""
        chunk_size = 5  # Increased number of tokens per chunk
        for i in range(0, len(output_ids), chunk_size):
            chunk_ids = output_ids[i:i+chunk_size]
            chunk_text = tokenizer.decode(chunk_ids, skip_special_tokens=True)
            full_output += chunk_text
            
            yield json.dumps({
                "type": "answer",
                "content": chunk_text
            }) + '\n'
            
            # Smaller delay for faster streaming
            time.sleep(0.03)
            
    except Exception as e:
        import traceback
        error_details = f"Error: {str(e)}\n{traceback.format_exc()}"
        print(error_details)
        yield json.dumps({
            "type": "error",
            "content": f"Generation error: {str(e)}"
        }) + '\n'
    
    # Signal completion
    yield json.dumps({"type": "complete"}) + '\n'
    
    # Clean up memory aggressively
    if DEVICE == "cuda":
        torch.cuda.empty_cache()
    gc.collect()

@app.route('/stream_chat', methods=['POST'])
def stream_chat():
    data = request.get_json()
    prompt = data.get('prompt', '').strip()
    
    if not prompt:
        return jsonify({"error": "Empty prompt"}), 400
    
    return Response(
        stream_generator(prompt),
        mimetype='text/event-stream',
        headers={
            'Cache-Control': 'no-cache',
            'X-Accel-Buffering': 'no',  # Prevent Nginx buffering
            'Connection': 'keep-alive'
        }
    )

@app.route('/chat', methods=['POST'])
def chat():
    # Ensure model is loaded
    if not load_model():
        return jsonify({"error": "Model failed to load"}), 500
    
    data = request.get_json()
    prompt = data.get('prompt', '').strip()
    
    if not prompt:
        return jsonify({"error": "Empty prompt"}), 400
    
    try:
        # Format prompt for DeepSeek model
        formatted_prompt = f"<|im_start|>user\n{prompt}<|im_end|>\n<|im_start|>assistant\n"
            
        inputs = tokenizer(formatted_prompt, return_tensors="pt")
        if DEVICE == "cuda":
            inputs = inputs.to("cuda")
            
        with torch.no_grad():
            outputs = model.generate(
                **inputs,
                max_new_tokens=MAX_NEW_TOKENS,
                temperature=0.7,
                top_p=0.9,
                do_sample=True,
                pad_token_id=tokenizer.eos_token_id)
        
        response = tokenizer.decode(outputs[0][len(inputs.input_ids[0]):], skip_special_tokens=True)
        
        # Clean up memory
        if DEVICE == "cuda":
            torch.cuda.empty_cache()
        gc.collect()
        
        return jsonify({"response": response})
    
    except Exception as e:
        import traceback
        error_details = f"Error: {str(e)}\n{traceback.format_exc()}"
        print(error_details)
        return jsonify({"error": str(e)}), 500

@app.route('/health', methods=['GET'])
def health_check():
    model_loaded = tokenizer is not None and model is not None
    memory_info = "N/A"
    
    # Get memory usage stats
    if torch.cuda.is_available():
        memory_info = f"{torch.cuda.memory_allocated()/1024**2:.2f}MB / {torch.cuda.get_device_properties(0).total_memory/1024**2:.2f}MB"
    else:
        import psutil
        memory_info = f"{psutil.virtual_memory().used/1024**3:.2f}GB / {psutil.virtual_memory().total/1024**3:.2f}GB"
    
    try:
        # Check if we need to load the model
        if not model_loaded and request.args.get('load') == 'true':
            model_loaded = load_model()
    except Exception as e:
        print(f"Health check error: {str(e)}")
    
    status = {
        "status": "ok" if model_loaded else "waiting",
        "model": MODEL_NAME,
        "model_loaded": model_loaded,
        "device": DEVICE,
        "cache_dir": str(cache_dir),
        "max_tokens": MAX_NEW_TOKENS,
        "memory_usage": memory_info
    }
    return jsonify(status)

@app.route('/unload', methods=['POST'])
def unload_model():
    """Endpoint to manually unload model and free memory"""
    global model, tokenizer
    
    if model is not None:
        del model
        model = None
        
    if tokenizer is not None:
        del tokenizer
        tokenizer = None
        
    # Force garbage collection
    if torch.cuda.is_available():
        torch.cuda.empty_cache()
    gc.collect()
    
    return jsonify({"status": "Model unloaded", "memory_freed": True})

@app.route('/')
def home():
    return jsonify({
        "service": "DeepSeek-1.5B Chat API",
        "status": "online",
        "endpoints": {
            "POST /chat": "Single-response chat",
            "POST /stream_chat": "Streaming chat with thinking steps",
            "GET /health": "Service health check",
            "POST /unload": "Unload model to free memory"
        },
        "config": {
            "model": MODEL_NAME,
            "max_tokens": MAX_NEW_TOKENS,
            "device": DEVICE,
            "cache_location": str(cache_dir)
        }
    })

if __name__ == '__main__':
    # Load model at startup only if explicitly requested
    if os.getenv('PRELOAD_MODEL', 'false').lower() == 'true':
        load_model()
    
    port = int(os.environ.get("PORT", 5000))
    app.run(host='0.0.0.0', port=port)