import os import time import json import numpy as np from pathlib import Path from flask import Flask, request, jsonify, Response from flask_cors import CORS from transformers import AutoTokenizer, AutoModelForCausalLM, TextStreamer import torch # Verify numpy version assert np.__version__.startswith('1.'), f"Invalid numpy version {np.__version__} - must be 1.x series" # Create cache directory if not exists cache_dir = Path(os.getenv('TRANSFORMERS_CACHE', '/app/cache')) cache_dir.mkdir(parents=True, exist_ok=True) app = Flask(__name__) CORS(app) # Model configuration MODEL_NAME = "deepseek-ai/deepseek-r1-6b-chat" MAX_NEW_TOKENS = 256 DEVICE = "cpu" # Initialize model try: tokenizer = AutoTokenizer.from_pretrained( MODEL_NAME, cache_dir=str(cache_dir) ) model = AutoModelForCausalLM.from_pretrained( MODEL_NAME, cache_dir=str(cache_dir), device_map="auto", torch_dtype=torch.float32, low_cpu_mem_usage=True) print("Model loaded successfully!") except Exception as e: print(f"Model loading failed: {str(e)}") model = None def stream_generator(prompt): """Generator function for streaming response with thinking steps""" # Thinking phases thinking_steps = [ "🔍 Analyzing your question...", "🧠 Accessing knowledge base...", "💡 Formulating response...", "📚 Verifying information..." ] # Stream thinking steps for step in thinking_steps: yield json.dumps({"type": "thinking", "content": step}) + '\n' time.sleep(1.5) # Simulate processing time # Prepare streaming generation inputs = tokenizer(prompt, return_tensors="pt").to(DEVICE) streamer = TextStreamer(tokenizer, skip_prompt=True) # Generate response chunks try: generated_ids = model.generate( **inputs, max_new_tokens=MAX_NEW_TOKENS, streamer=streamer, temperature=0.7, top_p=0.9, do_sample=True, pad_token_id=tokenizer.eos_token_id) # Stream generated text full_response = "" for token_ids in generated_ids: chunk = tokenizer.decode(token_ids, skip_special_tokens=True) new_content = chunk[len(full_response):] if new_content.strip(): full_response = chunk yield json.dumps({ "type": "answer", "content": new_content }) + '\n' except Exception as e: yield json.dumps({ "type": "error", "content": f"Generation error: {str(e)}" }) + '\n' yield json.dumps({"type": "complete"}) + '\n' @app.route('/stream_chat', methods=['POST']) def stream_chat(): if not model: return jsonify({"error": "Model not loaded"}), 500 data = request.get_json() prompt = data.get('prompt', '').strip() if not prompt: return jsonify({"error": "Empty prompt"}), 400 return Response( stream_generator(prompt), mimetype='text/event-stream', headers={ 'Cache-Control': 'no-cache', 'Connection': 'keep-alive' } ) @app.route('/chat', methods=['POST']) def chat(): if not model: return jsonify({"error": "Model not loaded"}), 500 data = request.get_json() prompt = data.get('prompt', '').strip() if not prompt: return jsonify({"error": "Empty prompt"}), 400 try: inputs = tokenizer(prompt, return_tensors="pt").to(DEVICE) outputs = model.generate( **inputs, max_new_tokens=MAX_NEW_TOKENS, temperature=0.7, top_p=0.9, do_sample=True, pad_token_id=tokenizer.eos_token_id) response = tokenizer.decode(outputs[0], skip_special_tokens=True) response = response.split("")[0].strip() return jsonify({"response": response}) except Exception as e: return jsonify({"error": str(e)}), 500 @app.route('/health', methods=['GET']) def health_check(): status = { "model_loaded": bool(model), "device": DEVICE, "cache_dir": str(cache_dir), "max_tokens": MAX_NEW_TOKENS, "memory_usage": f"{torch.cuda.memory_allocated()/1024**2:.2f}MB" if torch.cuda.is_available() else "CPU" } return jsonify(status) @app.route('/') def home(): return jsonify({ "service": "DeepSeek Chat API", "endpoints": { "POST /chat": "Single-response chat", "POST /stream_chat": "Streaming chat with thinking steps", "GET /health": "Service health check" }, "config": { "model": MODEL_NAME, "max_tokens": MAX_NEW_TOKENS, "cache_location": str(cache_dir) } }) if __name__ == '__main__': app.run(host='0.0.0.0', port=5000)