Spaces:

mike23415
/

Thinking

Sleeping

File size: 10,498 Bytes

f1fd41e
7ae54ea
 
98ee9d3
f1fd41e
7ae54ea
6f93dce
 
580eaed
f1fd41e
 
 
 
6f93dce
2f665a8
6f93dce
 
98ee9d3
 
d10798f
2f665a8
6f93dce
2f665a8
 
 
 
 
 
 
f1fd41e
2f665a8
 
 
 
4a9bfbe
2f665a8
 
 
 
4a9bfbe
 
 
 
 
 
 
 
 
98ee9d3
4a9bfbe
 
2f665a8
 
 
4a9bfbe
 
2f665a8
 
4a9bfbe
 
 
 
98ee9d3
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2f665a8
 
 
 
 
 
6f93dce
7ae54ea
 
2f665a8
 
 
 
 
7ae54ea
 
 
98ee9d3
 
7ae54ea
 
98ee9d3
7ae54ea
 
98ee9d3
7ae54ea
 
f1fd41e
98ee9d3
 
4a9bfbe
 
2f665a8
 
7ae54ea
98ee9d3
2f665a8
 
 
 
 
 
 
 
 
 
 
 
 
 
98ee9d3
2f665a8
98ee9d3
2f665a8
 
 
 
 
 
 
 
 
 
98ee9d3
 
2f665a8
f1fd41e
2f665a8
 
 
7ae54ea
 
 
 
 
2f665a8
7ae54ea
2f665a8
98ee9d3
2f665a8
 
 
7ae54ea
 
 
 
 
 
 
 
 
 
 
 
 
 
2f665a8
7ae54ea
 
 
6f93dce
 
 
2f665a8
 
 
6f93dce
f1fd41e
7ae54ea
6f93dce
 
f1fd41e
6f93dce
 
98ee9d3
 
4a9bfbe
 
2f665a8
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
7ae54ea
6f93dce
7ae54ea
6f93dce
2f665a8
 
 
6f93dce
 
 
 
2f665a8
98ee9d3
 
 
 
 
 
 
 
2f665a8
 
 
 
 
 
 
 
f1fd41e
2f665a8
98ee9d3
2f665a8
f1fd41e
 
7ae54ea
98ee9d3
f1fd41e
 
 
98ee9d3
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
f1fd41e
 
 
98ee9d3
2f665a8
f1fd41e
7ae54ea
 
98ee9d3
 
f1fd41e
 
7ae54ea
f1fd41e
98ee9d3
7ae54ea
f1fd41e
 
6f93dce
 
98ee9d3
2f665a8

import os
import time
import json
import gc  # For garbage collection
from pathlib import Path
from flask import Flask, request, jsonify, Response
from flask_cors import CORS
import torch

# Create cache directory if not exists
cache_dir = Path(os.getenv('TRANSFORMERS_CACHE', '/app/cache'))
cache_dir.mkdir(parents=True, exist_ok=True)

app = Flask(__name__)
CORS(app)  # Allow cross-origin requests

# Model configuration
# Use DeepSeek R1 Distill Qwen 1.5B model (much lighter than 7B)
MODEL_NAME = "deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B"
MAX_NEW_TOKENS = 256
DEVICE = "cpu" if not torch.cuda.is_available() else "cuda"

# Initialize model variables
tokenizer = None
model = None

def load_model():
    """Load model on first request to save memory at startup"""
    global tokenizer, model
    
    if tokenizer is not None and model is not None:
        return True
    
    try:
        from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig
        print(f"Loading model {MODEL_NAME}...")
        print(f"Using device: {DEVICE}")
        print(f"Cache directory: {cache_dir}")
        
        # Use 4-bit quantization for memory efficiency if on CUDA
        if DEVICE == "cuda":
            quantization_config = BitsAndBytesConfig(
                load_in_4bit=True,
                bnb_4bit_compute_dtype=torch.float16,
                bnb_4bit_quant_type="nf4",
                bnb_4bit_use_double_quant=True
            )
        else:
            # For CPU, we'll use a different optimization approach
            quantization_config = None
        
        # Load tokenizer
        tokenizer = AutoTokenizer.from_pretrained(
            MODEL_NAME,
            cache_dir=str(cache_dir),
            trust_remote_code=True
        )
        
        # Configure token if HF_TOKEN is set
        hf_token = os.environ.get("HF_TOKEN")
        token_kwargs = {"token": hf_token} if hf_token else {}
        
        # Additional memory optimization settings for low resource environments
        if DEVICE == "cpu":
            # Load model with 8-bit quantization for CPU
            try:
                # Try int8 quantization for CPU
                model = AutoModelForCausalLM.from_pretrained(
                    MODEL_NAME,
                    cache_dir=str(cache_dir),
                    load_in_8bit=True,
                    low_cpu_mem_usage=True,
                    trust_remote_code=True,
                    **token_kwargs
                )
            except Exception as e:
                print(f"8-bit quantization failed, falling back to standard loading: {str(e)}")
                model = AutoModelForCausalLM.from_pretrained(
                    MODEL_NAME,
                    cache_dir=str(cache_dir),
                    low_cpu_mem_usage=True,
                    trust_remote_code=True,
                    **token_kwargs
                )
        else:
            # Load model with 4-bit quantization for CUDA
            model = AutoModelForCausalLM.from_pretrained(
                MODEL_NAME,
                cache_dir=str(cache_dir),
                device_map="auto",
                torch_dtype=torch.float16,
                quantization_config=quantization_config,
                low_cpu_mem_usage=True,
                trust_remote_code=True,
                **token_kwargs
            )
        
        print("✅ Model loaded successfully!")
        return True
    except Exception as e:
        print(f"❌ Model loading failed: {str(e)}")
        return False

def stream_generator(prompt):
    """Generator function for streaming response with thinking steps"""
    # Ensure model is loaded
    if not load_model():
        yield json.dumps({"type": "error", "content": "Model not loaded"}) + '\n'
        return
    
    # Thinking phases
    thinking_steps = [
        "🔍 Analyzing your question...",
        "🧠 Processing...",
        "💡 Formulating response..."
    ]
    
    # Stream thinking steps (fewer steps, faster timing for lighter model)
    for step in thinking_steps:
        yield json.dumps({"type": "thinking", "content": step}) + '\n'
        time.sleep(0.5)  # Reduced timing for faster response
    
    # Prepare streaming generation
    try:
        # Format prompt for the model (DeepSeek specific)
        formatted_prompt = f"<|im_start|>user\n{prompt}<|im_end|>\n<|im_start|>assistant\n"
            
        inputs = tokenizer(formatted_prompt, return_tensors="pt")
        if DEVICE == "cuda":
            inputs = inputs.to("cuda")
        
        # Use memory efficient approach
        with torch.no_grad():
            generated_ids = model.generate(
                **inputs,
                max_new_tokens=MAX_NEW_TOKENS,
                temperature=0.7,
                top_p=0.9,
                do_sample=True,
                pad_token_id=tokenizer.eos_token_id,
                return_dict_in_generate=True,
                output_scores=False)
        
        # Get output sequence
        output_ids = generated_ids.sequences[0][len(inputs.input_ids[0]):]
        
        # Stream in slightly larger chunks for better performance
        full_output = ""
        chunk_size = 5  # Increased number of tokens per chunk
        for i in range(0, len(output_ids), chunk_size):
            chunk_ids = output_ids[i:i+chunk_size]
            chunk_text = tokenizer.decode(chunk_ids, skip_special_tokens=True)
            full_output += chunk_text
            
            yield json.dumps({
                "type": "answer",
                "content": chunk_text
            }) + '\n'
            
            # Smaller delay for faster streaming
            time.sleep(0.03)
            
    except Exception as e:
        import traceback
        error_details = f"Error: {str(e)}\n{traceback.format_exc()}"
        print(error_details)
        yield json.dumps({
            "type": "error",
            "content": f"Generation error: {str(e)}"
        }) + '\n'
    
    # Signal completion
    yield json.dumps({"type": "complete"}) + '\n'
    
    # Clean up memory aggressively
    if DEVICE == "cuda":
        torch.cuda.empty_cache()
    gc.collect()

@app.route('/stream_chat', methods=['POST'])
def stream_chat():
    data = request.get_json()
    prompt = data.get('prompt', '').strip()
    
    if not prompt:
        return jsonify({"error": "Empty prompt"}), 400
    
    return Response(
        stream_generator(prompt),
        mimetype='text/event-stream',
        headers={
            'Cache-Control': 'no-cache',
            'X-Accel-Buffering': 'no',  # Prevent Nginx buffering
            'Connection': 'keep-alive'
        }
    )

@app.route('/chat', methods=['POST'])
def chat():
    # Ensure model is loaded
    if not load_model():
        return jsonify({"error": "Model failed to load"}), 500
    
    data = request.get_json()
    prompt = data.get('prompt', '').strip()
    
    if not prompt:
        return jsonify({"error": "Empty prompt"}), 400
    
    try:
        # Format prompt for DeepSeek model
        formatted_prompt = f"<|im_start|>user\n{prompt}<|im_end|>\n<|im_start|>assistant\n"
            
        inputs = tokenizer(formatted_prompt, return_tensors="pt")
        if DEVICE == "cuda":
            inputs = inputs.to("cuda")
            
        with torch.no_grad():
            outputs = model.generate(
                **inputs,
                max_new_tokens=MAX_NEW_TOKENS,
                temperature=0.7,
                top_p=0.9,
                do_sample=True,
                pad_token_id=tokenizer.eos_token_id)
        
        response = tokenizer.decode(outputs[0][len(inputs.input_ids[0]):], skip_special_tokens=True)
        
        # Clean up memory
        if DEVICE == "cuda":
            torch.cuda.empty_cache()
        gc.collect()
        
        return jsonify({"response": response})
    
    except Exception as e:
        import traceback
        error_details = f"Error: {str(e)}\n{traceback.format_exc()}"
        print(error_details)
        return jsonify({"error": str(e)}), 500

@app.route('/health', methods=['GET'])
def health_check():
    model_loaded = tokenizer is not None and model is not None
    memory_info = "N/A"
    
    # Get memory usage stats
    if torch.cuda.is_available():
        memory_info = f"{torch.cuda.memory_allocated()/1024**2:.2f}MB / {torch.cuda.get_device_properties(0).total_memory/1024**2:.2f}MB"
    else:
        import psutil
        memory_info = f"{psutil.virtual_memory().used/1024**3:.2f}GB / {psutil.virtual_memory().total/1024**3:.2f}GB"
    
    try:
        # Check if we need to load the model
        if not model_loaded and request.args.get('load') == 'true':
            model_loaded = load_model()
    except Exception as e:
        print(f"Health check error: {str(e)}")
    
    status = {
        "status": "ok" if model_loaded else "waiting",
        "model": MODEL_NAME,
        "model_loaded": model_loaded,
        "device": DEVICE,
        "cache_dir": str(cache_dir),
        "max_tokens": MAX_NEW_TOKENS,
        "memory_usage": memory_info
    }
    return jsonify(status)

@app.route('/unload', methods=['POST'])
def unload_model():
    """Endpoint to manually unload model and free memory"""
    global model, tokenizer
    
    if model is not None:
        del model
        model = None
        
    if tokenizer is not None:
        del tokenizer
        tokenizer = None
        
    # Force garbage collection
    if torch.cuda.is_available():
        torch.cuda.empty_cache()
    gc.collect()
    
    return jsonify({"status": "Model unloaded", "memory_freed": True})

@app.route('/')
def home():
    return jsonify({
        "service": "DeepSeek-1.5B Chat API",
        "status": "online",
        "endpoints": {
            "POST /chat": "Single-response chat",
            "POST /stream_chat": "Streaming chat with thinking steps",
            "GET /health": "Service health check",
            "POST /unload": "Unload model to free memory"
        },
        "config": {
            "model": MODEL_NAME,
            "max_tokens": MAX_NEW_TOKENS,
            "device": DEVICE,
            "cache_location": str(cache_dir)
        }
    })

if __name__ == '__main__':
    # Load model at startup only if explicitly requested
    if os.getenv('PRELOAD_MODEL', 'false').lower() == 'true':
        load_model()
    
    port = int(os.environ.get("PORT", 5000))
    app.run(host='0.0.0.0', port=port)