Spaces:

mike23415
/

Thinking

Sleeping

App Files Files Community

mike23415 commited on about 1 month ago

Commit

98ee9d3

verified ·

1 Parent(s): 4a9bfbe

Update app.py

Browse files

Files changed (1) hide show

app.py +87 -46

app.py CHANGED Viewed

@@ -1,12 +1,11 @@
 import os
 import time
 import json
-import numpy as np
 from pathlib import Path
 from flask import Flask, request, jsonify, Response
 from flask_cors import CORS
 import torch
-import gc  # For garbage collection
 # Create cache directory if not exists
 cache_dir = Path(os.getenv('TRANSFORMERS_CACHE', '/app/cache'))
@@ -16,8 +15,8 @@ app = Flask(__name__)
 CORS(app)  # Allow cross-origin requests
 # Model configuration
-# Use DeepSeek R1 Distill Qwen 7B model
-MODEL_NAME = "deepseek-ai/DeepSeek-R1-Distill-Qwen-7B"
 MAX_NEW_TOKENS = 256
 DEVICE = "cpu" if not torch.cuda.is_available() else "cuda"
@@ -47,6 +46,7 @@ def load_model():
                 bnb_4bit_use_double_quant=True
             )
         else:
             quantization_config = None
         # Load tokenizer
@@ -60,17 +60,40 @@ def load_model():
         hf_token = os.environ.get("HF_TOKEN")
         token_kwargs = {"token": hf_token} if hf_token else {}
-        # Load model with appropriate settings for the device
-        model = AutoModelForCausalLM.from_pretrained(
-            MODEL_NAME,
-            cache_dir=str(cache_dir),
-            device_map="auto" if DEVICE == "cuda" else None,
-            torch_dtype=torch.float16 if DEVICE == "cuda" else torch.float32,
-            quantization_config=quantization_config,
-            low_cpu_mem_usage=True,
-            trust_remote_code=True,
-            **token_kwargs
-        )
         print("✅ Model loaded successfully!")
         return True
@@ -88,32 +111,25 @@ def stream_generator(prompt):
     # Thinking phases
     thinking_steps = [
         "🔍 Analyzing your question...",
-        "🧠 Accessing knowledge base...",
-        "💡 Formulating response...",
-        "📚 Verifying information..."
     ]
-    # Stream thinking steps
     for step in thinking_steps:
         yield json.dumps({"type": "thinking", "content": step}) + '\n'
-        time.sleep(0.8)  # Reduced timing for faster response
     # Prepare streaming generation
     try:
-        # Format prompt for the model
-        if "mistral" in MODEL_NAME.lower():
-            formatted_prompt = f"<s>[INST] {prompt} [/INST]"
-        elif "deepseek" in MODEL_NAME.lower():
-            formatted_prompt = f"<|im_start|>user\n{prompt}<|im_end|>\n<|im_start|>assistant\n"
-        else:
-            formatted_prompt = prompt
         inputs = tokenizer(formatted_prompt, return_tensors="pt")
         if DEVICE == "cuda":
             inputs = inputs.to("cuda")
-        # Use custom streaming implementation
-        # Start generation
         with torch.no_grad():
             generated_ids = model.generate(
                 **inputs,
@@ -128,9 +144,9 @@ def stream_generator(prompt):
         # Get output sequence
         output_ids = generated_ids.sequences[0][len(inputs.input_ids[0]):]
-        # Stream in chunks for smoother experience
         full_output = ""
-        chunk_size = 3  # Number of tokens per chunk
         for i in range(0, len(output_ids), chunk_size):
             chunk_ids = output_ids[i:i+chunk_size]
             chunk_text = tokenizer.decode(chunk_ids, skip_special_tokens=True)
@@ -141,8 +157,8 @@ def stream_generator(prompt):
                 "content": chunk_text
             }) + '\n'
-            # Small delay for smoother streaming
-            time.sleep(0.05)
     except Exception as e:
         import traceback
@@ -156,7 +172,7 @@ def stream_generator(prompt):
     # Signal completion
     yield json.dumps({"type": "complete"}) + '\n'
-    # Clean up memory
     if DEVICE == "cuda":
         torch.cuda.empty_cache()
     gc.collect()
@@ -192,13 +208,8 @@ def chat():
         return jsonify({"error": "Empty prompt"}), 400
     try:
-        # Format prompt for the model
-        if "mistral" in MODEL_NAME.lower():
-            formatted_prompt = f"<s>[INST] {prompt} [/INST]"
-        elif "deepseek" in MODEL_NAME.lower():
-            formatted_prompt = f"<|im_start|>user\n{prompt}<|im_end|>\n<|im_start|>assistant\n"
-        else:
-            formatted_prompt = prompt
         inputs = tokenizer(formatted_prompt, return_tensors="pt")
         if DEVICE == "cuda":
@@ -231,6 +242,14 @@ def chat():
 @app.route('/health', methods=['GET'])
 def health_check():
     model_loaded = tokenizer is not None and model is not None
     try:
         # Check if we need to load the model
@@ -241,34 +260,56 @@ def health_check():
     status = {
         "status": "ok" if model_loaded else "waiting",
         "model_loaded": model_loaded,
         "device": DEVICE,
         "cache_dir": str(cache_dir),
         "max_tokens": MAX_NEW_TOKENS,
-        "memory_usage": f"{torch.cuda.memory_allocated()/1024**2:.2f}MB"
-            if torch.cuda.is_available() else "CPU"
     }
     return jsonify(status)
 @app.route('/')
 def home():
     return jsonify({
-        "service": "DeepSeek Chat API",
         "status": "online",
         "endpoints": {
             "POST /chat": "Single-response chat",
             "POST /stream_chat": "Streaming chat with thinking steps",
-            "GET /health": "Service health check"
         },
         "config": {
             "model": MODEL_NAME,
             "max_tokens": MAX_NEW_TOKENS,
             "cache_location": str(cache_dir)
         }
     })
 if __name__ == '__main__':
-    # Load model at startup - only if explicitly requested
     if os.getenv('PRELOAD_MODEL', 'false').lower() == 'true':
         load_model()

 import os
 import time
 import json
+import gc  # For garbage collection
 from pathlib import Path
 from flask import Flask, request, jsonify, Response
 from flask_cors import CORS
 import torch
 # Create cache directory if not exists
 cache_dir = Path(os.getenv('TRANSFORMERS_CACHE', '/app/cache'))
 CORS(app)  # Allow cross-origin requests
 # Model configuration
+# Use DeepSeek R1 Distill Qwen 1.5B model (much lighter than 7B)
+MODEL_NAME = "deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B"
 MAX_NEW_TOKENS = 256
 DEVICE = "cpu" if not torch.cuda.is_available() else "cuda"
                 bnb_4bit_use_double_quant=True
             )
         else:
+            # For CPU, we'll use a different optimization approach
             quantization_config = None
         # Load tokenizer
         hf_token = os.environ.get("HF_TOKEN")
         token_kwargs = {"token": hf_token} if hf_token else {}
+        # Additional memory optimization settings for low resource environments
+        if DEVICE == "cpu":
+            # Load model with 8-bit quantization for CPU
+            try:
+                # Try int8 quantization for CPU
+                model = AutoModelForCausalLM.from_pretrained(
+                    MODEL_NAME,
+                    cache_dir=str(cache_dir),
+                    load_in_8bit=True,
+                    low_cpu_mem_usage=True,
+                    trust_remote_code=True,
+                    **token_kwargs
+                )
+            except Exception as e:
+                print(f"8-bit quantization failed, falling back to standard loading: {str(e)}")
+                model = AutoModelForCausalLM.from_pretrained(
+                    MODEL_NAME,
+                    cache_dir=str(cache_dir),
+                    low_cpu_mem_usage=True,
+                    trust_remote_code=True,
+                    **token_kwargs
+                )
+        else:
+            # Load model with 4-bit quantization for CUDA
+            model = AutoModelForCausalLM.from_pretrained(
+                MODEL_NAME,
+                cache_dir=str(cache_dir),
+                device_map="auto",
+                torch_dtype=torch.float16,
+                quantization_config=quantization_config,
+                low_cpu_mem_usage=True,
+                trust_remote_code=True,
+                **token_kwargs
+            )
         print("✅ Model loaded successfully!")
         return True
     # Thinking phases
     thinking_steps = [
         "🔍 Analyzing your question...",
+        "🧠 Processing...",
+        "💡 Formulating response..."
     ]
+    # Stream thinking steps (fewer steps, faster timing for lighter model)
     for step in thinking_steps:
         yield json.dumps({"type": "thinking", "content": step}) + '\n'
+        time.sleep(0.5)  # Reduced timing for faster response
     # Prepare streaming generation
     try:
+        # Format prompt for the model (DeepSeek specific)
+        formatted_prompt = f"<|im_start|>user\n{prompt}<|im_end|>\n<|im_start|>assistant\n"
         inputs = tokenizer(formatted_prompt, return_tensors="pt")
         if DEVICE == "cuda":
             inputs = inputs.to("cuda")
+        # Use memory efficient approach
         with torch.no_grad():
             generated_ids = model.generate(
                 **inputs,
         # Get output sequence
         output_ids = generated_ids.sequences[0][len(inputs.input_ids[0]):]
+        # Stream in slightly larger chunks for better performance
         full_output = ""
+        chunk_size = 5  # Increased number of tokens per chunk
         for i in range(0, len(output_ids), chunk_size):
             chunk_ids = output_ids[i:i+chunk_size]
             chunk_text = tokenizer.decode(chunk_ids, skip_special_tokens=True)
                 "content": chunk_text
             }) + '\n'
+            # Smaller delay for faster streaming
+            time.sleep(0.03)
     except Exception as e:
         import traceback
     # Signal completion
     yield json.dumps({"type": "complete"}) + '\n'
+    # Clean up memory aggressively
     if DEVICE == "cuda":
         torch.cuda.empty_cache()
     gc.collect()
         return jsonify({"error": "Empty prompt"}), 400
     try:
+        # Format prompt for DeepSeek model
+        formatted_prompt = f"<|im_start|>user\n{prompt}<|im_end|>\n<|im_start|>assistant\n"
         inputs = tokenizer(formatted_prompt, return_tensors="pt")
         if DEVICE == "cuda":
 @app.route('/health', methods=['GET'])
 def health_check():
     model_loaded = tokenizer is not None and model is not None
+    memory_info = "N/A"
+    # Get memory usage stats
+    if torch.cuda.is_available():
+        memory_info = f"{torch.cuda.memory_allocated()/1024**2:.2f}MB / {torch.cuda.get_device_properties(0).total_memory/1024**2:.2f}MB"
+    else:
+        import psutil
+        memory_info = f"{psutil.virtual_memory().used/1024**3:.2f}GB / {psutil.virtual_memory().total/1024**3:.2f}GB"
     try:
         # Check if we need to load the model
     status = {
         "status": "ok" if model_loaded else "waiting",
+        "model": MODEL_NAME,
         "model_loaded": model_loaded,
         "device": DEVICE,
         "cache_dir": str(cache_dir),
         "max_tokens": MAX_NEW_TOKENS,
+        "memory_usage": memory_info
     }
     return jsonify(status)
+@app.route('/unload', methods=['POST'])
+def unload_model():
+    """Endpoint to manually unload model and free memory"""
+    global model, tokenizer
+    if model is not None:
+        del model
+        model = None
+    if tokenizer is not None:
+        del tokenizer
+        tokenizer = None
+    # Force garbage collection
+    if torch.cuda.is_available():
+        torch.cuda.empty_cache()
+    gc.collect()
+    return jsonify({"status": "Model unloaded", "memory_freed": True})
 @app.route('/')
 def home():
     return jsonify({
+        "service": "DeepSeek-1.5B Chat API",
         "status": "online",
         "endpoints": {
             "POST /chat": "Single-response chat",
             "POST /stream_chat": "Streaming chat with thinking steps",
+            "GET /health": "Service health check",
+            "POST /unload": "Unload model to free memory"
         },
         "config": {
             "model": MODEL_NAME,
             "max_tokens": MAX_NEW_TOKENS,
+            "device": DEVICE,
             "cache_location": str(cache_dir)
         }
     })
 if __name__ == '__main__':
+    # Load model at startup only if explicitly requested
     if os.getenv('PRELOAD_MODEL', 'false').lower() == 'true':
         load_model()