Spaces:

mike23415
/

Thinking

Sleeping

App Files Files Community

mike23415 commited on 29 days ago

Commit

45ef073

verified ·

1 Parent(s): fdb5001

Update app.py

Browse files

Files changed (1) hide show

app.py +77 -137

app.py CHANGED Viewed

@@ -1,135 +1,105 @@
 import os
 import time
 import json
-import gc  # For garbage collection
 from pathlib import Path
 from flask import Flask, request, jsonify, Response
 from flask_cors import CORS
 import torch
-# Create cache directory if not exists
 cache_dir = Path(os.getenv('TRANSFORMERS_CACHE', '/app/cache'))
 cache_dir.mkdir(parents=True, exist_ok=True)
 app = Flask(__name__)
-CORS(app)  # Allow cross-origin requests
-# Model configuration
-# Use DeepSeek R1 Distill Qwen 1.5B model (much lighter than 7B)
 MODEL_NAME = "deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B"
 MAX_NEW_TOKENS = 256
-DEVICE = "cpu" if not torch.cuda.is_available() else "cuda"
-# Initialize model variables
 tokenizer = None
 model = None
 def load_model():
-    """Load model on first request to save memory at startup"""
     global tokenizer, model
     if tokenizer is not None and model is not None:
         return True
     try:
         from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig
-        print(f"Loading model {MODEL_NAME}...")
-        print(f"Using device: {DEVICE}")
-        print(f"Cache directory: {cache_dir}")
-        # Use 4-bit quantization for memory efficiency if on CUDA
         if DEVICE == "cuda":
-            quantization_config = BitsAndBytesConfig(
                 load_in_4bit=True,
                 bnb_4bit_compute_dtype=torch.float16,
                 bnb_4bit_quant_type="nf4",
                 bnb_4bit_use_double_quant=True
             )
-        else:
-            # For CPU, we'll use a different optimization approach
-            quantization_config = None
-        # Load tokenizer
-        tokenizer = AutoTokenizer.from_pretrained(
-            MODEL_NAME,
-            cache_dir=str(cache_dir),
-            trust_remote_code=True
-        )
-        # Configure token if HF_TOKEN is set
-        hf_token = os.environ.get("HF_TOKEN")
-        token_kwargs = {"token": hf_token} if hf_token else {}
-        # Additional memory optimization settings for low resource environments
-        if DEVICE == "cpu":
-            # Load model with 8-bit quantization for CPU
-            try:
-                # Try int8 quantization for CPU
-                model = AutoModelForCausalLM.from_pretrained(
-                    MODEL_NAME,
-                    cache_dir=str(cache_dir),
-                    load_in_8bit=True,
-                    low_cpu_mem_usage=True,
-                    trust_remote_code=True,
-                    **token_kwargs
-                )
-            except Exception as e:
-                print(f"8-bit quantization failed, falling back to standard loading: {str(e)}")
-                model = AutoModelForCausalLM.from_pretrained(
-                    MODEL_NAME,
-                    cache_dir=str(cache_dir),
-                    low_cpu_mem_usage=True,
-                    trust_remote_code=True,
-                    **token_kwargs
-                )
-        else:
-            # Load model with 4-bit quantization for CUDA
             model = AutoModelForCausalLM.from_pretrained(
                 MODEL_NAME,
                 cache_dir=str(cache_dir),
                 device_map="auto",
                 torch_dtype=torch.float16,
-                quantization_config=quantization_config,
                 low_cpu_mem_usage=True,
                 trust_remote_code=True,
                 **token_kwargs
             )
-        print("✅ Model loaded successfully!")
         return True
     except Exception as e:
-        print(f"❌ Model loading failed: {str(e)}")
         return False
 def stream_generator(prompt):
-    """Generator function for streaming response with thinking steps"""
-    # Ensure model is loaded
     if not load_model():
         yield json.dumps({"type": "error", "content": "Model not loaded"}) + '\n'
         return
-    # Thinking phases
     thinking_steps = [
         "🔍 Analyzing your question...",
         "🧠 Processing...",
         "💡 Formulating response..."
     ]
-    # Stream thinking steps (fewer steps, faster timing for lighter model)
     for step in thinking_steps:
         yield json.dumps({"type": "thinking", "content": step}) + '\n'
-        time.sleep(0.5)  # Reduced timing for faster response
-    # Prepare streaming generation
     try:
-        # Format prompt for the model (DeepSeek specific)
         formatted_prompt = f"<|im_start|>user\n{prompt}<|im_end|>\n<|im_start|>assistant\n"
         inputs = tokenizer(formatted_prompt, return_tensors="pt")
         if DEVICE == "cuda":
             inputs = inputs.to("cuda")
-        # Use memory efficient approach
         with torch.no_grad():
             generated_ids = model.generate(
                 **inputs,
@@ -139,40 +109,28 @@ def stream_generator(prompt):
                 do_sample=True,
                 pad_token_id=tokenizer.eos_token_id,
                 return_dict_in_generate=True,
-                output_scores=False)
-        # Get output sequence
         output_ids = generated_ids.sequences[0][len(inputs.input_ids[0]):]
-        # Stream in slightly larger chunks for better performance
         full_output = ""
-        chunk_size = 5  # Increased number of tokens per chunk
         for i in range(0, len(output_ids), chunk_size):
-            chunk_ids = output_ids[i:i+chunk_size]
             chunk_text = tokenizer.decode(chunk_ids, skip_special_tokens=True)
             full_output += chunk_text
-            yield json.dumps({
-                "type": "answer",
-                "content": chunk_text
-            }) + '\n'
-            # Smaller delay for faster streaming
             time.sleep(0.03)
     except Exception as e:
         import traceback
         error_details = f"Error: {str(e)}\n{traceback.format_exc()}"
         print(error_details)
-        yield json.dumps({
-            "type": "error",
-            "content": f"Generation error: {str(e)}"
-        }) + '\n'
-    # Signal completion
     yield json.dumps({"type": "complete"}) + '\n'
-    # Clean up memory aggressively
     if DEVICE == "cuda":
         torch.cuda.empty_cache()
     gc.collect()
@@ -181,40 +139,37 @@ def stream_generator(prompt):
 def stream_chat():
     data = request.get_json()
     prompt = data.get('prompt', '').strip()
     if not prompt:
         return jsonify({"error": "Empty prompt"}), 400
     return Response(
         stream_generator(prompt),
         mimetype='text/event-stream',
         headers={
             'Cache-Control': 'no-cache',
-            'X-Accel-Buffering': 'no',  # Prevent Nginx buffering
             'Connection': 'keep-alive'
         }
     )
 @app.route('/chat', methods=['POST'])
 def chat():
-    # Ensure model is loaded
     if not load_model():
         return jsonify({"error": "Model failed to load"}), 500
     data = request.get_json()
     prompt = data.get('prompt', '').strip()
     if not prompt:
         return jsonify({"error": "Empty prompt"}), 400
     try:
-        # Format prompt for DeepSeek model
         formatted_prompt = f"<|im_start|>user\n{prompt}<|im_end|>\n<|im_start|>assistant\n"
         inputs = tokenizer(formatted_prompt, return_tensors="pt")
         if DEVICE == "cuda":
             inputs = inputs.to("cuda")
         with torch.no_grad():
             outputs = model.generate(
                 **inputs,
@@ -222,43 +177,37 @@ def chat():
                 temperature=0.7,
                 top_p=0.9,
                 do_sample=True,
-                pad_token_id=tokenizer.eos_token_id)
         response = tokenizer.decode(outputs[0][len(inputs.input_ids[0]):], skip_special_tokens=True)
-        # Clean up memory
         if DEVICE == "cuda":
             torch.cuda.empty_cache()
         gc.collect()
         return jsonify({"response": response})
     except Exception as e:
         import traceback
-        error_details = f"Error: {str(e)}\n{traceback.format_exc()}"
-        print(error_details)
         return jsonify({"error": str(e)}), 500
 @app.route('/health', methods=['GET'])
 def health_check():
     model_loaded = tokenizer is not None and model is not None
     memory_info = "N/A"
-    # Get memory usage stats
     if torch.cuda.is_available():
         memory_info = f"{torch.cuda.memory_allocated()/1024**2:.2f}MB / {torch.cuda.get_device_properties(0).total_memory/1024**2:.2f}MB"
     else:
         import psutil
         memory_info = f"{psutil.virtual_memory().used/1024**3:.2f}GB / {psutil.virtual_memory().total/1024**3:.2f}GB"
-    try:
-        # Check if we need to load the model
-        if not model_loaded and request.args.get('load') == 'true':
-            model_loaded = load_model()
-    except Exception as e:
-        print(f"Health check error: {str(e)}")
-    status = {
         "status": "ok" if model_loaded else "waiting",
         "model": MODEL_NAME,
         "model_loaded": model_loaded,
@@ -266,27 +215,20 @@ def health_check():
         "cache_dir": str(cache_dir),
         "max_tokens": MAX_NEW_TOKENS,
         "memory_usage": memory_info
-    }
-    return jsonify(status)
 @app.route('/unload', methods=['POST'])
 def unload_model():
-    """Endpoint to manually unload model and free memory"""
     global model, tokenizer
     if model is not None:
         del model
         model = None
     if tokenizer is not None:
         del tokenizer
         tokenizer = None
-    # Force garbage collection
     if torch.cuda.is_available():
         torch.cuda.empty_cache()
     gc.collect()
     return jsonify({"status": "Model unloaded", "memory_freed": True})
 @app.route('/')
@@ -296,9 +238,9 @@ def home():
         "status": "online",
         "endpoints": {
             "POST /chat": "Single-response chat",
-            "POST /stream_chat": "Streaming chat with thinking steps",
             "GET /health": "Service health check",
-            "POST /unload": "Unload model to free memory"
         },
         "config": {
             "model": MODEL_NAME,
@@ -309,9 +251,7 @@ def home():
     })
 if __name__ == '__main__':
-    # Load model at startup only if explicitly requested
     if os.getenv('PRELOAD_MODEL', 'false').lower() == 'true':
         load_model()
     port = int(os.environ.get("PORT", 5000))
-    app.run(host='0.0.0.0', port=port)

 import os
 import time
 import json
+import gc
 from pathlib import Path
 from flask import Flask, request, jsonify, Response
 from flask_cors import CORS
 import torch
+# Cache and model settings
 cache_dir = Path(os.getenv('TRANSFORMERS_CACHE', '/app/cache'))
 cache_dir.mkdir(parents=True, exist_ok=True)
 app = Flask(__name__)
+CORS(app)
 MODEL_NAME = "deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B"
 MAX_NEW_TOKENS = 256
+DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
 tokenizer = None
 model = None
 def load_model():
     global tokenizer, model
     if tokenizer is not None and model is not None:
         return True
     try:
         from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig
+        print(f"Loading model: {MODEL_NAME}")
+        print(f"Device: {DEVICE}")
+        # HF auth token if needed
+        hf_token = os.environ.get("HF_TOKEN")
+        token_kwargs = {"token": hf_token} if hf_token else {}
+        tokenizer = AutoTokenizer.from_pretrained(
+            MODEL_NAME,
+            cache_dir=str(cache_dir),
+            trust_remote_code=True,
+            **token_kwargs
+        )
         if DEVICE == "cuda":
+            quant_config = BitsAndBytesConfig(
                 load_in_4bit=True,
                 bnb_4bit_compute_dtype=torch.float16,
                 bnb_4bit_quant_type="nf4",
                 bnb_4bit_use_double_quant=True
             )
             model = AutoModelForCausalLM.from_pretrained(
                 MODEL_NAME,
                 cache_dir=str(cache_dir),
+                trust_remote_code=True,
                 device_map="auto",
+                quantization_config=quant_config,
                 torch_dtype=torch.float16,
                 low_cpu_mem_usage=True,
+                **token_kwargs
+            )
+        else:
+            # CPU: no quantization_config; use float16 if possible
+            model = AutoModelForCausalLM.from_pretrained(
+                MODEL_NAME,
+                cache_dir=str(cache_dir),
                 trust_remote_code=True,
+                torch_dtype=torch.float16,
+                low_cpu_mem_usage=True,
                 **token_kwargs
             )
+        print("✅ Model loaded successfully")
         return True
     except Exception as e:
+        print(f"❌ Failed to load model: {e}")
         return False
 def stream_generator(prompt):
     if not load_model():
         yield json.dumps({"type": "error", "content": "Model not loaded"}) + '\n'
         return
     thinking_steps = [
         "🔍 Analyzing your question...",
         "🧠 Processing...",
         "💡 Formulating response..."
     ]
     for step in thinking_steps:
         yield json.dumps({"type": "thinking", "content": step}) + '\n'
+        time.sleep(0.5)
     try:
         formatted_prompt = f"<|im_start|>user\n{prompt}<|im_end|>\n<|im_start|>assistant\n"
         inputs = tokenizer(formatted_prompt, return_tensors="pt")
         if DEVICE == "cuda":
             inputs = inputs.to("cuda")
         with torch.no_grad():
             generated_ids = model.generate(
                 **inputs,
                 do_sample=True,
                 pad_token_id=tokenizer.eos_token_id,
                 return_dict_in_generate=True,
+                output_scores=False
+            )
         output_ids = generated_ids.sequences[0][len(inputs.input_ids[0]):]
         full_output = ""
+        chunk_size = 5
         for i in range(0, len(output_ids), chunk_size):
+            chunk_ids = output_ids[i:i + chunk_size]
             chunk_text = tokenizer.decode(chunk_ids, skip_special_tokens=True)
             full_output += chunk_text
+            yield json.dumps({"type": "answer", "content": chunk_text}) + '\n'
             time.sleep(0.03)
     except Exception as e:
         import traceback
         error_details = f"Error: {str(e)}\n{traceback.format_exc()}"
         print(error_details)
+        yield json.dumps({"type": "error", "content": str(e)}) + '\n'
     yield json.dumps({"type": "complete"}) + '\n'
     if DEVICE == "cuda":
         torch.cuda.empty_cache()
     gc.collect()
 def stream_chat():
     data = request.get_json()
     prompt = data.get('prompt', '').strip()
     if not prompt:
         return jsonify({"error": "Empty prompt"}), 400
     return Response(
         stream_generator(prompt),
         mimetype='text/event-stream',
         headers={
             'Cache-Control': 'no-cache',
+            'X-Accel-Buffering': 'no',
             'Connection': 'keep-alive'
         }
     )
 @app.route('/chat', methods=['POST'])
 def chat():
     if not load_model():
         return jsonify({"error": "Model failed to load"}), 500
     data = request.get_json()
     prompt = data.get('prompt', '').strip()
     if not prompt:
         return jsonify({"error": "Empty prompt"}), 400
     try:
         formatted_prompt = f"<|im_start|>user\n{prompt}<|im_end|>\n<|im_start|>assistant\n"
         inputs = tokenizer(formatted_prompt, return_tensors="pt")
         if DEVICE == "cuda":
             inputs = inputs.to("cuda")
         with torch.no_grad():
             outputs = model.generate(
                 **inputs,
                 temperature=0.7,
                 top_p=0.9,
                 do_sample=True,
+                pad_token_id=tokenizer.eos_token_id
+            )
         response = tokenizer.decode(outputs[0][len(inputs.input_ids[0]):], skip_special_tokens=True)
         if DEVICE == "cuda":
             torch.cuda.empty_cache()
         gc.collect()
         return jsonify({"response": response})
     except Exception as e:
         import traceback
+        print(f"Error: {e}\n{traceback.format_exc()}")
         return jsonify({"error": str(e)}), 500
 @app.route('/health', methods=['GET'])
 def health_check():
     model_loaded = tokenizer is not None and model is not None
     memory_info = "N/A"
     if torch.cuda.is_available():
         memory_info = f"{torch.cuda.memory_allocated()/1024**2:.2f}MB / {torch.cuda.get_device_properties(0).total_memory/1024**2:.2f}MB"
     else:
         import psutil
         memory_info = f"{psutil.virtual_memory().used/1024**3:.2f}GB / {psutil.virtual_memory().total/1024**3:.2f}GB"
+    if not model_loaded and request.args.get('load') == 'true':
+        model_loaded = load_model()
+    return jsonify({
         "status": "ok" if model_loaded else "waiting",
         "model": MODEL_NAME,
         "model_loaded": model_loaded,
         "cache_dir": str(cache_dir),
         "max_tokens": MAX_NEW_TOKENS,
         "memory_usage": memory_info
+    })
 @app.route('/unload', methods=['POST'])
 def unload_model():
     global model, tokenizer
     if model is not None:
         del model
         model = None
     if tokenizer is not None:
         del tokenizer
         tokenizer = None
     if torch.cuda.is_available():
         torch.cuda.empty_cache()
     gc.collect()
     return jsonify({"status": "Model unloaded", "memory_freed": True})
 @app.route('/')
         "status": "online",
         "endpoints": {
             "POST /chat": "Single-response chat",
+            "POST /stream_chat": "Streaming chat",
             "GET /health": "Service health check",
+            "POST /unload": "Unload model"
         },
         "config": {
             "model": MODEL_NAME,
     })
 if __name__ == '__main__':
     if os.getenv('PRELOAD_MODEL', 'false').lower() == 'true':
         load_model()
     port = int(os.environ.get("PORT", 5000))
+    app.run(host='0.0.0.0', port=port)