Spaces:

mike23415
/

Thinking

Sleeping

App Files Files Community

mike23415 commited on 29 days ago

Commit

7625bb8

verified ·

1 Parent(s): 45ef073

Update app.py

Browse files

Files changed (1) hide show

app.py +42 -133

app.py CHANGED Viewed

@@ -7,130 +7,87 @@ from flask import Flask, request, jsonify, Response
 from flask_cors import CORS
 import torch
-# Cache and model settings
 cache_dir = Path(os.getenv('TRANSFORMERS_CACHE', '/app/cache'))
 cache_dir.mkdir(parents=True, exist_ok=True)
 app = Flask(__name__)
 CORS(app)
-MODEL_NAME = "deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B"
 MAX_NEW_TOKENS = 256
-DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
 tokenizer = None
 model = None
 def load_model():
     global tokenizer, model
-    if tokenizer is not None and model is not None:
         return True
     try:
-        from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig
-        print(f"Loading model: {MODEL_NAME}")
-        print(f"Device: {DEVICE}")
-        # HF auth token if needed
         hf_token = os.environ.get("HF_TOKEN")
         token_kwargs = {"token": hf_token} if hf_token else {}
-        tokenizer = AutoTokenizer.from_pretrained(
             MODEL_NAME,
             cache_dir=str(cache_dir),
-            trust_remote_code=True,
             **token_kwargs
         )
         if DEVICE == "cuda":
-            quant_config = BitsAndBytesConfig(
-                load_in_4bit=True,
-                bnb_4bit_compute_dtype=torch.float16,
-                bnb_4bit_quant_type="nf4",
-                bnb_4bit_use_double_quant=True
-            )
-            model = AutoModelForCausalLM.from_pretrained(
-                MODEL_NAME,
-                cache_dir=str(cache_dir),
-                trust_remote_code=True,
-                device_map="auto",
-                quantization_config=quant_config,
-                torch_dtype=torch.float16,
-                low_cpu_mem_usage=True,
-                **token_kwargs
-            )
-        else:
-            # CPU: no quantization_config; use float16 if possible
-            model = AutoModelForCausalLM.from_pretrained(
-                MODEL_NAME,
-                cache_dir=str(cache_dir),
-                trust_remote_code=True,
-                torch_dtype=torch.float16,
-                low_cpu_mem_usage=True,
-                **token_kwargs
-            )
-        print("✅ Model loaded successfully")
         return True
     except Exception as e:
-        print(f"❌ Failed to load model: {e}")
         return False
 def stream_generator(prompt):
     if not load_model():
-        yield json.dumps({"type": "error", "content": "Model not loaded"}) + '\n'
         return
-    thinking_steps = [
-        "🔍 Analyzing your question...",
-        "🧠 Processing...",
-        "💡 Formulating response..."
-    ]
-    for step in thinking_steps:
         yield json.dumps({"type": "thinking", "content": step}) + '\n'
-        time.sleep(0.5)
     try:
-        formatted_prompt = f"<|im_start|>user\n{prompt}<|im_end|>\n<|im_start|>assistant\n"
-        inputs = tokenizer(formatted_prompt, return_tensors="pt")
-        if DEVICE == "cuda":
-            inputs = inputs.to("cuda")
         with torch.no_grad():
-            generated_ids = model.generate(
                 **inputs,
                 max_new_tokens=MAX_NEW_TOKENS,
                 temperature=0.7,
                 top_p=0.9,
                 do_sample=True,
-                pad_token_id=tokenizer.eos_token_id,
-                return_dict_in_generate=True,
-                output_scores=False
             )
-        output_ids = generated_ids.sequences[0][len(inputs.input_ids[0]):]
-        full_output = ""
-        chunk_size = 5
-        for i in range(0, len(output_ids), chunk_size):
-            chunk_ids = output_ids[i:i + chunk_size]
-            chunk_text = tokenizer.decode(chunk_ids, skip_special_tokens=True)
-            full_output += chunk_text
-            yield json.dumps({"type": "answer", "content": chunk_text}) + '\n'
             time.sleep(0.03)
     except Exception as e:
-        import traceback
-        error_details = f"Error: {str(e)}\n{traceback.format_exc()}"
-        print(error_details)
         yield json.dumps({"type": "error", "content": str(e)}) + '\n'
     yield json.dumps({"type": "complete"}) + '\n'
     if DEVICE == "cuda":
         torch.cuda.empty_cache()
     gc.collect()
@@ -139,7 +96,6 @@ def stream_generator(prompt):
 def stream_chat():
     data = request.get_json()
     prompt = data.get('prompt', '').strip()
     if not prompt:
         return jsonify({"error": "Empty prompt"}), 400
@@ -160,18 +116,15 @@ def chat():
     data = request.get_json()
     prompt = data.get('prompt', '').strip()
     if not prompt:
         return jsonify({"error": "Empty prompt"}), 400
     try:
-        formatted_prompt = f"<|im_start|>user\n{prompt}<|im_end|>\n<|im_start|>assistant\n"
-        inputs = tokenizer(formatted_prompt, return_tensors="pt")
-        if DEVICE == "cuda":
-            inputs = inputs.to("cuda")
         with torch.no_grad():
-            outputs = model.generate(
                 **inputs,
                 max_new_tokens=MAX_NEW_TOKENS,
                 temperature=0.7,
@@ -180,78 +133,34 @@ def chat():
                 pad_token_id=tokenizer.eos_token_id
             )
-        response = tokenizer.decode(outputs[0][len(inputs.input_ids[0]):], skip_special_tokens=True)
-        if DEVICE == "cuda":
-            torch.cuda.empty_cache()
-        gc.collect()
-        return jsonify({"response": response})
     except Exception as e:
-        import traceback
-        print(f"Error: {e}\n{traceback.format_exc()}")
         return jsonify({"error": str(e)}), 500
-@app.route('/health', methods=['GET'])
-def health_check():
-    model_loaded = tokenizer is not None and model is not None
-    memory_info = "N/A"
-    if torch.cuda.is_available():
-        memory_info = f"{torch.cuda.memory_allocated()/1024**2:.2f}MB / {torch.cuda.get_device_properties(0).total_memory/1024**2:.2f}MB"
-    else:
-        import psutil
-        memory_info = f"{psutil.virtual_memory().used/1024**3:.2f}GB / {psutil.virtual_memory().total/1024**3:.2f}GB"
-    if not model_loaded and request.args.get('load') == 'true':
-        model_loaded = load_model()
     return jsonify({
         "status": "ok" if model_loaded else "waiting",
-        "model": MODEL_NAME,
         "model_loaded": model_loaded,
         "device": DEVICE,
-        "cache_dir": str(cache_dir),
-        "max_tokens": MAX_NEW_TOKENS,
-        "memory_usage": memory_info
     })
-@app.route('/unload', methods=['POST'])
-def unload_model():
-    global model, tokenizer
-    if model is not None:
-        del model
-        model = None
-    if tokenizer is not None:
-        del tokenizer
-        tokenizer = None
-    if torch.cuda.is_available():
-        torch.cuda.empty_cache()
-    gc.collect()
-    return jsonify({"status": "Model unloaded", "memory_freed": True})
 @app.route('/')
 def home():
     return jsonify({
-        "service": "DeepSeek-1.5B Chat API",
         "status": "online",
         "endpoints": {
-            "POST /chat": "Single-response chat",
-            "POST /stream_chat": "Streaming chat",
-            "GET /health": "Service health check",
-            "POST /unload": "Unload model"
-        },
-        "config": {
-            "model": MODEL_NAME,
-            "max_tokens": MAX_NEW_TOKENS,
-            "device": DEVICE,
-            "cache_location": str(cache_dir)
         }
     })
 if __name__ == '__main__':
-    if os.getenv('PRELOAD_MODEL', 'false').lower() == 'true':
         load_model()
-    port = int(os.environ.get("PORT", 5000))
-    app.run(host='0.0.0.0', port=port)

 from flask_cors import CORS
 import torch
+# Caching
 cache_dir = Path(os.getenv('TRANSFORMERS_CACHE', '/app/cache'))
 cache_dir.mkdir(parents=True, exist_ok=True)
 app = Flask(__name__)
 CORS(app)
+MODEL_NAME = "microsoft/Phi-3-mini-4k-instruct"
 MAX_NEW_TOKENS = 256
+DEVICE = "cpu" if not torch.cuda.is_available() else "cuda"
 tokenizer = None
 model = None
 def load_model():
     global tokenizer, model
+    if tokenizer and model:
         return True
     try:
+        from transformers import AutoTokenizer, AutoModelForCausalLM
+        print(f"Loading {MODEL_NAME} on {DEVICE}...")
         hf_token = os.environ.get("HF_TOKEN")
         token_kwargs = {"token": hf_token} if hf_token else {}
+        tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME, cache_dir=str(cache_dir), trust_remote_code=False, **token_kwargs)
+        model = AutoModelForCausalLM.from_pretrained(
             MODEL_NAME,
             cache_dir=str(cache_dir),
+            torch_dtype=torch.bfloat16 if DEVICE == "cpu" else torch.float16,
+            low_cpu_mem_usage=True,
+            trust_remote_code=False,
             **token_kwargs
         )
         if DEVICE == "cuda":
+            model = model.to("cuda")
+        print("✅ Phi-3 Mini loaded successfully!")
         return True
     except Exception as e:
+        print(f"❌ Model load failed: {e}")
         return False
 def stream_generator(prompt):
     if not load_model():
+        yield json.dumps({"type": "error", "content": "Model failed to load"}) + '\n'
         return
+    thinking = ["🧠 Thinking...", "🤖 Preparing answer..."]
+    for step in thinking:
         yield json.dumps({"type": "thinking", "content": step}) + '\n'
+        time.sleep(0.4)
     try:
+        formatted_prompt = f"<|user|>\n{prompt}\n<|assistant|>\n"
+        inputs = tokenizer(formatted_prompt, return_tensors="pt").to(DEVICE if DEVICE == "cuda" else "cpu")
         with torch.no_grad():
+            output = model.generate(
                 **inputs,
                 max_new_tokens=MAX_NEW_TOKENS,
                 temperature=0.7,
                 top_p=0.9,
                 do_sample=True,
+                pad_token_id=tokenizer.eos_token_id
             )
+        new_tokens = output[0][inputs.input_ids.shape[-1]:]
+        generated_text = tokenizer.decode(new_tokens, skip_special_tokens=True)
+        for i in range(0, len(generated_text), 12):
+            yield json.dumps({"type": "answer", "content": generated_text[i:i+12]}) + '\n'
             time.sleep(0.03)
     except Exception as e:
         yield json.dumps({"type": "error", "content": str(e)}) + '\n'
     yield json.dumps({"type": "complete"}) + '\n'
     if DEVICE == "cuda":
         torch.cuda.empty_cache()
     gc.collect()
 def stream_chat():
     data = request.get_json()
     prompt = data.get('prompt', '').strip()
     if not prompt:
         return jsonify({"error": "Empty prompt"}), 400
     data = request.get_json()
     prompt = data.get('prompt', '').strip()
     if not prompt:
         return jsonify({"error": "Empty prompt"}), 400
     try:
+        formatted_prompt = f"<|user|>\n{prompt}\n<|assistant|>\n"
+        inputs = tokenizer(formatted_prompt, return_tensors="pt").to(DEVICE if DEVICE == "cuda" else "cpu")
         with torch.no_grad():
+            output = model.generate(
                 **inputs,
                 max_new_tokens=MAX_NEW_TOKENS,
                 temperature=0.7,
                 pad_token_id=tokenizer.eos_token_id
             )
+        response_text = tokenizer.decode(output[0][inputs.input_ids.shape[-1]:], skip_special_tokens=True)
+        return jsonify({"response": response_text})
     except Exception as e:
         return jsonify({"error": str(e)}), 500
+@app.route('/health')
+def health():
+    import psutil
+    model_loaded = model is not None
     return jsonify({
         "status": "ok" if model_loaded else "waiting",
         "model_loaded": model_loaded,
+        "memory": f"{psutil.virtual_memory().used/1024**3:.2f}GB used",
         "device": DEVICE,
     })
 @app.route('/')
 def home():
     return jsonify({
+        "service": "Phi-3 Mini Chat API",
         "status": "online",
         "endpoints": {
+            "POST /chat": "Single-response",
+            "POST /stream_chat": "Streaming chat"
         }
     })
 if __name__ == '__main__':
+    if os.getenv('PRELOAD_MODEL', 'false') == 'true':
         load_model()
+    app.run(host='0.0.0.0', port=int(os.environ.get("PORT", 5000)))