Spaces:

mike23415
/

Thinking

Running

App Files Files Community

mike23415 commited on 27 days ago

Commit

c5dd812

verified ·

1 Parent(s): 7625bb8

Update app.py

Browse files

Files changed (1) hide show

app.py +172 -150

app.py CHANGED Viewed

@@ -1,166 +1,188 @@
 import os
 import time
-import json
-import gc
-from pathlib import Path
-from flask import Flask, request, jsonify, Response
-from flask_cors import CORS
 import torch
-# Caching
-cache_dir = Path(os.getenv('TRANSFORMERS_CACHE', '/app/cache'))
-cache_dir.mkdir(parents=True, exist_ok=True)
 app = Flask(__name__)
 CORS(app)
-MODEL_NAME = "microsoft/Phi-3-mini-4k-instruct"
-MAX_NEW_TOKENS = 256
-DEVICE = "cpu" if not torch.cuda.is_available() else "cuda"
-tokenizer = None
-model = None
 def load_model():
-    global tokenizer, model
-    if tokenizer and model:
-        return True
-    try:
-        from transformers import AutoTokenizer, AutoModelForCausalLM
-        print(f"Loading {MODEL_NAME} on {DEVICE}...")
-        hf_token = os.environ.get("HF_TOKEN")
-        token_kwargs = {"token": hf_token} if hf_token else {}
-        tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME, cache_dir=str(cache_dir), trust_remote_code=False, **token_kwargs)
-        model = AutoModelForCausalLM.from_pretrained(
-            MODEL_NAME,
-            cache_dir=str(cache_dir),
-            torch_dtype=torch.bfloat16 if DEVICE == "cpu" else torch.float16,
-            low_cpu_mem_usage=True,
-            trust_remote_code=False,
-            **token_kwargs
-        )
-        if DEVICE == "cuda":
-            model = model.to("cuda")
-        print("✅ Phi-3 Mini loaded successfully!")
-        return True
-    except Exception as e:
-        print(f"❌ Model load failed: {e}")
-        return False
-def stream_generator(prompt):
-    if not load_model():
-        yield json.dumps({"type": "error", "content": "Model failed to load"}) + '\n'
-        return
-    thinking = ["🧠 Thinking...", "🤖 Preparing answer..."]
-    for step in thinking:
-        yield json.dumps({"type": "thinking", "content": step}) + '\n'
-        time.sleep(0.4)
-    try:
-        formatted_prompt = f"<|user|>\n{prompt}\n<|assistant|>\n"
-        inputs = tokenizer(formatted_prompt, return_tensors="pt").to(DEVICE if DEVICE == "cuda" else "cpu")
         with torch.no_grad():
-            output = model.generate(
-                **inputs,
-                max_new_tokens=MAX_NEW_TOKENS,
-                temperature=0.7,
-                top_p=0.9,
                 do_sample=True,
                 pad_token_id=tokenizer.eos_token_id
             )
-        new_tokens = output[0][inputs.input_ids.shape[-1]:]
-        generated_text = tokenizer.decode(new_tokens, skip_special_tokens=True)
-        for i in range(0, len(generated_text), 12):
-            yield json.dumps({"type": "answer", "content": generated_text[i:i+12]}) + '\n'
-            time.sleep(0.03)
-    except Exception as e:
-        yield json.dumps({"type": "error", "content": str(e)}) + '\n'
-    yield json.dumps({"type": "complete"}) + '\n'
-    if DEVICE == "cuda":
-        torch.cuda.empty_cache()
-    gc.collect()
-@app.route('/stream_chat', methods=['POST'])
-def stream_chat():
-    data = request.get_json()
-    prompt = data.get('prompt', '').strip()
-    if not prompt:
-        return jsonify({"error": "Empty prompt"}), 400
-    return Response(
-        stream_generator(prompt),
-        mimetype='text/event-stream',
-        headers={
-            'Cache-Control': 'no-cache',
-            'X-Accel-Buffering': 'no',
-            'Connection': 'keep-alive'
-        }
-    )
-@app.route('/chat', methods=['POST'])
 def chat():
-    if not load_model():
-        return jsonify({"error": "Model failed to load"}), 500
-    data = request.get_json()
-    prompt = data.get('prompt', '').strip()
-    if not prompt:
-        return jsonify({"error": "Empty prompt"}), 400
     try:
-        formatted_prompt = f"<|user|>\n{prompt}\n<|assistant|>\n"
-        inputs = tokenizer(formatted_prompt, return_tensors="pt").to(DEVICE if DEVICE == "cuda" else "cpu")
-        with torch.no_grad():
-            output = model.generate(
-                **inputs,
-                max_new_tokens=MAX_NEW_TOKENS,
-                temperature=0.7,
-                top_p=0.9,
-                do_sample=True,
-                pad_token_id=tokenizer.eos_token_id
-            )
-        response_text = tokenizer.decode(output[0][inputs.input_ids.shape[-1]:], skip_special_tokens=True)
-        return jsonify({"response": response_text})
-    except Exception as e:
-        return jsonify({"error": str(e)}), 500
-@app.route('/health')
-def health():
-    import psutil
-    model_loaded = model is not None
-    return jsonify({
-        "status": "ok" if model_loaded else "waiting",
-        "model_loaded": model_loaded,
-        "memory": f"{psutil.virtual_memory().used/1024**3:.2f}GB used",
-        "device": DEVICE,
-    })
-@app.route('/')
-def home():
-    return jsonify({
-        "service": "Phi-3 Mini Chat API",
-        "status": "online",
-        "endpoints": {
-            "POST /chat": "Single-response",
-            "POST /stream_chat": "Streaming chat"
         }
-    })
-if __name__ == '__main__':
-    if os.getenv('PRELOAD_MODEL', 'false') == 'true':
-        load_model()
-    app.run(host='0.0.0.0', port=int(os.environ.get("PORT", 5000)))

 import os
 import time
 import torch
+from flask import Flask, request, jsonify
+from flask_cors import CORS
+from transformers import AutoModelForCausalLM, AutoTokenizer
+import gradio as gr
+# Initialize Flask app
 app = Flask(__name__)
 CORS(app)
+# Global variables
+MODEL_ID = "microsoft/bitnet-b1.58-2B-4T"
+MAX_LENGTH = 2048
+MAX_NEW_TOKENS = 512
+TEMPERATURE = 0.7
+TOP_P = 0.9
+THINKING_STEPS = 3  # Number of thinking steps
+# Load model and tokenizer
+@app.before_first_request
 def load_model():
+    global model, tokenizer
+    print(f"Loading model: {MODEL_ID}")
+    # Load tokenizer
+    tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)
+    # Load model with optimizations for limited resources
+    model = AutoModelForCausalLM.from_pretrained(
+        MODEL_ID,
+        device_map="auto",
+        torch_dtype=torch.bfloat16,
+        load_in_4bit=True,
+    )
+    print("Model and tokenizer loaded successfully!")
+# Helper function for step-by-step thinking
+def generate_with_thinking(prompt, thinking_steps=THINKING_STEPS):
+    # Initialize conversation with prompt
+    full_prompt = prompt
+    # Add thinking prefix
+    thinking_prompt = full_prompt + "\n\nLet me think through this step by step:"
+    # Generate thinking steps
+    thinking_output = ""
+    for step in range(thinking_steps):
+        # Generate step i of thinking
+        inputs = tokenizer(thinking_prompt + thinking_output, return_tensors="pt").to(model.device)
         with torch.no_grad():
+            outputs = model.generate(
+                inputs["input_ids"],
+                max_length=MAX_LENGTH,
+                max_new_tokens=MAX_NEW_TOKENS // thinking_steps,
+                temperature=TEMPERATURE,
+                top_p=TOP_P,
                 do_sample=True,
                 pad_token_id=tokenizer.eos_token_id
             )
+        # Extract only new tokens
+        new_tokens = outputs[0][inputs["input_ids"].shape[1]:]
+        thinking_step_output = tokenizer.decode(new_tokens, skip_special_tokens=True)
+        # Add this step to our thinking output
+        thinking_output += f"\n\nStep {step+1}: {thinking_step_output}"
+    # Now generate final answer based on the thinking
+    final_prompt = full_prompt + "\n\n" + thinking_output + "\n\nBased on this thinking, my final answer is:"
+    inputs = tokenizer(final_prompt, return_tensors="pt").to(model.device)
+    with torch.no_grad():
+        outputs = model.generate(
+            inputs["input_ids"],
+            max_length=MAX_LENGTH,
+            max_new_tokens=MAX_NEW_TOKENS // 2,
+            temperature=TEMPERATURE,
+            top_p=TOP_P,
+            do_sample=True,
+            pad_token_id=tokenizer.eos_token_id
+        )
+    # Extract only the new tokens (the answer)
+    new_tokens = outputs[0][inputs["input_ids"].shape[1]:]
+    answer = tokenizer.decode(new_tokens, skip_special_tokens=True)
+    # Return thinking process and final answer
+    return {
+        "thinking": thinking_output,
+        "answer": answer,
+        "full_response": thinking_output + "\n\nBased on this thinking, my final answer is: " + answer
+    }
+# API endpoint for chat
+@app.route('/api/chat', methods=['POST'])
 def chat():
     try:
+        data = request.json
+        prompt = data.get('prompt', '')
+        include_thinking = data.get('include_thinking', False)
+        if not prompt:
+            return jsonify({'error': 'Prompt is required'}), 400
+        start_time = time.time()
+        response = generate_with_thinking(prompt)
+        end_time = time.time()
+        result = {
+            'answer': response['answer'],
+            'time_taken': round(end_time - start_time, 2)
         }
+        # Include thinking steps if requested
+        if include_thinking:
+            result['thinking'] = response['thinking']
+        return jsonify(result)
+    except Exception as e:
+        return jsonify({'error': str(e)}), 500
+# Simple health check endpoint
+@app.route('/health', methods=['GET'])
+def health_check():
+    return jsonify({'status': 'ok'})
+# Gradio Web UI
+def create_ui():
+    with gr.Blocks() as demo:
+        gr.Markdown("# BitNet Specialist Chatbot with Step-by-Step Thinking")
+        with gr.Row():
+            with gr.Column():
+                input_text = gr.Textbox(
+                    label="Your question",
+                    placeholder="Ask me anything...",
+                    lines=3
+                )
+                with gr.Row():
+                    submit_btn = gr.Button("Submit")
+                    clear_btn = gr.Button("Clear")
+                show_thinking = gr.Checkbox(label="Show thinking steps", value=True)
+            with gr.Column():
+                thinking_output = gr.Markdown(label="Thinking Process", visible=True)
+                answer_output = gr.Markdown(label="Final Answer")
+        def respond(question, show_thinking):
+            if not question.strip():
+                return "", "Please enter a question"
+            response = generate_with_thinking(question)
+            if show_thinking:
+                return response["thinking"], response["answer"]
+            else:
+                return "", response["answer"]
+        submit_btn.click(
+            respond,
+            inputs=[input_text, show_thinking],
+            outputs=[thinking_output, answer_output]
+        )
+        clear_btn.click(
+            lambda: ("", "", ""),
+            inputs=None,
+            outputs=[input_text, thinking_output, answer_output]
+        )
+    return demo
+# Create Gradio UI and launch the app
+if __name__ == "__main__":
+    # Load model at startup for Gradio
+    load_model()
+    # Create and launch Gradio interface
+    demo = create_ui()
+    demo.launch(server_name="0.0.0.0", server_port=7860, share=True)