Spaces:

mike23415
/

Thinking

Running

App Files Files Community

mike23415 commited on 26 days ago

Commit

5d565fc

verified ·

1 Parent(s): ec804b3

Update app.py

Browse files

Files changed (1) hide show

app.py +112 -64

app.py CHANGED Viewed

@@ -7,6 +7,7 @@ import logging
 import threading
 import queue
 import json
 from transformers import AutoTokenizer, AutoModelForCausalLM
 # Set up logging
@@ -17,6 +18,9 @@ logging.basicConfig(
 )
 logger = logging.getLogger(__name__)
 # Fix caching issue on Hugging Face Spaces
 os.environ["TRANSFORMERS_CACHE"] = "/tmp"
 os.environ["HF_HOME"] = "/tmp"
@@ -32,11 +36,36 @@ logger.info(f"Using device: {device}")
 tokenizer = None
 model = None
 # Initialize models once on startup
 def initialize_models():
     global tokenizer, model
     try:
         logger.info("Loading language model...")
         # You can change the model here if needed
         model_name = "Qwen/Qwen2.5-1.5B-Instruct"  # Good balance of quality and speed for CPU
@@ -45,17 +74,25 @@ def initialize_models():
         logger.info(f"Loading tokenizer: {model_name}")
         tokenizer = AutoTokenizer.from_pretrained(
             model_name,
-            use_fast=True  # Use the fast tokenizers when available
         )
         # Load model with optimizations for CPU
         logger.info(f"Loading model: {model_name}")
         model = AutoModelForCausalLM.from_pretrained(
             model_name,
-            torch_dtype=torch.float16,  # Use float16 for lower memory
-            device_map="cpu",  # Explicitly set to CPU
             low_cpu_mem_usage=True,  # Optimize memory loading
-            offload_folder="offload"  # Use disk offloading if needed
         )
         # Handle padding tokens
@@ -74,6 +111,21 @@ def initialize_models():
         logger.error(f"Error initializing models: {str(e)}")
         raise
 # Function to simulate "thinking" process
 def thinking_process(message, result_queue):
     """
@@ -84,19 +136,10 @@ def thinking_process(message, result_queue):
         # Simulate explicit thinking stage
         logger.info(f"Thinking about: '{message}'")
-        # Pause to simulate deeper thinking (helps with more complex queries)
-        time.sleep(1)
         # Create thoughtful prompt with system message and thinking instructions
         prompt = f"""<|im_start|>system
 You are a helpful, friendly, and thoughtful AI assistant.
-Let's approach the user's request step by step:
-1. First, understand what the user is really asking
-2. Consider the key aspects we need to address
-3. Think about the best way to structure the response
-4. Provide clear, accurate information in a conversational tone
-Always think carefully before responding, consider different angles, and provide thoughtful, detailed answers.
 <|im_end|>
 <|im_start|>user
 {message}<|im_end|>
@@ -104,30 +147,28 @@ Always think carefully before responding, consider different angles, and provide
 """
         # Handle inputs
-        inputs = tokenizer(prompt, return_tensors="pt", truncation=True, max_length=1024)
-        inputs = {k: v.to('cpu') for k, v in inputs.items()}
         # Generate answer with streaming
         streamer = TextStreamer(tokenizer, result_queue)
         # Simulate thinking first by sending some initial dots
         result_queue.put("Let me think about this...")
-        time.sleep(0.5)
-        # Generate response - we use a temperature of 0.7 for more thoughtful outputs
-        # and top_p for nucleus sampling to avoid repetitive or generic responses
         try:
-            model.generate(
-                **inputs,
-                max_new_tokens=512,
-                temperature=0.7,
-                top_p=0.9,
-                do_sample=True,
-                streamer=streamer,
-                num_beams=2,  # Using 2 beams helps with coherence
-                no_repeat_ngram_size=3,
-                repetition_penalty=1.2  # Discourages token repetition
-            )
         except Exception as e:
             logger.error(f"Model generation error: {str(e)}")
             result_queue.put(f"\n\nI apologize, but I encountered an error while processing your request.")
@@ -141,29 +182,25 @@ Always think carefully before responding, consider different angles, and provide
         # Signal generation is complete
         result_queue.put(None)
-# TextStreamer class for token-by-token generation
-class TextStreamer:
-    def __init__(self, tokenizer, queue):
-        self.tokenizer = tokenizer
-        self.queue = queue
-        self.current_tokens = []
-    def put(self, token_ids):
-        self.current_tokens.extend(token_ids.tolist())
-        text = self.tokenizer.decode(self.current_tokens, skip_special_tokens=True)
-        self.queue.put(text)
-    def end(self):
-        pass
 # API route for home page
 @app.route('/')
 def home():
-    return jsonify({"message": "AI Chat API is running!"})
 # API route for streaming chat responses
 @app.route('/chat', methods=['POST', 'GET'])
 def chat():
     # Handle both POST JSON and GET query parameters for flexibility
     if request.method == 'POST':
         try:
@@ -235,6 +272,10 @@ def chat():
 # Simple API for non-streaming chat (fallback)
 @app.route('/chat-simple', methods=['POST'])
 def chat_simple():
     data = request.get_json()
     message = data.get("message", "")
@@ -242,29 +283,29 @@ def chat_simple():
         return jsonify({"error": "Message is required"}), 400
     try:
-        # Create prompt with system message
         prompt = f"""<|im_start|>system
-You are a helpful, friendly, and thoughtful AI assistant. Think carefully and provide informative, detailed responses.
 <|im_end|>
 <|im_start|>user
 {message}<|im_end|>
 <|im_start|>assistant
 """
-        # Handle inputs
-        inputs = tokenizer(prompt, return_tensors="pt", truncation=True, max_length=1024)
-        inputs = {k: v.to('cpu') for k, v in inputs.items()}
-        # Generate answer
-        output = model.generate(
-            **inputs,
-            max_new_tokens=512,
-            temperature=0.7,
-            top_p=0.9,
-            do_sample=True,
-            num_beams=1,
-            no_repeat_ngram_size=3
-        )
         # Decode and format answer
         answer = tokenizer.decode(output[0], skip_special_tokens=True)
@@ -281,9 +322,16 @@ You are a helpful, friendly, and thoughtful AI assistant. Think carefully and pr
 if __name__ == "__main__":
     try:
-        # Initialize models at startup
-        initialize_models()
         logger.info("Starting Flask application")
-        app.run(host="0.0.0.0", port=7860)
     except Exception as e:
         logger.critical(f"Failed to start application: {str(e)}")

 import threading
 import queue
 import json
+import gc
 from transformers import AutoTokenizer, AutoModelForCausalLM
 # Set up logging
 )
 logger = logging.getLogger(__name__)
+# Print startup banner for visibility in logs
+print("\n===== Application Startup at", time.strftime("%Y-%m-%d %H:%M:%S"), "=====\n")
 # Fix caching issue on Hugging Face Spaces
 os.environ["TRANSFORMERS_CACHE"] = "/tmp"
 os.environ["HF_HOME"] = "/tmp"
 tokenizer = None
 model = None
+# Check available system resources
+def log_system_info():
+    # Basic system info
+    logger.info(f"Python version: {os.sys.version}")
+    # CPU info
+    import multiprocessing
+    logger.info(f"CPU cores: {multiprocessing.cpu_count()}")
+    # Memory info
+    try:
+        import psutil
+        mem = psutil.virtual_memory()
+        logger.info(f"Memory: Total={mem.total/1e9:.1f}GB, Available={mem.available/1e9:.1f}GB ({mem.percent}% used)")
+    except ImportError:
+        logger.info("psutil not installed, skipping detailed memory info")
+    # PyTorch info
+    logger.info(f"PyTorch version: {torch.__version__}")
+    logger.info(f"CUDA available: {torch.cuda.is_available()}")
+    if torch.cuda.is_available():
+        logger.info(f"CUDA version: {torch.version.cuda}")
+        logger.info(f"GPU: {torch.cuda.get_device_name(0)}")
 # Initialize models once on startup
 def initialize_models():
     global tokenizer, model
     try:
         logger.info("Loading language model...")
+        log_system_info()
         # You can change the model here if needed
         model_name = "Qwen/Qwen2.5-1.5B-Instruct"  # Good balance of quality and speed for CPU
         logger.info(f"Loading tokenizer: {model_name}")
         tokenizer = AutoTokenizer.from_pretrained(
             model_name,
+            use_fast=True,  # Use the fast tokenizers when available
+            local_files_only=False  # Allow downloading if not cached
         )
+        # Free up memory before loading model
+        gc.collect()
+        torch.cuda.empty_cache() if torch.cuda.is_available() else None
         # Load model with optimizations for CPU
         logger.info(f"Loading model: {model_name}")
+        # Set lower precision for CPU to reduce memory usage
+        torch_dtype = torch.bfloat16 if torch.cuda.is_available() else torch.float32
         model = AutoModelForCausalLM.from_pretrained(
             model_name,
+            torch_dtype=torch_dtype,
             low_cpu_mem_usage=True,  # Optimize memory loading
+            device_map="auto"  # Let the system decide optimal device mapping
         )
         # Handle padding tokens
         logger.error(f"Error initializing models: {str(e)}")
         raise
+# TextStreamer class for token-by-token generation
+class TextStreamer:
+    def __init__(self, tokenizer, queue):
+        self.tokenizer = tokenizer
+        self.queue = queue
+        self.current_tokens = []
+    def put(self, token_ids):
+        self.current_tokens.extend(token_ids.tolist())
+        text = self.tokenizer.decode(self.current_tokens, skip_special_tokens=True)
+        self.queue.put(text)
+    def end(self):
+        pass
 # Function to simulate "thinking" process
 def thinking_process(message, result_queue):
     """
         # Simulate explicit thinking stage
         logger.info(f"Thinking about: '{message}'")
         # Create thoughtful prompt with system message and thinking instructions
         prompt = f"""<|im_start|>system
 You are a helpful, friendly, and thoughtful AI assistant.
+Let's approach the user's request step by step.
 <|im_end|>
 <|im_start|>user
 {message}<|im_end|>
 """
         # Handle inputs
+        inputs = tokenizer(prompt, return_tensors="pt", truncation=True, max_length=512)
+        inputs = {k: v.to(device) for k, v in inputs.items()}
         # Generate answer with streaming
         streamer = TextStreamer(tokenizer, result_queue)
         # Simulate thinking first by sending some initial dots
         result_queue.put("Let me think about this...")
+        # Generate response with simpler parameters to avoid memory issues
         try:
+            with torch.no_grad():  # Disable gradient calculation to save memory
+                model.generate(
+                    **inputs,
+                    max_new_tokens=256,  # Reduced from 512
+                    temperature=0.7,
+                    top_p=0.9,
+                    do_sample=True,
+                    streamer=streamer,
+                    num_beams=1,  # Reduced from 2
+                    repetition_penalty=1.2
+                )
         except Exception as e:
             logger.error(f"Model generation error: {str(e)}")
             result_queue.put(f"\n\nI apologize, but I encountered an error while processing your request.")
         # Signal generation is complete
         result_queue.put(None)
 # API route for home page
 @app.route('/')
 def home():
+    return jsonify({"message": "AI Chat API is running!", "status": "online"})
+# Health check endpoint
+@app.route('/health')
+def health():
+    if model is None or tokenizer is None:
+        return jsonify({"status": "initializing"}), 503
+    return jsonify({"status": "healthy"})
 # API route for streaming chat responses
 @app.route('/chat', methods=['POST', 'GET'])
 def chat():
+    # Check if models are loaded
+    if model is None or tokenizer is None:
+        return jsonify({"error": "Models are still initializing. Please try again shortly."}), 503
     # Handle both POST JSON and GET query parameters for flexibility
     if request.method == 'POST':
         try:
 # Simple API for non-streaming chat (fallback)
 @app.route('/chat-simple', methods=['POST'])
 def chat_simple():
+    # Check if models are loaded
+    if model is None or tokenizer is None:
+        return jsonify({"error": "Models are still initializing. Please try again shortly."}), 503
     data = request.get_json()
     message = data.get("message", "")
         return jsonify({"error": "Message is required"}), 400
     try:
+        # Create prompt with system message (shorter version)
         prompt = f"""<|im_start|>system
+You are a helpful assistant.
 <|im_end|>
 <|im_start|>user
 {message}<|im_end|>
 <|im_start|>assistant
 """
+        # Handle inputs with reduced context
+        inputs = tokenizer(prompt, return_tensors="pt", truncation=True, max_length=256)
+        inputs = {k: v.to(device) for k, v in inputs.items()}
+        # Generate answer with reduced parameters
+        with torch.no_grad():  # Disable gradient calculation
+            output = model.generate(
+                **inputs,
+                max_new_tokens=256,  # Reduced from 512
+                temperature=0.7,
+                top_p=0.9,
+                do_sample=True,
+                num_beams=1
+            )
         # Decode and format answer
         answer = tokenizer.decode(output[0], skip_special_tokens=True)
 if __name__ == "__main__":
     try:
+        # Start the Flask app in a separate thread
+        flask_thread = threading.Thread(target=lambda: app.run(host="0.0.0.0", port=7860))
+        flask_thread.daemon = True
+        flask_thread.start()
+        # Initialize models in the main thread
         logger.info("Starting Flask application")
+        initialize_models()
+        # Keep the main thread alive
+        flask_thread.join()
     except Exception as e:
         logger.critical(f"Failed to start application: {str(e)}")