Spaces:

mike23415
/

Thinking

Sleeping

App Files Files Community

mike23415 commited on 24 days ago

Commit

480e847

verified ·

1 Parent(s): 5d565fc

Update app.py

Browse files

Files changed (1) hide show

app.py +63 -181

app.py CHANGED Viewed

@@ -6,21 +6,12 @@ import time
 import logging
 import threading
 import queue
-import json
-import gc
 from transformers import AutoTokenizer, AutoModelForCausalLM
 # Set up logging
-logging.basicConfig(
-    level=logging.INFO,
-    format='%(asctime)s - %(name)s - %(levelname)s - %(message)s',
-    datefmt='%Y-%m-%d %H:%M:%S'
-)
 logger = logging.getLogger(__name__)
-# Print startup banner for visibility in logs
-print("\n===== Application Startup at", time.strftime("%Y-%m-%d %H:%M:%S"), "=====\n")
 # Fix caching issue on Hugging Face Spaces
 os.environ["TRANSFORMERS_CACHE"] = "/tmp"
 os.environ["HF_HOME"] = "/tmp"
@@ -36,110 +27,41 @@ logger.info(f"Using device: {device}")
 tokenizer = None
 model = None
-# Check available system resources
-def log_system_info():
-    # Basic system info
-    logger.info(f"Python version: {os.sys.version}")
-    # CPU info
-    import multiprocessing
-    logger.info(f"CPU cores: {multiprocessing.cpu_count()}")
-    # Memory info
-    try:
-        import psutil
-        mem = psutil.virtual_memory()
-        logger.info(f"Memory: Total={mem.total/1e9:.1f}GB, Available={mem.available/1e9:.1f}GB ({mem.percent}% used)")
-    except ImportError:
-        logger.info("psutil not installed, skipping detailed memory info")
-    # PyTorch info
-    logger.info(f"PyTorch version: {torch.__version__}")
-    logger.info(f"CUDA available: {torch.cuda.is_available()}")
-    if torch.cuda.is_available():
-        logger.info(f"CUDA version: {torch.version.cuda}")
-        logger.info(f"GPU: {torch.cuda.get_device_name(0)}")
 # Initialize models once on startup
 def initialize_models():
     global tokenizer, model
     try:
         logger.info("Loading language model...")
-        log_system_info()
-        # You can change the model here if needed
-        model_name = "Qwen/Qwen2.5-1.5B-Instruct"  # Good balance of quality and speed for CPU
-        # Load tokenizer with caching
-        logger.info(f"Loading tokenizer: {model_name}")
-        tokenizer = AutoTokenizer.from_pretrained(
-            model_name,
-            use_fast=True,  # Use the fast tokenizers when available
-            local_files_only=False  # Allow downloading if not cached
-        )
-        # Free up memory before loading model
-        gc.collect()
-        torch.cuda.empty_cache() if torch.cuda.is_available() else None
-        # Load model with optimizations for CPU
-        logger.info(f"Loading model: {model_name}")
-        # Set lower precision for CPU to reduce memory usage
-        torch_dtype = torch.bfloat16 if torch.cuda.is_available() else torch.float32
         model = AutoModelForCausalLM.from_pretrained(
             model_name,
-            torch_dtype=torch_dtype,
-            low_cpu_mem_usage=True,  # Optimize memory loading
-            device_map="auto"  # Let the system decide optimal device mapping
         )
-        # Handle padding tokens
         if tokenizer.pad_token is None:
-            logger.info("Setting pad token to EOS token")
             tokenizer.pad_token = tokenizer.eos_token
             model.config.pad_token_id = model.config.eos_token_id
-        # Set up model configuration for better generation
-        model.config.do_sample = True  # Enable sampling
-        model.config.temperature = 0.7  # Default temperature
-        model.config.top_p = 0.9  # Default top_p
         logger.info("Models initialized successfully")
     except Exception as e:
         logger.error(f"Error initializing models: {str(e)}")
         raise
-# TextStreamer class for token-by-token generation
-class TextStreamer:
-    def __init__(self, tokenizer, queue):
-        self.tokenizer = tokenizer
-        self.queue = queue
-        self.current_tokens = []
-    def put(self, token_ids):
-        self.current_tokens.extend(token_ids.tolist())
-        text = self.tokenizer.decode(self.current_tokens, skip_special_tokens=True)
-        self.queue.put(text)
-    def end(self):
-        pass
 # Function to simulate "thinking" process
 def thinking_process(message, result_queue):
     """
-    This function simulates a thinking process and puts the result in the queue.
-    It includes both an explicit thinking stage and then a generation stage.
     """
     try:
-        # Simulate explicit thinking stage
         logger.info(f"Thinking about: '{message}'")
-        # Create thoughtful prompt with system message and thinking instructions
         prompt = f"""<|im_start|>system
-You are a helpful, friendly, and thoughtful AI assistant.
-Let's approach the user's request step by step.
 <|im_end|>
 <|im_start|>user
 {message}<|im_end|>
@@ -147,31 +69,23 @@ Let's approach the user's request step by step.
 """
         # Handle inputs
-        inputs = tokenizer(prompt, return_tensors="pt", truncation=True, max_length=512)
-        inputs = {k: v.to(device) for k, v in inputs.items()}
         # Generate answer with streaming
         streamer = TextStreamer(tokenizer, result_queue)
-        # Simulate thinking first by sending some initial dots
-        result_queue.put("Let me think about this...")
-        # Generate response with simpler parameters to avoid memory issues
-        try:
-            with torch.no_grad():  # Disable gradient calculation to save memory
-                model.generate(
-                    **inputs,
-                    max_new_tokens=256,  # Reduced from 512
-                    temperature=0.7,
-                    top_p=0.9,
-                    do_sample=True,
-                    streamer=streamer,
-                    num_beams=1,  # Reduced from 2
-                    repetition_penalty=1.2
-                )
-        except Exception as e:
-            logger.error(f"Model generation error: {str(e)}")
-            result_queue.put(f"\n\nI apologize, but I encountered an error while processing your request.")
         # Signal generation is complete
         result_queue.put(None)
@@ -182,54 +96,42 @@ Let's approach the user's request step by step.
         # Signal generation is complete
         result_queue.put(None)
 # API route for home page
 @app.route('/')
 def home():
-    return jsonify({"message": "AI Chat API is running!", "status": "online"})
-# Health check endpoint
-@app.route('/health')
-def health():
-    if model is None or tokenizer is None:
-        return jsonify({"status": "initializing"}), 503
-    return jsonify({"status": "healthy"})
 # API route for streaming chat responses
-@app.route('/chat', methods=['POST', 'GET'])
 def chat():
-    # Check if models are loaded
-    if model is None or tokenizer is None:
-        return jsonify({"error": "Models are still initializing. Please try again shortly."}), 503
-    # Handle both POST JSON and GET query parameters for flexibility
-    if request.method == 'POST':
-        try:
-            data = request.get_json()
-            message = data.get("message", "")
-        except:
-            # If JSON parsing fails, try form data
-            message = request.form.get("message", "")
-    else:  # GET
-        message = request.args.get("message", "")
     if not message:
         return jsonify({"error": "Message is required"}), 400
     try:
         def generate():
-            # Signal the start of streaming with headers
-            yield "retry: 1000\n"
-            yield "event: message\n"
-            # Show thinking indicator
-            yield f"data: [Thinking...]\n\n"
             # Create a queue for communication between threads
             result_queue = queue.Queue()
             # Start thinking in a separate thread
             thread = threading.Thread(target=thinking_process, args=(message, result_queue))
-            thread.daemon = True  # Make thread die when main thread exits
             thread.start()
             # Stream results as they become available
@@ -245,8 +147,7 @@ def chat():
                         new_part = result[len(previous_text):]
                         previous_text = result
                         if new_part:
-                            yield f"data: {json.dumps(new_part)}\n\n"
-                            time.sleep(0.01)  # Small delay for more natural typing effect
                 except queue.Empty:
                     # Timeout occurred
@@ -255,15 +156,7 @@ def chat():
             yield "data: [DONE]\n\n"
-        return Response(
-            stream_with_context(generate()),
-            mimetype='text/event-stream',
-            headers={
-                'Cache-Control': 'no-cache',
-                'Connection': 'keep-alive',
-                'X-Accel-Buffering': 'no'  # Disable buffering for Nginx
-            }
-        )
     except Exception as e:
         logger.error(f"Error processing chat request: {str(e)}")
@@ -272,10 +165,6 @@ def chat():
 # Simple API for non-streaming chat (fallback)
 @app.route('/chat-simple', methods=['POST'])
 def chat_simple():
-    # Check if models are loaded
-    if model is None or tokenizer is None:
-        return jsonify({"error": "Models are still initializing. Please try again shortly."}), 503
     data = request.get_json()
     message = data.get("message", "")
@@ -283,29 +172,29 @@ def chat_simple():
         return jsonify({"error": "Message is required"}), 400
     try:
-        # Create prompt with system message (shorter version)
         prompt = f"""<|im_start|>system
-You are a helpful assistant.
 <|im_end|>
 <|im_start|>user
 {message}<|im_end|>
 <|im_start|>assistant
 """
-        # Handle inputs with reduced context
-        inputs = tokenizer(prompt, return_tensors="pt", truncation=True, max_length=256)
-        inputs = {k: v.to(device) for k, v in inputs.items()}
-        # Generate answer with reduced parameters
-        with torch.no_grad():  # Disable gradient calculation
-            output = model.generate(
-                **inputs,
-                max_new_tokens=256,  # Reduced from 512
-                temperature=0.7,
-                top_p=0.9,
-                do_sample=True,
-                num_beams=1
-            )
         # Decode and format answer
         answer = tokenizer.decode(output[0], skip_special_tokens=True)
@@ -322,16 +211,9 @@ You are a helpful assistant.
 if __name__ == "__main__":
     try:
-        # Start the Flask app in a separate thread
-        flask_thread = threading.Thread(target=lambda: app.run(host="0.0.0.0", port=7860))
-        flask_thread.daemon = True
-        flask_thread.start()
-        # Initialize models in the main thread
-        logger.info("Starting Flask application")
         initialize_models()
-        # Keep the main thread alive
-        flask_thread.join()
     except Exception as e:
         logger.critical(f"Failed to start application: {str(e)}")

 import logging
 import threading
 import queue
 from transformers import AutoTokenizer, AutoModelForCausalLM
 # Set up logging
+logging.basicConfig(level=logging.INFO)
 logger = logging.getLogger(__name__)
 # Fix caching issue on Hugging Face Spaces
 os.environ["TRANSFORMERS_CACHE"] = "/tmp"
 os.environ["HF_HOME"] = "/tmp"
 tokenizer = None
 model = None
 # Initialize models once on startup
 def initialize_models():
     global tokenizer, model
     try:
         logger.info("Loading language model...")
+        model_name = "Qwen/Qwen2.5-1.5B-Instruct"
+        tokenizer = AutoTokenizer.from_pretrained(model_name)
         model = AutoModelForCausalLM.from_pretrained(
             model_name,
+            torch_dtype=torch.float16,  # Use float16 for lower memory on CPU
+            device_map="cpu",  # Explicitly set to CPU
+            low_cpu_mem_usage=True  # Optimize memory loading
         )
         if tokenizer.pad_token is None:
             tokenizer.pad_token = tokenizer.eos_token
             model.config.pad_token_id = model.config.eos_token_id
         logger.info("Models initialized successfully")
     except Exception as e:
         logger.error(f"Error initializing models: {str(e)}")
         raise
 # Function to simulate "thinking" process
 def thinking_process(message, result_queue):
     """
+    This function simulates a thinking process and puts the result in the queue
     """
     try:
+        # Simulate thinking process
         logger.info(f"Thinking about: '{message}'")
+        # Create prompt with system message
         prompt = f"""<|im_start|>system
+You are a helpful, friendly, and thoughtful AI assistant. Think carefully and provide informative, detailed responses.
 <|im_end|>
 <|im_start|>user
 {message}<|im_end|>
 """
         # Handle inputs
+        inputs = tokenizer(prompt, return_tensors="pt", truncation=True, max_length=1024)
+        inputs = {k: v.to('cpu') for k, v in inputs.items()}
         # Generate answer with streaming
         streamer = TextStreamer(tokenizer, result_queue)
+        # Generate response
+        model.generate(
+            **inputs,
+            max_new_tokens=512,
+            temperature=0.7,
+            top_p=0.9,
+            do_sample=True,
+            streamer=streamer,
+            num_beams=1,
+            no_repeat_ngram_size=3
+        )
         # Signal generation is complete
         result_queue.put(None)
         # Signal generation is complete
         result_queue.put(None)
+# TextStreamer class for token-by-token generation
+class TextStreamer:
+    def __init__(self, tokenizer, queue):
+        self.tokenizer = tokenizer
+        self.queue = queue
+        self.current_tokens = []
+    def put(self, token_ids):
+        self.current_tokens.extend(token_ids.tolist())
+        text = self.tokenizer.decode(self.current_tokens, skip_special_tokens=True)
+        self.queue.put(text)
+    def end(self):
+        pass
 # API route for home page
 @app.route('/')
 def home():
+    return jsonify({"message": "AI Chat API is running!"})
 # API route for streaming chat responses
+@app.route('/chat', methods=['POST'])
 def chat():
+    data = request.get_json()
+    message = data.get("message", "")
     if not message:
         return jsonify({"error": "Message is required"}), 400
     try:
         def generate():
             # Create a queue for communication between threads
             result_queue = queue.Queue()
             # Start thinking in a separate thread
             thread = threading.Thread(target=thinking_process, args=(message, result_queue))
             thread.start()
             # Stream results as they become available
                         new_part = result[len(previous_text):]
                         previous_text = result
                         if new_part:
+                            yield f"data: {new_part}\n\n"
                 except queue.Empty:
                     # Timeout occurred
             yield "data: [DONE]\n\n"
+        return Response(stream_with_context(generate()), mimetype='text/event-stream')
     except Exception as e:
         logger.error(f"Error processing chat request: {str(e)}")
 # Simple API for non-streaming chat (fallback)
 @app.route('/chat-simple', methods=['POST'])
 def chat_simple():
     data = request.get_json()
     message = data.get("message", "")
         return jsonify({"error": "Message is required"}), 400
     try:
+        # Create prompt with system message
         prompt = f"""<|im_start|>system
+You are a helpful, friendly, and thoughtful AI assistant. Think carefully and provide informative, detailed responses.
 <|im_end|>
 <|im_start|>user
 {message}<|im_end|>
 <|im_start|>assistant
 """
+        # Handle inputs
+        inputs = tokenizer(prompt, return_tensors="pt", truncation=True, max_length=1024)
+        inputs = {k: v.to('cpu') for k, v in inputs.items()}
+        # Generate answer
+        output = model.generate(
+            **inputs,
+            max_new_tokens=512,
+            temperature=0.7,
+            top_p=0.9,
+            do_sample=True,
+            num_beams=1,
+            no_repeat_ngram_size=3
+        )
         # Decode and format answer
         answer = tokenizer.decode(output[0], skip_special_tokens=True)
 if __name__ == "__main__":
     try:
+        # Initialize models at startup
         initialize_models()
+        logger.info("Starting Flask application")
+        app.run(host="0.0.0.0", port=7860)
     except Exception as e:
         logger.critical(f"Failed to start application: {str(e)}")