Spaces:

nananie143
/

q25COTapi

Sleeping

App Files Files Community

nananie143 commited on Jan 12

Commit

f871a33

verified ·

1 Parent(s): c139197

Create app.py

Browse files

Files changed (1) hide show

app.py +272 -0

app.py ADDED Viewed

	@@ -0,0 +1,272 @@

+import gradio as gr
+from langchain.llms import LlamaCpp
+import os
+import json
+import torch
+import logging
+from typing import Optional, List, Dict, Any
+from fastapi import FastAPI, HTTPException, Request
+from fastapi.responses import JSONResponse
+from pydantic import BaseModel
+import uvicorn
+import time
+from threading import Lock
+# Configure logging
+logging.basicConfig(level=logging.INFO)
+logger = logging.getLogger(__name__)
+class ChatCompletionRequest(BaseModel):
+    model: str
+    messages: List[Dict[str, str]]
+    temperature: Optional[float] = 0.7
+    max_tokens: Optional[int] = 2048
+    stream: Optional[bool] = False
+class QwenModel:
+    def __init__(self, model_path: str):
+        """Initialize the Qwen model with automatic device detection."""
+        try:
+            # Check for GPU availability
+            self.has_gpu = torch.cuda.is_available()
+            self.device_count = torch.cuda.device_count() if self.has_gpu else 0
+            logger.info(f"GPU available: {self.has_gpu}, Device count: {self.device_count}")
+            # Configure model parameters based on available hardware
+            n_gpu_layers = 40 if self.has_gpu else 0
+            logger.info(f"Using {'GPU' if self.has_gpu else 'CPU'} for inference")
+            self.llm = LlamaCpp(
+                model_path=model_path,
+                n_gpu_layers=n_gpu_layers,
+                n_ctx=4096,
+                n_batch=512 if self.has_gpu else 128,  # Reduced batch size for CPU
+                verbose=True,
+                temperature=0.7,
+                max_tokens=2048,
+                top_p=0.95,
+                top_k=50,
+                f16_kv=self.has_gpu,  # Only use f16 when GPU is available
+                use_mlock=True,  # Pin memory for better performance
+                use_mmap=True,
+            )
+            # Thread lock for concurrent API requests
+            self.lock = Lock()
+        except Exception as e:
+            logger.error(f"Failed to initialize model: {str(e)}")
+            raise
+    def generate_cot_prompt(self, messages: List[Dict[str, str]]) -> str:
+        """Generate a chain-of-thought prompt from message history."""
+        conversation = []
+        for msg in messages:
+            role = msg.get("role", "")
+            content = msg.get("content", "")
+            if role == "system":
+                conversation.append(f"System: {content}")
+            elif role == "user":
+                conversation.append(f"Human: {content}")
+            elif role == "assistant":
+                conversation.append(f"Assistant: {content}")
+        last_user_msg = next((msg["content"] for msg in reversed(messages)
+                            if msg["role"] == "user"), None)
+        if not last_user_msg:
+            raise ValueError("No user message found in the conversation")
+        cot_template = f"""Previous conversation:
+{chr(10).join(conversation)}
+Let's approach the latest question step-by-step:
+1. Understanding the question:
+   {last_user_msg}
+2. Breaking down components:
+   - Key elements to consider
+   - Specific information requested
+   - Relevant constraints
+3. Reasoning process:
+   - Systematic approach
+   - Applicable knowledge
+   - Potential challenges
+4. Step-by-step solution:
+"""
+        return cot_template
+    def process_response(self, response: str) -> str:
+        """Process and format the model's response."""
+        try:
+            response = response.strip()
+            # Add structural markers for better readability
+            if not response.startswith("Step"):
+                response = "Step-by-step solution:\n" + response
+            return response
+        except Exception as e:
+            logger.error(f"Error processing response: {str(e)}")
+            return "Error processing response"
+    def generate_response(self,
+                         messages: List[Dict[str, str]],
+                         temperature: float = 0.7,
+                         max_tokens: int = 2048) -> Dict[str, Any]:
+        """Generate a response using chain-of-thought reasoning."""
+        try:
+            with self.lock:  # Thread safety for concurrent API requests
+                # Generate the CoT prompt
+                full_prompt = self.generate_cot_prompt(messages)
+                # Get response from model
+                start_time = time.time()
+                response = self.llm(
+                    full_prompt,
+                    temperature=temperature,
+                    max_tokens=max_tokens
+                )
+                end_time = time.time()
+                # Process response
+                processed_response = self.process_response(response)
+                # Format response in OpenAI-compatible structure
+                return {
+                    "id": f"chatcmpl-{int(time.time()*1000)}",
+                    "object": "chat.completion",
+                    "created": int(time.time()),
+                    "model": "qwen-2.5-14b",
+                    "choices": [{
+                        "index": 0,
+                        "message": {
+                            "role": "assistant",
+                            "content": processed_response
+                        },
+                        "finish_reason": "stop"
+                    }],
+                    "usage": {
+                        "prompt_tokens": len(full_prompt.split()),
+                        "completion_tokens": len(processed_response.split()),
+                        "total_tokens": len(full_prompt.split()) + len(processed_response.split())
+                    },
+                    "system_info": {
+                        "device": "gpu" if self.has_gpu else "cpu",
+                        "processing_time": round(end_time - start_time, 2)
+                    }
+                }
+        except Exception as e:
+            logger.error(f"Error generating response: {str(e)}")
+            raise HTTPException(status_code=500, detail=str(e))
+# Initialize FastAPI
+app = FastAPI(title="Qwen 2.5 API")
+def create_gradio_interface(model: QwenModel):
+    """Create and configure the Gradio interface."""
+    def predict(message: str,
+                temperature: float,
+                max_tokens: int) -> str:
+        messages = [{"role": "user", "content": message}]
+        response = model.generate_response(
+            messages,
+            temperature=temperature,
+            max_tokens=max_tokens
+        )
+        return response["choices"][0]["message"]["content"]
+    iface = gr.Interface(
+        fn=predict,
+        inputs=[
+            gr.Textbox(
+                label="Input",
+                placeholder="Enter your question or task here...",
+                lines=5
+            ),
+            gr.Slider(
+                minimum=0.1,
+                maximum=1.0,
+                value=0.7,
+                label="Temperature",
+                info="Higher values make the output more random"
+            ),
+            gr.Slider(
+                minimum=64,
+                maximum=4096,
+                value=2048,
+                step=64,
+                label="Max Tokens",
+                info="Maximum length of the generated response"
+            )
+        ],
+        outputs=gr.Textbox(label="Response", lines=10),
+        title=f"Qwen 2.5 14B Instruct Model ({'GPU' if model.has_gpu else 'CPU'} Mode)",
+        description="""This is a Qwen 2.5 14B model interface with chain-of-thought prompting.
+        The model will break down complex problems and solve them step by step.""",
+        examples=[
+            ["Explain how photosynthesis works", 0.7, 2048],
+            ["Solve the quadratic equation: x² + 5x + 6 = 0", 0.7, 1024],
+            ["What are the implications of Moore's Law for future computing?", 0.8, 2048]
+        ]
+    )
+    return iface
+# Global model instance
+model = None
+@app.on_event("startup")
+async def startup_event():
+    """Initialize the model on startup."""
+    global model
+    model_path = "G17c21ds/Qwen2.5-14B-Instruct-Uncensored-Q8_0-GGUF"
+    model = QwenModel(model_path)
+    logger.info("Model initialized successfully")
+@app.post("/v1/chat/completions")
+async def create_chat_completion(request: ChatCompletionRequest):
+    """OpenAI-compatible chat completions endpoint."""
+    try:
+        response = model.generate_response(
+            request.messages,
+            temperature=request.temperature,
+            max_tokens=request.max_tokens
+        )
+        return JSONResponse(content=response)
+    except Exception as e:
+        raise HTTPException(status_code=500, detail=str(e))
+def main():
+    """Main function to initialize and launch the application."""
+    try:
+        global model
+        # Model path
+        model_path = "G17c21ds/Qwen2.5-14B-Instruct-Uncensored-Q8_0-GGUF"
+        # Initialize the model if not already initialized
+        if model is None:
+            model = QwenModel(model_path)
+        # Create and launch the Gradio interface
+        interface = create_gradio_interface(model)
+        # Mount FastAPI app to Gradio
+        app.mount("/", interface.app)
+        # Launch with uvicorn
+        uvicorn.run(
+            app,
+            host="0.0.0.0",
+            port=7860,
+            log_level="info"
+        )
+    except Exception as e:
+        logger.error(f"Application failed to start: {str(e)}")
+        raise
+if __name__ == "__main__":
+    main()