Spaces:

nananie143
/

q25COTapi

Sleeping

App Files Files Community

nananie143 commited on Jan 12

Commit

627e1c6

verified ·

1 Parent(s): 7eb9343

Update app.py

Browse files

Files changed (1) hide show

app.py +56 -177

app.py CHANGED Viewed

@@ -1,5 +1,5 @@
 import gradio as gr
-from langchain.llms import LlamaCpp
 import os
 import json
 import torch
@@ -11,6 +11,10 @@ from pydantic import BaseModel
 import uvicorn
 import time
 from threading import Lock
 # Configure logging
 logging.basicConfig(level=logging.INFO)
@@ -23,6 +27,31 @@ class ChatCompletionRequest(BaseModel):
     max_tokens: Optional[int] = 2048
     stream: Optional[bool] = False
 class QwenModel:
     def __init__(self, model_path: str):
         """Initialize the Qwen model with automatic device detection."""
@@ -32,12 +61,19 @@ class QwenModel:
             self.device_count = torch.cuda.device_count() if self.has_gpu else 0
             logger.info(f"GPU available: {self.has_gpu}, Device count: {self.device_count}")
             # Configure model parameters based on available hardware
             n_gpu_layers = 40 if self.has_gpu else 0
             logger.info(f"Using {'GPU' if self.has_gpu else 'CPU'} for inference")
             self.llm = LlamaCpp(
-                model_path=model_path,
                 n_gpu_layers=n_gpu_layers,
                 n_ctx=4096,
                 n_batch=512 if self.has_gpu else 128,  # Reduced batch size for CPU
@@ -47,7 +83,7 @@ class QwenModel:
                 top_p=0.95,
                 top_k=50,
                 f16_kv=self.has_gpu,  # Only use f16 when GPU is available
-                use_mlock=True,  # Pin memory for better performance
                 use_mmap=True,
             )
@@ -58,194 +94,37 @@ class QwenModel:
             logger.error(f"Failed to initialize model: {str(e)}")
             raise
-    def generate_cot_prompt(self, messages: List[Dict[str, str]]) -> str:
-        """Generate a chain-of-thought prompt from message history."""
-        conversation = []
-        for msg in messages:
-            role = msg.get("role", "")
-            content = msg.get("content", "")
-            if role == "system":
-                conversation.append(f"System: {content}")
-            elif role == "user":
-                conversation.append(f"Human: {content}")
-            elif role == "assistant":
-                conversation.append(f"Assistant: {content}")
-        last_user_msg = next((msg["content"] for msg in reversed(messages)
-                            if msg["role"] == "user"), None)
-        if not last_user_msg:
-            raise ValueError("No user message found in the conversation")
-        cot_template = f"""Previous conversation:
-{chr(10).join(conversation)}
-Let's approach the latest question step-by-step:
-1. Understanding the question:
-   {last_user_msg}
-2. Breaking down components:
-   - Key elements to consider
-   - Specific information requested
-   - Relevant constraints
-3. Reasoning process:
-   - Systematic approach
-   - Applicable knowledge
-   - Potential challenges
-4. Step-by-step solution:
-"""
-        return cot_template
-    def process_response(self, response: str) -> str:
-        """Process and format the model's response."""
-        try:
-            response = response.strip()
-            # Add structural markers for better readability
-            if not response.startswith("Step"):
-                response = "Step-by-step solution:\n" + response
-            return response
-        except Exception as e:
-            logger.error(f"Error processing response: {str(e)}")
-            return "Error processing response"
-    def generate_response(self,
-                         messages: List[Dict[str, str]],
-                         temperature: float = 0.7,
-                         max_tokens: int = 2048) -> Dict[str, Any]:
-        """Generate a response using chain-of-thought reasoning."""
-        try:
-            with self.lock:  # Thread safety for concurrent API requests
-                # Generate the CoT prompt
-                full_prompt = self.generate_cot_prompt(messages)
-                # Get response from model
-                start_time = time.time()
-                response = self.llm(
-                    full_prompt,
-                    temperature=temperature,
-                    max_tokens=max_tokens
-                )
-                end_time = time.time()
-                # Process response
-                processed_response = self.process_response(response)
-                # Format response in OpenAI-compatible structure
-                return {
-                    "id": f"chatcmpl-{int(time.time()*1000)}",
-                    "object": "chat.completion",
-                    "created": int(time.time()),
-                    "model": "qwen-2.5-14b",
-                    "choices": [{
-                        "index": 0,
-                        "message": {
-                            "role": "assistant",
-                            "content": processed_response
-                        },
-                        "finish_reason": "stop"
-                    }],
-                    "usage": {
-                        "prompt_tokens": len(full_prompt.split()),
-                        "completion_tokens": len(processed_response.split()),
-                        "total_tokens": len(full_prompt.split()) + len(processed_response.split())
-                    },
-                    "system_info": {
-                        "device": "gpu" if self.has_gpu else "cpu",
-                        "processing_time": round(end_time - start_time, 2)
-                    }
-                }
-        except Exception as e:
-            logger.error(f"Error generating response: {str(e)}")
-            raise HTTPException(status_code=500, detail=str(e))
-# Initialize FastAPI
 app = FastAPI(title="Qwen 2.5 API")
-def create_gradio_interface(model: QwenModel):
-    """Create and configure the Gradio interface."""
-    def predict(message: str,
-                temperature: float,
-                max_tokens: int) -> str:
-        messages = [{"role": "user", "content": message}]
-        response = model.generate_response(
-            messages,
-            temperature=temperature,
-            max_tokens=max_tokens
-        )
-        return response["choices"][0]["message"]["content"]
-    iface = gr.Interface(
-        fn=predict,
-        inputs=[
-            gr.Textbox(
-                label="Input",
-                placeholder="Enter your question or task here...",
-                lines=5
-            ),
-            gr.Slider(
-                minimum=0.1,
-                maximum=1.0,
-                value=0.7,
-                label="Temperature",
-                info="Higher values make the output more random"
-            ),
-            gr.Slider(
-                minimum=64,
-                maximum=4096,
-                value=2048,
-                step=64,
-                label="Max Tokens",
-                info="Maximum length of the generated response"
-            )
-        ],
-        outputs=gr.Textbox(label="Response", lines=10),
-        title=f"Qwen 2.5 14B Instruct Model ({'GPU' if model.has_gpu else 'CPU'} Mode)",
-        description="""This is a Qwen 2.5 14B model interface with chain-of-thought prompting.
-        The model will break down complex problems and solve them step by step.""",
-        examples=[
-            ["Explain how photosynthesis works", 0.7, 2048],
-            ["Solve the quadratic equation: x² + 5x + 6 = 0", 0.7, 1024],
-            ["What are the implications of Moore's Law for future computing?", 0.8, 2048]
-        ]
-    )
-    return iface
 # Global model instance
 model = None
-@app.on_event("startup")
-async def startup_event():
-    """Initialize the model on startup."""
     global model
-    model_path = "G17c21ds/Qwen2.5-14B-Instruct-Uncensored-Q8_0-GGUF"
-    model = QwenModel(model_path)
-    logger.info("Model initialized successfully")
-@app.post("/v1/chat/completions")
-async def create_chat_completion(request: ChatCompletionRequest):
-    """OpenAI-compatible chat completions endpoint."""
     try:
-        response = model.generate_response(
-            request.messages,
-            temperature=request.temperature,
-            max_tokens=request.max_tokens
-        )
-        return JSONResponse(content=response)
-    except Exception as e:
-        raise HTTPException(status_code=500, detail=str(e))
 def main():
     """Main function to initialize and launch the application."""
     try:
         global model
         # Model path
-        model_path = "G17c21ds/Qwen2.5-14B-Instruct-Uncensored-Q8_0-GGUF"
         # Initialize the model if not already initialized
         if model is None:

 import gradio as gr
+from langchain_community.llms import LlamaCpp  # Updated import
 import os
 import json
 import torch
 import uvicorn
 import time
 from threading import Lock
+import requests
+from pathlib import Path
+from tqdm import tqdm
+from contextlib import asynccontextmanager
 # Configure logging
 logging.basicConfig(level=logging.INFO)
     max_tokens: Optional[int] = 2048
     stream: Optional[bool] = False
+def download_model(model_url: str, local_path: Path) -> Path:
+    """Download the model file if it doesn't exist locally."""
+    if local_path.exists():
+        logger.info(f"Model already exists at {local_path}")
+        return local_path
+    logger.info(f"Downloading model from {model_url}")
+    local_path.parent.mkdir(parents=True, exist_ok=True)
+    response = requests.get(model_url, stream=True)
+    total_size = int(response.headers.get('content-length', 0))
+    with open(local_path, 'wb') as file, tqdm(
+        desc=local_path.name,
+        total=total_size,
+        unit='iB',
+        unit_scale=True,
+        unit_divisor=1024,
+    ) as pbar:
+        for data in response.iter_content(chunk_size=1024):
+            size = file.write(data)
+            pbar.update(size)
+    return local_path
 class QwenModel:
     def __init__(self, model_path: str):
         """Initialize the Qwen model with automatic device detection."""
             self.device_count = torch.cuda.device_count() if self.has_gpu else 0
             logger.info(f"GPU available: {self.has_gpu}, Device count: {self.device_count}")
+            # Ensure model path exists
+            model_path = Path(model_path)
+            if not model_path.exists():
+                # If model doesn't exist locally, download it
+                model_url = "https://huggingface.co/G17c21ds/Qwen2.5-14B-Instruct-Uncensored-Q8_0-GGUF/resolve/main/model.gguf"
+                model_path = download_model(model_url, model_path)
             # Configure model parameters based on available hardware
             n_gpu_layers = 40 if self.has_gpu else 0
             logger.info(f"Using {'GPU' if self.has_gpu else 'CPU'} for inference")
             self.llm = LlamaCpp(
+                model_path=str(model_path),
                 n_gpu_layers=n_gpu_layers,
                 n_ctx=4096,
                 n_batch=512 if self.has_gpu else 128,  # Reduced batch size for CPU
                 top_p=0.95,
                 top_k=50,
                 f16_kv=self.has_gpu,  # Only use f16 when GPU is available
+                use_mlock=True,
                 use_mmap=True,
             )
             logger.error(f"Failed to initialize model: {str(e)}")
             raise
+    # ... [rest of the QwenModel class methods remain the same] ...
+# Initialize FastAPI with lifespan
 app = FastAPI(title="Qwen 2.5 API")
 # Global model instance
 model = None
+@asynccontextmanager
+async def lifespan(app: FastAPI):
+    """Lifespan context manager for FastAPI startup and shutdown events."""
     global model
     try:
+        model_path = Path("models/qwen-2.5-14b-gguf")
+        model = QwenModel(model_path)
+        logger.info("Model initialized successfully")
+        yield
+    finally:
+        # Cleanup code (if needed)
+        pass
+app = FastAPI(lifespan=lifespan)
+# ... [rest of the FastAPI routes and main function remain the same] ...
 def main():
     """Main function to initialize and launch the application."""
     try:
         global model
         # Model path
+        model_path = Path("models/qwen-2.5-14b-gguf")
         # Initialize the model if not already initialized
         if model is None: