import gradio as gr from langchain_community.llms import LlamaCpp import os import json import torch import logging from typing import Optional, List, Dict, Any from fastapi import FastAPI, HTTPException, Request from fastapi.responses import JSONResponse from pydantic import BaseModel import uvicorn import time from threading import Lock import requests from pathlib import Path from tqdm import tqdm from contextlib import asynccontextmanager from huggingface_hub import hf_hub_download # Configure logging logging.basicConfig(level=logging.INFO) logger = logging.getLogger(__name__) class ChatCompletionRequest(BaseModel): model: str messages: List[Dict[str, str]] temperature: Optional[float] = 0.7 max_tokens: Optional[int] = 2048 stream: Optional[bool] = False def download_model_from_hf(): """Download the model file from Hugging Face.""" try: logger.info("Downloading model from Hugging Face Hub...") # Create models directory if it doesn't exist model_dir = Path("models") model_dir.mkdir(exist_ok=True) # Download the model using huggingface_hub local_path = hf_hub_download( repo_id="G17c21ds/Qwen2.5-14B-Instruct-Uncensored-Q8_0-GGUF", filename="model.gguf", local_dir=model_dir, local_dir_use_symlinks=False ) return Path(local_path) except Exception as e: logger.error(f"Error downloading model: {str(e)}") raise class QwenModel: def __init__(self): """Initialize the Qwen model with automatic device detection.""" try: # Check for GPU availability self.has_gpu = torch.cuda.is_available() self.device_count = torch.cuda.device_count() if self.has_gpu else 0 logger.info(f"GPU available: {self.has_gpu}, Device count: {self.device_count}") # Download or get the model model_path = download_model_from_hf() logger.info(f"Model path: {model_path}") # Configure model parameters based on available hardware n_gpu_layers = 40 if self.has_gpu else 0 logger.info(f"Using {'GPU' if self.has_gpu else 'CPU'} for inference") self.llm = LlamaCpp( model_path=str(model_path), n_gpu_layers=n_gpu_layers, n_ctx=4096, n_batch=512 if self.has_gpu else 128, verbose=True, temperature=0.7, max_tokens=2048, top_p=0.95, top_k=50, f16_kv=self.has_gpu, use_mlock=True, use_mmap=True, ) # Thread lock for concurrent API requests self.lock = Lock() except Exception as e: logger.error(f"Failed to initialize model: {str(e)}") raise # ... [rest of the QwenModel class methods remain the same] ... # Initialize FastAPI with lifespan app = FastAPI(title="Qwen 2.5 API") # Global model instance model = None @asynccontextmanager async def lifespan(app: FastAPI): """Lifespan context manager for FastAPI startup and shutdown events.""" global model try: model = QwenModel() logger.info("Model initialized successfully") yield finally: # Cleanup code (if needed) pass app = FastAPI(lifespan=lifespan) # ... [rest of the FastAPI routes remain the same] ... def main(): """Main function to initialize and launch the application.""" try: global model # Initialize the model if not already initialized if model is None: model = QwenModel() # Create and launch the Gradio interface interface = create_gradio_interface(model) # Mount FastAPI app to Gradio app.mount("/", interface.app) # Launch with uvicorn uvicorn.run( app, host="0.0.0.0", port=7860, log_level="info" ) except Exception as e: logger.error(f"Application failed to start: {str(e)}") raise if __name__ == "__main__": main()