Spaces:
Sleeping
Sleeping
import gradio as gr | |
from langchain_community.llms import LlamaCpp | |
import os | |
import json | |
import torch | |
import logging | |
from typing import Optional, List, Dict, Any | |
from fastapi import FastAPI, HTTPException, Request | |
from fastapi.responses import JSONResponse | |
from pydantic import BaseModel | |
import uvicorn | |
import time | |
from threading import Lock | |
import requests | |
from pathlib import Path | |
from tqdm import tqdm | |
from contextlib import asynccontextmanager | |
from huggingface_hub import hf_hub_download | |
# Configure logging | |
logging.basicConfig(level=logging.INFO) | |
logger = logging.getLogger(__name__) | |
class ChatCompletionRequest(BaseModel): | |
model: str | |
messages: List[Dict[str, str]] | |
temperature: Optional[float] = 0.7 | |
max_tokens: Optional[int] = 2048 | |
stream: Optional[bool] = False | |
def download_model_from_hf(): | |
"""Download the model file from Hugging Face.""" | |
try: | |
logger.info("Downloading model from Hugging Face Hub...") | |
# Create models directory if it doesn't exist | |
model_dir = Path("models") | |
model_dir.mkdir(exist_ok=True) | |
# Download the model using huggingface_hub | |
local_path = hf_hub_download( | |
repo_id="G17c21ds/Qwen2.5-14B-Instruct-Uncensored-Q8_0-GGUF", | |
filename="model.gguf", | |
local_dir=model_dir, | |
local_dir_use_symlinks=False | |
) | |
return Path(local_path) | |
except Exception as e: | |
logger.error(f"Error downloading model: {str(e)}") | |
raise | |
class QwenModel: | |
def __init__(self): | |
"""Initialize the Qwen model with automatic device detection.""" | |
try: | |
# Check for GPU availability | |
self.has_gpu = torch.cuda.is_available() | |
self.device_count = torch.cuda.device_count() if self.has_gpu else 0 | |
logger.info(f"GPU available: {self.has_gpu}, Device count: {self.device_count}") | |
# Download or get the model | |
model_path = download_model_from_hf() | |
logger.info(f"Model path: {model_path}") | |
# Configure model parameters based on available hardware | |
n_gpu_layers = 40 if self.has_gpu else 0 | |
logger.info(f"Using {'GPU' if self.has_gpu else 'CPU'} for inference") | |
self.llm = LlamaCpp( | |
model_path=str(model_path), | |
n_gpu_layers=n_gpu_layers, | |
n_ctx=4096, | |
n_batch=512 if self.has_gpu else 128, | |
verbose=True, | |
temperature=0.7, | |
max_tokens=2048, | |
top_p=0.95, | |
top_k=50, | |
f16_kv=self.has_gpu, | |
use_mlock=True, | |
use_mmap=True, | |
) | |
# Thread lock for concurrent API requests | |
self.lock = Lock() | |
except Exception as e: | |
logger.error(f"Failed to initialize model: {str(e)}") | |
raise | |
# ... [rest of the QwenModel class methods remain the same] ... | |
# Initialize FastAPI with lifespan | |
app = FastAPI(title="Qwen 2.5 API") | |
# Global model instance | |
model = None | |
async def lifespan(app: FastAPI): | |
"""Lifespan context manager for FastAPI startup and shutdown events.""" | |
global model | |
try: | |
model = QwenModel() | |
logger.info("Model initialized successfully") | |
yield | |
finally: | |
# Cleanup code (if needed) | |
pass | |
app = FastAPI(lifespan=lifespan) | |
# ... [rest of the FastAPI routes remain the same] ... | |
def main(): | |
"""Main function to initialize and launch the application.""" | |
try: | |
global model | |
# Initialize the model if not already initialized | |
if model is None: | |
model = QwenModel() | |
# Create and launch the Gradio interface | |
interface = create_gradio_interface(model) | |
# Mount FastAPI app to Gradio | |
app.mount("/", interface.app) | |
# Launch with uvicorn | |
uvicorn.run( | |
app, | |
host="0.0.0.0", | |
port=7860, | |
log_level="info" | |
) | |
except Exception as e: | |
logger.error(f"Application failed to start: {str(e)}") | |
raise | |
if __name__ == "__main__": | |
main() |