Spaces:

nananie143
/

q25COTapi

Sleeping

App Files Files Community

q25COTapi / app.py

nananie143

Update app.py

05661ec verified 6 months ago

raw

history blame

4.28 kB

	import gradio as gr
	from langchain_community.llms import LlamaCpp
	import os
	import json
	import torch
	import logging
	from typing import Optional, List, Dict, Any
	from fastapi import FastAPI, HTTPException, Request
	from fastapi.responses import JSONResponse
	from pydantic import BaseModel
	import uvicorn
	import time
	from threading import Lock
	import requests
	from pathlib import Path
	from tqdm import tqdm
	from contextlib import asynccontextmanager
	from huggingface_hub import hf_hub_download

	# Configure logging
	logging.basicConfig(level=logging.INFO)
	logger = logging.getLogger(__name__)

	class ChatCompletionRequest(BaseModel):
	model: str
	messages: List[Dict[str, str]]
	temperature: Optional[float] = 0.7
	max_tokens: Optional[int] = 2048
	stream: Optional[bool] = False

	def download_model_from_hf():
	"""Download the model file from Hugging Face."""
	try:
	logger.info("Downloading model from Hugging Face Hub...")

	# Create models directory if it doesn't exist
	model_dir = Path("models")
	model_dir.mkdir(exist_ok=True)

	# Download the model using huggingface_hub
	local_path = hf_hub_download(
	repo_id="G17c21ds/Qwen2.5-14B-Instruct-Uncensored-Q8_0-GGUF",
	filename="model.gguf",
	local_dir=model_dir,
	local_dir_use_symlinks=False
	)

	return Path(local_path)
	except Exception as e:
	logger.error(f"Error downloading model: {str(e)}")
	raise

	class QwenModel:
	def __init__(self):
	"""Initialize the Qwen model with automatic device detection."""
	try:
	# Check for GPU availability
	self.has_gpu = torch.cuda.is_available()
	self.device_count = torch.cuda.device_count() if self.has_gpu else 0
	logger.info(f"GPU available: {self.has_gpu}, Device count: {self.device_count}")

	# Download or get the model
	model_path = download_model_from_hf()
	logger.info(f"Model path: {model_path}")

	# Configure model parameters based on available hardware
	n_gpu_layers = 40 if self.has_gpu else 0
	logger.info(f"Using {'GPU' if self.has_gpu else 'CPU'} for inference")

	self.llm = LlamaCpp(
	model_path=str(model_path),
	n_gpu_layers=n_gpu_layers,
	n_ctx=4096,
	n_batch=512 if self.has_gpu else 128,
	verbose=True,
	temperature=0.7,
	max_tokens=2048,
	top_p=0.95,
	top_k=50,
	f16_kv=self.has_gpu,
	use_mlock=True,
	use_mmap=True,
	)

	# Thread lock for concurrent API requests
	self.lock = Lock()

	except Exception as e:
	logger.error(f"Failed to initialize model: {str(e)}")
	raise

	# ... [rest of the QwenModel class methods remain the same] ...

	# Initialize FastAPI with lifespan
	app = FastAPI(title="Qwen 2.5 API")

	# Global model instance
	model = None

	@asynccontextmanager
	async def lifespan(app: FastAPI):
	"""Lifespan context manager for FastAPI startup and shutdown events."""
	global model
	try:
	model = QwenModel()
	logger.info("Model initialized successfully")
	yield
	finally:
	# Cleanup code (if needed)
	pass

	app = FastAPI(lifespan=lifespan)

	# ... [rest of the FastAPI routes remain the same] ...

	def main():
	"""Main function to initialize and launch the application."""
	try:
	global model

	# Initialize the model if not already initialized
	if model is None:
	model = QwenModel()

	# Create and launch the Gradio interface
	interface = create_gradio_interface(model)

	# Mount FastAPI app to Gradio
	app.mount("/", interface.app)

	# Launch with uvicorn
	uvicorn.run(
	app,
	host="0.0.0.0",
	port=7860,
	log_level="info"
	)
	except Exception as e:
	logger.error(f"Application failed to start: {str(e)}")
	raise

	if __name__ == "__main__":
	main()