Spaces:

nananie143
/

q25COTapi

Sleeping

App Files Files Community

q25COTapi / app.py

nananie143

Update app.py

d582d65 verified 6 months ago

raw

history blame

5.4 kB

	import gradio as gr
	from langchain_community.llms import LlamaCpp
	import os
	import json
	import torch
	import logging
	from typing import Optional, List, Dict, Any
	from fastapi import FastAPI, HTTPException, Request
	from fastapi.responses import JSONResponse
	from pydantic import BaseModel
	import uvicorn
	import time
	from threading import Lock
	from pathlib import Path
	from huggingface_hub import hf_hub_download, list_repo_files

	# Configure logging
	logging.basicConfig(level=logging.INFO)
	logger = logging.getLogger(__name__)

	class ChatCompletionRequest(BaseModel):
	model: str
	messages: List[Dict[str, str]]
	temperature: Optional[float] = 0.7
	max_tokens: Optional[int] = 2048
	stream: Optional[bool] = False

	def get_model_filename():
	"""Get the correct model filename from the repository."""
	try:
	logger.info("Listing repository files...")
	files = list_repo_files("G17c21ds/Qwen2.5-14B-Instruct-Uncensored-Q8_0-GGUF")
	# Filter for GGUF files
	gguf_files = [f for f in files if f.endswith('.gguf')]
	if not gguf_files:
	raise ValueError("No GGUF model files found in repository")
	logger.info(f"Found model files: {gguf_files}")
	return gguf_files[0]
	except Exception as e:
	logger.error(f"Error listing repository files: {str(e)}")
	raise

	def download_model_from_hf():
	"""Download the model file from Hugging Face."""
	try:
	logger.info("Downloading model from Hugging Face Hub...")

	# Create models directory if it doesn't exist
	model_dir = Path("models")
	model_dir.mkdir(exist_ok=True)

	# Get the correct filename
	model_filename = get_model_filename()
	logger.info(f"Using model file: {model_filename}")

	# Download the model using huggingface_hub
	local_path = hf_hub_download(
	repo_id="G17c21ds/Qwen2.5-14B-Instruct-Uncensored-Q8_0-GGUF",
	filename=model_filename,
	local_dir=model_dir,
	local_dir_use_symlinks=False
	)

	return Path(local_path)
	except Exception as e:
	logger.error(f"Error downloading model: {str(e)}")
	raise

	class QwenModel:
	def __init__(self):
	"""Initialize the Qwen model with automatic device detection."""
	try:
	# Check for GPU availability
	self.has_gpu = torch.cuda.is_available()
	self.device_count = torch.cuda.device_count() if self.has_gpu else 0
	logger.info(f"GPU available: {self.has_gpu}, Device count: {self.device_count}")

	# Download or get the model
	model_path = download_model_from_hf()
	logger.info(f"Model path: {model_path}")

	# Configure model parameters based on available hardware
	n_gpu_layers = 40 if self.has_gpu else 0
	logger.info(f"Using {'GPU' if self.has_gpu else 'CPU'} for inference")

	# Adjust memory settings for CPU
	n_batch = 512 if self.has_gpu else 64 # Reduced batch size for CPU
	n_ctx = 2048 if not self.has_gpu else 4096 # Reduced context for CPU

	self.llm = LlamaCpp(
	model_path=str(model_path),
	n_gpu_layers=n_gpu_layers,
	n_ctx=n_ctx,
	n_batch=n_batch,
	verbose=True,
	temperature=0.7,
	max_tokens=2048,
	top_p=0.95,
	top_k=50,
	f16_kv=self.has_gpu,
	use_mlock=True,
	use_mmap=True,
	seed=42, # For reproducibility
	repeat_penalty=1.1, # Prevent repetitive outputs
	rope_scaling={"type": "linear", "factor": 1.0}, # RoPE scaling for better long-context handling
	)

	# Thread lock for concurrent API requests
	self.lock = Lock()

	except Exception as e:
	logger.error(f"Failed to initialize model: {str(e)}")
	raise

	# ... [rest of the QwenModel class methods remain the same] ...

	# Initialize FastAPI with lifespan
	app = FastAPI(title="Qwen 2.5 API")

	# Global model instance
	model = None

	@asynccontextmanager
	async def lifespan(app: FastAPI):
	"""Lifespan context manager for FastAPI startup and shutdown events."""
	global model
	try:
	model = QwenModel()
	logger.info("Model initialized successfully")
	yield
	finally:
	# Cleanup code (if needed)
	pass

	app = FastAPI(lifespan=lifespan)

	# ... [rest of the FastAPI routes remain the same] ...

	def main():
	"""Main function to initialize and launch the application."""
	try:
	global model

	# Initialize the model if not already initialized
	if model is None:
	model = QwenModel()

	# Create and launch the Gradio interface
	interface = create_gradio_interface(model)

	# Mount FastAPI app to Gradio
	app.mount("/", interface.app)

	# Launch with uvicorn
	uvicorn.run(
	app,
	host="0.0.0.0",
	port=7860,
	log_level="info"
	)
	except Exception as e:
	logger.error(f"Application failed to start: {str(e)}")
	raise

	if __name__ == "__main__":
	main()