import gradio as gr from fastapi import FastAPI, Request, HTTPException from fastapi.responses import JSONResponse import datetime import requests import os import logging # Initialize FastAPI app = FastAPI() # Configure logging logging.basicConfig(level=logging.INFO) logger = logging.getLogger(__name__) # Configuration API_URL = "https://api-inference.huggingface.co/models/Qwen/Qwen2.5-Coder-32B" headers = { "Authorization": f"Bearer {os.getenv('HF_API_TOKEN')}", "Content-Type": "application/json" } def format_chat_response(response_text, prompt_tokens=0, completion_tokens=0): return { "id": f"chatcmpl-{datetime.datetime.now().strftime('%Y%m%d%H%M%S')}", "object": "chat.completion", "created": int(datetime.datetime.now().timestamp()), "model": "Qwen/Qwen2.5-Coder-32B", "choices": [{ "index": 0, "message": { "role": "assistant", "content": response_text }, "finish_reason": "stop" }], "usage": { "prompt_tokens": prompt_tokens, "completion_tokens": completion_tokens, "total_tokens": prompt_tokens + completion_tokens } } async def query_model(payload): try: response = requests.post(API_URL, headers=headers, json=payload) response.raise_for_status() return response.json() except requests.exceptions.RequestException as e: logger.error(f"Request failed: {e}") raise HTTPException(status_code=500, detail=str(e)) @app.get("/status") async def status(): try: response_text = "it's working" return JSONResponse(content=format_chat_response(response_text)) except Exception as e: logger.error(f"Status check failed: {e}") raise HTTPException(status_code=500, detail=str(e)) @app.post("/v1/chat/completions") async def chat_completion(request: Request): try: data = await request.json() messages = data.get("messages", []) if not messages: raise HTTPException(status_code=400, detail="Messages are required") payload = { "inputs": { "messages": messages }, "parameters": { "max_new_tokens": data.get("max_tokens", 2048), "temperature": data.get("temperature", 0.7), "top_p": data.get("top_p", 0.95), "do_sample": True } } response = await query_model(payload) if isinstance(response, dict) and "error" in response: raise HTTPException(status_code=500, detail=response["error"]) response_text = response[0]["generated_text"] return JSONResponse(content=format_chat_response(response_text)) except HTTPException as e: logger.error(f"Chat completion failed: {e.detail}") raise e except Exception as e: logger.error(f"Unexpected error: {e}") raise HTTPException(status_code=500, detail=str(e)) def generate_response(messages): payload = { "inputs": { "messages": messages }, "parameters": { "max_new_tokens": 2048, "temperature": 0.7, "top_p": 0.95, "do_sample": True } } try: response = requests.post(API_URL, headers=headers, json=payload) response.raise_for_status() result = response.json() if isinstance(result, dict) and "error" in result: return f"Error: {result['error']}" return result[0]["generated_text"] except requests.exceptions.RequestException as e: logger.error(f"Request failed: {e}") return f"Error: {e}" def chat_interface(messages): chat_history = [] for message in messages: try: response = generate_response([{"role": "user", "content": message}]) chat_history.append({"role": "user", "content": message}) chat_history.append({"role": "assistant", "content": response}) except Exception as e: chat_history.append({"role": "user", "content": message}) chat_history.append({"role": "assistant", "content": f"Error: {str(e)}"}) return chat_history # Create Gradio interface def gradio_app(): return gr.ChatInterface(chat_interface, type="messages") # Mount both FastAPI and Gradio app = gr.mount_gradio_app(app, gradio_app(), path="/") # For running with uvicorn directly if __name__ == "__main__": import uvicorn uvicorn.run(app, host="0.0.0.0", port=7860)