File size: 4,705 Bytes
a2280d2
922765a
37e4010
 
d6b0a9b
 
922765a
64ecb84
cce0194
37e4010
cce0194
 
922765a
 
 
 
64ecb84
 
 
 
 
d6b0a9b
64ecb84
d6b0a9b
 
404e508
d6b0a9b
37e4010
 
 
 
d6b0a9b
37e4010
 
 
 
 
 
 
 
 
 
 
 
cce0194
37e4010
cce0194
d6b0a9b
922765a
 
 
 
 
 
 
d6b0a9b
4b77577
 
 
 
922765a
4b77577
922765a
 
 
37e4010
 
cce0194
37e4010
 
922765a
 
 
d6b0a9b
 
 
 
 
 
 
 
 
 
 
404e508
d6b0a9b
37e4010
d6b0a9b
922765a
404e508
d6b0a9b
37e4010
922765a
 
 
 
cce0194
922765a
c9bc402
 
a2280d2
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
c9bc402
 
 
a2280d2
 
 
 
 
 
 
 
 
c9bc402
 
 
 
 
a2280d2
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
import gradio as gr
from fastapi import FastAPI, Request, HTTPException
from fastapi.responses import JSONResponse
import datetime
import requests
import os
import logging
import toml

# Initialize FastAPI
app = FastAPI()

# Configure logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

# Load config
with open("config.toml") as f:
    config = toml.load(f)

API_URL = config["llm"]["api_url"]
headers = {
    "Authorization": f"Bearer {config['llm']['api_key']}",
    "Content-Type": "application/json"
}

def format_chat_response(response_text, prompt_tokens=0, completion_tokens=0):
    return {
        "id": f"chatcmpl-{datetime.datetime.now().strftime('%Y%m%d%H%M%S')}",
        "object": "chat.completion",
        "created": int(datetime.datetime.now().timestamp()),
        "model": "Qwen/Qwen2.5-Coder-32B",
        "choices": [{
            "index": 0,
            "message": {
                "role": "assistant",
                "content": response_text
            },
            "finish_reason": "stop"
        }],
        "usage": {
            "prompt_tokens": prompt_tokens,
            "completion_tokens": completion_tokens,
            "total_tokens": prompt_tokens + completion_tokens
        }
    }

async def query_model(payload):
    try:
        response = requests.post(API_URL, headers=headers, json=payload)
        response.raise_for_status()
        return response.json()
    except requests.exceptions.RequestException as e:
        logger.error(f"Request failed: {e}")
        raise HTTPException(status_code=500, detail=str(e))

@app.get("/status")
async def status():
    try:
        response_text = "it's working"
        return JSONResponse(content=format_chat_response(response_text))
    except Exception as e:
        logger.error(f"Status check failed: {e}")
        raise HTTPException(status_code=500, detail=str(e))

@app.post("/v1/chat/completions")
async def chat_completion(request: Request):
    try:
        data = await request.json()
        messages = data.get("messages", [])
        if not messages:
            raise HTTPException(status_code=400, detail="Messages are required")

        payload = {
            "inputs": {
                "messages": messages
            },
            "parameters": {
                "max_new_tokens": data.get("max_tokens", 2048),
                "temperature": data.get("temperature", 0.7),
                "top_p": data.get("top_p", 0.95),
                "do_sample": True
            }
        }
        
        response = await query_model(payload)
        
        if isinstance(response, dict) and "error" in response:
            raise HTTPException(status_code=500, detail=response["error"])
        
        response_text = response[0]["generated_text"]
        
        return JSONResponse(content=format_chat_response(response_text))
    except HTTPException as e:
        logger.error(f"Chat completion failed: {e.detail}")
        raise e
    except Exception as e:
        logger.error(f"Unexpected error: {e}")
        raise HTTPException(status_code=500, detail=str(e))

def generate_response(messages):
    payload = {
        "inputs": {
            "messages": messages
        },
        "parameters": {
            "max_new_tokens": 2048,
            "temperature": 0.7,
            "top_p": 0.95,
            "do_sample": True
        }
    }
    
    try:
        response = requests.post(API_URL, headers=headers, json=payload)
        response.raise_for_status()
        result = response.json()
        
        if isinstance(result, dict) and "error" in result:
            return f"Error: {result['error']}"
        
        return result[0]["generated_text"]
    except requests.exceptions.RequestException as e:
        logger.error(f"Request failed: {e}")
        return f"Error: {e}"

def chat_interface(messages):
    chat_history = []
    for message in messages:
        try:
            response = generate_response([{"role": "user", "content": message}])
            chat_history.append({"role": "user", "content": message})
            chat_history.append({"role": "assistant", "content": response})
        except Exception as e:
            chat_history.append({"role": "user", "content": message})
            chat_history.append({"role": "assistant", "content": f"Error: {str(e)}"})
    return chat_history

# Create Gradio interface
def gradio_app():
    return gr.ChatInterface(chat_interface, type="messages")

# Mount both FastAPI and Gradio
app = gr.mount_gradio_app(app, gradio_app(), path="/")

# For running with uvicorn directly
if __name__ == "__main__":
    import uvicorn
    uvicorn.run(app, host="0.0.0.0", port=7860)