OjciecTadeusz commited on
Commit
595ab95
·
verified ·
1 Parent(s): 8345d88

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +142 -197
app.py CHANGED
@@ -1,209 +1,154 @@
1
- from fastapi import FastAPI
2
- from pydantic import BaseModel
3
- from huggingface_hub import InferenceClient
4
- import uvicorn
5
 
 
 
 
 
 
 
 
 
6
 
 
7
  app = FastAPI()
8
 
9
- client = InferenceClient("Qwen/Qwen2.5-Coder-32B-Instruct")
10
-
11
- class Item(BaseModel):
12
- prompt: str
13
- history: list
14
- system_prompt: str
15
- temperature: float = 0.0
16
- max_new_tokens: int = 1048
17
- top_p: float = 0.15
18
- repetition_penalty: float = 1.0
19
-
20
- def format_prompt(message, history):
21
- prompt = "<s>"
22
- for user_prompt, bot_response in history:
23
- prompt += f"[INST] {user_prompt} [/INST]"
24
- prompt += f" {bot_response}</s> "
25
- prompt += f"[INST] {message} [/INST]"
26
- return prompt
27
-
28
- def generate(item: Item):
29
- temperature = float(item.temperature)
30
- if temperature < 1e-2:
31
- temperature = 1e-2
32
- top_p = float(item.top_p)
33
-
34
- generate_kwargs = dict(
35
- temperature=temperature,
36
- max_new_tokens=item.max_new_tokens,
37
- top_p=top_p,
38
- repetition_penalty=item.repetition_penalty,
39
- do_sample=True,
40
- seed=42,
41
- )
42
-
43
- formatted_prompt = format_prompt(f"{item.system_prompt}, {item.prompt}", item.history)
44
- stream = client.text_generation(formatted_prompt, **generate_kwargs, stream=True, details=True, return_full_text=False)
45
- output = ""
46
-
47
- for response in stream:
48
- output += response.token.text
49
- return output
50
-
51
- @app.post("/generate/")
52
- async def generate_text(item: Item):
53
- return {"response": generate(item)}
54
-
55
-
56
-
57
- # import gradio as gr
58
- # from fastapi import FastAPI, Request, HTTPException
59
- # from fastapi.responses import JSONResponse
60
- # import datetime
61
- # import requests
62
- # import os
63
- # import logging
64
- # import toml
65
-
66
- # # Initialize FastAPI
67
- # app = FastAPI()
68
-
69
- # # Configure logging
70
- # logging.basicConfig(level=logging.INFO)
71
- # logger = logging.getLogger(__name__)
72
-
73
- # # Load config
74
- # with open("config.toml") as f:
75
- # config = toml.load(f)
76
-
77
- # #API_URL = os.getenv('API_URL')
78
- # #API_TOKEN = os.getenv('API_TOKEN')
79
- # # API_URL = 'https://ojciectadeusz-fastapi-inference-qwen2-5-coder-32-a0ab504.hf.space/v1/chat/completions'
80
- # API_URL = 'https://ojciectadeusz-fastapi-inference-qwen2.5-coder-32b-instruct.hf.space/v1/chat/completions'
81
- # headers = {
82
- # "Authorization": f"Bearer {os.getenv('HF_API_TOKEN')}",
83
- # "Content-Type": "application/json"
84
- # }
85
-
86
- # def format_chat_response(response_text, prompt_tokens=0, completion_tokens=0):
87
- # return {
88
- # "id": f"chatcmpl-{datetime.datetime.now().strftime('%Y%m%d%H%M%S')}",
89
- # "object": "chat.completion",
90
- # "created": int(datetime.datetime.now().timestamp()),
91
- # "model": "Qwen/Qwen2.5-Coder-32B",
92
- # "choices": [{
93
- # "index": 0,
94
- # "message": {
95
- # "role": "assistant",
96
- # "content": response_text
97
- # },
98
- # "finish_reason": "stop"
99
- # }],
100
- # "usage": {
101
- # "prompt_tokens": prompt_tokens,
102
- # "completion_tokens": completion_tokens,
103
- # "total_tokens": prompt_tokens + completion_tokens
104
- # }
105
- # }
106
-
107
- # async def query_model(payload):
108
- # try:
109
- # response = requests.post(API_URL, headers=headers, json=payload)
110
- # response.raise_for_status()
111
- # return response.json()
112
- # except requests.exceptions.RequestException as e:
113
- # logger.error(f"Request failed: {e}")
114
- # raise HTTPException(status_code=500, detail=str(e))
115
-
116
- # @app.get("/status")
117
- # async def status():
118
- # try:
119
 
120
- # response_text = os.getenv('HF_API_TOKEN') + "it's working"
121
- # return JSONResponse(content=format_chat_response(response_text))
122
- # except Exception as e:
123
- # logger.error(f"Status check failed: {e}")
124
- # raise HTTPException(status_code=500, detail=str(e))
125
-
126
- # @app.post("/v1/chat/completions")
127
- # async def chat_completion(request: Request):
128
- # try:
129
- # data = await request.json()
130
- # messages = data.get("messages", [])
131
- # if not messages:
132
- # raise HTTPException(status_code=400, detail="Messages are required")
133
-
134
- # payload = {
135
- # "inputs": {
136
- # "messages": messages
137
- # },
138
- # "parameters": {
139
- # "max_new_tokens": data.get("max_tokens", 2048),
140
- # "temperature": data.get("temperature", 0.7),
141
- # "top_p": data.get("top_p", 0.95),
142
- # "do_sample": True
143
- # }
144
- # }
145
 
146
- # response = await query_model(payload)
147
 
148
- # if isinstance(response, dict) and "error" in response:
149
- # raise HTTPException(status_code=500, detail=response["error"])
150
 
151
- # response_text = response[0]["generated_text"]
152
 
153
- # return JSONResponse(content=format_chat_response(response_text))
154
- # except HTTPException as e:
155
- # logger.error(f"Chat completion failed: {e.detail}")
156
- # raise e
157
- # except Exception as e:
158
- # logger.error(f"Unexpected error: {e}")
159
- # raise HTTPException(status_code=500, detail=str(e))
160
-
161
- # def generate_response(messages):
162
- # payload = {
163
- # "inputs": {
164
- # "messages": messages
165
- # },
166
- # "parameters": {
167
- # "max_new_tokens": 2048,
168
- # "temperature": 0.7,
169
- # "top_p": 0.95,
170
- # "do_sample": True
171
- # }
172
- # }
173
 
174
- # try:
175
- # response = requests.post(API_URL, headers=headers, json=payload)
176
- # response.raise_for_status()
177
- # result = response.json()
178
 
179
- # if isinstance(result, dict) and "error" in result:
180
- # return f"Error: {result['error']}"
181
 
182
- # return result[0]["generated_text"]
183
- # except requests.exceptions.RequestException as e:
184
- # logger.error(f"Request failed: {e}")
185
- # return f"Error: {e}"
186
-
187
- # def chat_interface(messages):
188
- # chat_history = []
189
- # for message in messages:
190
- # try:
191
- # response = generate_response([{"role": "user", "content": message}])
192
- # chat_history.append({"role": "user", "content": message})
193
- # chat_history.append({"role": "assistant", "content": response})
194
- # except Exception as e:
195
- # chat_history.append({"role": "user", "content": message})
196
- # chat_history.append({"role": "assistant", "content": f"Error: {str(e)}"})
197
- # return chat_history
198
-
199
- # # Create Gradio interface
200
- # def gradio_app():
201
- # return gr.ChatInterface(chat_interface, type="messages")
202
-
203
- # # Mount both FastAPI and Gradio
204
- # app = gr.mount_gradio_app(app, gradio_app(), path="/")
205
-
206
- # # For running with uvicorn directly
207
- # if __name__ == "__main__":
208
- # import uvicorn
209
- # uvicorn.run(app, host="0.0.0.0", port=7860)
 
 
 
 
 
1
 
2
+ import gradio as gr
3
+ from fastapi import FastAPI, Request, HTTPException
4
+ from fastapi.responses import JSONResponse
5
+ import datetime
6
+ import requests
7
+ import os
8
+ import logging
9
+ import toml
10
 
11
+ # Initialize FastAPI
12
  app = FastAPI()
13
 
14
+ # Configure logging
15
+ logging.basicConfig(level=logging.INFO)
16
+ logger = logging.getLogger(__name__)
17
+
18
+ # Load config
19
+ with open("config.toml") as f:
20
+ config = toml.load(f)
21
+
22
+ #API_URL = os.getenv('API_URL')
23
+ #API_TOKEN = os.getenv('API_TOKEN')
24
+ # API_URL = 'https://ojciectadeusz-fastapi-inference-qwen2-5-coder-32-a0ab504.hf.space/v1/chat/completions'
25
+ API_URL = 'https://ojciectadeusz-fastapi-inference-qwen2.5-coder-32b-instruct.hf.space/v1/chat/completions'
26
+ headers = {
27
+ "Authorization": f"Bearer {os.getenv('HF_API_TOKEN')}",
28
+ "Content-Type": "application/json"
29
+ }
30
+
31
+ def format_chat_response(response_text, prompt_tokens=0, completion_tokens=0):
32
+ return {
33
+ "id": f"chatcmpl-{datetime.datetime.now().strftime('%Y%m%d%H%M%S')}",
34
+ "object": "chat.completion",
35
+ "created": int(datetime.datetime.now().timestamp()),
36
+ "model": "Qwen/Qwen2.5-Coder-32B",
37
+ "choices": [{
38
+ "index": 0,
39
+ "message": {
40
+ "role": "assistant",
41
+ "content": response_text
42
+ },
43
+ "finish_reason": "stop"
44
+ }],
45
+ "usage": {
46
+ "prompt_tokens": prompt_tokens,
47
+ "completion_tokens": completion_tokens,
48
+ "total_tokens": prompt_tokens + completion_tokens
49
+ }
50
+ }
51
+
52
+ async def query_model(payload):
53
+ try:
54
+ response = requests.post(API_URL, headers=headers, json=payload)
55
+ response.raise_for_status()
56
+ return response.json()
57
+ except requests.exceptions.RequestException as e:
58
+ logger.error(f"Request failed: {e}")
59
+ raise HTTPException(status_code=500, detail=str(e))
60
+
61
+ @app.get("/status")
62
+ async def status():
63
+ try:
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
64
 
65
+ response_text = os.getenv('HF_API_TOKEN') + "it's working"
66
+ return JSONResponse(content=format_chat_response(response_text))
67
+ except Exception as e:
68
+ logger.error(f"Status check failed: {e}")
69
+ raise HTTPException(status_code=500, detail=str(e))
70
+
71
+ @app.post("/v1/chat/completions")
72
+ async def chat_completion(request: Request):
73
+ try:
74
+ data = await request.json()
75
+ messages = data.get("messages", [])
76
+ if not messages:
77
+ raise HTTPException(status_code=400, detail="Messages are required")
78
+
79
+ payload = {
80
+ "inputs": {
81
+ "messages": messages
82
+ },
83
+ "parameters": {
84
+ "max_new_tokens": data.get("max_tokens", 2048),
85
+ "temperature": data.get("temperature", 0.7),
86
+ "top_p": data.get("top_p", 0.95),
87
+ "do_sample": True
88
+ }
89
+ }
90
 
91
+ response = await query_model(payload)
92
 
93
+ if isinstance(response, dict) and "error" in response:
94
+ raise HTTPException(status_code=500, detail=response["error"])
95
 
96
+ response_text = response[0]["generated_text"]
97
 
98
+ return JSONResponse(content=format_chat_response(response_text))
99
+ except HTTPException as e:
100
+ logger.error(f"Chat completion failed: {e.detail}")
101
+ raise e
102
+ except Exception as e:
103
+ logger.error(f"Unexpected error: {e}")
104
+ raise HTTPException(status_code=500, detail=str(e))
105
+
106
+ def generate_response(messages):
107
+ payload = {
108
+ "inputs": {
109
+ "messages": messages
110
+ },
111
+ "parameters": {
112
+ "max_new_tokens": 2048,
113
+ "temperature": 0.7,
114
+ "top_p": 0.95,
115
+ "do_sample": True
116
+ }
117
+ }
118
 
119
+ try:
120
+ response = requests.post(API_URL, headers=headers, json=payload)
121
+ response.raise_for_status()
122
+ result = response.json()
123
 
124
+ if isinstance(result, dict) and "error" in result:
125
+ return f"Error: {result['error']}"
126
 
127
+ return result[0]["generated_text"]
128
+ except requests.exceptions.RequestException as e:
129
+ logger.error(f"Request failed: {e}")
130
+ return f"Error: {e}"
131
+
132
+ def chat_interface(messages):
133
+ chat_history = []
134
+ for message in messages:
135
+ try:
136
+ response = generate_response([{"role": "user", "content": message}])
137
+ chat_history.append({"role": "user", "content": message})
138
+ chat_history.append({"role": "assistant", "content": response})
139
+ except Exception as e:
140
+ chat_history.append({"role": "user", "content": message})
141
+ chat_history.append({"role": "assistant", "content": f"Error: {str(e)}"})
142
+ return chat_history
143
+
144
+ # Create Gradio interface
145
+ def gradio_app():
146
+ return gr.ChatInterface(chat_interface, type="messages")
147
+
148
+ # Mount both FastAPI and Gradio
149
+ app = gr.mount_gradio_app(app, gradio_app(), path="/")
150
+
151
+ # For running with uvicorn directly
152
+ if __name__ == "__main__":
153
+ import uvicorn
154
+ uvicorn.run(app, host="0.0.0.0", port=7860)