Spaces:
Running
on
T4
Running
on
T4
measure retriever and reader time
Browse files
app.py
CHANGED
@@ -3,6 +3,7 @@ import pandas as pd
|
|
3 |
import logging
|
4 |
import asyncio
|
5 |
import os
|
|
|
6 |
from uuid import uuid4
|
7 |
from datetime import datetime, timedelta
|
8 |
from pathlib import Path
|
@@ -139,8 +140,13 @@ async def chat(query,history,sources,reports,subtype, client_ip=None, session_id
|
|
139 |
vectorstore = vectorstores["docling"]
|
140 |
|
141 |
##------------------------------get context----------------------------------------------
|
|
|
|
|
|
|
142 |
context_retrieved = get_context(vectorstore=vectorstore,query=query,reports=reports,
|
143 |
sources=sources,subtype=subtype)
|
|
|
|
|
144 |
context_retrieved_formatted = "||".join(doc.page_content for doc in context_retrieved)
|
145 |
context_retrieved_lst = [doc.page_content for doc in context_retrieved]
|
146 |
|
@@ -186,18 +192,19 @@ async def chat(query,history,sources,reports,subtype, client_ip=None, session_id
|
|
186 |
"session_duration_seconds": session_duration,
|
187 |
"client_location": session_data['location_info'],
|
188 |
"platform": session_data['platform_info'],
|
189 |
-
|
190 |
-
|
191 |
-
|
192 |
-
"subtype": subtype,
|
193 |
#"year": year,
|
194 |
"question": query,
|
195 |
"retriever": model_config.get('retriever','MODEL'),
|
196 |
"endpoint_type": model_config.get('reader','TYPE'),
|
197 |
"reader": model_config.get('reader','NVIDIA_MODEL'),
|
198 |
-
|
199 |
}
|
200 |
|
|
|
201 |
if model_config.get('reader','TYPE') == 'NVIDIA':
|
202 |
chat_model = nvidia_client()
|
203 |
async def process_stream():
|
@@ -226,6 +233,8 @@ async def chat(query,history,sources,reports,subtype, client_ip=None, session_id
|
|
226 |
|
227 |
elif model_config.get('reader','TYPE') == 'DEDICATED':
|
228 |
chat_model = dedicated_endpoint()
|
|
|
|
|
229 |
async def process_stream():
|
230 |
# Without nonlocal, Python would create a new local variable answer_yet inside process_stream(),
|
231 |
# instead of modifying the one from the outer scope.
|
@@ -236,11 +245,15 @@ async def chat(query,history,sources,reports,subtype, client_ip=None, session_id
|
|
236 |
answer_yet += token
|
237 |
parsed_answer = parse_output_llm_with_sources(answer_yet)
|
238 |
history[-1] = (query, parsed_answer)
|
|
|
239 |
yield [tuple(x) for x in history], docs_html, logs_data, session_id
|
240 |
-
|
|
|
|
|
241 |
# Stream the response updates
|
242 |
async for update in process_stream():
|
243 |
yield update
|
|
|
244 |
|
245 |
else:
|
246 |
chat_model = serverless_api() # TESTING: ADAPTED FOR HF INFERENCE API (needs to be reverted for production version)
|
@@ -276,6 +289,7 @@ async def chat(query,history,sources,reports,subtype, client_ip=None, session_id
|
|
276 |
|
277 |
async for update in process_stream():
|
278 |
yield update
|
|
|
279 |
|
280 |
# logging the event
|
281 |
try:
|
|
|
3 |
import logging
|
4 |
import asyncio
|
5 |
import os
|
6 |
+
import time
|
7 |
from uuid import uuid4
|
8 |
from datetime import datetime, timedelta
|
9 |
from pathlib import Path
|
|
|
140 |
vectorstore = vectorstores["docling"]
|
141 |
|
142 |
##------------------------------get context----------------------------------------------
|
143 |
+
|
144 |
+
### adding for assessing computation time
|
145 |
+
start_time = time.time()
|
146 |
context_retrieved = get_context(vectorstore=vectorstore,query=query,reports=reports,
|
147 |
sources=sources,subtype=subtype)
|
148 |
+
end_time = time.time()
|
149 |
+
print("Time for retriever:",end_time - start_time)
|
150 |
context_retrieved_formatted = "||".join(doc.page_content for doc in context_retrieved)
|
151 |
context_retrieved_lst = [doc.page_content for doc in context_retrieved]
|
152 |
|
|
|
192 |
"session_duration_seconds": session_duration,
|
193 |
"client_location": session_data['location_info'],
|
194 |
"platform": session_data['platform_info'],
|
195 |
+
"system_prompt": SYSTEM_PROMPT,
|
196 |
+
"sources": sources,
|
197 |
+
"reports": reports,
|
198 |
+
"subtype": subtype,
|
199 |
#"year": year,
|
200 |
"question": query,
|
201 |
"retriever": model_config.get('retriever','MODEL'),
|
202 |
"endpoint_type": model_config.get('reader','TYPE'),
|
203 |
"reader": model_config.get('reader','NVIDIA_MODEL'),
|
204 |
+
"docs": [doc.page_content for doc in context_retrieved],
|
205 |
}
|
206 |
|
207 |
+
|
208 |
if model_config.get('reader','TYPE') == 'NVIDIA':
|
209 |
chat_model = nvidia_client()
|
210 |
async def process_stream():
|
|
|
233 |
|
234 |
elif model_config.get('reader','TYPE') == 'DEDICATED':
|
235 |
chat_model = dedicated_endpoint()
|
236 |
+
### adding for assessing computation time
|
237 |
+
start_time = time.time()
|
238 |
async def process_stream():
|
239 |
# Without nonlocal, Python would create a new local variable answer_yet inside process_stream(),
|
240 |
# instead of modifying the one from the outer scope.
|
|
|
245 |
answer_yet += token
|
246 |
parsed_answer = parse_output_llm_with_sources(answer_yet)
|
247 |
history[-1] = (query, parsed_answer)
|
248 |
+
logs_data["answer"] = parsed_answer
|
249 |
yield [tuple(x) for x in history], docs_html, logs_data, session_id
|
250 |
+
end_time = time.time()
|
251 |
+
print("Time for reader:",end_time - start_time)
|
252 |
+
|
253 |
# Stream the response updates
|
254 |
async for update in process_stream():
|
255 |
yield update
|
256 |
+
|
257 |
|
258 |
else:
|
259 |
chat_model = serverless_api() # TESTING: ADAPTED FOR HF INFERENCE API (needs to be reverted for production version)
|
|
|
289 |
|
290 |
async for update in process_stream():
|
291 |
yield update
|
292 |
+
|
293 |
|
294 |
# logging the event
|
295 |
try:
|