ppsingh commited on
Commit
9a13482
·
verified ·
1 Parent(s): 0d7e1df

measure retriever and reader time

Browse files
Files changed (1) hide show
  1. app.py +20 -6
app.py CHANGED
@@ -3,6 +3,7 @@ import pandas as pd
3
  import logging
4
  import asyncio
5
  import os
 
6
  from uuid import uuid4
7
  from datetime import datetime, timedelta
8
  from pathlib import Path
@@ -139,8 +140,13 @@ async def chat(query,history,sources,reports,subtype, client_ip=None, session_id
139
  vectorstore = vectorstores["docling"]
140
 
141
  ##------------------------------get context----------------------------------------------
 
 
 
142
  context_retrieved = get_context(vectorstore=vectorstore,query=query,reports=reports,
143
  sources=sources,subtype=subtype)
 
 
144
  context_retrieved_formatted = "||".join(doc.page_content for doc in context_retrieved)
145
  context_retrieved_lst = [doc.page_content for doc in context_retrieved]
146
 
@@ -186,18 +192,19 @@ async def chat(query,history,sources,reports,subtype, client_ip=None, session_id
186
  "session_duration_seconds": session_duration,
187
  "client_location": session_data['location_info'],
188
  "platform": session_data['platform_info'],
189
- # "system_prompt": SYSTEM_PROMPT, #REMOVED FOR TESTING
190
- # "sources": sources, #REMOVED FOR TESTING
191
- # "reports": reports, #REMOVED FOR TESTING
192
- "subtype": subtype, #REMOVED FOR TESTING
193
  #"year": year,
194
  "question": query,
195
  "retriever": model_config.get('retriever','MODEL'),
196
  "endpoint_type": model_config.get('reader','TYPE'),
197
  "reader": model_config.get('reader','NVIDIA_MODEL'),
198
- # "docs": [doc.page_content for doc in context_retrieved], #REMOVED FOR TESTING
199
  }
200
 
 
201
  if model_config.get('reader','TYPE') == 'NVIDIA':
202
  chat_model = nvidia_client()
203
  async def process_stream():
@@ -226,6 +233,8 @@ async def chat(query,history,sources,reports,subtype, client_ip=None, session_id
226
 
227
  elif model_config.get('reader','TYPE') == 'DEDICATED':
228
  chat_model = dedicated_endpoint()
 
 
229
  async def process_stream():
230
  # Without nonlocal, Python would create a new local variable answer_yet inside process_stream(),
231
  # instead of modifying the one from the outer scope.
@@ -236,11 +245,15 @@ async def chat(query,history,sources,reports,subtype, client_ip=None, session_id
236
  answer_yet += token
237
  parsed_answer = parse_output_llm_with_sources(answer_yet)
238
  history[-1] = (query, parsed_answer)
 
239
  yield [tuple(x) for x in history], docs_html, logs_data, session_id
240
-
 
 
241
  # Stream the response updates
242
  async for update in process_stream():
243
  yield update
 
244
 
245
  else:
246
  chat_model = serverless_api() # TESTING: ADAPTED FOR HF INFERENCE API (needs to be reverted for production version)
@@ -276,6 +289,7 @@ async def chat(query,history,sources,reports,subtype, client_ip=None, session_id
276
 
277
  async for update in process_stream():
278
  yield update
 
279
 
280
  # logging the event
281
  try:
 
3
  import logging
4
  import asyncio
5
  import os
6
+ import time
7
  from uuid import uuid4
8
  from datetime import datetime, timedelta
9
  from pathlib import Path
 
140
  vectorstore = vectorstores["docling"]
141
 
142
  ##------------------------------get context----------------------------------------------
143
+
144
+ ### adding for assessing computation time
145
+ start_time = time.time()
146
  context_retrieved = get_context(vectorstore=vectorstore,query=query,reports=reports,
147
  sources=sources,subtype=subtype)
148
+ end_time = time.time()
149
+ print("Time for retriever:",end_time - start_time)
150
  context_retrieved_formatted = "||".join(doc.page_content for doc in context_retrieved)
151
  context_retrieved_lst = [doc.page_content for doc in context_retrieved]
152
 
 
192
  "session_duration_seconds": session_duration,
193
  "client_location": session_data['location_info'],
194
  "platform": session_data['platform_info'],
195
+ "system_prompt": SYSTEM_PROMPT,
196
+ "sources": sources,
197
+ "reports": reports,
198
+ "subtype": subtype,
199
  #"year": year,
200
  "question": query,
201
  "retriever": model_config.get('retriever','MODEL'),
202
  "endpoint_type": model_config.get('reader','TYPE'),
203
  "reader": model_config.get('reader','NVIDIA_MODEL'),
204
+ "docs": [doc.page_content for doc in context_retrieved],
205
  }
206
 
207
+
208
  if model_config.get('reader','TYPE') == 'NVIDIA':
209
  chat_model = nvidia_client()
210
  async def process_stream():
 
233
 
234
  elif model_config.get('reader','TYPE') == 'DEDICATED':
235
  chat_model = dedicated_endpoint()
236
+ ### adding for assessing computation time
237
+ start_time = time.time()
238
  async def process_stream():
239
  # Without nonlocal, Python would create a new local variable answer_yet inside process_stream(),
240
  # instead of modifying the one from the outer scope.
 
245
  answer_yet += token
246
  parsed_answer = parse_output_llm_with_sources(answer_yet)
247
  history[-1] = (query, parsed_answer)
248
+ logs_data["answer"] = parsed_answer
249
  yield [tuple(x) for x in history], docs_html, logs_data, session_id
250
+ end_time = time.time()
251
+ print("Time for reader:",end_time - start_time)
252
+
253
  # Stream the response updates
254
  async for update in process_stream():
255
  yield update
256
+
257
 
258
  else:
259
  chat_model = serverless_api() # TESTING: ADAPTED FOR HF INFERENCE API (needs to be reverted for production version)
 
289
 
290
  async for update in process_stream():
291
  yield update
292
+
293
 
294
  # logging the event
295
  try: