Spaces:

GIZ
/

audit_assistant

Running on T4

App Files Files Community

ppsingh commited on Mar 13

Commit

9a13482

verified ·

1 Parent(s): 0d7e1df

measure retriever and reader time

Browse files

Files changed (1) hide show

app.py +20 -6

app.py CHANGED Viewed

@@ -3,6 +3,7 @@ import pandas as pd
 import logging
 import asyncio
 import os
 from uuid import uuid4
 from datetime import datetime, timedelta
 from pathlib import Path
@@ -139,8 +140,13 @@ async def chat(query,history,sources,reports,subtype, client_ip=None, session_id
     vectorstore = vectorstores["docling"]
     ##------------------------------get context----------------------------------------------
     context_retrieved = get_context(vectorstore=vectorstore,query=query,reports=reports,
                                                 sources=sources,subtype=subtype)
     context_retrieved_formatted = "||".join(doc.page_content for doc in context_retrieved)
     context_retrieved_lst = [doc.page_content for doc in context_retrieved]
@@ -186,18 +192,19 @@ async def chat(query,history,sources,reports,subtype, client_ip=None, session_id
         "session_duration_seconds": session_duration,
         "client_location": session_data['location_info'],
         "platform": session_data['platform_info'],
-        # "system_prompt": SYSTEM_PROMPT, #REMOVED FOR TESTING
-        # "sources": sources, #REMOVED FOR TESTING
-        # "reports": reports, #REMOVED FOR TESTING
-        "subtype": subtype, #REMOVED FOR TESTING
         #"year": year,
         "question": query,
         "retriever": model_config.get('retriever','MODEL'),
         "endpoint_type": model_config.get('reader','TYPE'),
         "reader": model_config.get('reader','NVIDIA_MODEL'),
-        # "docs": [doc.page_content for doc in context_retrieved], #REMOVED FOR TESTING
     }
     if model_config.get('reader','TYPE') == 'NVIDIA':
         chat_model = nvidia_client()
         async def process_stream():
@@ -226,6 +233,8 @@ async def chat(query,history,sources,reports,subtype, client_ip=None, session_id
     elif model_config.get('reader','TYPE') == 'DEDICATED':
         chat_model = dedicated_endpoint()
         async def process_stream():
         # Without nonlocal, Python would create a new local variable answer_yet inside process_stream(),
         # instead of modifying the one from the outer scope.
@@ -236,11 +245,15 @@ async def chat(query,history,sources,reports,subtype, client_ip=None, session_id
                 answer_yet += token
                 parsed_answer = parse_output_llm_with_sources(answer_yet)
                 history[-1] = (query, parsed_answer)
                 yield [tuple(x) for x in history], docs_html, logs_data, session_id
        # Stream the response updates
         async for update in process_stream():
             yield update
     else:
         chat_model = serverless_api() # TESTING: ADAPTED FOR HF INFERENCE API (needs to be reverted for production version)
@@ -276,6 +289,7 @@ async def chat(query,history,sources,reports,subtype, client_ip=None, session_id
         async for update in process_stream():
             yield update
     # logging the event
     try:

 import logging
 import asyncio
 import os
+import time
 from uuid import uuid4
 from datetime import datetime, timedelta
 from pathlib import Path
     vectorstore = vectorstores["docling"]
     ##------------------------------get context----------------------------------------------
+    ### adding for assessing computation time
+    start_time = time.time()
     context_retrieved = get_context(vectorstore=vectorstore,query=query,reports=reports,
                                                 sources=sources,subtype=subtype)
+    end_time = time.time()
+    print("Time for retriever:",end_time - start_time)
     context_retrieved_formatted = "||".join(doc.page_content for doc in context_retrieved)
     context_retrieved_lst = [doc.page_content for doc in context_retrieved]
         "session_duration_seconds": session_duration,
         "client_location": session_data['location_info'],
         "platform": session_data['platform_info'],
+        "system_prompt": SYSTEM_PROMPT,
+        "sources": sources,
+        "reports": reports,
+        "subtype": subtype,
         #"year": year,
         "question": query,
         "retriever": model_config.get('retriever','MODEL'),
         "endpoint_type": model_config.get('reader','TYPE'),
         "reader": model_config.get('reader','NVIDIA_MODEL'),
+        "docs": [doc.page_content for doc in context_retrieved],
     }
     if model_config.get('reader','TYPE') == 'NVIDIA':
         chat_model = nvidia_client()
         async def process_stream():
     elif model_config.get('reader','TYPE') == 'DEDICATED':
         chat_model = dedicated_endpoint()
+        ### adding for assessing computation time
+        start_time = time.time()
         async def process_stream():
         # Without nonlocal, Python would create a new local variable answer_yet inside process_stream(),
         # instead of modifying the one from the outer scope.
                 answer_yet += token
                 parsed_answer = parse_output_llm_with_sources(answer_yet)
                 history[-1] = (query, parsed_answer)
+                logs_data["answer"] = parsed_answer
                 yield [tuple(x) for x in history], docs_html, logs_data, session_id
+        end_time = time.time()
+        print("Time for reader:",end_time - start_time)
        # Stream the response updates
         async for update in process_stream():
             yield update
     else:
         chat_model = serverless_api() # TESTING: ADAPTED FOR HF INFERENCE API (needs to be reverted for production version)
         async for update in process_stream():
             yield update
     # logging the event
     try: