Spaces:

GIZ
/

audit_assistant

Running on T4

App Files Files Community

ppsingh commited on Dec 4, 2024

Commit

dc8b4b0

1 Parent(s): d201098

add new chunks

Browse files

Files changed (9) hide show

app.py +3 -3
auditqa/__pycache__/__init__.cpython-310.pyc +0 -0
auditqa/__pycache__/process_chunks.cpython-310.pyc +0 -0
auditqa/__pycache__/reader.cpython-310.pyc +0 -0
auditqa/__pycache__/reports.cpython-310.pyc +0 -0
auditqa/__pycache__/retriever.cpython-310.pyc +0 -0
auditqa/__pycache__/sample_questions.cpython-310.pyc +0 -0
auditqa/__pycache__/utils.cpython-310.pyc +0 -0
auditqa/process_chunks.py +54 -0

app.py CHANGED Viewed

@@ -9,7 +9,7 @@ from pathlib import Path
 from huggingface_hub import CommitScheduler
 from auditqa.sample_questions import QUESTIONS
 from auditqa.reports import files, report_list
-from auditqa.process_chunks import load_chunks, getconfig, get_local_qdrant
 from auditqa.retriever import get_context
 from auditqa.reader import nvidia_client, dedicated_endpoint
 from auditqa.utils import make_html_source, parse_output_llm_with_sources, save_logs, get_message_template
@@ -40,9 +40,9 @@ scheduler = CommitScheduler(
 # We need to create the local vectorstore collection once using load_chunks
 # vectorestore colection are stored on persistent storage so this needs to be run only once
 # hence, comment out line below when creating for first time
-#vectorstores = load_chunks()
 # once the vectore embeddings  are created we will use qdrant client to access these
-vectorstores = get_local_qdrant()
 #####---------------------CHAT-----------------------------------------------------
 def start_chat(query,history):

 from huggingface_hub import CommitScheduler
 from auditqa.sample_questions import QUESTIONS
 from auditqa.reports import files, report_list
+from auditqa.process_chunks import load_chunks, getconfig, get_local_qdrant, load_new_chunks
 from auditqa.retriever import get_context
 from auditqa.reader import nvidia_client, dedicated_endpoint
 from auditqa.utils import make_html_source, parse_output_llm_with_sources, save_logs, get_message_template
 # We need to create the local vectorstore collection once using load_chunks
 # vectorestore colection are stored on persistent storage so this needs to be run only once
 # hence, comment out line below when creating for first time
+vectorstores = load_new_chunks()
 # once the vectore embeddings  are created we will use qdrant client to access these
+#vectorstores = get_local_qdrant()
 #####---------------------CHAT-----------------------------------------------------
 def start_chat(query,history):

auditqa/__pycache__/__init__.cpython-310.pyc CHANGED Viewed

Binary files a/auditqa/__pycache__/__init__.cpython-310.pyc and b/auditqa/__pycache__/__init__.cpython-310.pyc differ

auditqa/__pycache__/process_chunks.cpython-310.pyc CHANGED Viewed

Binary files a/auditqa/__pycache__/process_chunks.cpython-310.pyc and b/auditqa/__pycache__/process_chunks.cpython-310.pyc differ

auditqa/__pycache__/reader.cpython-310.pyc CHANGED Viewed

Binary files a/auditqa/__pycache__/reader.cpython-310.pyc and b/auditqa/__pycache__/reader.cpython-310.pyc differ

auditqa/__pycache__/reports.cpython-310.pyc CHANGED Viewed

Binary files a/auditqa/__pycache__/reports.cpython-310.pyc and b/auditqa/__pycache__/reports.cpython-310.pyc differ

auditqa/__pycache__/retriever.cpython-310.pyc CHANGED Viewed

Binary files a/auditqa/__pycache__/retriever.cpython-310.pyc and b/auditqa/__pycache__/retriever.cpython-310.pyc differ

auditqa/__pycache__/sample_questions.cpython-310.pyc CHANGED Viewed

Binary files a/auditqa/__pycache__/sample_questions.cpython-310.pyc and b/auditqa/__pycache__/sample_questions.cpython-310.pyc differ

auditqa/__pycache__/utils.cpython-310.pyc CHANGED Viewed

Binary files a/auditqa/__pycache__/utils.cpython-310.pyc and b/auditqa/__pycache__/utils.cpython-310.pyc differ

auditqa/process_chunks.py CHANGED Viewed

@@ -113,6 +113,60 @@ def load_chunks():
     print("vector embeddings done")
     return qdrant_collections
 def get_local_qdrant():
     """once the local qdrant server is created this is used to make the connection to exisitng server"""
     config = getconfig("./model_params.cfg")

     print("vector embeddings done")
     return qdrant_collections
+def load_new_chunks():
+    """
+    this method reads through the files and report_list to create the vector database
+    """
+    #  we iterate through the files which contain information about its
+    # 'source'=='category', 'subtype', these are used in UI for document selection
+    #  which will be used later for filtering database
+    config = getconfig("./model_params.cfg")
+    files = pd.read_json("./axa_processed_chunks_update.json")
+    all_documents= []
+    # iterate through 'source'
+    for i in range(len(files)):
+        # load the chunks
+        try:
+            doc_processed = open_file(path_to_data + "/chunks/"+ os.path.basename(files.loc[i,'chunks_filepath']))
+            doc_processed = doc_processed['paragraphs']
+        except Exception as e:
+            print("Exception: ", e)
+        print("chunks in subtype:", files.loc[0,'filename'], "are:",len(doc_processed))
+        # add metadata information
+        for doc in doc_processed:
+            all_documents.append(Document(page_content= doc['content'],
+                        metadata={"source": files.loc[i,'category'],
+                                "subtype":os.path.splitext(files.loc[i,'filename'])[0],
+                                "year":files.loc[i,'year'],
+                                "filename":files.loc[0,'filename'],
+                                "page":doc['metadata']['page'],
+                                "headings":doc['metadata']['headings']}))
+    # convert list of list to flat list
+    print("length of chunks:",len(all_documents))
+    # define embedding model
+    embeddings = HuggingFaceEmbeddings(
+        model_kwargs = {'device': device},
+        encode_kwargs = {'normalize_embeddings': bool(int(config.get('retriever','NORMALIZE')))},
+        model_name=config.get('retriever','MODEL')
+    )
+    # placeholder for collection
+    qdrant_collections = {}
+    qdrant_collections['allreports'] = Qdrant.from_documents(
+                all_documents,
+                embeddings,
+                path="/data/local_qdrant",
+                collection_name='allreports',
+            )
+    print(qdrant_collections)
+    print("vector embeddings done")
+    return qdrant_collections
 def get_local_qdrant():
     """once the local qdrant server is created this is used to make the connection to exisitng server"""
     config = getconfig("./model_params.cfg")