Spaces:

GIZ
/

audit_assistant

Running on T4

App Files Files Community

ppsingh commited on Aug 6, 2024

Commit

6702158

verified ·

1 Parent(s): 1a3a52c

Update auditqa/doc_process.py

Browse files

Files changed (1) hide show

auditqa/doc_process.py +25 -7

auditqa/doc_process.py CHANGED Viewed

@@ -8,13 +8,15 @@ from langchain_community.embeddings import HuggingFaceEmbeddings, HuggingFaceInf
 from langchain_community.vectorstores import Qdrant
 from auditqa.reports import files, report_list
 device = 'cuda' if cuda.is_available() else 'cpu'
-#from dotenv import load_dotenv
-#load_dotenv()
-#HF_token = os.environ["HF_TOKEN"]
 path_to_data = "./data/pdf/"
 def process_pdf():
     docs = {}
     for file in report_list:
         try:
@@ -22,6 +24,7 @@ def process_pdf():
         except Exception as e:
             print("Exception: ", e)
     # text splitter based on the tokenizer of a model of your choosing
     # to make texts fit exactly a transformer's context window size
     # langchain text splitters: https://python.langchain.com/docs/modules/data_connection/document_transformers/
@@ -34,35 +37,49 @@ def process_pdf():
             strip_whitespace=True,
             separators=["\n\n", "\n"],
     )
     all_documents = {}
     categories = list(files.keys())
     for category in categories:
         print(category)
         all_documents[category] = []
         subtypes = list(files[category].keys())
         for subtype in subtypes:
             print(subtype)
             for file in files[category][subtype]:
                 doc_processed = text_splitter.split_documents(docs[file])
                 for doc in doc_processed:
                     doc.metadata["source"] = category
                     doc.metadata["subtype"] = subtype
                     doc.metadata["year"] = file[-4:]
                 all_documents[category].append(doc_processed)
     for key, docs_processed in all_documents.items():
         docs_processed = [item for sublist in docs_processed for item in sublist]
         all_documents[key] = docs_processed
     embeddings = HuggingFaceEmbeddings(
         model_kwargs = {'device': device},
         encode_kwargs = {'normalize_embeddings': True},
         model_name="BAAI/bge-small-en-v1.5"
     )
     qdrant_collections = {}
     for file,value in all_documents.items():
         print("emebddings for:",file)
         qdrant_collections[file] = Qdrant.from_documents(
@@ -71,6 +88,7 @@ def process_pdf():
             location=":memory:",
             collection_name=file,
         )
     print("done")
     return qdrant_collections

 from langchain_community.vectorstores import Qdrant
 from auditqa.reports import files, report_list
 device = 'cuda' if cuda.is_available() else 'cpu'
+# path to the pdf files
 path_to_data = "./data/pdf/"
 def process_pdf():
+    """
+    this method reads through the files and report_list to create the vector database
+    """
+    # load all the files using PyMuPDFfLoader
     docs = {}
     for file in report_list:
         try:
         except Exception as e:
             print("Exception: ", e)
     # text splitter based on the tokenizer of a model of your choosing
     # to make texts fit exactly a transformer's context window size
     # langchain text splitters: https://python.langchain.com/docs/modules/data_connection/document_transformers/
             strip_whitespace=True,
             separators=["\n\n", "\n"],
     )
+    #  we iterate through the files which contain information about its
+    # 'source'=='category', 'subtype', these are used in UI for document selection
+    #  which will be used later for filtering database
     all_documents = {}
     categories = list(files.keys())
+    # iterate through 'source'
     for category in categories:
         print(category)
         all_documents[category] = []
         subtypes = list(files[category].keys())
+        # iterate through 'subtype' within the source
+        # example source/category == 'District', has subtypes which is district names
         for subtype in subtypes:
             print(subtype)
             for file in files[category][subtype]:
+                # create the chunks
                 doc_processed = text_splitter.split_documents(docs[file])
+                # add metadata information
                 for doc in doc_processed:
                     doc.metadata["source"] = category
                     doc.metadata["subtype"] = subtype
                     doc.metadata["year"] = file[-4:]
                 all_documents[category].append(doc_processed)
+    # convert list of list to flat list
     for key, docs_processed in all_documents.items():
         docs_processed = [item for sublist in docs_processed for item in sublist]
         all_documents[key] = docs_processed
+    all_documents['allreports'] = [sublist for key,sublist in all_documents.items()]
+    all_documents['allreports'] = [item for sublist in all_documents['allreports'] for item in sublist]
+    # define embedding model
     embeddings = HuggingFaceEmbeddings(
         model_kwargs = {'device': device},
         encode_kwargs = {'normalize_embeddings': True},
         model_name="BAAI/bge-small-en-v1.5"
     )
+    # placeholder for collection
     qdrant_collections = {}
     for file,value in all_documents.items():
         print("emebddings for:",file)
         qdrant_collections[file] = Qdrant.from_documents(
             location=":memory:",
             collection_name=file,
         )
     print("done")
     return qdrant_collections