audit_assistant

Sleeping

ppsingh commited on Jul 22, 2024

Commit

1593f40

verified ·

1 Parent(s): 6240195

Update auditqa/doc_process.py

Files changed (1) hide show

auditqa/doc_process.py CHANGED Viewed

@@ -31,29 +31,37 @@ def process_pdf():
     text_splitter = RecursiveCharacterTextSplitter.from_huggingface_tokenizer(
             AutoTokenizer.from_pretrained("BAAI/bge-small-en-v1.5"),
             chunk_size=chunk_size,
-            chunk_overlap=int(chunk_size / 10),
             add_start_index=True,
             strip_whitespace=True,
             separators=["\n\n", "\n"],
     )
-    all_documents = {}
     for file,value in docs.items():
         doc_processed = text_splitter.split_documents(value)
         for doc in doc_processed:
             doc.metadata["source"] = file
             doc.metadata["year"] = file[-4:]
-        all_documents[file] = doc_processed
-    print(all_documents.keys())
     embeddings = HuggingFaceEmbeddings(
         model_kwargs = {'device': 'cpu'},
         encode_kwargs = {'normalize_embeddings': True},
         model_name="BAAI/bge-small-en-v1.5"
     )
     qdrant_collections = {}
     for file,value in all_documents.items():
         print("emebddings for:",file)
         qdrant_collections[file] = Qdrant.from_documents(

     text_splitter = RecursiveCharacterTextSplitter.from_huggingface_tokenizer(
             AutoTokenizer.from_pretrained("BAAI/bge-small-en-v1.5"),
             chunk_size=chunk_size,
+            chunk_overlap=int(chunk_size / 20),
             add_start_index=True,
             strip_whitespace=True,
             separators=["\n\n", "\n"],
     )
+    all_documents = {'Consolidated':[], 'MWTS':[]}
     for file,value in docs.items():
         doc_processed = text_splitter.split_documents(value)
         for doc in doc_processed:
             doc.metadata["source"] = file
             doc.metadata["year"] = file[-4:]
+        for key in all_documents:
+            if key in file:
+                print(key)
+                all_documents[key].append(doc_processed)
+    for key, docs_processed in all_documents.items():
+        docs_processed = [item for sublist in docs_processed for item in sublist]
+        all_documents[key] = docs_processed
     embeddings = HuggingFaceEmbeddings(
         model_kwargs = {'device': 'cpu'},
         encode_kwargs = {'normalize_embeddings': True},
         model_name="BAAI/bge-small-en-v1.5"
     )
     qdrant_collections = {}
     for file,value in all_documents.items():
         print("emebddings for:",file)
         qdrant_collections[file] = Qdrant.from_documents(