Spaces:
Running
on
CPU Upgrade
Running
on
CPU Upgrade
Update auditqa/doc_process.py
Browse files- auditqa/doc_process.py +25 -7
auditqa/doc_process.py
CHANGED
@@ -8,13 +8,15 @@ from langchain_community.embeddings import HuggingFaceEmbeddings, HuggingFaceInf
|
|
8 |
from langchain_community.vectorstores import Qdrant
|
9 |
from auditqa.reports import files, report_list
|
10 |
device = 'cuda' if cuda.is_available() else 'cpu'
|
11 |
-
#from dotenv import load_dotenv
|
12 |
-
#load_dotenv()
|
13 |
|
14 |
-
#
|
15 |
path_to_data = "./data/pdf/"
|
16 |
|
17 |
def process_pdf():
|
|
|
|
|
|
|
|
|
18 |
docs = {}
|
19 |
for file in report_list:
|
20 |
try:
|
@@ -22,6 +24,7 @@ def process_pdf():
|
|
22 |
except Exception as e:
|
23 |
print("Exception: ", e)
|
24 |
|
|
|
25 |
# text splitter based on the tokenizer of a model of your choosing
|
26 |
# to make texts fit exactly a transformer's context window size
|
27 |
# langchain text splitters: https://python.langchain.com/docs/modules/data_connection/document_transformers/
|
@@ -34,35 +37,49 @@ def process_pdf():
|
|
34 |
strip_whitespace=True,
|
35 |
separators=["\n\n", "\n"],
|
36 |
)
|
|
|
|
|
|
|
37 |
all_documents = {}
|
38 |
categories = list(files.keys())
|
|
|
39 |
for category in categories:
|
40 |
print(category)
|
41 |
all_documents[category] = []
|
42 |
subtypes = list(files[category].keys())
|
|
|
|
|
43 |
for subtype in subtypes:
|
44 |
print(subtype)
|
45 |
for file in files[category][subtype]:
|
|
|
|
|
46 |
doc_processed = text_splitter.split_documents(docs[file])
|
|
|
|
|
47 |
for doc in doc_processed:
|
48 |
doc.metadata["source"] = category
|
49 |
doc.metadata["subtype"] = subtype
|
50 |
doc.metadata["year"] = file[-4:]
|
51 |
|
52 |
all_documents[category].append(doc_processed)
|
53 |
-
|
|
|
54 |
for key, docs_processed in all_documents.items():
|
55 |
docs_processed = [item for sublist in docs_processed for item in sublist]
|
56 |
all_documents[key] = docs_processed
|
57 |
-
|
|
|
|
|
58 |
embeddings = HuggingFaceEmbeddings(
|
59 |
model_kwargs = {'device': device},
|
60 |
encode_kwargs = {'normalize_embeddings': True},
|
61 |
model_name="BAAI/bge-small-en-v1.5"
|
62 |
)
|
63 |
-
|
64 |
qdrant_collections = {}
|
65 |
-
|
|
|
66 |
for file,value in all_documents.items():
|
67 |
print("emebddings for:",file)
|
68 |
qdrant_collections[file] = Qdrant.from_documents(
|
@@ -71,6 +88,7 @@ def process_pdf():
|
|
71 |
location=":memory:",
|
72 |
collection_name=file,
|
73 |
)
|
|
|
74 |
print("done")
|
75 |
return qdrant_collections
|
76 |
|
|
|
8 |
from langchain_community.vectorstores import Qdrant
|
9 |
from auditqa.reports import files, report_list
|
10 |
device = 'cuda' if cuda.is_available() else 'cpu'
|
|
|
|
|
11 |
|
12 |
+
# path to the pdf files
|
13 |
path_to_data = "./data/pdf/"
|
14 |
|
15 |
def process_pdf():
|
16 |
+
"""
|
17 |
+
this method reads through the files and report_list to create the vector database
|
18 |
+
"""
|
19 |
+
# load all the files using PyMuPDFfLoader
|
20 |
docs = {}
|
21 |
for file in report_list:
|
22 |
try:
|
|
|
24 |
except Exception as e:
|
25 |
print("Exception: ", e)
|
26 |
|
27 |
+
|
28 |
# text splitter based on the tokenizer of a model of your choosing
|
29 |
# to make texts fit exactly a transformer's context window size
|
30 |
# langchain text splitters: https://python.langchain.com/docs/modules/data_connection/document_transformers/
|
|
|
37 |
strip_whitespace=True,
|
38 |
separators=["\n\n", "\n"],
|
39 |
)
|
40 |
+
# we iterate through the files which contain information about its
|
41 |
+
# 'source'=='category', 'subtype', these are used in UI for document selection
|
42 |
+
# which will be used later for filtering database
|
43 |
all_documents = {}
|
44 |
categories = list(files.keys())
|
45 |
+
# iterate through 'source'
|
46 |
for category in categories:
|
47 |
print(category)
|
48 |
all_documents[category] = []
|
49 |
subtypes = list(files[category].keys())
|
50 |
+
# iterate through 'subtype' within the source
|
51 |
+
# example source/category == 'District', has subtypes which is district names
|
52 |
for subtype in subtypes:
|
53 |
print(subtype)
|
54 |
for file in files[category][subtype]:
|
55 |
+
|
56 |
+
# create the chunks
|
57 |
doc_processed = text_splitter.split_documents(docs[file])
|
58 |
+
|
59 |
+
# add metadata information
|
60 |
for doc in doc_processed:
|
61 |
doc.metadata["source"] = category
|
62 |
doc.metadata["subtype"] = subtype
|
63 |
doc.metadata["year"] = file[-4:]
|
64 |
|
65 |
all_documents[category].append(doc_processed)
|
66 |
+
|
67 |
+
# convert list of list to flat list
|
68 |
for key, docs_processed in all_documents.items():
|
69 |
docs_processed = [item for sublist in docs_processed for item in sublist]
|
70 |
all_documents[key] = docs_processed
|
71 |
+
all_documents['allreports'] = [sublist for key,sublist in all_documents.items()]
|
72 |
+
all_documents['allreports'] = [item for sublist in all_documents['allreports'] for item in sublist]
|
73 |
+
# define embedding model
|
74 |
embeddings = HuggingFaceEmbeddings(
|
75 |
model_kwargs = {'device': device},
|
76 |
encode_kwargs = {'normalize_embeddings': True},
|
77 |
model_name="BAAI/bge-small-en-v1.5"
|
78 |
)
|
79 |
+
# placeholder for collection
|
80 |
qdrant_collections = {}
|
81 |
+
|
82 |
+
|
83 |
for file,value in all_documents.items():
|
84 |
print("emebddings for:",file)
|
85 |
qdrant_collections[file] = Qdrant.from_documents(
|
|
|
88 |
location=":memory:",
|
89 |
collection_name=file,
|
90 |
)
|
91 |
+
|
92 |
print("done")
|
93 |
return qdrant_collections
|
94 |
|