Spaces:
Running
on
CPU Upgrade
Running
on
CPU Upgrade
Update auditqa/doc_process.py
Browse files- auditqa/doc_process.py +5 -3
auditqa/doc_process.py
CHANGED
@@ -45,17 +45,18 @@ def process_pdf():
|
|
45 |
categories = list(files.keys())
|
46 |
# iterate through 'source'
|
47 |
for category in categories:
|
48 |
-
print(category)
|
49 |
all_documents[category] = []
|
50 |
subtypes = list(files[category].keys())
|
51 |
# iterate through 'subtype' within the source
|
52 |
# example source/category == 'District', has subtypes which is district names
|
53 |
for subtype in subtypes:
|
54 |
-
print(subtype)
|
55 |
for file in files[category][subtype]:
|
56 |
|
57 |
# create the chunks
|
58 |
doc_processed = text_splitter.split_documents(docs[file])
|
|
|
59 |
|
60 |
# add metadata information
|
61 |
for doc in doc_processed:
|
@@ -69,6 +70,7 @@ def process_pdf():
|
|
69 |
# convert list of list to flat list
|
70 |
for key, docs_processed in all_documents.items():
|
71 |
docs_processed = [item for sublist in docs_processed for item in sublist]
|
|
|
72 |
all_documents[key] = docs_processed
|
73 |
all_documents['allreports'] = [sublist for key,sublist in all_documents.items()]
|
74 |
all_documents['allreports'] = [item for sublist in all_documents['allreports'] for item in sublist]
|
@@ -90,7 +92,7 @@ def process_pdf():
|
|
90 |
location=":memory:",
|
91 |
collection_name=file,
|
92 |
)
|
93 |
-
|
94 |
print("vector embeddings done")
|
95 |
return qdrant_collections
|
96 |
|
|
|
45 |
categories = list(files.keys())
|
46 |
# iterate through 'source'
|
47 |
for category in categories:
|
48 |
+
print("documents splitting in source:",category)
|
49 |
all_documents[category] = []
|
50 |
subtypes = list(files[category].keys())
|
51 |
# iterate through 'subtype' within the source
|
52 |
# example source/category == 'District', has subtypes which is district names
|
53 |
for subtype in subtypes:
|
54 |
+
print("document splitting for subtype:",subtype)
|
55 |
for file in files[category][subtype]:
|
56 |
|
57 |
# create the chunks
|
58 |
doc_processed = text_splitter.split_documents(docs[file])
|
59 |
+
print("chunks in subtype:",subtype, "are:",len(doc_processed))
|
60 |
|
61 |
# add metadata information
|
62 |
for doc in doc_processed:
|
|
|
70 |
# convert list of list to flat list
|
71 |
for key, docs_processed in all_documents.items():
|
72 |
docs_processed = [item for sublist in docs_processed for item in sublist]
|
73 |
+
print("length of chunks in source:",source, "are:",len(docs_processed)
|
74 |
all_documents[key] = docs_processed
|
75 |
all_documents['allreports'] = [sublist for key,sublist in all_documents.items()]
|
76 |
all_documents['allreports'] = [item for sublist in all_documents['allreports'] for item in sublist]
|
|
|
92 |
location=":memory:",
|
93 |
collection_name=file,
|
94 |
)
|
95 |
+
print(qdrant_collections)
|
96 |
print("vector embeddings done")
|
97 |
return qdrant_collections
|
98 |
|