Spaces:
Running
on
CPU Upgrade
Running
on
CPU Upgrade
add new chunks
Browse files- app.py +3 -3
- auditqa/__pycache__/__init__.cpython-310.pyc +0 -0
- auditqa/__pycache__/process_chunks.cpython-310.pyc +0 -0
- auditqa/__pycache__/reader.cpython-310.pyc +0 -0
- auditqa/__pycache__/reports.cpython-310.pyc +0 -0
- auditqa/__pycache__/retriever.cpython-310.pyc +0 -0
- auditqa/__pycache__/sample_questions.cpython-310.pyc +0 -0
- auditqa/__pycache__/utils.cpython-310.pyc +0 -0
- auditqa/process_chunks.py +54 -0
app.py
CHANGED
@@ -9,7 +9,7 @@ from pathlib import Path
|
|
9 |
from huggingface_hub import CommitScheduler
|
10 |
from auditqa.sample_questions import QUESTIONS
|
11 |
from auditqa.reports import files, report_list
|
12 |
-
from auditqa.process_chunks import load_chunks, getconfig, get_local_qdrant
|
13 |
from auditqa.retriever import get_context
|
14 |
from auditqa.reader import nvidia_client, dedicated_endpoint
|
15 |
from auditqa.utils import make_html_source, parse_output_llm_with_sources, save_logs, get_message_template
|
@@ -40,9 +40,9 @@ scheduler = CommitScheduler(
|
|
40 |
# We need to create the local vectorstore collection once using load_chunks
|
41 |
# vectorestore colection are stored on persistent storage so this needs to be run only once
|
42 |
# hence, comment out line below when creating for first time
|
43 |
-
|
44 |
# once the vectore embeddings are created we will use qdrant client to access these
|
45 |
-
vectorstores = get_local_qdrant()
|
46 |
|
47 |
#####---------------------CHAT-----------------------------------------------------
|
48 |
def start_chat(query,history):
|
|
|
9 |
from huggingface_hub import CommitScheduler
|
10 |
from auditqa.sample_questions import QUESTIONS
|
11 |
from auditqa.reports import files, report_list
|
12 |
+
from auditqa.process_chunks import load_chunks, getconfig, get_local_qdrant, load_new_chunks
|
13 |
from auditqa.retriever import get_context
|
14 |
from auditqa.reader import nvidia_client, dedicated_endpoint
|
15 |
from auditqa.utils import make_html_source, parse_output_llm_with_sources, save_logs, get_message_template
|
|
|
40 |
# We need to create the local vectorstore collection once using load_chunks
|
41 |
# vectorestore colection are stored on persistent storage so this needs to be run only once
|
42 |
# hence, comment out line below when creating for first time
|
43 |
+
vectorstores = load_new_chunks()
|
44 |
# once the vectore embeddings are created we will use qdrant client to access these
|
45 |
+
#vectorstores = get_local_qdrant()
|
46 |
|
47 |
#####---------------------CHAT-----------------------------------------------------
|
48 |
def start_chat(query,history):
|
auditqa/__pycache__/__init__.cpython-310.pyc
CHANGED
Binary files a/auditqa/__pycache__/__init__.cpython-310.pyc and b/auditqa/__pycache__/__init__.cpython-310.pyc differ
|
|
auditqa/__pycache__/process_chunks.cpython-310.pyc
CHANGED
Binary files a/auditqa/__pycache__/process_chunks.cpython-310.pyc and b/auditqa/__pycache__/process_chunks.cpython-310.pyc differ
|
|
auditqa/__pycache__/reader.cpython-310.pyc
CHANGED
Binary files a/auditqa/__pycache__/reader.cpython-310.pyc and b/auditqa/__pycache__/reader.cpython-310.pyc differ
|
|
auditqa/__pycache__/reports.cpython-310.pyc
CHANGED
Binary files a/auditqa/__pycache__/reports.cpython-310.pyc and b/auditqa/__pycache__/reports.cpython-310.pyc differ
|
|
auditqa/__pycache__/retriever.cpython-310.pyc
CHANGED
Binary files a/auditqa/__pycache__/retriever.cpython-310.pyc and b/auditqa/__pycache__/retriever.cpython-310.pyc differ
|
|
auditqa/__pycache__/sample_questions.cpython-310.pyc
CHANGED
Binary files a/auditqa/__pycache__/sample_questions.cpython-310.pyc and b/auditqa/__pycache__/sample_questions.cpython-310.pyc differ
|
|
auditqa/__pycache__/utils.cpython-310.pyc
CHANGED
Binary files a/auditqa/__pycache__/utils.cpython-310.pyc and b/auditqa/__pycache__/utils.cpython-310.pyc differ
|
|
auditqa/process_chunks.py
CHANGED
@@ -113,6 +113,60 @@ def load_chunks():
|
|
113 |
print("vector embeddings done")
|
114 |
return qdrant_collections
|
115 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
116 |
def get_local_qdrant():
|
117 |
"""once the local qdrant server is created this is used to make the connection to exisitng server"""
|
118 |
config = getconfig("./model_params.cfg")
|
|
|
113 |
print("vector embeddings done")
|
114 |
return qdrant_collections
|
115 |
|
116 |
+
def load_new_chunks():
|
117 |
+
"""
|
118 |
+
this method reads through the files and report_list to create the vector database
|
119 |
+
"""
|
120 |
+
|
121 |
+
# we iterate through the files which contain information about its
|
122 |
+
# 'source'=='category', 'subtype', these are used in UI for document selection
|
123 |
+
# which will be used later for filtering database
|
124 |
+
config = getconfig("./model_params.cfg")
|
125 |
+
files = pd.read_json("./axa_processed_chunks_update.json")
|
126 |
+
all_documents= []
|
127 |
+
# iterate through 'source'
|
128 |
+
for i in range(len(files)):
|
129 |
+
# load the chunks
|
130 |
+
try:
|
131 |
+
doc_processed = open_file(path_to_data + "/chunks/"+ os.path.basename(files.loc[i,'chunks_filepath']))
|
132 |
+
doc_processed = doc_processed['paragraphs']
|
133 |
+
|
134 |
+
except Exception as e:
|
135 |
+
print("Exception: ", e)
|
136 |
+
print("chunks in subtype:", files.loc[0,'filename'], "are:",len(doc_processed))
|
137 |
+
|
138 |
+
# add metadata information
|
139 |
+
|
140 |
+
for doc in doc_processed:
|
141 |
+
all_documents.append(Document(page_content= doc['content'],
|
142 |
+
metadata={"source": files.loc[i,'category'],
|
143 |
+
"subtype":os.path.splitext(files.loc[i,'filename'])[0],
|
144 |
+
"year":files.loc[i,'year'],
|
145 |
+
"filename":files.loc[0,'filename'],
|
146 |
+
"page":doc['metadata']['page'],
|
147 |
+
"headings":doc['metadata']['headings']}))
|
148 |
+
|
149 |
+
# convert list of list to flat list
|
150 |
+
print("length of chunks:",len(all_documents))
|
151 |
+
|
152 |
+
# define embedding model
|
153 |
+
embeddings = HuggingFaceEmbeddings(
|
154 |
+
model_kwargs = {'device': device},
|
155 |
+
encode_kwargs = {'normalize_embeddings': bool(int(config.get('retriever','NORMALIZE')))},
|
156 |
+
model_name=config.get('retriever','MODEL')
|
157 |
+
)
|
158 |
+
# placeholder for collection
|
159 |
+
qdrant_collections = {}
|
160 |
+
qdrant_collections['allreports'] = Qdrant.from_documents(
|
161 |
+
all_documents,
|
162 |
+
embeddings,
|
163 |
+
path="/data/local_qdrant",
|
164 |
+
collection_name='allreports',
|
165 |
+
)
|
166 |
+
print(qdrant_collections)
|
167 |
+
print("vector embeddings done")
|
168 |
+
return qdrant_collections
|
169 |
+
|
170 |
def get_local_qdrant():
|
171 |
"""once the local qdrant server is created this is used to make the connection to exisitng server"""
|
172 |
config = getconfig("./model_params.cfg")
|