ppsingh commited on
Commit
dc8b4b0
1 Parent(s): d201098

add new chunks

Browse files
app.py CHANGED
@@ -9,7 +9,7 @@ from pathlib import Path
9
  from huggingface_hub import CommitScheduler
10
  from auditqa.sample_questions import QUESTIONS
11
  from auditqa.reports import files, report_list
12
- from auditqa.process_chunks import load_chunks, getconfig, get_local_qdrant
13
  from auditqa.retriever import get_context
14
  from auditqa.reader import nvidia_client, dedicated_endpoint
15
  from auditqa.utils import make_html_source, parse_output_llm_with_sources, save_logs, get_message_template
@@ -40,9 +40,9 @@ scheduler = CommitScheduler(
40
  # We need to create the local vectorstore collection once using load_chunks
41
  # vectorestore colection are stored on persistent storage so this needs to be run only once
42
  # hence, comment out line below when creating for first time
43
- #vectorstores = load_chunks()
44
  # once the vectore embeddings are created we will use qdrant client to access these
45
- vectorstores = get_local_qdrant()
46
 
47
  #####---------------------CHAT-----------------------------------------------------
48
  def start_chat(query,history):
 
9
  from huggingface_hub import CommitScheduler
10
  from auditqa.sample_questions import QUESTIONS
11
  from auditqa.reports import files, report_list
12
+ from auditqa.process_chunks import load_chunks, getconfig, get_local_qdrant, load_new_chunks
13
  from auditqa.retriever import get_context
14
  from auditqa.reader import nvidia_client, dedicated_endpoint
15
  from auditqa.utils import make_html_source, parse_output_llm_with_sources, save_logs, get_message_template
 
40
  # We need to create the local vectorstore collection once using load_chunks
41
  # vectorestore colection are stored on persistent storage so this needs to be run only once
42
  # hence, comment out line below when creating for first time
43
+ vectorstores = load_new_chunks()
44
  # once the vectore embeddings are created we will use qdrant client to access these
45
+ #vectorstores = get_local_qdrant()
46
 
47
  #####---------------------CHAT-----------------------------------------------------
48
  def start_chat(query,history):
auditqa/__pycache__/__init__.cpython-310.pyc CHANGED
Binary files a/auditqa/__pycache__/__init__.cpython-310.pyc and b/auditqa/__pycache__/__init__.cpython-310.pyc differ
 
auditqa/__pycache__/process_chunks.cpython-310.pyc CHANGED
Binary files a/auditqa/__pycache__/process_chunks.cpython-310.pyc and b/auditqa/__pycache__/process_chunks.cpython-310.pyc differ
 
auditqa/__pycache__/reader.cpython-310.pyc CHANGED
Binary files a/auditqa/__pycache__/reader.cpython-310.pyc and b/auditqa/__pycache__/reader.cpython-310.pyc differ
 
auditqa/__pycache__/reports.cpython-310.pyc CHANGED
Binary files a/auditqa/__pycache__/reports.cpython-310.pyc and b/auditqa/__pycache__/reports.cpython-310.pyc differ
 
auditqa/__pycache__/retriever.cpython-310.pyc CHANGED
Binary files a/auditqa/__pycache__/retriever.cpython-310.pyc and b/auditqa/__pycache__/retriever.cpython-310.pyc differ
 
auditqa/__pycache__/sample_questions.cpython-310.pyc CHANGED
Binary files a/auditqa/__pycache__/sample_questions.cpython-310.pyc and b/auditqa/__pycache__/sample_questions.cpython-310.pyc differ
 
auditqa/__pycache__/utils.cpython-310.pyc CHANGED
Binary files a/auditqa/__pycache__/utils.cpython-310.pyc and b/auditqa/__pycache__/utils.cpython-310.pyc differ
 
auditqa/process_chunks.py CHANGED
@@ -113,6 +113,60 @@ def load_chunks():
113
  print("vector embeddings done")
114
  return qdrant_collections
115
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
116
  def get_local_qdrant():
117
  """once the local qdrant server is created this is used to make the connection to exisitng server"""
118
  config = getconfig("./model_params.cfg")
 
113
  print("vector embeddings done")
114
  return qdrant_collections
115
 
116
+ def load_new_chunks():
117
+ """
118
+ this method reads through the files and report_list to create the vector database
119
+ """
120
+
121
+ # we iterate through the files which contain information about its
122
+ # 'source'=='category', 'subtype', these are used in UI for document selection
123
+ # which will be used later for filtering database
124
+ config = getconfig("./model_params.cfg")
125
+ files = pd.read_json("./axa_processed_chunks_update.json")
126
+ all_documents= []
127
+ # iterate through 'source'
128
+ for i in range(len(files)):
129
+ # load the chunks
130
+ try:
131
+ doc_processed = open_file(path_to_data + "/chunks/"+ os.path.basename(files.loc[i,'chunks_filepath']))
132
+ doc_processed = doc_processed['paragraphs']
133
+
134
+ except Exception as e:
135
+ print("Exception: ", e)
136
+ print("chunks in subtype:", files.loc[0,'filename'], "are:",len(doc_processed))
137
+
138
+ # add metadata information
139
+
140
+ for doc in doc_processed:
141
+ all_documents.append(Document(page_content= doc['content'],
142
+ metadata={"source": files.loc[i,'category'],
143
+ "subtype":os.path.splitext(files.loc[i,'filename'])[0],
144
+ "year":files.loc[i,'year'],
145
+ "filename":files.loc[0,'filename'],
146
+ "page":doc['metadata']['page'],
147
+ "headings":doc['metadata']['headings']}))
148
+
149
+ # convert list of list to flat list
150
+ print("length of chunks:",len(all_documents))
151
+
152
+ # define embedding model
153
+ embeddings = HuggingFaceEmbeddings(
154
+ model_kwargs = {'device': device},
155
+ encode_kwargs = {'normalize_embeddings': bool(int(config.get('retriever','NORMALIZE')))},
156
+ model_name=config.get('retriever','MODEL')
157
+ )
158
+ # placeholder for collection
159
+ qdrant_collections = {}
160
+ qdrant_collections['allreports'] = Qdrant.from_documents(
161
+ all_documents,
162
+ embeddings,
163
+ path="/data/local_qdrant",
164
+ collection_name='allreports',
165
+ )
166
+ print(qdrant_collections)
167
+ print("vector embeddings done")
168
+ return qdrant_collections
169
+
170
  def get_local_qdrant():
171
  """once the local qdrant server is created this is used to make the connection to exisitng server"""
172
  config = getconfig("./model_params.cfg")