Shreyas094 commited on
Commit
0d9d94c
·
verified ·
1 Parent(s): 8142ec6

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +64 -20
app.py CHANGED
@@ -111,9 +111,9 @@ def update_vectors(files, parser):
111
  return "Please upload at least one PDF file.", display_documents()
112
 
113
  embed = get_embeddings()
114
- total_chunks = 0
115
-
116
  all_data = []
 
 
117
  for file in files:
118
  logging.info(f"Processing file: {file.name}")
119
  try:
@@ -122,8 +122,14 @@ def update_vectors(files, parser):
122
  logging.warning(f"No chunks loaded from {file.name}")
123
  continue
124
  logging.info(f"Loaded {len(data)} chunks from {file.name}")
125
- all_data.extend(data)
126
- total_chunks += len(data)
 
 
 
 
 
 
127
  if not any(doc["name"] == file.name for doc in uploaded_documents):
128
  uploaded_documents.append({"name": file.name, "selected": True})
129
  logging.info(f"Added new document to uploaded_documents: {file.name}")
@@ -132,8 +138,6 @@ def update_vectors(files, parser):
132
  except Exception as e:
133
  logging.error(f"Error processing file {file.name}: {str(e)}")
134
 
135
- logging.info(f"Total chunks processed: {total_chunks}")
136
-
137
  if not all_data:
138
  logging.warning("No valid data extracted from uploaded files")
139
  return "No valid data could be extracted from the uploaded files. Please check the file contents and try again.", display_documents()
@@ -153,6 +157,17 @@ def update_vectors(files, parser):
153
 
154
  database.save_local("faiss_database")
155
  logging.info("FAISS database saved")
 
 
 
 
 
 
 
 
 
 
 
156
  except Exception as e:
157
  logging.error(f"Error updating FAISS database: {str(e)}")
158
  return f"Error updating vector store: {str(e)}", display_documents()
@@ -160,7 +175,28 @@ def update_vectors(files, parser):
160
  save_documents(uploaded_documents)
161
  logging.info(f"Updated documents saved. Total documents: {len(uploaded_documents)}")
162
 
163
- return f"Vector store updated successfully. Processed {total_chunks} chunks from {len(files)} files using {parser}.", display_documents()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
164
 
165
  def delete_documents(selected_docs):
166
  global uploaded_documents
@@ -490,13 +526,17 @@ def get_response_from_pdf(query, model, selected_docs, num_calls=3, temperature=
490
  return
491
 
492
  try:
493
- retriever = database.as_retriever()
494
  logging.info(f"Retrieving relevant documents for query: {query}")
495
  relevant_docs = retriever.get_relevant_documents(query)
496
  logging.info(f"Number of relevant documents retrieved: {len(relevant_docs)}")
497
 
498
- # Filter relevant_docs based on selected documents
499
- filtered_docs = [doc for doc in relevant_docs if doc.metadata["source"] in selected_docs]
 
 
 
 
500
  logging.info(f"Number of filtered documents: {len(filtered_docs)}")
501
 
502
  if not filtered_docs:
@@ -505,24 +545,28 @@ def get_response_from_pdf(query, model, selected_docs, num_calls=3, temperature=
505
  return
506
 
507
  for i, doc in enumerate(filtered_docs):
508
- logging.info(f"Document {i+1} source: {doc.metadata['source']}")
509
- logging.info(f"Document {i+1} content preview: {doc.page_content[:100]}...")
510
 
511
- context_str = "\n".join([doc.page_content for doc in filtered_docs])
512
  logging.info(f"Total context length: {len(context_str)}")
513
 
 
 
 
 
 
 
 
 
 
 
514
  if model == "@cf/meta/llama-3.1-8b-instruct":
515
  logging.info("Using Cloudflare API")
516
- # Use Cloudflare API with the retrieved context
517
- for response in get_response_from_cloudflare(prompt="", context=context_str, query=query, num_calls=num_calls, temperature=temperature, search_type="pdf"):
518
  yield response
519
  else:
520
  logging.info("Using Hugging Face API")
521
- # Use Hugging Face API
522
- prompt = f"""Using the following context from the PDF documents:
523
- {context_str}
524
- Write a detailed and complete response that answers the following user question: '{query}'"""
525
-
526
  client = InferenceClient(model, token=huggingface_token)
527
 
528
  response = ""
 
111
  return "Please upload at least one PDF file.", display_documents()
112
 
113
  embed = get_embeddings()
 
 
114
  all_data = []
115
+ seen_contents = set()
116
+
117
  for file in files:
118
  logging.info(f"Processing file: {file.name}")
119
  try:
 
122
  logging.warning(f"No chunks loaded from {file.name}")
123
  continue
124
  logging.info(f"Loaded {len(data)} chunks from {file.name}")
125
+
126
+ for chunk in data:
127
+ if chunk.page_content not in seen_contents:
128
+ all_data.append(chunk)
129
+ seen_contents.add(chunk.page_content)
130
+ else:
131
+ logging.warning(f"Duplicate content detected in {file.name}, skipping...")
132
+
133
  if not any(doc["name"] == file.name for doc in uploaded_documents):
134
  uploaded_documents.append({"name": file.name, "selected": True})
135
  logging.info(f"Added new document to uploaded_documents: {file.name}")
 
138
  except Exception as e:
139
  logging.error(f"Error processing file {file.name}: {str(e)}")
140
 
 
 
141
  if not all_data:
142
  logging.warning("No valid data extracted from uploaded files")
143
  return "No valid data could be extracted from the uploaded files. Please check the file contents and try again.", display_documents()
 
157
 
158
  database.save_local("faiss_database")
159
  logging.info("FAISS database saved")
160
+
161
+ # Check the database after updating
162
+ check_faiss_database()
163
+
164
+ # Analyze document similarity
165
+ analyze_document_similarity()
166
+
167
+ # Test document retrieval
168
+ test_document_retrieval("Tell me about the contents of the 8K filing")
169
+ test_document_retrieval("What information is in the 10Q report?")
170
+
171
  except Exception as e:
172
  logging.error(f"Error updating FAISS database: {str(e)}")
173
  return f"Error updating vector store: {str(e)}", display_documents()
 
175
  save_documents(uploaded_documents)
176
  logging.info(f"Updated documents saved. Total documents: {len(uploaded_documents)}")
177
 
178
+ return f"Vector store updated successfully. Processed {len(all_data)} chunks from {len(files)} files using {parser}.", display_documents()
179
+
180
+ from sklearn.metrics.pairwise import cosine_similarity
181
+
182
+ def analyze_document_similarity():
183
+ embed = get_embeddings()
184
+ database = FAISS.load_local("faiss_database", embed, allow_dangerous_deserialization=True)
185
+
186
+ docs = list(database.docstore.docs.values())
187
+ embeddings = [database.embedding_function(doc.page_content) for doc in docs]
188
+
189
+ similarity_matrix = cosine_similarity(embeddings)
190
+
191
+ for i in range(len(docs)):
192
+ for j in range(i+1, len(docs)):
193
+ similarity = similarity_matrix[i][j]
194
+ logging.info(f"Similarity between {docs[i].metadata['source']} and {docs[j].metadata['source']}: {similarity}")
195
+ if similarity > 0.9: # Adjust this threshold as needed
196
+ logging.warning(f"High similarity detected between {docs[i].metadata['source']} and {docs[j].metadata['source']}")
197
+
198
+ # Call this after updating the vector store
199
+ analyze_document_similarity()
200
 
201
  def delete_documents(selected_docs):
202
  global uploaded_documents
 
526
  return
527
 
528
  try:
529
+ retriever = database.as_retriever(search_kwargs={"k": 10}) # Increase k to retrieve more documents
530
  logging.info(f"Retrieving relevant documents for query: {query}")
531
  relevant_docs = retriever.get_relevant_documents(query)
532
  logging.info(f"Number of relevant documents retrieved: {len(relevant_docs)}")
533
 
534
+ for i, doc in enumerate(relevant_docs):
535
+ logging.info(f"Relevant document {i+1}: {doc.metadata['source']}, Score: {doc.metadata.get('score', 'N/A')}")
536
+ logging.info(f"Relevant document {i+1} content preview: {doc.page_content[:100]}...")
537
+
538
+ # Filter relevant_docs based on selected documents, but keep original order
539
+ filtered_docs = [doc for doc in relevant_docs if any(selected_doc in doc.metadata["source"] for selected_doc in selected_docs)]
540
  logging.info(f"Number of filtered documents: {len(filtered_docs)}")
541
 
542
  if not filtered_docs:
 
545
  return
546
 
547
  for i, doc in enumerate(filtered_docs):
548
+ logging.info(f"Filtered document {i+1} source: {doc.metadata['source']}")
549
+ logging.info(f"Filtered document {i+1} content preview: {doc.page_content[:100]}...")
550
 
551
+ context_str = "\n\n".join([f"Document: {doc.metadata['source']}\n{doc.page_content}" for doc in filtered_docs])
552
  logging.info(f"Total context length: {len(context_str)}")
553
 
554
+ prompt = f"""You are analyzing multiple financial documents. The following documents have been selected: {', '.join(selected_docs)}
555
+
556
+ Using the following context from the selected PDF documents:
557
+
558
+ {context_str}
559
+
560
+ Please provide a detailed and complete response that answers the following user question, making sure to consider information from all selected documents: '{query}'
561
+
562
+ If the information is not found in the provided context, please state that clearly."""
563
+
564
  if model == "@cf/meta/llama-3.1-8b-instruct":
565
  logging.info("Using Cloudflare API")
566
+ for response in get_response_from_cloudflare(prompt=prompt, context=context_str, query=query, num_calls=num_calls, temperature=temperature, search_type="pdf"):
 
567
  yield response
568
  else:
569
  logging.info("Using Hugging Face API")
 
 
 
 
 
570
  client = InferenceClient(model, token=huggingface_token)
571
 
572
  response = ""