NaimaAqeel commited on
Commit
ac5f15c
·
verified ·
1 Parent(s): 187fb24

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +21 -25
app.py CHANGED
@@ -59,12 +59,12 @@ llm = HuggingFaceEndpoint(
59
  # Initialize the HuggingFace embeddings
60
  embedding = HuggingFaceEmbeddings()
61
 
62
- # Load or create FAISS index
63
  index_path = "faiss_index.pkl"
64
  document_texts_path = "document_texts.pkl"
65
-
66
  document_texts = []
67
 
 
68
  if os.path.exists(index_path) and os.path.exists(document_texts_path):
69
  try:
70
  with open(index_path, "rb") as f:
@@ -76,8 +76,7 @@ if os.path.exists(index_path) and os.path.exists(document_texts_path):
76
  except Exception as e:
77
  print(f"Error loading FAISS index or document texts: {e}")
78
  else:
79
- # Create a new FAISS index if it doesn't exist
80
- index = faiss.IndexFlatL2(embedding_model.get_sentence_embedding_dimension())
81
  with open(index_path, "wb") as f:
82
  pickle.dump(index, f)
83
  print("Created new FAISS index and saved to faiss_index.pkl")
@@ -86,7 +85,7 @@ def upload_files(files):
86
  global index, document_texts
87
  try:
88
  for file in files:
89
- file_path = file.name # Get the file path from the NamedString object
90
  if file_path.endswith('.pdf'):
91
  text = extract_text_from_pdf(file_path)
92
  elif file_path.endswith('.docx'):
@@ -94,23 +93,22 @@ def upload_files(files):
94
  else:
95
  return "Unsupported file format"
96
 
97
- print(f"Extracted text: {text[:100]}...") # Debug: Show the first 100 characters of the extracted text
98
 
99
- # Process the text and update FAISS index
100
  sentences = text.split("\n")
101
- embeddings = embedding_model.encode(sentences)
102
- print(f"Embeddings shape: {embeddings.shape}") # Debug: Show the shape of the embeddings
103
  index.add(np.array(embeddings))
104
- document_texts.extend(sentences) # Store sentences for retrieval
105
 
106
- # Save the updated index and documents
107
  with open(index_path, "wb") as f:
108
  pickle.dump(index, f)
109
  print("Saved updated FAISS index to faiss_index.pkl")
110
  with open(document_texts_path, "wb") as f:
111
  pickle.dump(document_texts, f)
112
  print("Saved updated document texts to document_texts.pkl")
113
-
114
  return "Files processed successfully"
115
  except Exception as e:
116
  print(f"Error processing files: {e}")
@@ -118,30 +116,28 @@ def upload_files(files):
118
 
119
  def query_text(text):
120
  try:
121
- print(f"Query text: {text}") # Debug: Show the query text
 
 
122
 
123
- # Encode the query text
124
- query_embedding = embedding_model.encode([text])
125
- print(f"Query embedding shape: {query_embedding.shape}") # Debug: Show the shape of the query embedding
126
-
127
- # Search the FAISS index
128
  D, I = index.search(np.array(query_embedding), k=5)
129
- print(f"Distances: {D}, Indices: {I}") # Debug: Show the distances and indices of the search results
130
-
131
  top_documents = []
132
  for idx in I[0]:
133
- if idx != -1 and idx < len(document_texts): # Ensure that a valid index is found
134
- top_documents.append(document_texts[idx]) # Append the actual sentences for the response
135
  else:
136
  print(f"Invalid index found: {idx}")
137
- return top_documents
 
138
  except Exception as e:
139
  print(f"Error querying text: {e}")
140
  return f"Error querying text: {e}"
141
 
142
- # Create Gradio interface
143
  with gr.Blocks() as demo:
144
- gr.Markdown("## Document Upload and Query System")
145
 
146
  with gr.Tab("Upload Files"):
147
  upload = gr.File(file_count="multiple", label="Upload PDF or DOCX files")
 
59
  # Initialize the HuggingFace embeddings
60
  embedding = HuggingFaceEmbeddings()
61
 
62
+ # FAISS index and storage paths
63
  index_path = "faiss_index.pkl"
64
  document_texts_path = "document_texts.pkl"
 
65
  document_texts = []
66
 
67
+ # Load or create FAISS index using cosine similarity (Inner Product + Normalized vectors)
68
  if os.path.exists(index_path) and os.path.exists(document_texts_path):
69
  try:
70
  with open(index_path, "rb") as f:
 
76
  except Exception as e:
77
  print(f"Error loading FAISS index or document texts: {e}")
78
  else:
79
+ index = faiss.IndexFlatIP(embedding_model.get_sentence_embedding_dimension())
 
80
  with open(index_path, "wb") as f:
81
  pickle.dump(index, f)
82
  print("Created new FAISS index and saved to faiss_index.pkl")
 
85
  global index, document_texts
86
  try:
87
  for file in files:
88
+ file_path = file.name
89
  if file_path.endswith('.pdf'):
90
  text = extract_text_from_pdf(file_path)
91
  elif file_path.endswith('.docx'):
 
93
  else:
94
  return "Unsupported file format"
95
 
96
+ print(f"Extracted text: {text[:100]}...")
97
 
 
98
  sentences = text.split("\n")
99
+ embeddings = embedding_model.encode(sentences, normalize_embeddings=True) # Cosine similarity step
100
+ print(f"Embeddings shape: {embeddings.shape}")
101
  index.add(np.array(embeddings))
102
+ document_texts.extend(sentences)
103
 
104
+ # Save updated index and texts
105
  with open(index_path, "wb") as f:
106
  pickle.dump(index, f)
107
  print("Saved updated FAISS index to faiss_index.pkl")
108
  with open(document_texts_path, "wb") as f:
109
  pickle.dump(document_texts, f)
110
  print("Saved updated document texts to document_texts.pkl")
111
+
112
  return "Files processed successfully"
113
  except Exception as e:
114
  print(f"Error processing files: {e}")
 
116
 
117
  def query_text(text):
118
  try:
119
+ print(f"Query text: {text}")
120
+ query_embedding = embedding_model.encode([text], normalize_embeddings=True) # Cosine similarity step
121
+ print(f"Query embedding shape: {query_embedding.shape}")
122
 
 
 
 
 
 
123
  D, I = index.search(np.array(query_embedding), k=5)
124
+ print(f"Distances: {D}, Indices: {I}")
125
+
126
  top_documents = []
127
  for idx in I[0]:
128
+ if idx != -1 and idx < len(document_texts):
129
+ top_documents.append(document_texts[idx])
130
  else:
131
  print(f"Invalid index found: {idx}")
132
+
133
+ return "\n\n".join(top_documents)
134
  except Exception as e:
135
  print(f"Error querying text: {e}")
136
  return f"Error querying text: {e}"
137
 
138
+ # Gradio Interface
139
  with gr.Blocks() as demo:
140
+ gr.Markdown("## Document Upload and Query System with Cosine Similarity")
141
 
142
  with gr.Tab("Upload Files"):
143
  upload = gr.File(file_count="multiple", label="Upload PDF or DOCX files")