Spaces:

NaimaAqeel
/

Chatbot

Runtime error

App Files Files Community

NaimaAqeel commited on Apr 19

Commit

ac5f15c

verified ·

1 Parent(s): 187fb24

Update app.py

Browse files

Files changed (1) hide show

app.py +21 -25

app.py CHANGED Viewed

@@ -59,12 +59,12 @@ llm = HuggingFaceEndpoint(
 # Initialize the HuggingFace embeddings
 embedding = HuggingFaceEmbeddings()
-# Load or create FAISS index
 index_path = "faiss_index.pkl"
 document_texts_path = "document_texts.pkl"
 document_texts = []
 if os.path.exists(index_path) and os.path.exists(document_texts_path):
     try:
         with open(index_path, "rb") as f:
@@ -76,8 +76,7 @@ if os.path.exists(index_path) and os.path.exists(document_texts_path):
     except Exception as e:
         print(f"Error loading FAISS index or document texts: {e}")
 else:
-    # Create a new FAISS index if it doesn't exist
-    index = faiss.IndexFlatL2(embedding_model.get_sentence_embedding_dimension())
     with open(index_path, "wb") as f:
         pickle.dump(index, f)
         print("Created new FAISS index and saved to faiss_index.pkl")
@@ -86,7 +85,7 @@ def upload_files(files):
     global index, document_texts
     try:
         for file in files:
-            file_path = file.name  # Get the file path from the NamedString object
             if file_path.endswith('.pdf'):
                 text = extract_text_from_pdf(file_path)
             elif file_path.endswith('.docx'):
@@ -94,23 +93,22 @@ def upload_files(files):
             else:
                 return "Unsupported file format"
-            print(f"Extracted text: {text[:100]}...")  # Debug: Show the first 100 characters of the extracted text
-            # Process the text and update FAISS index
             sentences = text.split("\n")
-            embeddings = embedding_model.encode(sentences)
-            print(f"Embeddings shape: {embeddings.shape}")  # Debug: Show the shape of the embeddings
             index.add(np.array(embeddings))
-            document_texts.extend(sentences)  # Store sentences for retrieval
-        # Save the updated index and documents
         with open(index_path, "wb") as f:
             pickle.dump(index, f)
             print("Saved updated FAISS index to faiss_index.pkl")
         with open(document_texts_path, "wb") as f:
             pickle.dump(document_texts, f)
             print("Saved updated document texts to document_texts.pkl")
         return "Files processed successfully"
     except Exception as e:
         print(f"Error processing files: {e}")
@@ -118,30 +116,28 @@ def upload_files(files):
 def query_text(text):
     try:
-        print(f"Query text: {text}")  # Debug: Show the query text
-        # Encode the query text
-        query_embedding = embedding_model.encode([text])
-        print(f"Query embedding shape: {query_embedding.shape}")  # Debug: Show the shape of the query embedding
-        # Search the FAISS index
         D, I = index.search(np.array(query_embedding), k=5)
-        print(f"Distances: {D}, Indices: {I}")  # Debug: Show the distances and indices of the search results
         top_documents = []
         for idx in I[0]:
-            if idx != -1 and idx < len(document_texts):  # Ensure that a valid index is found
-                top_documents.append(document_texts[idx])  # Append the actual sentences for the response
             else:
                 print(f"Invalid index found: {idx}")
-        return top_documents
     except Exception as e:
         print(f"Error querying text: {e}")
         return f"Error querying text: {e}"
-# Create Gradio interface
 with gr.Blocks() as demo:
-    gr.Markdown("## Document Upload and Query System")
     with gr.Tab("Upload Files"):
         upload = gr.File(file_count="multiple", label="Upload PDF or DOCX files")

 # Initialize the HuggingFace embeddings
 embedding = HuggingFaceEmbeddings()
+# FAISS index and storage paths
 index_path = "faiss_index.pkl"
 document_texts_path = "document_texts.pkl"
 document_texts = []
+# Load or create FAISS index using cosine similarity (Inner Product + Normalized vectors)
 if os.path.exists(index_path) and os.path.exists(document_texts_path):
     try:
         with open(index_path, "rb") as f:
     except Exception as e:
         print(f"Error loading FAISS index or document texts: {e}")
 else:
+    index = faiss.IndexFlatIP(embedding_model.get_sentence_embedding_dimension())
     with open(index_path, "wb") as f:
         pickle.dump(index, f)
         print("Created new FAISS index and saved to faiss_index.pkl")
     global index, document_texts
     try:
         for file in files:
+            file_path = file.name
             if file_path.endswith('.pdf'):
                 text = extract_text_from_pdf(file_path)
             elif file_path.endswith('.docx'):
             else:
                 return "Unsupported file format"
+            print(f"Extracted text: {text[:100]}...")
             sentences = text.split("\n")
+            embeddings = embedding_model.encode(sentences, normalize_embeddings=True)  # Cosine similarity step
+            print(f"Embeddings shape: {embeddings.shape}")
             index.add(np.array(embeddings))
+            document_texts.extend(sentences)
+        # Save updated index and texts
         with open(index_path, "wb") as f:
             pickle.dump(index, f)
             print("Saved updated FAISS index to faiss_index.pkl")
         with open(document_texts_path, "wb") as f:
             pickle.dump(document_texts, f)
             print("Saved updated document texts to document_texts.pkl")
         return "Files processed successfully"
     except Exception as e:
         print(f"Error processing files: {e}")
 def query_text(text):
     try:
+        print(f"Query text: {text}")
+        query_embedding = embedding_model.encode([text], normalize_embeddings=True)  # Cosine similarity step
+        print(f"Query embedding shape: {query_embedding.shape}")
         D, I = index.search(np.array(query_embedding), k=5)
+        print(f"Distances: {D}, Indices: {I}")
         top_documents = []
         for idx in I[0]:
+            if idx != -1 and idx < len(document_texts):
+                top_documents.append(document_texts[idx])
             else:
                 print(f"Invalid index found: {idx}")
+        return "\n\n".join(top_documents)
     except Exception as e:
         print(f"Error querying text: {e}")
         return f"Error querying text: {e}"
+# Gradio Interface
 with gr.Blocks() as demo:
+    gr.Markdown("## Document Upload and Query System with Cosine Similarity")
     with gr.Tab("Upload Files"):
         upload = gr.File(file_count="multiple", label="Upload PDF or DOCX files")