Spaces:

NaimaAqeel
/

Chatbot

Runtime error

App Files Files Community

NaimaAqeel commited on Jun 6, 2024

Commit

6959bbb

verified ·

1 Parent(s): 03bc240

Update app.py

Browse files

Files changed (1) hide show

app.py +33 -17

app.py CHANGED Viewed

@@ -3,13 +3,13 @@ import gradio as gr
 import fitz  # PyMuPDF for PDF text extraction
 from docx import Document  # python-docx for DOCX text extraction
 from sentence_transformers import SentenceTransformer
-from langchain_community.vectorstores import FAISS
-from langchain_community.embeddings import HuggingFaceEmbeddings
 from transformers import AutoModelForSeq2SeqLM, AutoTokenizer
 from nltk.tokenize import sent_tokenize
 import torch
 import pickle
 import nltk
 # Download NLTK punkt tokenizer data if not already downloaded
 nltk.download('punkt', quiet=True)
@@ -40,17 +40,33 @@ def extract_text_from_docx(docx_path):
 embedding_model = SentenceTransformer('all-MiniLM-L6-v2')
 # Initialize the HuggingFaceEmbeddings for LangChain
-hf_embeddings = HuggingFaceEmbeddings(model_name='sentence-transformers/all-MiniLM-L6-v2')
 # Initialize the FAISS index
-index_path = "faiss_index.pkl"
-if os.path.exists(index_path):
-    with open(index_path, "rb") as f:
-        faiss_index = pickle.load(f)
-        print("Loaded FAISS index from faiss_index.pkl")
-else:
-    # Initialize FAISS index using LangChain
-    faiss_index = FAISS(embedding_function=hf_embeddings)
 def preprocess_text(text):
     sentences = sent_tokenize(text)
@@ -72,13 +88,14 @@ def upload_files(files):
             # Preprocess text
             sentences = preprocess_text(text)
-            # Encode sentences and add to FAISS index
             embeddings = embedding_model.encode(sentences)
-            for sentence, embedding in zip(sentences, embeddings):
-                faiss_index.add_sentence(sentence, embedding)
         # Save the updated index
-        with open(index_path, "wb") as f:
             pickle.dump(faiss_index, f)
         return {"message": "Files processed successfully"}
@@ -97,7 +114,7 @@ def process_and_query(state, files, question):
         question_embedding = embedding_model.encode([question])
         # Search the FAISS index for similar passages
-        retrieved_results = faiss_index.similarity_search(question, k=5)  # Retrieve top 5 passages
         retrieved_passages = [result['text'] for result in retrieved_results]
         # Initialize RAG generator model
@@ -136,4 +153,3 @@ with gr.Blocks() as demo:
         query_button.click(fn=process_and_query, inputs=[query], outputs=query_output)
 demo.launch()

 import fitz  # PyMuPDF for PDF text extraction
 from docx import Document  # python-docx for DOCX text extraction
 from sentence_transformers import SentenceTransformer
 from transformers import AutoModelForSeq2SeqLM, AutoTokenizer
 from nltk.tokenize import sent_tokenize
 import torch
 import pickle
 import nltk
+import faiss
+import numpy as np
 # Download NLTK punkt tokenizer data if not already downloaded
 nltk.download('punkt', quiet=True)
 embedding_model = SentenceTransformer('all-MiniLM-L6-v2')
 # Initialize the HuggingFaceEmbeddings for LangChain
+# Since we're not using it directly for index, initialization may be skipped here
 # Initialize the FAISS index
+class FAISSIndex:
+    def __init__(self, dimension):
+        self.dimension = dimension
+        self.index = faiss.IndexFlatL2(dimension)
+    def add_sentences(self, sentences, embeddings):
+        # Ensure embeddings are numpy arrays
+        embeddings = np.array(embeddings)
+        # Check if embeddings and sentences have the same length
+        assert len(embeddings) == len(sentences), "Number of embeddings should match number of sentences"
+        # Add each sentence embedding to the index
+        for emb in embeddings:
+            self.index.add(np.expand_dims(emb, axis=0))
+    def similarity_search(self, query_embedding, k=5):
+        # Search for similar embeddings in the index
+        D, I = self.index.search(query_embedding, k)
+        return [{"text": str(i), "score": float(d)} for i, d in zip(I[0], D[0])]
+# Initialize the FAISS index instance
+index_dimension = 512  # Dimensionality of SentenceTransformer embeddings
+faiss_index = FAISSIndex(index_dimension)
 def preprocess_text(text):
     sentences = sent_tokenize(text)
             # Preprocess text
             sentences = preprocess_text(text)
+            # Encode sentences
             embeddings = embedding_model.encode(sentences)
+            # Add sentences to FAISS index
+            faiss_index.add_sentences(sentences, embeddings)
         # Save the updated index
+        with open("faiss_index.pkl", "wb") as f:
             pickle.dump(faiss_index, f)
         return {"message": "Files processed successfully"}
         question_embedding = embedding_model.encode([question])
         # Search the FAISS index for similar passages
+        retrieved_results = faiss_index.similarity_search(question_embedding, k=5)  # Retrieve top 5 passages
         retrieved_passages = [result['text'] for result in retrieved_results]
         # Initialize RAG generator model
         query_button.click(fn=process_and_query, inputs=[query], outputs=query_output)
 demo.launch()