Spaces:

NaimaAqeel
/

Chatbot

Runtime error

App Files Files Community

NaimaAqeel commited on Jun 4, 2024

Commit

ba470cd

verified ·

1 Parent(s): 261cad3

Update app.py

Browse files

Files changed (1) hide show

app.py +74 -53

app.py CHANGED Viewed

@@ -14,16 +14,23 @@ from langchain_community.embeddings import HuggingFaceEmbeddings
 # Function to extract text from a PDF file
 def extract_text_from_pdf(pdf_path):
     text = ""
-    doc = fitz.open(pdf_path)
-    for page_num in range(len(doc)):
-        page = doc.load_page(page_num)
-        text += page.get_text()
     return text
 # Function to extract text from a Word document
 def extract_text_from_docx(docx_path):
-    doc = Document(docx_path)
-    text = "\n".join([para.text for para in doc.paragraphs])
     return text
 # Initialize the embedding model
@@ -47,20 +54,22 @@ embedding = HuggingFaceEmbeddings()
 index_path = "faiss_index.pkl"
 document_texts_path = "document_texts.pkl"
 if os.path.exists(index_path):
-    with open(index_path, "rb") as f:
-        index = pickle.load(f)
-        print("Loaded FAISS index from faiss_index.pkl")
-    if os.path.exists(document_texts_path):
-        with open(document_texts_path, "rb") as f:
-            document_texts = pickle.load(f)
-            print("Loaded document texts from document_texts.pkl")
-    else:
-        document_texts = []
 else:
     # Create a new FAISS index if it doesn't exist
     index = faiss.IndexFlatL2(embedding_model.get_sentence_embedding_dimension())
-    document_texts = []
     with open(index_path, "wb") as f:
         pickle.dump(index, f)
         print("Created new FAISS index and saved to faiss_index.pkl")
@@ -68,49 +77,60 @@ else:
 def upload_files(files):
     global index, document_texts
     for file in files:
-        content = file.read()
-        if file.name.endswith('.pdf'):
-            with open("temp.pdf", "wb") as f:
-                f.write(content)
-            text = extract_text_from_pdf("temp.pdf")
-        elif file.name.endswith('.docx'):
-            with open("temp.docx", "wb") as f:
-                f.write(content)
-            text = extract_text_from_docx("temp.docx")
-        else:
-            return "Unsupported file format"
-        # Process the text and update FAISS index
-        sentences = text.split("\n")
-        embeddings = embedding_model.encode(sentences)
-        index.add(np.array(embeddings))
-        document_texts.append(text)
     # Save the updated index and documents
-    with open(index_path, "wb") as f:
-        pickle.dump(index, f)
-        print("Saved updated FAISS index to faiss_index.pkl")
-    with open(document_texts_path, "wb") as f:
-        pickle.dump(document_texts, f)
-        print("Saved updated document texts to document_texts.pkl")
     return "Files processed successfully"
 def query_text(text):
-    # Encode the query text
-    query_embedding = embedding_model.encode([text])
-    # Search the FAISS index
-    D, I = index.search(np.array(query_embedding), k=5)
-    top_documents = []
-    for idx in I[0]:
-        if idx != -1 and idx < len(document_texts):  # Ensure that a valid index is found
-            top_documents.append(document_texts[idx])
-        else:
-            print(f"Invalid index found: {idx}")
-    return top_documents
 # Create Gradio interface
 with gr.Blocks() as demo:
@@ -131,6 +151,7 @@ with gr.Blocks() as demo:
 demo.launch()

 # Function to extract text from a PDF file
 def extract_text_from_pdf(pdf_path):
     text = ""
+    try:
+        doc = fitz.open(pdf_path)
+        for page_num in range(len(doc)):
+            page = doc.load_page(page_num)
+            text += page.get_text()
+    except Exception as e:
+        print(f"Error extracting text from PDF: {e}")
     return text
 # Function to extract text from a Word document
 def extract_text_from_docx(docx_path):
+    text = ""
+    try:
+        doc = Document(docx_path)
+        text = "\n".join([para.text for para in doc.paragraphs])
+    except Exception as e:
+        print(f"Error extracting text from DOCX: {e}")
     return text
 # Initialize the embedding model
 index_path = "faiss_index.pkl"
 document_texts_path = "document_texts.pkl"
+document_texts = []
 if os.path.exists(index_path):
+    try:
+        with open(index_path, "rb") as f:
+            index = pickle.load(f)
+            print("Loaded FAISS index from faiss_index.pkl")
+        if os.path.exists(document_texts_path):
+            with open(document_texts_path, "rb") as f:
+                document_texts = pickle.load(f)
+                print("Loaded document texts from document_texts.pkl")
+    except Exception as e:
+        print(f"Error loading FAISS index or document texts: {e}")
 else:
     # Create a new FAISS index if it doesn't exist
     index = faiss.IndexFlatL2(embedding_model.get_sentence_embedding_dimension())
     with open(index_path, "wb") as f:
         pickle.dump(index, f)
         print("Created new FAISS index and saved to faiss_index.pkl")
 def upload_files(files):
     global index, document_texts
     for file in files:
+        try:
+            content = file.read()
+            if file.name.endswith('.pdf'):
+                with open("temp.pdf", "wb") as f:
+                    f.write(content)
+                text = extract_text_from_pdf("temp.pdf")
+            elif file.name.endswith('.docx'):
+                with open("temp.docx", "wb") as f:
+                    f.write(content)
+                text = extract_text_from_docx("temp.docx")
+            else:
+                return "Unsupported file format"
+            # Process the text and update FAISS index
+            sentences = text.split("\n")
+            embeddings = embedding_model.encode(sentences)
+            index.add(np.array(embeddings))
+            document_texts.append(text)
+        except Exception as e:
+            print(f"Error processing file {file.name}: {e}")
+            return f"Error processing file {file.name}: {e}"
     # Save the updated index and documents
+    try:
+        with open(index_path, "wb") as f:
+            pickle.dump(index, f)
+            print("Saved updated FAISS index to faiss_index.pkl")
+        with open(document_texts_path, "wb") as f:
+            pickle.dump(document_texts, f)
+            print("Saved updated document texts to document_texts.pkl")
+    except Exception as e:
+        print(f"Error saving FAISS index or document texts: {e}")
+        return f"Error saving FAISS index or document texts: {e}"
     return "Files processed successfully"
 def query_text(text):
+    try:
+        # Encode the query text
+        query_embedding = embedding_model.encode([text])
+        # Search the FAISS index
+        D, I = index.search(np.array(query_embedding), k=5)
+        top_documents = []
+        for idx in I[0]:
+            if idx != -1 and idx < len(document_texts):  # Ensure that a valid index is found
+                top_documents.append(document_texts[idx])
+            else:
+                print(f"Invalid index found: {idx}")
+        return top_documents
+    except Exception as e:
+        print(f"Error querying text: {e}")
+        return f"Error querying text: {e}"
 # Create Gradio interface
 with gr.Blocks() as demo:
 demo.launch()