Spaces:

NaimaAqeel
/

Chatbot

Runtime error

App Files Files Community

NaimaAqeel commited on Jun 7, 2024

Commit

98c11b9

verified ·

1 Parent(s): 2e2f2cb

Update app.py

Browse files

Files changed (1) hide show

app.py +24 -33

app.py CHANGED Viewed

@@ -1,7 +1,8 @@
 import os
 import gradio as gr
 from docx import Document
-import fitz  # PyMuPDF for PDF text extraction
 from sentence_transformers import SentenceTransformer
 from langchain_community.vectorstores import FAISS
 from langchain_community.embeddings import HuggingFaceEmbeddings
@@ -53,24 +54,24 @@ state = {
     "sentences": []
 }
-def extract_text_from_pdf(pdf_path):
     text = ""
     try:
-        doc = fitz.open(pdf_path)
-        for page_num in range(len(doc)):
-            page = doc.load_page(page_num)
-            text += page.get_text()
     except Exception as e:
-        raise RuntimeError(f"Error extracting text from PDF '{pdf_path}': {e}")
     return text
-def extract_text_from_docx(docx_path):
     text = ""
     try:
-        doc = Document(docx_path)
         text = "\n".join([para.text for para in doc.paragraphs])
     except Exception as e:
-        raise RuntimeError(f"Error extracting text from DOCX '{docx_path}': {e}")
     return text
 def preprocess_text(text):
@@ -81,28 +82,18 @@ def upload_files(files):
     global state, faiss_index
     try:
         for file in files:
-            try:
-                if isinstance(file, str):
-                    file_path = file
-                else:
-                    file_path = file.name
-                if file_path.endswith('.pdf'):
-                    text = extract_text_from_pdf(file_path)
-                elif file_path.endswith('.docx'):
-                    text = extract_text_from_docx(file_path)
-                else:
-                    return {"error": f"Unsupported file format: {file_path}"}
-                sentences = preprocess_text(text)
-                embeddings = embedding_model.encode(sentences)
-                faiss_index.add(np.array(embeddings).astype(np.float32))  # Add embeddings
-                state["sentences"].extend(sentences)
-            except Exception as e:
-                print(f"Error processing file '{file}': {e}")
-                return {"error": str(e)}
         # Save the updated index
         faiss.write_index(faiss_index, index_path)
@@ -110,7 +101,7 @@ def upload_files(files):
         return {"message": "Files processed successfully"}
     except Exception as e:
-        print(f"General error processing files: {e}")
         return {"error": str(e)}
 def process_and_query(question):

 import os
+import io
+import PyPDF2
 import gradio as gr
 from docx import Document
 from sentence_transformers import SentenceTransformer
 from langchain_community.vectorstores import FAISS
 from langchain_community.embeddings import HuggingFaceEmbeddings
     "sentences": []
 }
+def extract_text_from_pdf(file):
     text = ""
     try:
+        pdf_data = file.read()
+        pdf_reader = PyPDF2.PdfReader(io.BytesIO(pdf_data))
+        pdf_pages = pdf_reader.pages
+        text = "\n\n".join(page.extract_text() for page in pdf_pages)
     except Exception as e:
+        raise RuntimeError(f"Error extracting text from PDF: {e}")
     return text
+def extract_text_from_docx(file):
     text = ""
     try:
+        doc = Document(file)
         text = "\n".join([para.text for para in doc.paragraphs])
     except Exception as e:
+        raise RuntimeError(f"Error extracting text from DOCX: {e}")
     return text
 def preprocess_text(text):
     global state, faiss_index
     try:
         for file in files:
+            if file.name.endswith('.pdf'):
+                text = extract_text_from_pdf(file)
+            elif file.name.endswith('.docx'):
+                text = extract_text_from_docx(file)
+            else:
+                return {"error": f"Unsupported file format: {file.name}"}
+            sentences = preprocess_text(text)
+            embeddings = embedding_model.encode(sentences)
+            faiss_index.add(np.array(embeddings).astype(np.float32))  # Add embeddings
+            state["sentences"].extend(sentences)
         # Save the updated index
         faiss.write_index(faiss_index, index_path)
         return {"message": "Files processed successfully"}
     except Exception as e:
+        print(f"Error processing files: {e}")
         return {"error": str(e)}
 def process_and_query(question):