NaimaAqeel commited on
Commit
98c11b9
·
verified ·
1 Parent(s): 2e2f2cb

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +24 -33
app.py CHANGED
@@ -1,7 +1,8 @@
1
  import os
 
 
2
  import gradio as gr
3
  from docx import Document
4
- import fitz # PyMuPDF for PDF text extraction
5
  from sentence_transformers import SentenceTransformer
6
  from langchain_community.vectorstores import FAISS
7
  from langchain_community.embeddings import HuggingFaceEmbeddings
@@ -53,24 +54,24 @@ state = {
53
  "sentences": []
54
  }
55
 
56
- def extract_text_from_pdf(pdf_path):
57
  text = ""
58
  try:
59
- doc = fitz.open(pdf_path)
60
- for page_num in range(len(doc)):
61
- page = doc.load_page(page_num)
62
- text += page.get_text()
63
  except Exception as e:
64
- raise RuntimeError(f"Error extracting text from PDF '{pdf_path}': {e}")
65
  return text
66
 
67
- def extract_text_from_docx(docx_path):
68
  text = ""
69
  try:
70
- doc = Document(docx_path)
71
  text = "\n".join([para.text for para in doc.paragraphs])
72
  except Exception as e:
73
- raise RuntimeError(f"Error extracting text from DOCX '{docx_path}': {e}")
74
  return text
75
 
76
  def preprocess_text(text):
@@ -81,28 +82,18 @@ def upload_files(files):
81
  global state, faiss_index
82
  try:
83
  for file in files:
84
- try:
85
- if isinstance(file, str):
86
- file_path = file
87
- else:
88
- file_path = file.name
89
-
90
- if file_path.endswith('.pdf'):
91
- text = extract_text_from_pdf(file_path)
92
- elif file_path.endswith('.docx'):
93
- text = extract_text_from_docx(file_path)
94
- else:
95
- return {"error": f"Unsupported file format: {file_path}"}
96
-
97
- sentences = preprocess_text(text)
98
- embeddings = embedding_model.encode(sentences)
99
-
100
- faiss_index.add(np.array(embeddings).astype(np.float32)) # Add embeddings
101
- state["sentences"].extend(sentences)
102
-
103
- except Exception as e:
104
- print(f"Error processing file '{file}': {e}")
105
- return {"error": str(e)}
106
 
107
  # Save the updated index
108
  faiss.write_index(faiss_index, index_path)
@@ -110,7 +101,7 @@ def upload_files(files):
110
  return {"message": "Files processed successfully"}
111
 
112
  except Exception as e:
113
- print(f"General error processing files: {e}")
114
  return {"error": str(e)}
115
 
116
  def process_and_query(question):
 
1
  import os
2
+ import io
3
+ import PyPDF2
4
  import gradio as gr
5
  from docx import Document
 
6
  from sentence_transformers import SentenceTransformer
7
  from langchain_community.vectorstores import FAISS
8
  from langchain_community.embeddings import HuggingFaceEmbeddings
 
54
  "sentences": []
55
  }
56
 
57
+ def extract_text_from_pdf(file):
58
  text = ""
59
  try:
60
+ pdf_data = file.read()
61
+ pdf_reader = PyPDF2.PdfReader(io.BytesIO(pdf_data))
62
+ pdf_pages = pdf_reader.pages
63
+ text = "\n\n".join(page.extract_text() for page in pdf_pages)
64
  except Exception as e:
65
+ raise RuntimeError(f"Error extracting text from PDF: {e}")
66
  return text
67
 
68
+ def extract_text_from_docx(file):
69
  text = ""
70
  try:
71
+ doc = Document(file)
72
  text = "\n".join([para.text for para in doc.paragraphs])
73
  except Exception as e:
74
+ raise RuntimeError(f"Error extracting text from DOCX: {e}")
75
  return text
76
 
77
  def preprocess_text(text):
 
82
  global state, faiss_index
83
  try:
84
  for file in files:
85
+ if file.name.endswith('.pdf'):
86
+ text = extract_text_from_pdf(file)
87
+ elif file.name.endswith('.docx'):
88
+ text = extract_text_from_docx(file)
89
+ else:
90
+ return {"error": f"Unsupported file format: {file.name}"}
91
+
92
+ sentences = preprocess_text(text)
93
+ embeddings = embedding_model.encode(sentences)
94
+
95
+ faiss_index.add(np.array(embeddings).astype(np.float32)) # Add embeddings
96
+ state["sentences"].extend(sentences)
 
 
 
 
 
 
 
 
 
 
97
 
98
  # Save the updated index
99
  faiss.write_index(faiss_index, index_path)
 
101
  return {"message": "Files processed successfully"}
102
 
103
  except Exception as e:
104
+ print(f"Error processing files: {e}")
105
  return {"error": str(e)}
106
 
107
  def process_and_query(question):