NaimaAqeel commited on
Commit
4f4ccbd
·
verified ·
1 Parent(s): 18ab537

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +32 -26
app.py CHANGED
@@ -1,4 +1,5 @@
1
  import os
 
2
  from docx import Document
3
  from sentence_transformers import SentenceTransformer
4
  import faiss
@@ -30,6 +31,18 @@ def extract_text_from_docx(docx_path):
30
  print(f"Error extracting text from DOCX: {e}")
31
  return text
32
 
 
 
 
 
 
 
 
 
 
 
 
 
33
  # Load or create FAISS index
34
  index_path = "faiss_index.pkl"
35
  document_texts_path = "document_texts.pkl"
@@ -64,13 +77,17 @@ def upload_files(files):
64
  file_path = file.name # Get the file path from the NamedString object
65
  if file_path.endswith('.docx'):
66
  text = extract_text_from_docx(file_path)
67
-
68
- # Process the text and update FAISS index
69
- sentences = text.split("\n")
70
- sentences = [preprocess_text(sentence) for sentence in sentences if sentence.strip()]
71
- embeddings = embedding_model.encode(sentences)
72
- index.add(np.array(embeddings))
73
- document_texts.extend(sentences) # Store sentences for retrieval
 
 
 
 
74
 
75
  # Save the updated index and documents
76
  with open(index_path, "wb") as f:
@@ -110,14 +127,16 @@ def query_text(text):
110
  return f"Error querying text: {e}"
111
 
112
  def main():
113
- upload_interface = gr.Interface(
 
114
  fn=upload_files,
115
- inputs=gr.File(file_count="multiple", label="Upload DOCX files"),
116
  outputs="text",
117
- title="Upload DOCX Files",
118
- description="Upload DOCX files to process and add to the FAISS index."
119
  )
120
 
 
121
  query_interface = gr.Interface(
122
  fn=query_text,
123
  inputs="text",
@@ -126,22 +145,9 @@ def main():
126
  description="Query the indexed text and get answers from the language model."
127
  )
128
 
129
- demo = gr.TabbedInterface([upload_interface, query_interface], ["Upload Files", "Query Text"])
 
130
  demo.launch()
131
 
132
  if __name__ == "__main__":
133
  main()
134
-
135
-
136
-
137
-
138
-
139
-
140
-
141
-
142
-
143
-
144
-
145
-
146
-
147
-
 
1
  import os
2
+ import fitz # PyMuPDF
3
  from docx import Document
4
  from sentence_transformers import SentenceTransformer
5
  import faiss
 
31
  print(f"Error extracting text from DOCX: {e}")
32
  return text
33
 
34
+ # Function to extract text from a PDF document
35
+ def extract_text_from_pdf(pdf_path):
36
+ text = ""
37
+ try:
38
+ pdf_document = fitz.open(pdf_path)
39
+ for page_num in range(pdf_document.page_count):
40
+ page = pdf_document.load_page(page_num)
41
+ text += page.get_text()
42
+ except Exception as e:
43
+ print(f"Error extracting text from PDF: {e}")
44
+ return text
45
+
46
  # Load or create FAISS index
47
  index_path = "faiss_index.pkl"
48
  document_texts_path = "document_texts.pkl"
 
77
  file_path = file.name # Get the file path from the NamedString object
78
  if file_path.endswith('.docx'):
79
  text = extract_text_from_docx(file_path)
80
+ elif file_path.endswith('.pdf'):
81
+ text = extract_text_from_pdf(file_path)
82
+ else:
83
+ continue
84
+
85
+ # Process the text and update FAISS index
86
+ sentences = text.split("\n")
87
+ sentences = [preprocess_text(sentence) for sentence in sentences if sentence.strip()]
88
+ embeddings = embedding_model.encode(sentences)
89
+ index.add(np.array(embeddings))
90
+ document_texts.extend(sentences) # Store sentences for retrieval
91
 
92
  # Save the updated index and documents
93
  with open(index_path, "wb") as f:
 
127
  return f"Error querying text: {e}"
128
 
129
  def main():
130
+ # Gradio interface for uploading files
131
+ file_upload_interface = gr.Interface(
132
  fn=upload_files,
133
+ inputs=gr.File(file_count="multiple", label="Upload DOCX or PDF files"),
134
  outputs="text",
135
+ title="Upload Files",
136
+ description="Upload DOCX or PDF files to process and add to the FAISS index."
137
  )
138
 
139
+ # Gradio interface for querying text
140
  query_interface = gr.Interface(
141
  fn=query_text,
142
  inputs="text",
 
145
  description="Query the indexed text and get answers from the language model."
146
  )
147
 
148
+ # Create a tabbed interface
149
+ demo = gr.TabbedInterface([file_upload_interface, query_interface], ["Upload Files", "Query Text"])
150
  demo.launch()
151
 
152
  if __name__ == "__main__":
153
  main()