NaimaAqeel commited on
Commit
ba470cd
·
verified ·
1 Parent(s): 261cad3

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +74 -53
app.py CHANGED
@@ -14,16 +14,23 @@ from langchain_community.embeddings import HuggingFaceEmbeddings
14
  # Function to extract text from a PDF file
15
  def extract_text_from_pdf(pdf_path):
16
  text = ""
17
- doc = fitz.open(pdf_path)
18
- for page_num in range(len(doc)):
19
- page = doc.load_page(page_num)
20
- text += page.get_text()
 
 
 
21
  return text
22
 
23
  # Function to extract text from a Word document
24
  def extract_text_from_docx(docx_path):
25
- doc = Document(docx_path)
26
- text = "\n".join([para.text for para in doc.paragraphs])
 
 
 
 
27
  return text
28
 
29
  # Initialize the embedding model
@@ -47,20 +54,22 @@ embedding = HuggingFaceEmbeddings()
47
  index_path = "faiss_index.pkl"
48
  document_texts_path = "document_texts.pkl"
49
 
 
 
50
  if os.path.exists(index_path):
51
- with open(index_path, "rb") as f:
52
- index = pickle.load(f)
53
- print("Loaded FAISS index from faiss_index.pkl")
54
- if os.path.exists(document_texts_path):
55
- with open(document_texts_path, "rb") as f:
56
- document_texts = pickle.load(f)
57
- print("Loaded document texts from document_texts.pkl")
58
- else:
59
- document_texts = []
 
60
  else:
61
  # Create a new FAISS index if it doesn't exist
62
  index = faiss.IndexFlatL2(embedding_model.get_sentence_embedding_dimension())
63
- document_texts = []
64
  with open(index_path, "wb") as f:
65
  pickle.dump(index, f)
66
  print("Created new FAISS index and saved to faiss_index.pkl")
@@ -68,49 +77,60 @@ else:
68
  def upload_files(files):
69
  global index, document_texts
70
  for file in files:
71
- content = file.read()
72
- if file.name.endswith('.pdf'):
73
- with open("temp.pdf", "wb") as f:
74
- f.write(content)
75
- text = extract_text_from_pdf("temp.pdf")
76
- elif file.name.endswith('.docx'):
77
- with open("temp.docx", "wb") as f:
78
- f.write(content)
79
- text = extract_text_from_docx("temp.docx")
80
- else:
81
- return "Unsupported file format"
82
-
83
- # Process the text and update FAISS index
84
- sentences = text.split("\n")
85
- embeddings = embedding_model.encode(sentences)
86
- index.add(np.array(embeddings))
87
- document_texts.append(text)
 
 
 
 
88
 
89
  # Save the updated index and documents
90
- with open(index_path, "wb") as f:
91
- pickle.dump(index, f)
92
- print("Saved updated FAISS index to faiss_index.pkl")
93
- with open(document_texts_path, "wb") as f:
94
- pickle.dump(document_texts, f)
95
- print("Saved updated document texts to document_texts.pkl")
 
 
 
 
96
 
97
  return "Files processed successfully"
98
 
99
  def query_text(text):
100
- # Encode the query text
101
- query_embedding = embedding_model.encode([text])
102
-
103
- # Search the FAISS index
104
- D, I = index.search(np.array(query_embedding), k=5)
105
-
106
- top_documents = []
107
- for idx in I[0]:
108
- if idx != -1 and idx < len(document_texts): # Ensure that a valid index is found
109
- top_documents.append(document_texts[idx])
110
- else:
111
- print(f"Invalid index found: {idx}")
112
-
113
- return top_documents
 
 
 
114
 
115
  # Create Gradio interface
116
  with gr.Blocks() as demo:
@@ -131,6 +151,7 @@ with gr.Blocks() as demo:
131
  demo.launch()
132
 
133
 
 
134
 
135
 
136
 
 
14
  # Function to extract text from a PDF file
15
  def extract_text_from_pdf(pdf_path):
16
  text = ""
17
+ try:
18
+ doc = fitz.open(pdf_path)
19
+ for page_num in range(len(doc)):
20
+ page = doc.load_page(page_num)
21
+ text += page.get_text()
22
+ except Exception as e:
23
+ print(f"Error extracting text from PDF: {e}")
24
  return text
25
 
26
  # Function to extract text from a Word document
27
  def extract_text_from_docx(docx_path):
28
+ text = ""
29
+ try:
30
+ doc = Document(docx_path)
31
+ text = "\n".join([para.text for para in doc.paragraphs])
32
+ except Exception as e:
33
+ print(f"Error extracting text from DOCX: {e}")
34
  return text
35
 
36
  # Initialize the embedding model
 
54
  index_path = "faiss_index.pkl"
55
  document_texts_path = "document_texts.pkl"
56
 
57
+ document_texts = []
58
+
59
  if os.path.exists(index_path):
60
+ try:
61
+ with open(index_path, "rb") as f:
62
+ index = pickle.load(f)
63
+ print("Loaded FAISS index from faiss_index.pkl")
64
+ if os.path.exists(document_texts_path):
65
+ with open(document_texts_path, "rb") as f:
66
+ document_texts = pickle.load(f)
67
+ print("Loaded document texts from document_texts.pkl")
68
+ except Exception as e:
69
+ print(f"Error loading FAISS index or document texts: {e}")
70
  else:
71
  # Create a new FAISS index if it doesn't exist
72
  index = faiss.IndexFlatL2(embedding_model.get_sentence_embedding_dimension())
 
73
  with open(index_path, "wb") as f:
74
  pickle.dump(index, f)
75
  print("Created new FAISS index and saved to faiss_index.pkl")
 
77
  def upload_files(files):
78
  global index, document_texts
79
  for file in files:
80
+ try:
81
+ content = file.read()
82
+ if file.name.endswith('.pdf'):
83
+ with open("temp.pdf", "wb") as f:
84
+ f.write(content)
85
+ text = extract_text_from_pdf("temp.pdf")
86
+ elif file.name.endswith('.docx'):
87
+ with open("temp.docx", "wb") as f:
88
+ f.write(content)
89
+ text = extract_text_from_docx("temp.docx")
90
+ else:
91
+ return "Unsupported file format"
92
+
93
+ # Process the text and update FAISS index
94
+ sentences = text.split("\n")
95
+ embeddings = embedding_model.encode(sentences)
96
+ index.add(np.array(embeddings))
97
+ document_texts.append(text)
98
+ except Exception as e:
99
+ print(f"Error processing file {file.name}: {e}")
100
+ return f"Error processing file {file.name}: {e}"
101
 
102
  # Save the updated index and documents
103
+ try:
104
+ with open(index_path, "wb") as f:
105
+ pickle.dump(index, f)
106
+ print("Saved updated FAISS index to faiss_index.pkl")
107
+ with open(document_texts_path, "wb") as f:
108
+ pickle.dump(document_texts, f)
109
+ print("Saved updated document texts to document_texts.pkl")
110
+ except Exception as e:
111
+ print(f"Error saving FAISS index or document texts: {e}")
112
+ return f"Error saving FAISS index or document texts: {e}"
113
 
114
  return "Files processed successfully"
115
 
116
  def query_text(text):
117
+ try:
118
+ # Encode the query text
119
+ query_embedding = embedding_model.encode([text])
120
+
121
+ # Search the FAISS index
122
+ D, I = index.search(np.array(query_embedding), k=5)
123
+
124
+ top_documents = []
125
+ for idx in I[0]:
126
+ if idx != -1 and idx < len(document_texts): # Ensure that a valid index is found
127
+ top_documents.append(document_texts[idx])
128
+ else:
129
+ print(f"Invalid index found: {idx}")
130
+ return top_documents
131
+ except Exception as e:
132
+ print(f"Error querying text: {e}")
133
+ return f"Error querying text: {e}"
134
 
135
  # Create Gradio interface
136
  with gr.Blocks() as demo:
 
151
  demo.launch()
152
 
153
 
154
+
155
 
156
 
157