NaimaAqeel commited on
Commit
a028e27
·
verified ·
1 Parent(s): bcac0e6

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +63 -1
app.py CHANGED
@@ -62,4 +62,66 @@ def extract_text_from_docx(docx_path):
62
  doc = Document(docx_path)
63
  text = "\n".join([para.text for para in doc.paragraphs])
64
  except Exception as e:
65
- print
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
62
  doc = Document(docx_path)
63
  text = "\n".join([para.text for para in doc.paragraphs])
64
  except Exception as e:
65
+ print(f"DOCX error: {e}")
66
+ return text
67
+
68
+ # =============================================
69
+ # DOCUMENT UPLOAD HANDLER
70
+ # =============================================
71
+ def upload_document(file):
72
+ ext = os.path.splitext(file.name)[-1].lower()
73
+ if ext == ".pdf":
74
+ text = extract_text_from_pdf(file.name)
75
+ elif ext == ".docx":
76
+ text = extract_text_from_docx(file.name)
77
+ else:
78
+ return "Unsupported file type"
79
+
80
+ embedding = get_embeddings(text)
81
+ index.add(embedding)
82
+ document_texts.append(text)
83
+
84
+ # Save updated index and texts
85
+ with open(index_path, "wb") as f:
86
+ pickle.dump(index, f)
87
+ with open(document_texts_path, "wb") as f:
88
+ pickle.dump(document_texts, f)
89
+
90
+ return "Document uploaded and indexed successfully!"
91
+
92
+ # =============================================
93
+ # SEMANTIC SEARCH HANDLER
94
+ # =============================================
95
+ def search_documents(query):
96
+ if not document_texts:
97
+ return "No documents indexed yet."
98
+
99
+ query_vector = get_embeddings(query)
100
+ scores, indices = index.search(query_vector, k=1)
101
+ best_match_idx = indices[0][0]
102
+
103
+ return f"**Best Match:**\n\n{document_texts[best_match_idx][:1000]}..."
104
+
105
+ # =============================================
106
+ # GRADIO INTERFACE
107
+ # =============================================
108
+ upload_interface = gr.Interface(
109
+ fn=upload_document,
110
+ inputs=gr.File(file_types=[".pdf", ".docx"]),
111
+ outputs="text",
112
+ title="Upload PDF/DOCX",
113
+ description="Upload a PDF or Word document to be indexed for semantic search."
114
+ )
115
+
116
+ search_interface = gr.Interface(
117
+ fn=search_documents,
118
+ inputs=gr.Textbox(placeholder="Enter your question or search query here..."),
119
+ outputs="markdown",
120
+ title="Semantic Search",
121
+ description="Search for content in uploaded documents using natural language."
122
+ )
123
+
124
+ app = gr.TabbedInterface([upload_interface, search_interface], ["Upload Document", "Search Document"])
125
+
126
+ if __name__ == "__main__":
127
+ app.launch()