Spaces:
Sleeping
Sleeping
Update app.py
Browse files
app.py
CHANGED
@@ -179,101 +179,74 @@ def generate_output(context, query):
|
|
179 |
# context_data, combined_context = search_documents(query)
|
180 |
|
181 |
# document_titles = list({os.path.basename(doc["title"]) for doc in context_data}) # Get only file names
|
182 |
-
|
183 |
# formatted_titles = " " + "\n".join(document_titles)
|
184 |
|
|
|
|
|
185 |
# results = {
|
186 |
# "results": [
|
187 |
# {
|
188 |
# "natural_language_output": generate_output(doc["relevant_text"], query),
|
189 |
-
# "doc_id": doc["doc_id"],
|
190 |
# "chunk_id": doc["chunk_id"],
|
|
|
191 |
# "title": doc["title"],
|
192 |
# "relevant_text": doc["relevant_text"],
|
193 |
# "page_number": doc["page_number"],
|
194 |
# "score": doc["score"],
|
195 |
# }
|
196 |
# for doc in context_data
|
197 |
-
# ]
|
|
|
198 |
# }
|
199 |
|
200 |
# return results, formatted_titles # Return results and formatted document titles
|
201 |
# except Exception as e:
|
202 |
-
# return {"results": []}, f"Error in workflow: {str(e)}"
|
|
|
203 |
|
204 |
def complete_workflow(query):
|
205 |
try:
|
206 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
207 |
|
208 |
-
|
209 |
-
|
|
|
|
|
|
|
|
|
|
|
210 |
|
211 |
-
|
|
|
|
|
212 |
|
213 |
results = {
|
214 |
"results": [
|
215 |
{
|
216 |
-
"natural_language_output":
|
217 |
"chunk_id": doc["chunk_id"],
|
218 |
-
"document_id": doc["doc_id"],
|
219 |
"title": doc["title"],
|
220 |
"relevant_text": doc["relevant_text"],
|
221 |
"page_number": doc["page_number"],
|
222 |
"score": doc["score"],
|
|
|
223 |
}
|
224 |
for doc in context_data
|
225 |
],
|
226 |
-
"total_results":
|
227 |
}
|
228 |
|
229 |
-
return results, formatted_titles # Return results and formatted document titles
|
230 |
except Exception as e:
|
231 |
return {"results": [], "total_results": 0}, f"Error in workflow: {str(e)}"
|
232 |
|
233 |
-
|
234 |
-
# def complete_workflow(query):
|
235 |
-
# try:
|
236 |
-
# # 🔹 Step 1: Perform Hybrid Search (Vector + BM25)
|
237 |
-
# context_data, combined_context = hybrid_search_documents(query)
|
238 |
-
|
239 |
-
# # 🔹 Step 2: Generate LLM-based Natural Language Output
|
240 |
-
# llm = ChatOpenAI(model="gpt-4", openai_api_key=openai.api_key, temperature=0.7)
|
241 |
-
# prompt_template = """
|
242 |
-
# Use the following context to answer the question as accurately as possible:
|
243 |
-
|
244 |
-
# Context: {context}
|
245 |
-
# Question: {question}
|
246 |
-
|
247 |
-
# Answer:
|
248 |
-
# """
|
249 |
-
# prompt = prompt_template.format(context=combined_context, question=query)
|
250 |
-
# response = llm([HumanMessage(content=prompt)])
|
251 |
-
|
252 |
-
# # 🔹 Step 3: Format Results
|
253 |
-
# document_titles = list({os.path.basename(doc["title"]) for doc in context_data}) # Extract unique file names
|
254 |
-
# formatted_titles = "\n".join(document_titles)
|
255 |
-
|
256 |
-
# results = {
|
257 |
-
# "results": [
|
258 |
-
# {
|
259 |
-
# "natural_language_output": response.content,
|
260 |
-
# "chunk_id": doc["chunk_id"],
|
261 |
-
# "document_id": doc["doc_id"],
|
262 |
-
# "title": doc["title"],
|
263 |
-
# "relevant_text": doc["relevant_text"],
|
264 |
-
# "page_number": doc["page_number"],
|
265 |
-
# "score": doc["score"],
|
266 |
-
# "method": doc["method"], # "vector" or "bm25"
|
267 |
-
# }
|
268 |
-
# for doc in context_data
|
269 |
-
# ],
|
270 |
-
# "total_results": len(context_data), # Return total number of retrieved results
|
271 |
-
# }
|
272 |
-
|
273 |
-
# return results, formatted_titles # Return both results and formatted document titles
|
274 |
-
# except Exception as e:
|
275 |
-
# return {"results": [], "total_results": 0}, f"Error in workflow: {str(e)}"
|
276 |
-
|
277 |
def gradio_app():
|
278 |
with gr.Blocks(css=".result-output {width: 150%; font-size: 16px; padding: 10px;}") as app:
|
279 |
gr.Markdown("### Intelligent Document Search Prototype-v0.1.2 ")
|
|
|
179 |
# context_data, combined_context = search_documents(query)
|
180 |
|
181 |
# document_titles = list({os.path.basename(doc["title"]) for doc in context_data}) # Get only file names
|
|
|
182 |
# formatted_titles = " " + "\n".join(document_titles)
|
183 |
|
184 |
+
# total_results = len(context_data) # Count the total number of results
|
185 |
+
|
186 |
# results = {
|
187 |
# "results": [
|
188 |
# {
|
189 |
# "natural_language_output": generate_output(doc["relevant_text"], query),
|
|
|
190 |
# "chunk_id": doc["chunk_id"],
|
191 |
+
# "document_id": doc["doc_id"], # Assuming doc_id is the UUID
|
192 |
# "title": doc["title"],
|
193 |
# "relevant_text": doc["relevant_text"],
|
194 |
# "page_number": doc["page_number"],
|
195 |
# "score": doc["score"],
|
196 |
# }
|
197 |
# for doc in context_data
|
198 |
+
# ],
|
199 |
+
# "total_results": total_results # Added total_results field
|
200 |
# }
|
201 |
|
202 |
# return results, formatted_titles # Return results and formatted document titles
|
203 |
# except Exception as e:
|
204 |
+
# return {"results": [], "total_results": 0}, f"Error in workflow: {str(e)}"
|
205 |
+
|
206 |
|
207 |
def complete_workflow(query):
|
208 |
try:
|
209 |
+
# 🔹 Step 1: Perform Hybrid Search (Vector + BM25)
|
210 |
+
context_data, combined_context = hybrid_search_documents(query)
|
211 |
+
|
212 |
+
# 🔹 Step 2: Generate LLM-based Natural Language Output
|
213 |
+
llm = ChatOpenAI(model="gpt-4", openai_api_key=openai.api_key, temperature=0.7)
|
214 |
+
prompt_template = """
|
215 |
+
Use the following context to answer the question as accurately as possible:
|
216 |
|
217 |
+
Context: {context}
|
218 |
+
Question: {question}
|
219 |
+
|
220 |
+
Answer:
|
221 |
+
"""
|
222 |
+
prompt = prompt_template.format(context=combined_context, question=query)
|
223 |
+
response = llm([HumanMessage(content=prompt)])
|
224 |
|
225 |
+
# 🔹 Step 3: Format Results
|
226 |
+
document_titles = list({os.path.basename(doc["title"]) for doc in context_data}) # Extract unique file names
|
227 |
+
formatted_titles = "\n".join(document_titles)
|
228 |
|
229 |
results = {
|
230 |
"results": [
|
231 |
{
|
232 |
+
"natural_language_output": response.content,
|
233 |
"chunk_id": doc["chunk_id"],
|
234 |
+
"document_id": doc["doc_id"],
|
235 |
"title": doc["title"],
|
236 |
"relevant_text": doc["relevant_text"],
|
237 |
"page_number": doc["page_number"],
|
238 |
"score": doc["score"],
|
239 |
+
"method": doc["method"], # "vector" or "bm25"
|
240 |
}
|
241 |
for doc in context_data
|
242 |
],
|
243 |
+
"total_results": len(context_data), # Return total number of retrieved results
|
244 |
}
|
245 |
|
246 |
+
return results, formatted_titles # Return both results and formatted document titles
|
247 |
except Exception as e:
|
248 |
return {"results": [], "total_results": 0}, f"Error in workflow: {str(e)}"
|
249 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
250 |
def gradio_app():
|
251 |
with gr.Blocks(css=".result-output {width: 150%; font-size: 16px; padding: 10px;}") as app:
|
252 |
gr.Markdown("### Intelligent Document Search Prototype-v0.1.2 ")
|