manu commited on
Commit
1b5281e
Β·
verified Β·
1 Parent(s): d88ef28

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +41 -33
app.py CHANGED
@@ -170,11 +170,11 @@ PDF pages:
170
  # Local Search (ColPali)
171
  # =============================
172
 
173
- def search(query: str, k: int = 5) -> List[int]:
174
  """
175
  Search within a PDF document for the most relevant pages to answer a query and return the page indexes as a list.
176
  MCP tool description:
177
- - name: mcp_test_search
178
  - description: Search within a PDF document for the most relevant pages to answer a query.
179
  - input_schema:
180
  type: object
@@ -212,7 +212,7 @@ def search_synthetize(query: str, k: int = 5) -> List[int]:
212
  """
213
  Search within a PDF document for the most relevant pages to answer a query and synthetizes a short grounded answer using only those pages.
214
  MCP tool description:
215
- - name: mcp_test_search_synthetize
216
  - description: Search within a PDF document for the most relevant pages to answer a query and synthetizes a short grounded answer using only those pages.
217
  - input_schema:
218
  type: object
@@ -226,13 +226,15 @@ def search_synthetize(query: str, k: int = 5) -> List[int]:
226
  Returns:
227
  ai_response (str): Text answer to the query grounded in content from the PDF, with citations (page numbers).
228
  """
229
- top_k_indices = search(query, k)
230
  expanded = set(top_k_indices)
231
  for i in top_k_indices:
232
  expanded.add(i - 1)
233
  expanded.add(i + 1)
234
  expanded = {i for i in expanded if 0 <= i < len(images)}
235
  expanded = sorted(expanded)
 
 
236
 
237
  # Build gallery results with 1-based page numbering
238
  results = []
@@ -268,12 +270,13 @@ def _build_image_parts_from_indices(indices: List[int]) -> List[Dict[str, Any]]:
268
 
269
  SYSTEM1 = (
270
  """
271
- You are a PDF research agent with a single tool: mcp_test_search(query: string, k: int).
272
  Act iteratively:
273
- 1) Split the user question into 1–4 focused sub-queries. Subqueries should be asked as natural language questions, not just keywords.
274
- 2) For each sub-query, call mcp_test_search (k=5 by default; increase to up to 10 if you need to go deep).
275
- 3) You will receive the output of mcp_test_search as a list of indices corresponding to page numbers. Stop generating once all the tool calls end. You will be fed the corresponding pages as images in a follow-up message.
276
- 4) Stop early when confident; otherwise run new search calls using the tool to find additional missing information. Use up to 5 rounds of iterations and 20 searches in total. If info is missing, try to continue searching using new keywords and queries.
 
277
 
278
  Workflow:
279
  β€’ Use ONLY the provided images for grounding and cite as (p.<page>).
@@ -286,10 +289,10 @@ Deliverable:
286
 
287
 
288
  SYSTEM2 = """
289
- You are a PDF research agent with a single tool: mcp_test_search_synthetize(query: string, k: int).
290
  Act iteratively:
291
  1) Split the user question into 1–4 focused sub-queries. Subqueries should be asked as natural language questions, not just keywords.
292
- 2) For each sub-query, call mcp_test_search_synthetize (k=5 by default; increase to up to 20 if you need to go deep).
293
  3) Stop early when confident; otherwise refine and repeat, up to 4 iterations and 20 searches in total. If info is missing, try to continue searching using new keywords and queries.
294
 
295
  Grounding & citations:
@@ -325,11 +328,20 @@ def stream_agent(question: str,
325
  Multi-round streaming:
326
  β€’ Seed: optional local ColPali search on the user question to attach initial pages.
327
  β€’ Each round: open a GPT-5 stream with *attached images* (if any).
328
- β€’ If the model calls mcp_test_search and returns indices, we end the stream and
329
  start a NEW API call with previous_response_id + the requested pages attached.
330
  """
331
- visual_reasoning = True if visual_reasoning=="Visual Reasoning" else False
332
- allowed_tools = "mcp_test_search" if visual_reasoning else "mcp_test_search_synthetize"
 
 
 
 
 
 
 
 
 
333
  SYSTEM= SYSTEM1 if visual_reasoning else SYSTEM2
334
 
335
  if not api_key:
@@ -342,12 +354,6 @@ def stream_agent(question: str,
342
 
343
  client = OpenAI(api_key=api_key)
344
 
345
- # Optional seeding: attach some likely pages on round 1
346
- try:
347
- seed_indices = [] if visual_reasoning is False else search(question, k=5)
348
- except Exception as e:
349
- yield f"❌ Search failed: {e}", "", ""
350
- return
351
 
352
  log_lines = ["Log", f"[seed] indices={seed_indices}"]
353
  prev_response_id: Optional[str] = None
@@ -386,9 +392,10 @@ def stream_agent(question: str,
386
  parts.append({"type": "input_text", "text": "Continue reasoning with the newly attached pages. Remember you should probably further query the search tool."})
387
 
388
  parts += _build_image_parts_from_indices(attached_indices)
389
- if attached_indices:
390
- pages_str = ", ".join(str(i + 1) for i in sorted(set(attached_indices)))
391
- parts.append({"type": "input_text", "text": f"(Attached pages from round {round_idx}: {pages_str}). Ground your answer in these images, or query for new pages."})
 
392
 
393
  # First call includes system; follow-ups use previous_response_id
394
  if prev_response_id:
@@ -404,7 +411,7 @@ def stream_agent(question: str,
404
  input=req_input,
405
  reasoning={"effort": "medium", "summary": "auto"},
406
  tools=tools,
407
- store=True, # persist conversation state on server
408
  )
409
  if prev_response_id:
410
  req_kwargs["previous_response_id"] = prev_response_id
@@ -493,12 +500,13 @@ def stream_agent(question: str,
493
  expanded.add(i - 1)
494
  expanded.add(i + 1)
495
  expanded = {i for i in expanded if 0 <= i < len(images)}
496
- pending_indices = sorted(expanded) if len(expanded) < 15 else sorted(base)
497
  round_idx += 1
498
  continue
499
 
500
  # No further tool-driven retrieval β†’ done
501
  break
 
502
 
503
  return
504
 
@@ -567,14 +575,14 @@ body {background: radial-gradient(1200px 600px at 20% -10%, rgba(124,58,237,.25)
567
 
568
  def build_ui():
569
  theme = gr.themes.Soft()
570
- with gr.Blocks(title="ColPali PDF RAG + Follow-up Responses", theme=theme, css=CUSTOM_CSS) as demo:
571
  gr.HTML(
572
  """
573
  <div class="app-header">
574
  <div class="icon">πŸ“š</div>
575
  <div>
576
- <h1>ColPali PDF Search + Streaming Agent (Follow-up Responses)</h1>
577
- <p>Index PDFs with ColQwen2. The agent attaches images in follow-up GPT-5 calls; MCP is search-only.</p>
578
  </div>
579
  </div>
580
  """
@@ -627,10 +635,10 @@ def build_ui():
627
  search_synthetize_button = gr.Button("πŸ” Search & Synthetize", variant="primary")
628
 
629
  with gr.Column(scale=2):
630
- output_docs = gr.Textbox(label="Indices (0-based)", lines=1, placeholder="[0, 1, 2, ...]")
631
  output_text = gr.Textbox(label="ColQwen+GPT-5 Answer", lines=12, placeholder="...")
632
 
633
- search_button.click(search, inputs=[query_box, k_slider], outputs=[output_docs])
634
  search_synthetize_button.click(search_synthetize, inputs=[query_box, k_slider], outputs=[output_text])
635
 
636
  # ---- Tab 3: Agent (Streaming)
@@ -670,9 +678,9 @@ def build_ui():
670
  )
671
  with gr.Row():
672
  visual_reasoning_box = gr.Dropdown(
673
- label="Visual Reasoning",
674
- choices=["Visual Reasoning", "Vision Summary"],
675
- value="Visual Reasoning",
676
  )
677
 
678
  with gr.Column(scale=3):
 
170
  # Local Search (ColPali)
171
  # =============================
172
 
173
+ def image_search(query: str, k: int = 5) -> List[int]:
174
  """
175
  Search within a PDF document for the most relevant pages to answer a query and return the page indexes as a list.
176
  MCP tool description:
177
+ - name: visual_deepsearch_image_search
178
  - description: Search within a PDF document for the most relevant pages to answer a query.
179
  - input_schema:
180
  type: object
 
212
  """
213
  Search within a PDF document for the most relevant pages to answer a query and synthetizes a short grounded answer using only those pages.
214
  MCP tool description:
215
+ - name: visual_deepsearch_search_synthetize
216
  - description: Search within a PDF document for the most relevant pages to answer a query and synthetizes a short grounded answer using only those pages.
217
  - input_schema:
218
  type: object
 
226
  Returns:
227
  ai_response (str): Text answer to the query grounded in content from the PDF, with citations (page numbers).
228
  """
229
+ top_k_indices = image_search(query, k)
230
  expanded = set(top_k_indices)
231
  for i in top_k_indices:
232
  expanded.add(i - 1)
233
  expanded.add(i + 1)
234
  expanded = {i for i in expanded if 0 <= i < len(images)}
235
  expanded = sorted(expanded)
236
+ expanded = expanded if len(expanded) < 20 else sorted(top_k_indices)
237
+
238
 
239
  # Build gallery results with 1-based page numbering
240
  results = []
 
270
 
271
  SYSTEM1 = (
272
  """
273
+ You are a PDF research agent with a single tool: visual_deepsearch_image_search(query: string, k: int).
274
  Act iteratively:
275
+ 1) Split the user question into 1–4 focused sub-queries. You can use the provided page images to help you ask relevant followup queries. Subqueries should be asked as natural language questions, not just keywords.
276
+ 2) For each sub-query, call visual_deepsearch_image_search (k=5 by default; increase to up to 10 if you need to go deep).
277
+ 3) You will receive the output of visual_deepsearch_image_search as a list of indices corresponding to page numbers. Print the page numbers out and stop generating. An external system will take over and convert the indices into image for you.
278
+ 4) Analyze the images received to find information you were looking for. If you are condident that you have all the information needed for a complete response, stop early and provide a final answer. Otherwise run new search calls using the tool to find additional missing information.
279
+ 5) Repeat the process for up to 5 rounds of iterations and 20 searches in total. If info is missing, try to continue searching using new keywords and queries.
280
 
281
  Workflow:
282
  β€’ Use ONLY the provided images for grounding and cite as (p.<page>).
 
289
 
290
 
291
  SYSTEM2 = """
292
+ You are a PDF research agent with a single tool: visual_deepsearch_search_synthetize(query: string, k: int).
293
  Act iteratively:
294
  1) Split the user question into 1–4 focused sub-queries. Subqueries should be asked as natural language questions, not just keywords.
295
+ 2) For each sub-query, call visual_deepsearch_search_synthetize (k=5 by default; increase to up to 20 if you need to go deep).
296
  3) Stop early when confident; otherwise refine and repeat, up to 4 iterations and 20 searches in total. If info is missing, try to continue searching using new keywords and queries.
297
 
298
  Grounding & citations:
 
328
  Multi-round streaming:
329
  β€’ Seed: optional local ColPali search on the user question to attach initial pages.
330
  β€’ Each round: open a GPT-5 stream with *attached images* (if any).
331
+ β€’ If the model calls the tool and returns indices, we end the stream and
332
  start a NEW API call with previous_response_id + the requested pages attached.
333
  """
334
+
335
+ # Optional seeding: attach some likely pages on round 1
336
+ try:
337
+ seed_indices = search(question, k=5) if visual_reasoning == "Seeded Visual Reasoning" else []
338
+ except Exception as e:
339
+ yield f"❌ Search failed: {e}", "", ""
340
+ return
341
+
342
+ visual_reasoning: bool = True if "Visual Reasoning" in visual_reasoning else False
343
+
344
+ allowed_tools = "visual_deepsearch_image_search" if visual_reasoning else "visual_deepsearch_search_synthetize"
345
  SYSTEM= SYSTEM1 if visual_reasoning else SYSTEM2
346
 
347
  if not api_key:
 
354
 
355
  client = OpenAI(api_key=api_key)
356
 
 
 
 
 
 
 
357
 
358
  log_lines = ["Log", f"[seed] indices={seed_indices}"]
359
  prev_response_id: Optional[str] = None
 
392
  parts.append({"type": "input_text", "text": "Continue reasoning with the newly attached pages. Remember you should probably further query the search tool."})
393
 
394
  parts += _build_image_parts_from_indices(attached_indices)
395
+
396
+ # if attached_indices:
397
+ # pages_str = ", ".join(str(i + 1) for i in sorted(set(attached_indices)))
398
+ # parts.append({"type": "input_text", "text": f"(Attached pages from round {round_idx}: {pages_str}). Ground your answer in these images, or query for new pages."})
399
 
400
  # First call includes system; follow-ups use previous_response_id
401
  if prev_response_id:
 
411
  input=req_input,
412
  reasoning={"effort": "medium", "summary": "auto"},
413
  tools=tools,
414
+ store=True,
415
  )
416
  if prev_response_id:
417
  req_kwargs["previous_response_id"] = prev_response_id
 
500
  expanded.add(i - 1)
501
  expanded.add(i + 1)
502
  expanded = {i for i in expanded if 0 <= i < len(images)}
503
+ pending_indices = sorted(expanded) if len(expanded) < 20 else sorted(base)
504
  round_idx += 1
505
  continue
506
 
507
  # No further tool-driven retrieval β†’ done
508
  break
509
+ print("Search Finished")
510
 
511
  return
512
 
 
575
 
576
  def build_ui():
577
  theme = gr.themes.Soft()
578
+ with gr.Blocks(title="ColPali Agentic RAG", theme=theme, css=CUSTOM_CSS) as demo:
579
  gr.HTML(
580
  """
581
  <div class="app-header">
582
  <div class="icon">πŸ“š</div>
583
  <div>
584
+ <h1>ColPali PDF Search + GPT5 Agent</h1>
585
+ <p>Index PDFs with ColQwen2. The agent uses the search tool through MCP. The search tool returns either textual summaries or images by reference which are attached to conversation in follow-up GPT-5 calls.</p>
586
  </div>
587
  </div>
588
  """
 
635
  search_synthetize_button = gr.Button("πŸ” Search & Synthetize", variant="primary")
636
 
637
  with gr.Column(scale=2):
638
+ output_docs = gr.Textbox(label="Indices", lines=1, placeholder="[0, 1, 2, ...]")
639
  output_text = gr.Textbox(label="ColQwen+GPT-5 Answer", lines=12, placeholder="...")
640
 
641
+ search_button.click(image_search, inputs=[query_box, k_slider], outputs=[output_docs])
642
  search_synthetize_button.click(search_synthetize, inputs=[query_box, k_slider], outputs=[output_text])
643
 
644
  # ---- Tab 3: Agent (Streaming)
 
678
  )
679
  with gr.Row():
680
  visual_reasoning_box = gr.Dropdown(
681
+ label="Reasoning Mode",
682
+ choices=["Visual Reasoning", "Seeded Visual Reasoning", "Visual Summary Reasoning"],
683
+ value="Visual Summary Reasoning",
684
  )
685
 
686
  with gr.Column(scale=3):