visual-deepsearch

Sleeping

App Files Files Community

manu commited on 5 days ago

Commit

1b5281e

verified ·

1 Parent(s): d88ef28

Update app.py

Browse files

Files changed (1) hide show

app.py +41 -33

app.py CHANGED Viewed

@@ -170,11 +170,11 @@ PDF pages:
 # Local Search (ColPali)
 # =============================
-def search(query: str, k: int = 5) -> List[int]:
     """
     Search within a PDF document for the most relevant pages to answer a query and return the page indexes as a list.
     MCP tool description:
-      - name: mcp_test_search
       - description: Search within a PDF document for the most relevant pages to answer a query.
       - input_schema:
           type: object
@@ -212,7 +212,7 @@ def search_synthetize(query: str, k: int = 5) -> List[int]:
     """
     Search within a PDF document for the most relevant pages to answer a query and synthetizes a short grounded answer using only those pages.
     MCP tool description:
-      - name: mcp_test_search_synthetize
       - description: Search within a PDF document for the most relevant pages to answer a query and synthetizes a short grounded answer using only those pages.
       - input_schema:
           type: object
@@ -226,13 +226,15 @@ def search_synthetize(query: str, k: int = 5) -> List[int]:
     Returns:
         ai_response (str): Text answer to the query grounded in content from the PDF, with citations (page numbers).
     """
-    top_k_indices = search(query, k)
     expanded = set(top_k_indices)
     for i in top_k_indices:
         expanded.add(i - 1)
         expanded.add(i + 1)
     expanded = {i for i in expanded if 0 <= i < len(images)}
     expanded = sorted(expanded)
     # Build gallery results with 1-based page numbering
     results = []
@@ -268,12 +270,13 @@ def _build_image_parts_from_indices(indices: List[int]) -> List[Dict[str, Any]]:
 SYSTEM1 = (
     """
-You are a PDF research agent with a single tool: mcp_test_search(query: string, k: int).
 Act iteratively:
-  1) Split the user question into 1–4 focused sub-queries. Subqueries should be asked as natural language questions, not just keywords.
-  2) For each sub-query, call mcp_test_search (k=5 by default; increase to up to 10 if you need to go deep).
-  3) You will receive the output of mcp_test_search as a list of indices corresponding to page numbers. Stop generating once all the tool calls end. You will be fed the corresponding pages as images in a follow-up message.
-  4) Stop early when confident; otherwise run new search calls using the tool to find additional missing information. Use up to 5 rounds of iterations and 20 searches in total. If info is missing, try to continue searching using new keywords and queries.
 Workflow:
   • Use ONLY the provided images for grounding and cite as (p.<page>).
@@ -286,10 +289,10 @@ Deliverable:
 SYSTEM2 = """
-You are a PDF research agent with a single tool: mcp_test_search_synthetize(query: string, k: int).
 Act iteratively:
   1) Split the user question into 1–4 focused sub-queries. Subqueries should be asked as natural language questions, not just keywords.
-  2) For each sub-query, call mcp_test_search_synthetize (k=5 by default; increase to up to 20 if you need to go deep).
   3) Stop early when confident; otherwise refine and repeat, up to 4 iterations and 20 searches in total. If info is missing, try to continue searching using new keywords and queries.
 Grounding & citations:
@@ -325,11 +328,20 @@ def stream_agent(question: str,
     Multi-round streaming:
       • Seed: optional local ColPali search on the user question to attach initial pages.
       • Each round: open a GPT-5 stream with *attached images* (if any).
-      • If the model calls mcp_test_search and returns indices, we end the stream and
         start a NEW API call with previous_response_id + the requested pages attached.
     """
-    visual_reasoning = True if visual_reasoning=="Visual Reasoning" else False
-    allowed_tools = "mcp_test_search"  if visual_reasoning else "mcp_test_search_synthetize"
     SYSTEM= SYSTEM1 if visual_reasoning else SYSTEM2
     if not api_key:
@@ -342,12 +354,6 @@ def stream_agent(question: str,
     client = OpenAI(api_key=api_key)
-    # Optional seeding: attach some likely pages on round 1
-    try:
-        seed_indices = [] if visual_reasoning is False else search(question, k=5)
-    except Exception as e:
-        yield f"❌ Search failed: {e}", "", ""
-        return
     log_lines = ["Log", f"[seed] indices={seed_indices}"]
     prev_response_id: Optional[str] = None
@@ -386,9 +392,10 @@ def stream_agent(question: str,
             parts.append({"type": "input_text", "text": "Continue reasoning with the newly attached pages. Remember you should probably further query the search tool."})
         parts += _build_image_parts_from_indices(attached_indices)
-        if attached_indices:
-            pages_str = ", ".join(str(i + 1) for i in sorted(set(attached_indices)))
-            parts.append({"type": "input_text", "text": f"(Attached pages from round {round_idx}: {pages_str}). Ground your answer in these images, or query for new pages."})
         # First call includes system; follow-ups use previous_response_id
         if prev_response_id:
@@ -404,7 +411,7 @@ def stream_agent(question: str,
             input=req_input,
             reasoning={"effort": "medium", "summary": "auto"},
             tools=tools,
-            store=True,  # persist conversation state on server
         )
         if prev_response_id:
             req_kwargs["previous_response_id"] = prev_response_id
@@ -493,12 +500,13 @@ def stream_agent(question: str,
                 expanded.add(i - 1)
                 expanded.add(i + 1)
             expanded = {i for i in expanded if 0 <= i < len(images)}
-            pending_indices = sorted(expanded) if len(expanded) < 15 else sorted(base)
             round_idx += 1
             continue
         # No further tool-driven retrieval → done
         break
     return
@@ -567,14 +575,14 @@ body {background: radial-gradient(1200px 600px at 20% -10%, rgba(124,58,237,.25)
 def build_ui():
     theme = gr.themes.Soft()
-    with gr.Blocks(title="ColPali PDF RAG + Follow-up Responses", theme=theme, css=CUSTOM_CSS) as demo:
         gr.HTML(
             """
             <div class="app-header">
               <div class="icon">📚</div>
               <div>
-                <h1>ColPali PDF Search + Streaming Agent (Follow-up Responses)</h1>
-                <p>Index PDFs with ColQwen2. The agent attaches images in follow-up GPT-5 calls; MCP is search-only.</p>
               </div>
             </div>
             """
@@ -627,10 +635,10 @@ def build_ui():
                     search_synthetize_button = gr.Button("🔍 Search & Synthetize", variant="primary")
                 with gr.Column(scale=2):
-                    output_docs = gr.Textbox(label="Indices (0-based)", lines=1, placeholder="[0, 1, 2, ...]")
                     output_text = gr.Textbox(label="ColQwen+GPT-5 Answer", lines=12, placeholder="...")
-            search_button.click(search, inputs=[query_box, k_slider], outputs=[output_docs])
             search_synthetize_button.click(search_synthetize, inputs=[query_box, k_slider], outputs=[output_text])
         # ---- Tab 3: Agent (Streaming)
@@ -670,9 +678,9 @@ def build_ui():
                             )
                         with gr.Row():
                             visual_reasoning_box = gr.Dropdown(
-                                label="Visual Reasoning",
-                                choices=["Visual Reasoning", "Vision Summary"],
-                                value="Visual Reasoning",
                             )
                 with gr.Column(scale=3):

 # Local Search (ColPali)
 # =============================
+def image_search(query: str, k: int = 5) -> List[int]:
     """
     Search within a PDF document for the most relevant pages to answer a query and return the page indexes as a list.
     MCP tool description:
+      - name: visual_deepsearch_image_search
       - description: Search within a PDF document for the most relevant pages to answer a query.
       - input_schema:
           type: object
     """
     Search within a PDF document for the most relevant pages to answer a query and synthetizes a short grounded answer using only those pages.
     MCP tool description:
+      - name: visual_deepsearch_search_synthetize
       - description: Search within a PDF document for the most relevant pages to answer a query and synthetizes a short grounded answer using only those pages.
       - input_schema:
           type: object
     Returns:
         ai_response (str): Text answer to the query grounded in content from the PDF, with citations (page numbers).
     """
+    top_k_indices = image_search(query, k)
     expanded = set(top_k_indices)
     for i in top_k_indices:
         expanded.add(i - 1)
         expanded.add(i + 1)
     expanded = {i for i in expanded if 0 <= i < len(images)}
     expanded = sorted(expanded)
+    expanded = expanded if len(expanded) < 20 else sorted(top_k_indices)
     # Build gallery results with 1-based page numbering
     results = []
 SYSTEM1 = (
     """
+You are a PDF research agent with a single tool: visual_deepsearch_image_search(query: string, k: int).
 Act iteratively:
+  1) Split the user question into 1–4 focused sub-queries. You can use the provided page images to help you ask relevant followup queries. Subqueries should be asked as natural language questions, not just keywords.
+  2) For each sub-query, call visual_deepsearch_image_search (k=5 by default; increase to up to 10 if you need to go deep).
+  3) You will receive the output of visual_deepsearch_image_search as a list of indices corresponding to page numbers. Print the page numbers out and stop generating. An external system will take over and convert the indices into image for you.
+  4) Analyze the images received to find information you were looking for. If you are condident that you have all the information needed for a complete response, stop early and provide a final answer. Otherwise run new search calls using the tool to find additional missing information.
+  5) Repeat the process for up to 5 rounds of iterations and 20 searches in total. If info is missing, try to continue searching using new keywords and queries.
 Workflow:
   • Use ONLY the provided images for grounding and cite as (p.<page>).
 SYSTEM2 = """
+You are a PDF research agent with a single tool: visual_deepsearch_search_synthetize(query: string, k: int).
 Act iteratively:
   1) Split the user question into 1–4 focused sub-queries. Subqueries should be asked as natural language questions, not just keywords.
+  2) For each sub-query, call visual_deepsearch_search_synthetize (k=5 by default; increase to up to 20 if you need to go deep).
   3) Stop early when confident; otherwise refine and repeat, up to 4 iterations and 20 searches in total. If info is missing, try to continue searching using new keywords and queries.
 Grounding & citations:
     Multi-round streaming:
       • Seed: optional local ColPali search on the user question to attach initial pages.
       • Each round: open a GPT-5 stream with *attached images* (if any).
+      • If the model calls the tool and returns indices, we end the stream and
         start a NEW API call with previous_response_id + the requested pages attached.
     """
+        # Optional seeding: attach some likely pages on round 1
+    try:
+        seed_indices = search(question, k=5) if visual_reasoning == "Seeded Visual Reasoning" else []
+    except Exception as e:
+        yield f"❌ Search failed: {e}", "", ""
+        return
+    visual_reasoning: bool = True if "Visual Reasoning" in visual_reasoning else False
+    allowed_tools = "visual_deepsearch_image_search"  if visual_reasoning else "visual_deepsearch_search_synthetize"
     SYSTEM= SYSTEM1 if visual_reasoning else SYSTEM2
     if not api_key:
     client = OpenAI(api_key=api_key)
     log_lines = ["Log", f"[seed] indices={seed_indices}"]
     prev_response_id: Optional[str] = None
             parts.append({"type": "input_text", "text": "Continue reasoning with the newly attached pages. Remember you should probably further query the search tool."})
         parts += _build_image_parts_from_indices(attached_indices)
+        # if attached_indices:
+        #    pages_str = ", ".join(str(i + 1) for i in sorted(set(attached_indices)))
+        #    parts.append({"type": "input_text", "text": f"(Attached pages from round {round_idx}: {pages_str}). Ground your answer in these images, or query for new pages."})
         # First call includes system; follow-ups use previous_response_id
         if prev_response_id:
             input=req_input,
             reasoning={"effort": "medium", "summary": "auto"},
             tools=tools,
+            store=True,
         )
         if prev_response_id:
             req_kwargs["previous_response_id"] = prev_response_id
                 expanded.add(i - 1)
                 expanded.add(i + 1)
             expanded = {i for i in expanded if 0 <= i < len(images)}
+            pending_indices = sorted(expanded) if len(expanded) < 20 else sorted(base)
             round_idx += 1
             continue
         # No further tool-driven retrieval → done
         break
+    print("Search Finished")
     return
 def build_ui():
     theme = gr.themes.Soft()
+    with gr.Blocks(title="ColPali Agentic RAG", theme=theme, css=CUSTOM_CSS) as demo:
         gr.HTML(
             """
             <div class="app-header">
               <div class="icon">📚</div>
               <div>
+                <h1>ColPali PDF Search + GPT5 Agent</h1>
+                <p>Index PDFs with ColQwen2. The agent uses the search tool through MCP. The search tool returns either textual summaries or images by reference which are attached to conversation in follow-up GPT-5 calls.</p>
               </div>
             </div>
             """
                     search_synthetize_button = gr.Button("🔍 Search & Synthetize", variant="primary")
                 with gr.Column(scale=2):
+                    output_docs = gr.Textbox(label="Indices", lines=1, placeholder="[0, 1, 2, ...]")
                     output_text = gr.Textbox(label="ColQwen+GPT-5 Answer", lines=12, placeholder="...")
+            search_button.click(image_search, inputs=[query_box, k_slider], outputs=[output_docs])
             search_synthetize_button.click(search_synthetize, inputs=[query_box, k_slider], outputs=[output_text])
         # ---- Tab 3: Agent (Streaming)
                             )
                         with gr.Row():
                             visual_reasoning_box = gr.Dropdown(
+                                label="Reasoning Mode",
+                                choices=["Visual Reasoning", "Seeded Visual Reasoning", "Visual Summary Reasoning"],
+                                value="Visual Summary Reasoning",
                             )
                 with gr.Column(scale=3):