Spaces:

jedick
/

R-help-chat

Running on Zero

App Files Files Community

jedick commited on 27 days ago

Commit

84ccc57

1 Parent(s): 1130c52

Display thinking output

Browse files

Files changed (2) hide show

app.py +34 -15
mods/tool_calling_llm.py +33 -7

app.py CHANGED Viewed

@@ -6,6 +6,7 @@ from langgraph.checkpoint.memory import MemorySaver
 from dotenv import load_dotenv
 from main import openai_model, model_id
 from util import get_sources, get_start_end_months
 import requests
 import zipfile
 import shutil
@@ -16,6 +17,8 @@ import torch
 import uuid
 import ast
 import os
 # Setup environment variables
 load_dotenv(dotenv_path=".env", override=True)
@@ -71,7 +74,7 @@ def run_workflow(input, history, compute_mode, thread_id, session_hash):
         graph_instances[compute_mode][session_hash] = graph
         print(f"Set {compute_mode} graph for session {session_hash}")
         # Notify when model finishes loading
-        gr.Success(f"{compute_mode}", duration=4, title=f"Model loaded")
     print(f"Using thread_id: {thread_id}")
@@ -94,6 +97,17 @@ def run_workflow(input, history, compute_mode, thread_id, session_hash):
         if node == "query":
             # Get the message (AIMessage class in LangChain)
             chunk_messages = chunk["messages"]
             # Look for tool calls
             if chunk_messages.tool_calls:
                 # Loop over tool calls
@@ -114,11 +128,6 @@ def run_workflow(input, history, compute_mode, thread_id, session_hash):
                             metadata={"title": f"🔍 Running tool {tool_call['name']}"},
                         )
                     )
-            if chunk_messages.content:
-                # Display response made instead of or in addition to a tool call
-                history.append(
-                    gr.ChatMessage(role="assistant", content=chunk_messages.content)
-                )
             yield history, [], []
         if node == "retrieve_emails":
@@ -165,9 +174,18 @@ def run_workflow(input, history, compute_mode, thread_id, session_hash):
             chunk_messages = chunk["messages"]
             # Chat response without citations
             if chunk_messages.content:
-                history.append(
-                    gr.ChatMessage(role="assistant", content=chunk_messages.content)
-                )
             # None is used for no change to the retrieved emails textbox
             yield history, None, []
@@ -267,7 +285,7 @@ with gr.Blocks(
         render=False,
     )
     data_error = gr.Textbox(
-        value="App is unavailable because data could not be loaded. Try reloading the page, then contact the maintainer if the problem persists.",
         lines=1,
         label="Error downloading or extracting data",
         visible=False,
@@ -343,7 +361,7 @@ with gr.Blocks(
             ## 🇷🤝💬 R-help-chat
             **Chat with the [R-help mailing list archives](https://stat.ethz.ch/pipermail/r-help/).**
-            An LLM turns your question into a search query, including year ranges, and generates an answer from the retrieved emails.
             You can ask follow-up questions with the chat history as context.
             ➡️ To clear the history and start a new chat, press the 🗑️ clear button.
             **_Answers may be incorrect._**
@@ -361,7 +379,8 @@ with gr.Blocks(
         if compute_mode == "local":
             status_text = f"""
             📍 Now in **local** mode, using ZeroGPU hardware<br>
-            ⌛ Response time is about 1 minute<br>
             ✨ [nomic-embed-text-v1.5](https://huggingface.co/nomic-ai/nomic-embed-text-v1.5) and [{model_id.split("/")[-1]}](https://huggingface.co/{model_id})<br>
             🏠 See the project's [GitHub repository](https://github.com/jedick/R-help-chat)
             """
@@ -379,8 +398,8 @@ with gr.Blocks(
             end = None
         info_text = f"""
             **Database:** {len(sources)} emails from {start} to {end}.
-            **Features:** RAG, today's date, hybrid search (dense+sparse), query analysis,
-            multiple retrievals per turn (remote mode), answer with citations (remote mode), chat memory.
             **Tech:** LangChain + Hugging Face + Gradio; ChromaDB and BM25S-based retrievers.<br>
             """
         return info_text
@@ -410,7 +429,7 @@ with gr.Blocks(
                 example_questions = [
                     # "What is today's date?",
                     "Summarize emails from the last two months",
-                    "How to use plotmath?",
                     "When was has.HLC mentioned?",
                     "Who reported installation problems in 2023-2024?",
                 ]

 from dotenv import load_dotenv
 from main import openai_model, model_id
 from util import get_sources, get_start_end_months
+from mods.tool_calling_llm import extract_think
 import requests
 import zipfile
 import shutil
 import uuid
 import ast
 import os
+import re
 # Setup environment variables
 load_dotenv(dotenv_path=".env", override=True)
         graph_instances[compute_mode][session_hash] = graph
         print(f"Set {compute_mode} graph for session {session_hash}")
         # Notify when model finishes loading
+        gr.Success(f"{compute_mode}", duration=4, title=f"Model loaded!")
     print(f"Using thread_id: {thread_id}")
         if node == "query":
             # Get the message (AIMessage class in LangChain)
             chunk_messages = chunk["messages"]
+            # Display non-tool-call content
+            if chunk_messages.content:
+                content = chunk_messages.content
+                metadata = None
+                # Show thinking content in "metadata" message
+                if content.startswith("<think>"):
+                    content, _ = extract_think(content)
+                    metadata = {"title": f"🧠 Thinking about query"}
+                history.append(
+                    gr.ChatMessage(role="assistant", content=content, metadata=metadata)
+                )
             # Look for tool calls
             if chunk_messages.tool_calls:
                 # Loop over tool calls
                             metadata={"title": f"🔍 Running tool {tool_call['name']}"},
                         )
                     )
             yield history, [], []
         if node == "retrieve_emails":
             chunk_messages = chunk["messages"]
             # Chat response without citations
             if chunk_messages.content:
+                content = chunk_messages.content
+                # Show thinking content in "metadata" message
+                think_text, content = extract_think(content)
+                if think_text:
+                    history.append(
+                        gr.ChatMessage(
+                            role="assistant",
+                            content=think_text,
+                            metadata={"title": f"🧠 Thinking about answer"},
+                        )
+                    )
+                history.append(gr.ChatMessage(role="assistant", content=content))
             # None is used for no change to the retrieved emails textbox
             yield history, None, []
         render=False,
     )
     data_error = gr.Textbox(
+        value="Email database is missing. Try reloading the page, then contact the maintainer if the problem persists.",
         lines=1,
         label="Error downloading or extracting data",
         visible=False,
             ## 🇷🤝💬 R-help-chat
             **Chat with the [R-help mailing list archives](https://stat.ethz.ch/pipermail/r-help/).**
+            An LLM turns your question into a search query, including year ranges and months, and generates an answer from the retrieved emails.
             You can ask follow-up questions with the chat history as context.
             ➡️ To clear the history and start a new chat, press the 🗑️ clear button.
             **_Answers may be incorrect._**
         if compute_mode == "local":
             status_text = f"""
             📍 Now in **local** mode, using ZeroGPU hardware<br>
+            ⌛ Response time is about one minute<br>
+            🧠 Thinking is enabled for query; add **/think** to enable thinking for answer</br>
             ✨ [nomic-embed-text-v1.5](https://huggingface.co/nomic-ai/nomic-embed-text-v1.5) and [{model_id.split("/")[-1]}](https://huggingface.co/{model_id})<br>
             🏠 See the project's [GitHub repository](https://github.com/jedick/R-help-chat)
             """
             end = None
         info_text = f"""
             **Database:** {len(sources)} emails from {start} to {end}.
+            **Features:** RAG, today's date, hybrid search (dense+sparse), thinking display (local),
+            multiple retrievals per turn (remote), answer with citations (remote), chat memory.
             **Tech:** LangChain + Hugging Face + Gradio; ChromaDB and BM25S-based retrievers.<br>
             """
         return info_text
                 example_questions = [
                     # "What is today's date?",
                     "Summarize emails from the last two months",
+                    "Advice on using plotmath /think",
                     "When was has.HLC mentioned?",
                     "Who reported installation problems in 2023-2024?",
                 ]

mods/tool_calling_llm.py CHANGED Viewed

@@ -1,6 +1,7 @@
 import re
 import json
 import uuid
 from abc import ABC
 from shutil import Error
 from typing import (
@@ -145,6 +146,19 @@ def parse_response(message: BaseMessage) -> str:
     raise ValueError(f"`message` is not an instance of `AIMessage`: {message}")
 class ToolCallingLLM(BaseChatModel, ABC):
     """ToolCallingLLM mixin to enable tool calling features on non tool calling models.
@@ -239,7 +253,7 @@ class ToolCallingLLM(BaseChatModel, ABC):
     """  # noqa: E501
     tool_system_prompt_template: str = DEFAULT_SYSTEM_TEMPLATE
-    # Suffix to add to the system prompt that is not templated (variable names are not interpreted)
     system_message_suffix: str = ""
     override_bind_tools: bool = True
@@ -301,7 +315,7 @@ class ToolCallingLLM(BaseChatModel, ABC):
         system_message = system_message_prompt_template.format(
             tools=json.dumps(functions, indent=2)
         )
-        # Add extra context after the formatted system message
         system_message = SystemMessage(
             system_message.content + self.system_message_suffix
         )
@@ -313,14 +327,22 @@ class ToolCallingLLM(BaseChatModel, ABC):
         chat_generation_content = response_message.content
         if not isinstance(chat_generation_content, str):
             raise ValueError("ToolCallingLLM does not support non-string output.")
         try:
             parsed_chat_result = json.loads(chat_generation_content)
         except json.JSONDecodeError:
             try:
                 parsed_chat_result = parse_json_garbage(chat_generation_content)
             except Exception:
                 return AIMessage(content=chat_generation_content)
         called_tool_name = (
             parsed_chat_result["tool"]
             if "tool" in parsed_chat_result
@@ -349,10 +371,14 @@ class ToolCallingLLM(BaseChatModel, ABC):
             elif "response" in parsed_chat_result:
                 response = parsed_chat_result["response"]
             else:
-                raise ValueError(
-                    f"Failed to parse a response from {self.model} output: "  # type: ignore[attr-defined]
-                    f"{chat_generation_content}"
-                )
             return AIMessage(content=response)
         called_tool_arguments = (
@@ -366,7 +392,7 @@ class ToolCallingLLM(BaseChatModel, ABC):
         )
         response_message_with_functions = AIMessage(
-            content="",
             tool_calls=[
                 ToolCall(
                     name=called_tool_name,

 import re
 import json
 import uuid
+import warnings
 from abc import ABC
 from shutil import Error
 from typing import (
     raise ValueError(f"`message` is not an instance of `AIMessage`: {message}")
+def extract_think(content):
+    # Added by Cursor 20250726 jmd
+    # Extract content within <think>...</think>
+    think_match = re.search(r"<think>(.*?)</think>", content, re.DOTALL)
+    think_text = think_match.group(1).strip() if think_match else ""
+    # Extract text after </think>
+    if think_match:
+        post_think = content[think_match.end() :].lstrip()
+    else:
+        post_think = content
+    return think_text, post_think
 class ToolCallingLLM(BaseChatModel, ABC):
     """ToolCallingLLM mixin to enable tool calling features on non tool calling models.
     """  # noqa: E501
     tool_system_prompt_template: str = DEFAULT_SYSTEM_TEMPLATE
+    # Suffix to add to the system prompt that is not templated 20250717 jmd
     system_message_suffix: str = ""
     override_bind_tools: bool = True
         system_message = system_message_prompt_template.format(
             tools=json.dumps(functions, indent=2)
         )
+        # Add extra context after the formatted system message 20250717 jmd
         system_message = SystemMessage(
             system_message.content + self.system_message_suffix
         )
         chat_generation_content = response_message.content
         if not isinstance(chat_generation_content, str):
             raise ValueError("ToolCallingLLM does not support non-string output.")
+        # Extract <think>...</think> content and text after </think> for further processing 20250726 jmd
+        think_text, chat_generation_content = extract_think(chat_generation_content)
         try:
             parsed_chat_result = json.loads(chat_generation_content)
         except json.JSONDecodeError:
             try:
                 parsed_chat_result = parse_json_garbage(chat_generation_content)
             except Exception:
+                warnings.warn(f"Failed to parse JSON from {self.model} output")
                 return AIMessage(content=chat_generation_content)
+        print("parsed_chat_result")
+        print(parsed_chat_result)
         called_tool_name = (
             parsed_chat_result["tool"]
             if "tool" in parsed_chat_result
             elif "response" in parsed_chat_result:
                 response = parsed_chat_result["response"]
             else:
+                # raise ValueError(
+                #    f"Failed to parse a response from {self.model} output: "  # type: ignore[attr-defined]
+                #    # Keep this commented for privacy in deployed app 20250727 jmd
+                #    # f"{chat_generation_content}"
+                # )
+                # Change to warning and return the generated content 20250727 jmd
+                warnings.warn(f"Failed to parse a response from {self.model} output")
+                response = chat_generation_content
             return AIMessage(content=response)
         called_tool_arguments = (
         )
         response_message_with_functions = AIMessage(
+            content=f"<think>\n{think_text}\n</think>",
             tool_calls=[
                 ToolCall(
                     name=called_tool_name,