Spaces:

jedick
/

R-help-chat

Running on Zero

App Files Files Community

jedick commited on 27 days ago

Commit

3575a77

1 Parent(s): ace4242

Enable thinking for answer

Browse files

Files changed (5) hide show

app.py +9 -8
graph.py +17 -13
main.py +8 -8
mods/tool_calling_llm.py +11 -3
prompts.py +2 -2

app.py CHANGED Viewed

@@ -88,7 +88,7 @@ def run_workflow(input, history, compute_mode, thread_id, session_hash):
         # Get the chat model and build the graph
         chat_model = GetChatModel(compute_mode)
         graph_builder = BuildGraph(
-            chat_model, compute_mode, search_type, think_query=True
         )
         # Compile the graph with an in-memory checkpointer
         memory = MemorySaver()
@@ -184,7 +184,7 @@ def run_workflow(input, history, compute_mode, thread_id, session_hash):
             retrieved_emails = "\n\n".join(retrieved_emails)
             yield history, retrieved_emails, []
-        if node == "generate":
             # Append messages (thinking and non-thinking) to history
             chunk_messages = chunk["messages"]
             history = append_content(chunk_messages, history, thinking_about="answer")
@@ -383,8 +383,9 @@ with gr.Blocks(
             status_text = f"""
             📍 Now in **local** mode, using ZeroGPU hardware<br>
             ⌛ Response time is about one minute<br>
-            🔍 Thinking is enabled for the query<br>
-            &emsp;&nbsp; 🧠 Add **/think** to enable thinking for the answer</br>
             ✨ [nomic-embed-text-v1.5](https://huggingface.co/nomic-ai/nomic-embed-text-v1.5) and [{model_id.split("/")[-1]}](https://huggingface.co/{model_id})<br>
             🏠 See the project's [GitHub repository](https://github.com/jedick/R-help-chat)
             """
@@ -412,15 +413,15 @@ with gr.Blocks(
         """Get example questions based on compute mode"""
         questions = [
             # "What is today's date?",
-            "Summarize emails from the last two months",
-            "Advice on using plotmath /think",
             "When was has.HLC mentioned?",
             "Who reported installation problems in 2023-2024?",
         ]
         if compute_mode == "remote":
-            # Remove "/think" from questions in remote mode
-            questions = [q.replace(" /think", "") for q in questions]
         # cf. https://github.com/gradio-app/gradio/pull/8745 for updating examples
         return gr.Dataset(samples=[[q] for q in questions]) if as_dataset else questions

         # Get the chat model and build the graph
         chat_model = GetChatModel(compute_mode)
         graph_builder = BuildGraph(
+            chat_model, compute_mode, search_type, think_answer=True
         )
         # Compile the graph with an in-memory checkpointer
         memory = MemorySaver()
             retrieved_emails = "\n\n".join(retrieved_emails)
             yield history, retrieved_emails, []
+        if node == "answer":
             # Append messages (thinking and non-thinking) to history
             chunk_messages = chunk["messages"]
             history = append_content(chunk_messages, history, thinking_about="answer")
             status_text = f"""
             📍 Now in **local** mode, using ZeroGPU hardware<br>
             ⌛ Response time is about one minute<br>
+            🧠 Thinking is enabled for the answer<br>
+            &emsp;&nbsp; 🔍 Add **/think** to enable thinking for the query</br>
+            &emsp;&nbsp; 🚫 Add **/no_think** to disable all thinking</br>
             ✨ [nomic-embed-text-v1.5](https://huggingface.co/nomic-ai/nomic-embed-text-v1.5) and [{model_id.split("/")[-1]}](https://huggingface.co/{model_id})<br>
             🏠 See the project's [GitHub repository](https://github.com/jedick/R-help-chat)
             """
         """Get example questions based on compute mode"""
         questions = [
             # "What is today's date?",
+            "Summarize emails from the last two months /no_think",
+            "Show me code examples using plotmath",
             "When was has.HLC mentioned?",
             "Who reported installation problems in 2023-2024?",
         ]
         if compute_mode == "remote":
+            # Remove "/no_think" from questions in remote mode
+            questions = [q.replace(" /no_think", "") for q in questions]
         # cf. https://github.com/gradio-app/gradio/pull/8745 for updating examples
         return gr.Dataset(samples=[[q] for q in questions]) if as_dataset else questions

graph.py CHANGED Viewed

@@ -9,7 +9,7 @@ import os
 # Local modules
 from retriever import BuildRetriever
-from prompts import query_prompt, generate_prompt, generic_tools_template
 from mods.tool_calling_llm import ToolCallingLLM
 # For tracing (disabled)
@@ -94,6 +94,7 @@ def BuildGraph(
     search_type,
     top_k=6,
     think_query=False,
 ):
     """
     Build conversational RAG graph for email retrieval and answering with citations.
@@ -103,7 +104,8 @@ def BuildGraph(
         compute_mode: remote or local (for retriever)
         search_type: dense, sparse, or hybrid (for retriever)
         top_k: number of documents to retrieve
-        think_query: Whether to use thinking mode for query
     Based on:
         https://python.langchain.com/docs/how_to/qa_sources
@@ -193,11 +195,11 @@ def BuildGraph(
             chat_model, query_prompt(chat_model, think=think_query)
         ).bind_tools([retrieve_emails])
         # Don't use answer_with_citations tool because responses with are sometimes unparseable
-        generate_model = chat_model
     else:
         # For remote model (OpenAI API)
         query_model = chat_model.bind_tools([retrieve_emails])
-        generate_model = chat_model.bind_tools([answer_with_citations])
     # Initialize the graph object
     graph = StateGraph(MessagesState)
@@ -216,27 +218,29 @@ def BuildGraph(
         return {"messages": response}
-    def generate(state: MessagesState):
         """Generates an answer with the chat model"""
         if is_local:
             messages = state["messages"]
-            # print_message_summaries(messages, "--- generate: before normalization ---")
             messages = normalize_messages(messages)
             # Add the system message here because we're not using tools
-            messages = [SystemMessage(generate_prompt(chat_model))] + messages
-            # print_message_summaries(messages, "--- generate: after normalization ---")
         else:
             messages = [
-                SystemMessage(generate_prompt(chat_model, with_tools=True))
             ] + state["messages"]
-        response = generate_model.invoke(messages)
         return {"messages": response}
     # Define model and tool nodes
     graph.add_node("query", query)
-    graph.add_node("generate", generate)
     graph.add_node("retrieve_emails", ToolNode([retrieve_emails]))
     graph.add_node("answer_with_citations", ToolNode([answer_with_citations]))
     # Route the user's input to the query model
@@ -249,13 +253,13 @@ def BuildGraph(
         {END: END, "tools": "retrieve_emails"},
     )
     graph.add_conditional_edges(
-        "generate",
         tools_condition,
         {END: END, "tools": "answer_with_citations"},
     )
     # Add edge from the retrieval tool to the generating model
-    graph.add_edge("retrieve_emails", "generate")
     # Done!
     return graph

 # Local modules
 from retriever import BuildRetriever
+from prompts import query_prompt, answer_prompt, generic_tools_template
 from mods.tool_calling_llm import ToolCallingLLM
 # For tracing (disabled)
     search_type,
     top_k=6,
     think_query=False,
+    think_answer=False,
 ):
     """
     Build conversational RAG graph for email retrieval and answering with citations.
         compute_mode: remote or local (for retriever)
         search_type: dense, sparse, or hybrid (for retriever)
         top_k: number of documents to retrieve
+        think_query: Whether to use thinking mode for the query
+        think_answer: Whether to use thinking mode for the answer
     Based on:
         https://python.langchain.com/docs/how_to/qa_sources
             chat_model, query_prompt(chat_model, think=think_query)
         ).bind_tools([retrieve_emails])
         # Don't use answer_with_citations tool because responses with are sometimes unparseable
+        answer_model = chat_model
     else:
         # For remote model (OpenAI API)
         query_model = chat_model.bind_tools([retrieve_emails])
+        answer_model = chat_model.bind_tools([answer_with_citations])
     # Initialize the graph object
     graph = StateGraph(MessagesState)
         return {"messages": response}
+    def answer(state: MessagesState):
         """Generates an answer with the chat model"""
         if is_local:
             messages = state["messages"]
+            # print_message_summaries(messages, "--- answer: before normalization ---")
             messages = normalize_messages(messages)
             # Add the system message here because we're not using tools
+            messages = [
+                SystemMessage(answer_prompt(chat_model, think=think_answer))
+            ] + messages
+            # print_message_summaries(messages, "--- answer: after normalization ---")
         else:
             messages = [
+                SystemMessage(answer_prompt(chat_model, with_tools=True))
             ] + state["messages"]
+        response = answer_model.invoke(messages)
         return {"messages": response}
     # Define model and tool nodes
     graph.add_node("query", query)
     graph.add_node("retrieve_emails", ToolNode([retrieve_emails]))
+    graph.add_node("answer", answer)
     graph.add_node("answer_with_citations", ToolNode([answer_with_citations]))
     # Route the user's input to the query model
         {END: END, "tools": "retrieve_emails"},
     )
     graph.add_conditional_edges(
+        "answer",
         tools_condition,
         {END: END, "tools": "answer_with_citations"},
     )
     # Add edge from the retrieval tool to the generating model
+    graph.add_edge("retrieve_emails", "answer")
     # Done!
     return graph

main.py CHANGED Viewed

@@ -23,7 +23,7 @@ from langchain_huggingface import ChatHuggingFace, HuggingFacePipeline
 from index import ProcessFile
 from retriever import BuildRetriever, db_dir
 from graph import BuildGraph
-from prompts import generate_prompt
 # -----------
 # R-help-chat
@@ -201,7 +201,7 @@ def RunChain(
     chat_model = GetChatModel(compute_mode)
     # Get prompt with /no_think for SmolLM3/Qwen
-    system_prompt = generate_prompt(chat_model)
     # Create a prompt template
     system_template = ChatPromptTemplate.from_messages([SystemMessage(system_prompt)])
@@ -236,8 +236,8 @@ def RunGraph(
     compute_mode: str = "remote",
     search_type: str = "hybrid",
     top_k: int = 6,
-    think_retrieve=False,
-    think_generate=False,
     thread_id=None,
 ):
     """Run graph for conversational RAG app
@@ -247,8 +247,8 @@ def RunGraph(
         compute_mode: Compute mode for embedding and chat models (remote or local)
         search_type: Type of search to use. Options: "dense", "sparse", or "hybrid"
         top_k: Number of documents to retrieve
-        think_retrieve: Whether to use thinking mode for retrieval (tool-calling)
-        think_generate: Whether to use thinking mode for generation
         thread_id: Thread ID for memory (optional)
     Example:
@@ -263,8 +263,8 @@ def RunGraph(
         compute_mode,
         search_type,
         top_k,
-        think_retrieve,
-        think_generate,
     )
     # Compile the graph with an in-memory checkpointer

 from index import ProcessFile
 from retriever import BuildRetriever, db_dir
 from graph import BuildGraph
+from prompts import answer_prompt
 # -----------
 # R-help-chat
     chat_model = GetChatModel(compute_mode)
     # Get prompt with /no_think for SmolLM3/Qwen
+    system_prompt = answer_prompt(chat_model)
     # Create a prompt template
     system_template = ChatPromptTemplate.from_messages([SystemMessage(system_prompt)])
     compute_mode: str = "remote",
     search_type: str = "hybrid",
     top_k: int = 6,
+    think_query=False,
+    think_answer=False,
     thread_id=None,
 ):
     """Run graph for conversational RAG app
         compute_mode: Compute mode for embedding and chat models (remote or local)
         search_type: Type of search to use. Options: "dense", "sparse", or "hybrid"
         top_k: Number of documents to retrieve
+        think_query: Whether to use thinking mode for the query
+        think_answer: Whether to use thinking mode for the answer
         thread_id: Thread ID for memory (optional)
     Example:
         compute_mode,
         search_type,
         top_k,
+        think_query,
+        think_answer,
     )
     # Compile the graph with an in-memory checkpointer

mods/tool_calling_llm.py CHANGED Viewed

@@ -183,10 +183,18 @@ class ToolCallingLLM(BaseChatModel, ABC):
         # Parse output for JSON (support multiple objects separated by commas)
         try:
             parsed_json_results = json.loads(f"[{post_think}]")
-        except json.JSONDecodeError:
-            # Return entire response if JSON wasn't parsed (or is missing)
-            return AIMessage(content=response_message.content)
         tool_calls = []
         for parsed_json_result in parsed_json_results:

         # Parse output for JSON (support multiple objects separated by commas)
         try:
+            # Works for one or more JSON objects not enclosed in "[]"
             parsed_json_results = json.loads(f"[{post_think}]")
+        except:
+            try:
+                # Works for one or more JSON objects already enclosed in "[]"
+                parsed_json_results = json.loads(f"{post_think}")
+            except json.JSONDecodeError:
+                # Return entire response if JSON wasn't parsed (or is missing)
+                return AIMessage(content=response_message.content)
+        # print("parsed_json_results")
+        # print(parsed_json_results)
         tool_calls = []
         for parsed_json_result in parsed_json_results:

prompts.py CHANGED Viewed

@@ -46,8 +46,8 @@ def query_prompt(chat_model, think=False):
     return prompt
-def generate_prompt(chat_model, think=False, with_tools=False):
-    """Return system prompt for generate step"""
     prompt = (
         f"Today Date: {date.today()}. "
         "You are a helpful chatbot designed to answer questions about R programming based on the R-help mailing list archives. "

     return prompt
+def answer_prompt(chat_model, think=False, with_tools=False):
+    """Return system prompt for answer step"""
     prompt = (
         f"Today Date: {date.today()}. "
         "You are a helpful chatbot designed to answer questions about R programming based on the R-help mailing list archives. "