# app.py # Advanced AI R&D Assistant for Hugging Face Spaces # # This app leverages LangGraph, DeepSeek-R1 via text-based function calling, and Agentic RAG. # API keys are securely loaded via environment variables. # # To deploy: # 1. Add your API key to Hugging Face Space secrets with the key DEEP_SEEK_API. # 2. Ensure your requirements.txt includes langchain-community. # 3. Run the app with Streamlit. import os import re import logging import streamlit as st import requests from typing import Sequence from typing_extensions import TypedDict, Annotated # Updated imports for LangChain from langchain.embeddings.openai import OpenAIEmbeddings from langchain.vectorstores import Chroma from langchain.schema import HumanMessage, AIMessage from langchain.text_splitter import RecursiveCharacterTextSplitter from langchain.tools.retriever import create_retriever_tool # Imports for LangGraph remain the same from langgraph.graph import END, StateGraph, START from langgraph.prebuilt import ToolNode from langgraph.graph.message import add_messages # Configure logging logging.basicConfig(level=logging.INFO) logger = logging.getLogger(__name__) # --- Dummy Data Setup --- research_texts = [ "Research Report: Results of a New AI Model Improving Image Recognition Accuracy to 98%", "Academic Paper Summary: Why Transformers Became the Mainstream Architecture in Natural Language Processing", "Latest Trends in Machine Learning Methods Using Quantum Computing" ] development_texts = [ "Project A: UI Design Completed, API Integration in Progress", "Project B: Testing New Feature X, Bug Fixes Needed", "Product Y: In the Performance Optimization Stage Before Release" ] # --- Preprocessing & Embeddings --- splitter = RecursiveCharacterTextSplitter(chunk_size=100, chunk_overlap=10) research_docs = splitter.create_documents(research_texts) development_docs = splitter.create_documents(development_texts) embeddings = OpenAIEmbeddings(model="text-embedding-3-large") research_vectorstore = Chroma.from_documents( documents=research_docs, embedding=embeddings, collection_name="research_collection" ) development_vectorstore = Chroma.from_documents( documents=development_docs, embedding=embeddings, collection_name="development_collection" ) research_retriever = research_vectorstore.as_retriever() development_retriever = development_vectorstore.as_retriever() research_tool = create_retriever_tool( research_retriever, "research_db_tool", "Search information from the research database." ) development_tool = create_retriever_tool( development_retriever, "development_db_tool", "Search information from the development database." ) tools = [research_tool, development_tool] # --- Agent and Workflow Functions --- # Note: We are using only AIMessage and HumanMessage for our message types. class AgentState(TypedDict): messages: Annotated[Sequence[AIMessage | HumanMessage], add_messages] def agent(state: AgentState): logger.info("Agent invoked") messages = state["messages"] user_message = messages[0][1] if isinstance(messages[0], tuple) else messages[0].content prompt = f"""Given this user question: "{user_message}" If it's about research or academic topics, respond EXACTLY in this format: SEARCH_RESEARCH: If it's about development status, respond EXACTLY in this format: SEARCH_DEV: Otherwise, just answer directly. """ headers = { "Accept": "application/json", "Authorization": f"Bearer {os.environ.get('DEEP_SEEK_API')}", "Content-Type": "application/json" } data = { "model": "deepseek-chat", "messages": [{"role": "user", "content": prompt}], "temperature": 0.7, "max_tokens": 1024 } response = requests.post( "https://api.deepseek.com/v1/chat/completions", headers=headers, json=data, verify=False ) if response.status_code == 200: response_text = response.json()['choices'][0]['message']['content'] logger.info(f"DeepSeek response: {response_text}") if "SEARCH_RESEARCH:" in response_text: query = response_text.split("SEARCH_RESEARCH:")[1].strip() results = research_retriever.invoke(query) return {"messages": [AIMessage(content=f'Action: research_db_tool\n{{"query": "{query}"}}\n\nResults: {str(results)}')]} elif "SEARCH_DEV:" in response_text: query = response_text.split("SEARCH_DEV:")[1].strip() results = development_retriever.invoke(query) return {"messages": [AIMessage(content=f'Action: development_db_tool\n{{"query": "{query}"}}\n\nResults: {str(results)}')]} else: return {"messages": [AIMessage(content=response_text)]} else: error_msg = f"DeepSeek API call failed: {response.text}" logger.error(error_msg) raise Exception(error_msg) def simple_grade_documents(state: AgentState): last_message = state["messages"][-1] logger.info(f"Grading message: {last_message.content}") if "Results: [Document" in last_message.content: return "generate" else: return "rewrite" def generate(state: AgentState): logger.info("Generating final answer") messages = state["messages"] question = messages[0].content if not isinstance(messages[0], tuple) else messages[0][1] last_message = messages[-1] docs = "" if "Results: [" in last_message.content: docs = last_message.content[last_message.content.find("Results: ["):] headers = { "Accept": "application/json", "Authorization": f"Bearer {os.environ.get('DEEP_SEEK_API')}", "Content-Type": "application/json" } prompt = f"""Based on these research documents, summarize the latest advancements in AI: Question: {question} Documents: {docs} Focus on extracting and synthesizing the key findings from the research papers. """ data = { "model": "deepseek-chat", "messages": [{"role": "user", "content": prompt}], "temperature": 0.7, "max_tokens": 1024 } response = requests.post( "https://api.deepseek.com/v1/chat/completions", headers=headers, json=data, verify=False ) if response.status_code == 200: response_text = response.json()['choices'][0]['message']['content'] return {"messages": [AIMessage(content=response_text)]} else: error_msg = f"DeepSeek API generate call failed: {response.text}" logger.error(error_msg) raise Exception(error_msg) def rewrite(state: AgentState): logger.info("Rewriting question") original_question = state["messages"][0].content if state["messages"] else "N/A" headers = { "Accept": "application/json", "Authorization": f"Bearer {os.environ.get('DEEP_SEEK_API')}", "Content-Type": "application/json" } data = { "model": "deepseek-chat", "messages": [{"role": "user", "content": f"Rewrite this question to be more specific and clearer: {original_question}"}], "temperature": 0.7, "max_tokens": 1024 } response = requests.post( "https://api.deepseek.com/v1/chat/completions", headers=headers, json=data, verify=False ) if response.status_code == 200: response_text = response.json()['choices'][0]['message']['content'] return {"messages": [AIMessage(content=response_text)]} else: error_msg = f"DeepSeek API rewrite call failed: {response.text}" logger.error(error_msg) raise Exception(error_msg) tools_pattern = re.compile(r"Action: .*") def custom_tools_condition(state: AgentState): last_message = state["messages"][-1] if tools_pattern.match(last_message.content): return "tools" return END # Build the workflow with LangGraph's StateGraph workflow = StateGraph(AgentState) workflow.add_node("agent", agent) retrieve_node = ToolNode(tools) workflow.add_node("retrieve", retrieve_node) workflow.add_node("rewrite", rewrite) workflow.add_node("generate", generate) workflow.add_edge(START, "agent") workflow.add_conditional_edges("agent", custom_tools_condition, {"tools": "retrieve", END: END}) workflow.add_conditional_edges("retrieve", simple_grade_documents) workflow.add_edge("generate", END) workflow.add_edge("rewrite", "agent") app_workflow = workflow.compile() def process_question(user_question, app, config): events = [] for event in app.stream({"messages": [("user", user_question)]}, config): events.append(event) return events # --- Streamlit UI --- def main(): st.set_page_config(page_title="Advanced AI R&D Assistant", layout="wide", initial_sidebar_state="expanded") st.markdown( """ """, unsafe_allow_html=True ) # Sidebar: Display available data with st.sidebar: st.header("📚 Available Data") st.subheader("Research Database") for text in research_texts: st.markdown(f'
{text}
', unsafe_allow_html=True) st.subheader("Development Database") for text in development_texts: st.markdown(f'
{text}
', unsafe_allow_html=True) st.title("🤖 Advanced AI R&D Assistant") st.markdown("---") query = st.text_area("Enter your question:", height=100, placeholder="e.g., What is the latest advancement in AI research?") col1, col2 = st.columns([1, 2]) with col1: if st.button("🔍 Get Answer", use_container_width=True): if query: with st.spinner('Processing your question...'): events = process_question(query, app_workflow, {"configurable": {"thread_id": "1"}}) for event in events: if 'agent' in event: with st.expander("🔄 Processing Step", expanded=True): content = event['agent']['messages'][0].content if "Results:" in content: st.markdown("### 📑 Retrieved Documents:") docs = content[content.find("Results:"):] st.info(docs) elif 'generate' in event: st.markdown("### ✨ Final Answer:") st.success(event['generate']['messages'][0].content) else: st.warning("⚠️ Please enter a question first!") with col2: st.markdown( """ ### 🎯 How to Use 1. Type your question in the text box. 2. Click "Get Answer" to process. 3. View retrieved documents and the final answer. ### 💡 Example Questions - What are the latest advancements in AI research? - What is the status of Project A? - What are the current trends in machine learning? """ ) if __name__ == "__main__": main()