# ------------------------------ # Enhanced NeuroResearch AI System with Domain Adaptability, # Refinement Counter, Dynamic Difficulty Gradient, and Meta-Refinement Inspired by LADDER # ------------------------------ import logging import os import re import hashlib import json import time import sys from datetime import datetime from concurrent.futures import ThreadPoolExecutor, as_completed from typing import List, Dict, Any, Optional, Sequence import chromadb import requests import streamlit as st # LangChain and LangGraph imports from langchain_openai import OpenAIEmbeddings from langchain_community.vectorstores import Chroma from langchain_core.messages import HumanMessage, AIMessage, ToolMessage from langchain.text_splitter import RecursiveCharacterTextSplitter from langgraph.graph import END, StateGraph from langgraph.prebuilt import ToolNode from langgraph.graph.message import add_messages from typing_extensions import TypedDict, Annotated from langchain.tools.retriever import create_retriever_tool # Increase Python's recursion limit at the very start (if needed) sys.setrecursionlimit(1000) # ------------------------------ # Logging Configuration # ------------------------------ logging.basicConfig( level=logging.INFO, format="%(asctime)s [%(levelname)s] %(message)s" ) logger = logging.getLogger(__name__) # ------------------------------ # State Schema Definition # ------------------------------ class AgentState(TypedDict): messages: Annotated[Sequence[AIMessage | HumanMessage | ToolMessage], add_messages] context: Dict[str, Any] metadata: Dict[str, Any] # ------------------------------ # Configuration # ------------------------------ class ResearchConfig: DEEPSEEK_API_KEY = os.environ.get("DEEPSEEK_API_KEY") CHROMA_PATH = "chroma_db" CHUNK_SIZE = 512 CHUNK_OVERLAP = 64 MAX_CONCURRENT_REQUESTS = 5 EMBEDDING_DIMENSIONS = 1536 DOCUMENT_MAP = { "Research Report: Results of a New AI Model Improving Image Recognition Accuracy to 98%": "CV-Transformer Hybrid Architecture", "Academic Paper Summary: Why Transformers Became the Mainstream Architecture in Natural Language Processing": "Transformer Architecture Analysis", "Latest Trends in Machine Learning Methods Using Quantum Computing": "Quantum ML Frontiers" } # Base analysis template remains unchanged. ANALYSIS_TEMPLATE = ( "Analyze these technical documents with scientific rigor:\n{context}\n\n" "Respond with:\n" "1. Key Technical Contributions (bullet points)\n" "2. Novel Methodologies\n" "3. Empirical Results (with metrics)\n" "4. Potential Applications\n" "5. Limitations & Future Directions\n\n" "Format: Markdown with LaTeX mathematical notation where applicable" ) # Domain-specific prompt additions for customizable analysis DOMAIN_PROMPTS = { "Biomedical Research": "Consider clinical trial design, patient outcomes, and recent biomedical breakthroughs.", "Legal Research": "Emphasize legal precedents, case law, and nuanced statutory interpretations.", "Environmental and Energy Studies": "Highlight renewable energy technologies, efficiency metrics, and policy implications.", "Competitive Programming and Theoretical Computer Science": "Focus on algorithmic complexity, innovative proofs, and computational techniques.", "Social Sciences": "Concentrate on economic trends, sociological data, and correlations impacting public policy." } if not ResearchConfig.DEEPSEEK_API_KEY: st.error( """**Research Portal Configuration Required** 1. Obtain DeepSeek API key: [platform.deepseek.com](https://platform.deepseek.com/) 2. Configure secret: `DEEPSEEK_API_KEY` in Space settings 3. Rebuild deployment""" ) st.stop() # ------------------------------ # Quantum Document Processing # ------------------------------ class QuantumDocumentManager: """ Manages creation of Chroma collections from raw document texts. """ def __init__(self) -> None: try: self.client = chromadb.PersistentClient(path=ResearchConfig.CHROMA_PATH) logger.info("Initialized PersistentClient for Chroma.") except Exception as e: logger.exception("Error initializing PersistentClient; falling back to in-memory client.") self.client = chromadb.Client() # Fallback to in-memory client self.embeddings = OpenAIEmbeddings( model="text-embedding-3-large", dimensions=ResearchConfig.EMBEDDING_DIMENSIONS ) def create_collection(self, documents: List[str], collection_name: str) -> Chroma: """ Splits documents into chunks and stores them as a Chroma collection. """ splitter = RecursiveCharacterTextSplitter( chunk_size=ResearchConfig.CHUNK_SIZE, chunk_overlap=ResearchConfig.CHUNK_OVERLAP, separators=["\n\n", "\n", "|||"] ) try: docs = splitter.create_documents(documents) logger.info(f"Created {len(docs)} document chunks for collection '{collection_name}'.") except Exception as e: logger.exception("Error during document splitting.") raise e return Chroma.from_documents( documents=docs, embedding=self.embeddings, client=self.client, collection_name=collection_name, ids=[self._document_id(doc.page_content) for doc in docs] ) def _document_id(self, content: str) -> str: """ Generates a unique document ID using SHA256 and the current timestamp. """ return f"{hashlib.sha256(content.encode()).hexdigest()[:16]}-{int(time.time())}" # Initialize document collections qdm = QuantumDocumentManager() research_docs = qdm.create_collection([ "Research Report: Results of a New AI Model Improving Image Recognition Accuracy to 98%", "Academic Paper Summary: Why Transformers Became the Mainstream Architecture in Natural Language Processing", "Latest Trends in Machine Learning Methods Using Quantum Computing" ], "research") development_docs = qdm.create_collection([ "Project A: UI Design Completed, API Integration in Progress", "Project B: Testing New Feature X, Bug Fixes Needed", "Product Y: In the Performance Optimization Stage Before Release" ], "development") # ------------------------------ # Advanced Retrieval System # ------------------------------ class ResearchRetriever: """ Provides retrieval methods for different domains. """ def __init__(self) -> None: try: self.research_retriever = research_docs.as_retriever( search_type="mmr", search_kwargs={'k': 4, 'fetch_k': 20, 'lambda_mult': 0.85} ) self.development_retriever = development_docs.as_retriever( search_type="similarity", search_kwargs={'k': 3} ) logger.info("Initialized retrievers for research and development domains.") except Exception as e: logger.exception("Error initializing retrievers.") raise e def retrieve(self, query: str, domain: str) -> List[Any]: """ Retrieves documents based on the query and domain. For now, domain differentiation is minimal; however, you can extend this method to use domain-specific collections. """ try: # For demonstration, we use the "research" collection for all domains. return self.research_retriever.invoke(query) except Exception as e: logger.exception(f"Retrieval error for domain '{domain}'.") return [] retriever = ResearchRetriever() # ------------------------------ # Cognitive Processing Unit # ------------------------------ class CognitiveProcessor: """ Executes API requests to the DeepSeek backend using triple redundancy and consolidates results via a consensus mechanism. """ def __init__(self) -> None: self.executor = ThreadPoolExecutor(max_workers=ResearchConfig.MAX_CONCURRENT_REQUESTS) self.session_id = hashlib.sha256(datetime.now().isoformat().encode()).hexdigest()[:12] def process_query(self, prompt: str) -> Dict: """ Processes a query by sending multiple API requests in parallel. """ futures = [] for _ in range(3): # Triple redundancy for reliability futures.append(self.executor.submit(self._execute_api_request, prompt)) results = [] for future in as_completed(futures): try: results.append(future.result()) except Exception as e: logger.exception("Error during API request execution.") st.error(f"Processing Error: {str(e)}") return self._consensus_check(results) def _execute_api_request(self, prompt: str) -> Dict: """ Executes a single API request to the DeepSeek endpoint. """ headers = { "Authorization": f"Bearer {ResearchConfig.DEEPSEEK_API_KEY}", "Content-Type": "application/json", "X-Research-Session": self.session_id } payload = { "model": "deepseek-chat", "messages": [{ "role": "user", "content": f"Respond as Senior AI Researcher:\n{prompt}" }], "temperature": 0.7, "max_tokens": 1500, "top_p": 0.9 } try: response = requests.post( "https://api.deepseek.com/v1/chat/completions", headers=headers, json=payload, timeout=45 ) response.raise_for_status() logger.info("DeepSeek API request successful.") return response.json() except requests.exceptions.RequestException as e: logger.exception("DeepSeek API request failed.") return {"error": str(e)} def _consensus_check(self, results: List[Dict]) -> Dict: """ Consolidates multiple API responses, selecting the one with the most content. """ valid_results = [r for r in results if "error" not in r] if not valid_results: logger.error("All API requests failed.") return {"error": "All API requests failed"} return max(valid_results, key=lambda x: len(x.get('choices', [{}])[0].get('message', {}).get('content', ''))) # ------------------------------ # Research Workflow Engine # ------------------------------ class ResearchWorkflow: """ Defines the multi-step research workflow using a state graph. """ def __init__(self) -> None: self.processor = CognitiveProcessor() self.workflow = StateGraph(AgentState) self._build_workflow() self.app = self.workflow.compile() def _build_workflow(self) -> None: # Define nodes self.workflow.add_node("ingest", self.ingest_query) self.workflow.add_node("retrieve", self.retrieve_documents) self.workflow.add_node("analyze", self.analyze_content) self.workflow.add_node("validate", self.validate_output) self.workflow.add_node("refine", self.refine_results) # Set entry point and edges self.workflow.set_entry_point("ingest") self.workflow.add_edge("ingest", "retrieve") self.workflow.add_edge("retrieve", "analyze") self.workflow.add_conditional_edges( "analyze", self._quality_check, {"valid": "validate", "invalid": "refine"} ) self.workflow.add_edge("validate", END) self.workflow.add_edge("refine", "retrieve") def ingest_query(self, state: AgentState) -> Dict: """ Ingests the research query and initializes the refinement counter, refinement history, and domain. """ try: query = state["messages"][-1].content # Retrieve domain from the query context if available; otherwise, default to "Biomedical Research" domain = state.get("domain", "Biomedical Research") # Initialize context with raw query, selected domain, refinement counter, and empty refinement history new_context = {"raw_query": query, "domain": domain, "refine_count": 0, "refinement_history": []} logger.info(f"Query ingested. Domain: {domain}") return { "messages": [AIMessage(content="Query ingested successfully")], "context": new_context, "metadata": {"timestamp": datetime.now().isoformat()} } except Exception as e: logger.exception("Error during query ingestion.") return self._error_state(f"Ingestion Error: {str(e)}") def retrieve_documents(self, state: AgentState) -> Dict: """ Retrieves research documents based on the query. """ try: query = state["context"]["raw_query"] # For demonstration, we use the "research" collection for all domains. docs = retriever.retrieve(query, state["context"].get("domain", "Biomedical Research")) logger.info(f"Retrieved {len(docs)} documents for query.") return { "messages": [AIMessage(content=f"Retrieved {len(docs)} documents")], "context": { "documents": docs, "retrieval_time": time.time(), "refine_count": state["context"].get("refine_count", 0), "refinement_history": state["context"].get("refinement_history", []), "domain": state["context"].get("domain", "Biomedical Research") } } except Exception as e: logger.exception("Error during document retrieval.") return self._error_state(f"Retrieval Error: {str(e)}") def analyze_content(self, state: AgentState) -> Dict: """ Analyzes the retrieved documents using the DeepSeek API. Augments the prompt with domain-specific instructions. """ try: docs = state["context"].get("documents", []) docs_text = "\n\n".join([d.page_content for d in docs]) domain = state["context"].get("domain", "Biomedical Research") domain_prompt = ResearchConfig.DOMAIN_PROMPTS.get(domain, "") # Combine domain-specific instructions with the base analysis template full_prompt = f"{domain_prompt}\n\n" + ResearchConfig.ANALYSIS_TEMPLATE.format(context=docs_text) response = self.processor.process_query(full_prompt) if "error" in response: logger.error("DeepSeek response error during analysis.") return self._error_state(response["error"]) logger.info("Content analysis completed.") return { "messages": [AIMessage(content=response.get('choices', [{}])[0].get('message', {}).get('content', ''))], "context": { "analysis": response, "refine_count": state["context"].get("refine_count", 0), "refinement_history": state["context"].get("refinement_history", []), "domain": domain } } except Exception as e: logger.exception("Error during content analysis.") return self._error_state(f"Analysis Error: {str(e)}") def validate_output(self, state: AgentState) -> Dict: """ Validates the technical analysis report. """ try: analysis = state["messages"][-1].content validation_prompt = ( f"Validate research analysis:\n{analysis}\n\n" "Check for:\n1. Technical accuracy\n2. Citation support\n3. Logical consistency\n4. Methodological soundness\n\n" "Respond with 'VALID' or 'INVALID'" ) response = self.processor.process_query(validation_prompt) logger.info("Output validation completed.") return { "messages": [AIMessage(content=analysis + f"\n\nValidation: {response.get('choices', [{}])[0].get('message', {}).get('content', '')}")] } except Exception as e: logger.exception("Error during output validation.") return self._error_state(f"Validation Error: {str(e)}") def refine_results(self, state: AgentState) -> Dict: """ Refines the analysis report if validation fails. Implements an innovative meta-refinement mechanism inspired by LADDER. Tracks refinement history, uses a dynamic difficulty gradient, and if the refinement count exceeds a threshold, summarizes the history into a final output. """ try: current_count = state["context"].get("refine_count", 0) state["context"]["refine_count"] = current_count + 1 # Append current analysis to refinement history refinement_history = state["context"].setdefault("refinement_history", []) current_analysis = state["messages"][-1].content refinement_history.append(current_analysis) # Compute a "difficulty level" (from 3 to 0) based on refinement count difficulty_level = max(0, 3 - state["context"]["refine_count"]) logger.info(f"Refinement iteration: {state['context']['refine_count']}, Difficulty level: {difficulty_level}") # If refinement count exceeds threshold, perform meta-refinement by summarizing the history if state["context"]["refine_count"] >= 3: meta_prompt = ( "You are given the following series of refinement outputs:\n" + "\n---\n".join(refinement_history) + "\n\nSummarize the above into a final, concise, and high-quality technical analysis report. Do not introduce new ideas; just synthesize the improvements." ) meta_response = self.processor.process_query(meta_prompt) logger.info("Meta-refinement completed.") return { "messages": [AIMessage(content=meta_response.get('choices', [{}])[0].get('message', {}).get('content', ''))], "context": state["context"] } else: # Standard refinement with a dynamic difficulty prompt refinement_prompt = ( f"Refine this analysis (current difficulty level: {difficulty_level}):\n{current_analysis}\n\n" "Improve the following aspects:\n1. Technical precision\n2. Empirical grounding\n3. Theoretical coherence\n\n" "Use a structured difficulty gradient approach (similar to LADDER) to produce a simpler yet more accurate variant." ) response = self.processor.process_query(refinement_prompt) logger.info("Refinement completed.") return { "messages": [AIMessage(content=response.get('choices', [{}])[0].get('message', {}).get('content', ''))], "context": state["context"] } except Exception as e: logger.exception("Error during refinement.") return self._error_state(f"Refinement Error: {str(e)}") def _quality_check(self, state: AgentState) -> str: """ Checks whether the analysis report is valid. Forces a valid state if the refinement count exceeds a threshold. """ refine_count = state["context"].get("refine_count", 0) if refine_count >= 3: logger.warning("Refinement limit reached. Forcing valid outcome to prevent infinite recursion.") return "valid" content = state["messages"][-1].content quality = "valid" if "VALID" in content else "invalid" logger.info(f"Quality check returned: {quality}") return quality def _error_state(self, message: str) -> Dict: """ Returns a standardized error state. """ logger.error(message) return { "messages": [AIMessage(content=f"❌ {message}")], "context": {"error": True}, "metadata": {"status": "error"} } # ------------------------------ # Research Interface (Streamlit UI) # ------------------------------ class ResearchInterface: """ Provides the Streamlit-based interface for executing the research workflow. """ def __init__(self) -> None: self.workflow = ResearchWorkflow() self._initialize_interface() def _initialize_interface(self) -> None: st.set_page_config( page_title="NeuroResearch AI", layout="wide", initial_sidebar_state="expanded" ) self._inject_styles() self._build_sidebar() self._build_main_interface() def _inject_styles(self) -> None: st.markdown( """ """, unsafe_allow_html=True ) def _build_sidebar(self) -> None: with st.sidebar: st.title("🔍 Research Database") st.subheader("Technical Papers") for title, short in ResearchConfig.DOCUMENT_MAP.items(): with st.expander(short): st.markdown(f"```\n{title}\n```") st.subheader("Analysis Metrics") st.metric("Vector Collections", 2) st.metric("Embedding Dimensions", ResearchConfig.EMBEDDING_DIMENSIONS) def _build_main_interface(self) -> None: st.title("🧠 NeuroResearch AI") query = st.text_area( "Research Query:", height=200, placeholder="Enter technical research question..." ) # Domain selection for research use cases domain = st.selectbox( "Select Research Domain:", options=[ "Biomedical Research", "Legal Research", "Environmental and Energy Studies", "Competitive Programming and Theoretical Computer Science", "Social Sciences" ], index=0 ) if st.button("Execute Analysis", type="primary"): self._execute_analysis(query, domain) def _execute_analysis(self, query: str, domain: str) -> None: try: with st.spinner("Initializing Quantum Analysis..."): # Pass domain into the context by adding it to the initial state results = self.workflow.app.stream({ "messages": [HumanMessage(content=query)], "context": {"domain": domain}, "metadata": {} }, {"recursion_limit": 100}) for event in results: self._render_event(event) st.success("✅ Analysis Completed Successfully") except Exception as e: logger.exception("Workflow execution failed.") st.error( f"""**Analysis Failed** {str(e)} Potential issues: - Complex query structure - Document correlation failure - Temporal processing constraints""" ) def _render_event(self, event: Dict) -> None: if 'ingest' in event: with st.container(): st.success("✅ Query Ingested") elif 'retrieve' in event: with st.container(): docs = event['retrieve']['context'].get('documents', []) st.info(f"📚 Retrieved {len(docs)} documents") with st.expander("View Retrieved Documents", expanded=False): for idx, doc in enumerate(docs, start=1): st.markdown(f"**Document {idx}**") st.code(doc.page_content, language='text') elif 'analyze' in event: with st.container(): content = event['analyze']['messages'][0].content with st.expander("Technical Analysis Report", expanded=True): st.markdown(content) elif 'validate' in event: with st.container(): content = event['validate']['messages'][0].content if "VALID" in content: st.success("✅ Validation Passed") with st.expander("View Validated Analysis", expanded=True): st.markdown(content.split("Validation:")[0]) else: st.warning("⚠️ Validation Issues Detected") with st.expander("View Validation Details", expanded=True): st.markdown(content) if __name__ == "__main__": ResearchInterface()