Spaces:

mgbam
/

NeuroResearch_AI

Sleeping

File size: 22,048 Bytes

# ------------------------------
# UniversalResearch AI System with Refinement Counter and Increased Recursion Limit
# ------------------------------
import logging
import os
import re
import hashlib
import json
import time
import sys
from datetime import datetime
from concurrent.futures import ThreadPoolExecutor, as_completed
from typing import List, Dict, Any, Optional, Sequence

import chromadb
import requests
import streamlit as st

# LangChain and LangGraph imports
from langchain_openai import OpenAIEmbeddings
from langchain_community.vectorstores import Chroma
from langchain_core.messages import HumanMessage, AIMessage, ToolMessage
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langgraph.graph import END, StateGraph
from langgraph.prebuilt import ToolNode
from langgraph.graph.message import add_messages
from typing_extensions import TypedDict, Annotated
from langchain.tools.retriever import create_retriever_tool

# Increase Python's recursion limit at the very start (if needed)
sys.setrecursionlimit(10000)

# ------------------------------
# Logging Configuration
# ------------------------------
logging.basicConfig(
    level=logging.INFO,
    format="%(asctime)s [%(levelname)s] %(message)s"
)
logger = logging.getLogger(__name__)

# ------------------------------
# State Schema Definition
# ------------------------------
class AgentState(TypedDict):
    messages: Annotated[Sequence[AIMessage | HumanMessage | ToolMessage], add_messages]
    context: Dict[str, Any]
    metadata: Dict[str, Any]

# ------------------------------
# Configuration
# ------------------------------
class ResearchConfig:
    """
    Generic configuration for the UniversalResearch AI System.
    This configuration is designed to be applicable to any research domain.
    """
    DEEPSEEK_API_KEY = os.environ.get("DEEPSEEK_API_KEY")
    CHROMA_PATH = "chroma_db"
    CHUNK_SIZE = 512
    CHUNK_OVERLAP = 64
    MAX_CONCURRENT_REQUESTS = 5
    EMBEDDING_DIMENSIONS = 1536
    # An optional map can be used to list pre-loaded or featured research topics.
    DOCUMENT_MAP = {
        "Sample Research Document 1": "Topic A Overview",
        "Sample Research Document 2": "Topic B Analysis",
        "Sample Research Document 3": "Topic C Innovations"
    }
    ANALYSIS_TEMPLATE = (
        "Analyze the following research documents with scientific rigor:\n{context}\n\n"
        "Provide your analysis with the following structure:\n"
        "1. Key Contributions (bullet points)\n"
        "2. Novel Methodologies\n"
        "3. Empirical Results (with metrics)\n"
        "4. Potential Applications\n"
        "5. Limitations & Future Directions\n\n"
        "Format your response in Markdown with LaTeX mathematical notation where applicable."
    )

if not ResearchConfig.DEEPSEEK_API_KEY:
    st.error(
        """**Research Portal Configuration Required**  
1. Obtain your DeepSeek API key from [platform.deepseek.com](https://platform.deepseek.com/)  
2. Set the secret: `DEEPSEEK_API_KEY` in your deployment settings  
3. Rebuild your deployment."""
    )
    st.stop()

# ------------------------------
# Universal Document Processing
# ------------------------------
class UniversalDocumentManager:
    """
    Manages the creation of document collections for any research domain.
    Documents are split into manageable chunks and embedded using OpenAI embeddings.
    """
    def __init__(self) -> None:
        try:
            self.client = chromadb.PersistentClient(path=ResearchConfig.CHROMA_PATH)
            logger.info("Initialized PersistentClient for Chroma.")
        except Exception as e:
            logger.error(f"Error initializing PersistentClient: {e}")
            self.client = chromadb.Client()  # Fallback to in-memory client

        self.embeddings = OpenAIEmbeddings(
            model="text-embedding-3-large",
            dimensions=ResearchConfig.EMBEDDING_DIMENSIONS
        )

    def create_collection(self, documents: List[str], collection_name: str) -> Chroma:
        """
        Splits documents into chunks and stores them in a Chroma collection.
        """
        splitter = RecursiveCharacterTextSplitter(
            chunk_size=ResearchConfig.CHUNK_SIZE,
            chunk_overlap=ResearchConfig.CHUNK_OVERLAP,
            separators=["\n\n", "\n", "|||"]
        )
        try:
            docs = splitter.create_documents(documents)
            logger.info(f"Created {len(docs)} document chunks for collection '{collection_name}'.")
        except Exception as e:
            logger.error(f"Error splitting documents: {e}")
            raise e

        return Chroma.from_documents(
            documents=docs,
            embedding=self.embeddings,
            client=self.client,
            collection_name=collection_name,
            ids=[self._document_id(doc.page_content) for doc in docs]
        )

    def _document_id(self, content: str) -> str:
        """
        Generates a unique document ID using a SHA256 hash combined with the current timestamp.
        """
        return f"{hashlib.sha256(content.encode()).hexdigest()[:16]}-{int(time.time())}"

# Initialize document collections for multiple research domains
udm = UniversalDocumentManager()
# Example collections – these can be updated with any research domain documents.
research_docs = udm.create_collection([
    "Research Report: Novel AI Techniques in Renewable Energy",
    "Academic Paper: Advances in Quantum Computing for Data Analysis",
    "Survey: Emerging Trends in Biomedical Research"
], "research")

development_docs = udm.create_collection([
    "Project Update: New Algorithms in Software Engineering",
    "Development Report: Innovations in User Interface Design",
    "Case Study: Agile Methodologies in Large-Scale Software Projects"
], "development")

# ------------------------------
# Advanced Retrieval System
# ------------------------------
class ResearchRetriever:
    """
    Provides retrieval methods for research documents.
    This class supports multiple domains, such as academic research and development.
    """
    def __init__(self) -> None:
        try:
            self.research_retriever = research_docs.as_retriever(
                search_type="mmr",
                search_kwargs={'k': 4, 'fetch_k': 20, 'lambda_mult': 0.85}
            )
            self.development_retriever = development_docs.as_retriever(
                search_type="similarity",
                search_kwargs={'k': 3}
            )
            logger.info("Initialized retrievers for research and development domains.")
        except Exception as e:
            logger.error(f"Error initializing retrievers: {e}")
            raise e

    def retrieve(self, query: str, domain: str) -> List[Any]:
        """
        Retrieves documents for a given query and domain.
        """
        try:
            if domain == "research":
                return self.research_retriever.invoke(query)
            elif domain == "development":
                return self.development_retriever.invoke(query)
            else:
                logger.warning(f"Domain '{domain}' not recognized. Defaulting to research.")
                return self.research_retriever.invoke(query)
        except Exception as e:
            logger.error(f"Retrieval error for domain '{domain}': {e}")
            return []

retriever = ResearchRetriever()

# ------------------------------
# Cognitive Processing Unit
# ------------------------------
class CognitiveProcessor:
    """
    Executes API requests to the DeepSeek backend using redundant parallel requests.
    The responses are consolidated via a consensus mechanism.
    """
    def __init__(self) -> None:
        self.executor = ThreadPoolExecutor(max_workers=ResearchConfig.MAX_CONCURRENT_REQUESTS)
        self.session_id = hashlib.sha256(datetime.now().isoformat().encode()).hexdigest()[:12]

    def process_query(self, prompt: str) -> Dict:
        """
        Processes a query by sending multiple API requests in parallel.
        """
        futures = []
        for _ in range(3):  # Triple redundancy for improved reliability
            futures.append(self.executor.submit(self._execute_api_request, prompt))

        results = []
        for future in as_completed(futures):
            try:
                results.append(future.result())
            except Exception as e:
                logger.error(f"Error in API request: {e}")
                st.error(f"Processing Error: {str(e)}")

        return self._consensus_check(results)

    def _execute_api_request(self, prompt: str) -> Dict:
        """
        Executes a single API request to the DeepSeek endpoint.
        """
        headers = {
            "Authorization": f"Bearer {ResearchConfig.DEEPSEEK_API_KEY}",
            "Content-Type": "application/json",
            "X-Research-Session": self.session_id
        }
        payload = {
            "model": "deepseek-chat",
            "messages": [{
                "role": "user",
                "content": f"Respond as a Senior Researcher:\n{prompt}"
            }],
            "temperature": 0.7,
            "max_tokens": 1500,
            "top_p": 0.9
        }
        try:
            response = requests.post(
                "https://api.deepseek.com/v1/chat/completions",
                headers=headers,
                json=payload,
                timeout=45
            )
            response.raise_for_status()
            logger.info("DeepSeek API request successful.")
            return response.json()
        except requests.exceptions.RequestException as e:
            logger.error(f"DeepSeek API request failed: {e}")
            return {"error": str(e)}

    def _consensus_check(self, results: List[Dict]) -> Dict:
        """
        Consolidates multiple API responses by selecting the one with the most content.
        """
        valid_results = [r for r in results if "error" not in r]
        if not valid_results:
            logger.error("All API requests failed.")
            return {"error": "All API requests failed"}
        return max(valid_results, key=lambda x: len(x.get('choices', [{}])[0].get('message', {}).get('content', '')))

# ------------------------------
# Research Workflow Engine
# ------------------------------
class ResearchWorkflow:
    """
    Defines a multi-step research workflow using a state graph.
    This workflow is designed to be domain-agnostic, working for any research area.
    """
    def __init__(self) -> None:
        self.processor = CognitiveProcessor()
        self.workflow = StateGraph(AgentState)
        self._build_workflow()
        self.app = self.workflow.compile()

    def _build_workflow(self) -> None:
        # Define workflow nodes
        self.workflow.add_node("ingest", self.ingest_query)
        self.workflow.add_node("retrieve", self.retrieve_documents)
        self.workflow.add_node("analyze", self.analyze_content)
        self.workflow.add_node("validate", self.validate_output)
        self.workflow.add_node("refine", self.refine_results)
        # Set entry point and define transitions
        self.workflow.set_entry_point("ingest")
        self.workflow.add_edge("ingest", "retrieve")
        self.workflow.add_edge("retrieve", "analyze")
        self.workflow.add_conditional_edges(
            "analyze",
            self._quality_check,
            {"valid": "validate", "invalid": "refine"}
        )
        self.workflow.add_edge("validate", END)
        self.workflow.add_edge("refine", "retrieve")

    def ingest_query(self, state: AgentState) -> Dict:
        """
        Ingests the research query and initializes the refinement counter.
        """
        try:
            query = state["messages"][-1].content
            new_context = {"raw_query": query, "refine_count": 0}
            logger.info("Query ingested.")
            return {
                "messages": [AIMessage(content="Query ingested successfully")],
                "context": new_context,
                "metadata": {"timestamp": datetime.now().isoformat()}
            }
        except Exception as e:
            return self._error_state(f"Ingestion Error: {str(e)}")

    def retrieve_documents(self, state: AgentState) -> Dict:
        """
        Retrieves research documents for the given query.
        """
        try:
            query = state["context"]["raw_query"]
            docs = retriever.retrieve(query, "research")
            logger.info(f"Retrieved {len(docs)} documents for query.")
            return {
                "messages": [AIMessage(content=f"Retrieved {len(docs)} documents")],
                "context": {"documents": docs, "retrieval_time": time.time(), "refine_count": state["context"].get("refine_count", 0)}
            }
        except Exception as e:
            return self._error_state(f"Retrieval Error: {str(e)}")

    def analyze_content(self, state: AgentState) -> Dict:
        """
        Analyzes the retrieved research documents using the DeepSeek API.
        """
        try:
            docs = state["context"].get("documents", [])
            docs_text = "\n\n".join([d.page_content for d in docs])
            prompt = ResearchConfig.ANALYSIS_TEMPLATE.format(context=docs_text)
            response = self.processor.process_query(prompt)
            if "error" in response:
                return self._error_state(response["error"])
            logger.info("Content analysis completed.")
            return {
                "messages": [AIMessage(content=response.get('choices', [{}])[0].get('message', {}).get('content', ''))],
                "context": {"analysis": response, "refine_count": state["context"].get("refine_count", 0)}
            }
        except Exception as e:
            return self._error_state(f"Analysis Error: {str(e)}")

    def validate_output(self, state: AgentState) -> Dict:
        """
        Validates the analysis report for technical accuracy and consistency.
        """
        analysis = state["messages"][-1].content
        validation_prompt = (
            f"Validate the following research analysis:\n{analysis}\n\n"
            "Check for:\n1. Technical accuracy\n2. Adequate citation support\n3. Logical consistency\n4. Methodological soundness\n\n"
            "Respond with 'VALID' or 'INVALID'."
        )
        response = self.processor.process_query(validation_prompt)
        logger.info("Output validation completed.")
        return {
            "messages": [AIMessage(content=analysis + f"\n\nValidation: {response.get('choices', [{}])[0].get('message', {}).get('content', '')}")]
        }

    def refine_results(self, state: AgentState) -> Dict:
        """
        Refines the analysis report if validation fails.
        Increments the refinement counter to avoid infinite loops.
        """
        current_count = state["context"].get("refine_count", 0)
        state["context"]["refine_count"] = current_count + 1
        logger.info(f"Refinement iteration: {state['context']['refine_count']}")
        refinement_prompt = (
            f"Refine this analysis:\n{state['messages'][-1].content}\n\n"
            "Improve by enhancing technical precision, empirical grounding, and theoretical coherence."
        )
        response = self.processor.process_query(refinement_prompt)
        logger.info("Refinement completed.")
        return {
            "messages": [AIMessage(content=response.get('choices', [{}])[0].get('message', {}).get('content', ''))],
            "context": state["context"]
        }

    def _quality_check(self, state: AgentState) -> str:
        """
        Checks whether the analysis report is valid.
        Forces a valid state if the refinement counter exceeds a preset threshold.
        """
        refine_count = state["context"].get("refine_count", 0)
        if refine_count >= 3:
            logger.warning("Refinement limit reached. Forcing valid outcome to prevent infinite recursion.")
            return "valid"
        content = state["messages"][-1].content
        quality = "valid" if "VALID" in content else "invalid"
        logger.info(f"Quality check returned: {quality}")
        return quality

    def _error_state(self, message: str) -> Dict:
        """
        Returns a standardized error state.
        """
        logger.error(message)
        return {
            "messages": [AIMessage(content=f"❌ {message}")],
            "context": {"error": True},
            "metadata": {"status": "error"}
        }

# ------------------------------
# Research Interface (Streamlit UI)
# ------------------------------
class ResearchInterface:
    """
    Provides a Streamlit-based interface for executing the UniversalResearch AI workflow.
    The interface is domain-agnostic, making it suitable for research in any field.
    """
    def __init__(self) -> None:
        self.workflow = ResearchWorkflow()
        self._initialize_interface()

    def _initialize_interface(self) -> None:
        st.set_page_config(
            page_title="UniversalResearch AI",
            layout="wide",
            initial_sidebar_state="expanded"
        )
        self._inject_styles()
        self._build_sidebar()
        self._build_main_interface()

    def _inject_styles(self) -> None:
        st.markdown(
            """
            <style>
            :root {
                --primary: #2ecc71;
                --secondary: #3498db;
                --background: #0a0a0a;
                --text: #ecf0f1;
            }
            .stApp {
                background: var(--background);
                color: var(--text);
                font-family: 'Roboto', sans-serif;
            }
            .stTextArea textarea {
                background: #1a1a1a !important;
                color: var(--text) !important;
                border: 2px solid var(--secondary);
                border-radius: 8px;
                padding: 1rem;
            }
            .stButton>button {
                background: linear-gradient(135deg, var(--primary), var(--secondary));
                border: none;
                border-radius: 8px;
                padding: 1rem 2rem;
                transition: all 0.3s;
            }
            .stButton>button:hover {
                transform: translateY(-2px);
                box-shadow: 0 4px 12px rgba(46, 204, 113, 0.3);
            }
            .stExpander {
                background: #1a1a1a;
                border: 1px solid #2a2a2a;
                border-radius: 8px;
                margin: 1rem 0;
            }
            </style>
            """,
            unsafe_allow_html=True
        )

    def _build_sidebar(self) -> None:
        with st.sidebar:
            st.title("🔍 Research Database")
            st.subheader("Featured Research Topics")
            # Display featured research topics from the DOCUMENT_MAP.
            for title, short in ResearchConfig.DOCUMENT_MAP.items():
                with st.expander(short):
                    st.markdown(f"```\n{title}\n```")
            st.subheader("Analysis Metrics")
            st.metric("Vector Collections", 2)
            st.metric("Embedding Dimensions", ResearchConfig.EMBEDDING_DIMENSIONS)

    def _build_main_interface(self) -> None:
        st.title("🧠 UniversalResearch AI")
        query = st.text_area(
            "Research Query:",
            height=200,
            placeholder="Enter a research question or topic from any domain..."
        )
        if st.button("Execute Analysis", type="primary"):
            self._execute_analysis(query)

    def _execute_analysis(self, query: str) -> None:
        try:
            with st.spinner("Initializing Universal Analysis..."):
                # Invoke the workflow with an increased recursion limit configuration.
                results = self.workflow.app.stream({
                    "messages": [HumanMessage(content=query)],
                    "context": {},
                    "metadata": {}
                }, {"recursion_limit": 100})
                for event in results:
                    self._render_event(event)
                st.success("✅ Analysis Completed Successfully")
        except Exception as e:
            logger.error(f"Workflow execution failed: {e}")
            st.error(
                f"""**Analysis Failed**  
{str(e)}  
Potential issues:
- Complex query structure
- Document correlation failure
- Temporal processing constraints"""
            )

    def _render_event(self, event: Dict) -> None:
        if 'ingest' in event:
            with st.container():
                st.success("✅ Query Ingested")
        elif 'retrieve' in event:
            with st.container():
                docs = event['retrieve']['context'].get('documents', [])
                st.info(f"📚 Retrieved {len(docs)} documents")
                with st.expander("View Retrieved Documents", expanded=False):
                    for idx, doc in enumerate(docs, start=1):
                        st.markdown(f"**Document {idx}**")
                        st.code(doc.page_content, language='text')
        elif 'analyze' in event:
            with st.container():
                content = event['analyze']['messages'][0].content
                with st.expander("Research Analysis Report", expanded=True):
                    st.markdown(content)
        elif 'validate' in event:
            with st.container():
                content = event['validate']['messages'][0].content
                if "VALID" in content:
                    st.success("✅ Validation Passed")
                    with st.expander("View Validated Analysis", expanded=True):
                        st.markdown(content.split("Validation:")[0])
                else:
                    st.warning("⚠️ Validation Issues Detected")
                    with st.expander("View Validation Details", expanded=True):
                        st.markdown(content)

if __name__ == "__main__":
    ResearchInterface()