Spaces:

mgbam
/

NeuroResearch_AI

Sleeping

App Files Files Community

NeuroResearch_AI / app.py

mgbam

Update app.py

d4c248d verified 4 months ago

raw

history blame

21.2 kB

	import logging
	import os
	import re
	import hashlib
	import json
	import time
	import sys
	from datetime import datetime
	from concurrent.futures import ThreadPoolExecutor, as_completed
	from typing import List, Dict, Any, Optional, Sequence
	import chromadb
	import requests
	import streamlit as st

	# LangChain and LangGraph imports
	from langchain_openai import OpenAIEmbeddings
	from langchain_community.vectorstores import Chroma
	from langchain_core.messages import HumanMessage, AIMessage, ToolMessage
	from langchain.text_splitter import RecursiveCharacterTextSplitter
	from langgraph.graph import END, StateGraph
	from langgraph.prebuilt import ToolNode
	from langgraph.graph.message import add_messages
	from typing_extensions import TypedDict, Annotated
	from langchain.tools.retriever import create_retriever_tool

	# Increase Python's recursion limit at the very start (if needed)
	sys.setrecursionlimit(10000)

	# ------------------------------
	# Logging Configuration
	# ------------------------------
	logging.basicConfig(
	level=logging.INFO,
	format="%(asctime)s [%(levelname)s] %(message)s"
	)
	logger = logging.getLogger(__name__)

	# ------------------------------
	# State Schema Definition
	# ------------------------------
	class AgentState(TypedDict):
	messages: Annotated[Sequence[AIMessage \| HumanMessage \| ToolMessage], add_messages]
	context: Dict[str, Any]
	metadata: Dict[str, Any]

	# ------------------------------
	# Configuration
	# ------------------------------
	class ResearchConfig:
	DEEPSEEK_API_KEY = os.environ.get("DEEPSEEK_API_KEY")
	CHROMA_PATH = "chroma_db"
	CHUNK_SIZE = 512
	CHUNK_OVERLAP = 64
	MAX_CONCURRENT_REQUESTS = 5
	EMBEDDING_DIMENSIONS = 1536
	DOCUMENT_MAP = {
	"Research Report: Results of a New AI Model Improving Image Recognition Accuracy to 98%":
	"CV-Transformer Hybrid Architecture",
	"Academic Paper Summary: Why Transformers Became the Mainstream Architecture in Natural Language Processing":
	"Transformer Architecture Analysis",
	"Latest Trends in Machine Learning Methods Using Quantum Computing":
	"Quantum ML Frontiers"
	}
	ANALYSIS_TEMPLATE = (
	"Analyze these technical documents with scientific rigor:\n{context}\n\n"
	"Respond with:\n"
	"1. Key Technical Contributions (bullet points)\n"
	"2. Novel Methodologies\n"
	"3. Empirical Results (with metrics)\n"
	"4. Potential Applications\n"
	"5. Limitations & Future Directions\n\n"
	"Format: Markdown with LaTeX mathematical notation where applicable"
	)

	if not ResearchConfig.DEEPSEEK_API_KEY:
	st.error(
	"""Research Portal Configuration Required
	1. Obtain DeepSeek API key: [platform.deepseek.com](https://platform.deepseek.com/)
	2. Configure secret: `DEEPSEEK_API_KEY` in Space settings
	3. Rebuild deployment"""
	)
	st.stop()

	# ------------------------------
	# Quantum Document Processing
	# ------------------------------
	class QuantumDocumentManager:
	"""
	Manages creation of Chroma collections from raw document texts.
	"""
	def __init__(self) -> None:
	try:
	self.client = chromadb.PersistentClient(path=ResearchConfig.CHROMA_PATH)
	logger.info("Initialized PersistentClient for Chroma.")
	except Exception as e:
	logger.error(f"Error initializing PersistentClient: {e}")
	self.client = chromadb.Client() # Fallback to in-memory client
	self.embeddings = OpenAIEmbeddings(
	model="text-embedding-3-large",
	dimensions=ResearchConfig.EMBEDDING_DIMENSIONS
	)

	def create_collection(self, documents: List[str], collection_name: str) -> Chroma:
	"""
	Splits documents into chunks and stores them as a Chroma collection.
	"""
	splitter = RecursiveCharacterTextSplitter(
	chunk_size=ResearchConfig.CHUNK_SIZE,
	chunk_overlap=ResearchConfig.CHUNK_OVERLAP,
	separators=["\n\n", "\n", "\|\|\|"]
	)
	try:
	docs = splitter.create_documents(documents)
	logger.info(f"Created {len(docs)} document chunks for collection '{collection_name}'.")
	except Exception as e:
	logger.error(f"Error splitting documents: {e}")
	raise e

	return Chroma.from_documents(
	documents=docs,
	embedding=self.embeddings,
	client=self.client,
	collection_name=collection_name,
	ids=[self._document_id(doc.page_content) for doc in docs]
	)

	def _document_id(self, content: str) -> str:
	"""
	Generates a unique document ID using SHA256 and the current timestamp.
	"""
	return f"{hashlib.sha256(content.encode()).hexdigest()[:16]}-{int(time.time())}"

	# Initialize document collections
	qdm = QuantumDocumentManager()
	research_docs = qdm.create_collection([
	"Research Report: Results of a New AI Model Improving Image Recognition Accuracy to 98%",
	"Academic Paper Summary: Why Transformers Became the Mainstream Architecture in Natural Language Processing",
	"Latest Trends in Machine Learning Methods Using Quantum Computing"
	], "research")

	development_docs = qdm.create_collection([
	"Project A: UI Design Completed, API Integration in Progress",
	"Project B: Testing New Feature X, Bug Fixes Needed",
	"Product Y: In the Performance Optimization Stage Before Release"
	], "development")

	# ------------------------------
	# Advanced Retrieval System
	# ------------------------------
	class ResearchRetriever:
	"""
	Provides retrieval methods for different domains.
	"""
	def __init__(self) -> None:
	try:
	self.research_retriever = research_docs.as_retriever(
	search_type="mmr",
	search_kwargs={'k': 4, 'fetch_k': 20, 'lambda_mult': 0.85}
	)
	self.development_retriever = development_docs.as_retriever(
	search_type="similarity",
	search_kwargs={'k': 3}
	)
	logger.info("Initialized retrievers for research and development domains.")
	except Exception as e:
	logger.error(f"Error initializing retrievers: {e}")
	raise e

	def retrieve(self, query: str, domain: str) -> List[Any]:
	"""
	Retrieves documents based on the query and domain.
	"""
	try:
	if domain == "research":
	return self.research_retriever.invoke(query)
	elif domain == "development":
	return self.development_retriever.invoke(query)
	else:
	logger.warning(f"Domain '{domain}' not recognized.")
	return []
	except Exception as e:
	logger.error(f"Retrieval error for domain '{domain}': {e}")
	return []

	retriever = ResearchRetriever()

	# ------------------------------
	# Cognitive Processing Unit
	# ------------------------------
	class CognitiveProcessor:
	"""
	Executes API requests to the DeepSeek backend using triple redundancy
	and consolidates results via a consensus mechanism.
	"""
	def __init__(self) -> None:
	self.executor = ThreadPoolExecutor(max_workers=ResearchConfig.MAX_CONCURRENT_REQUESTS)
	self.session_id = hashlib.sha256(datetime.now().isoformat().encode()).hexdigest()[:12]

	def process_query(self, prompt: str) -> Dict:
	"""
	Processes a query by sending multiple API requests in parallel.
	"""
	futures = []
	for _ in range(3): # Triple redundancy for reliability
	futures.append(self.executor.submit(self._execute_api_request, prompt))

	results = []
	for future in as_completed(futures):
	try:
	results.append(future.result())
	except Exception as e:
	logger.error(f"Error in API request: {e}")
	st.error(f"Processing Error: {str(e)}")

	return self._consensus_check(results)

	def _execute_api_request(self, prompt: str) -> Dict:
	"""
	Executes a single API request to the DeepSeek endpoint.
	"""
	headers = {
	"Authorization": f"Bearer {ResearchConfig.DEEPSEEK_API_KEY}",
	"Content-Type": "application/json",
	"X-Research-Session": self.session_id
	}
	payload = {
	"model": "deepseek-chat",
	"messages": [{
	"role": "user",
	"content": f"Respond as Senior AI Researcher:\n{prompt}"
	}],
	"temperature": 0.7,
	"max_tokens": 1500,
	"top_p": 0.9
	}
	try:
	response = requests.post(
	"https://api.deepseek.com/v1/chat/completions",
	headers=headers,
	json=payload,
	timeout=45
	)
	response.raise_for_status()
	logger.info("DeepSeek API request successful.")
	return response.json()
	except requests.exceptions.RequestException as e:
	logger.error(f"DeepSeek API request failed: {e}")
	return {"error": str(e)}

	def _consensus_check(self, results: List[Dict]) -> Dict:
	"""
	Consolidates multiple API responses, selecting the one with the most content.
	"""
	valid_results = [r for r in results if "error" not in r]
	if not valid_results:
	logger.error("All API requests failed.")
	return {"error": "All API requests failed"}
	return max(valid_results, key=lambda x: len(x.get('choices', [{}])[0].get('message', {}).get('content', '')))

	# ------------------------------
	# Research Workflow Engine
	# ------------------------------
	class ResearchWorkflow:
	"""
	Defines the multi-step research workflow using a state graph.
	"""
	def __init__(self) -> None:
	self.processor = CognitiveProcessor()
	self.workflow = StateGraph(AgentState)
	self._build_workflow()
	self.app = self.workflow.compile()

	def _build_workflow(self) -> None:
	# Define nodes
	self.workflow.add_node("ingest", self.ingest_query)
	self.workflow.add_node("retrieve", self.retrieve_documents)
	self.workflow.add_node("analyze", self.analyze_content)
	self.workflow.add_node("validate", self.validate_output)
	self.workflow.add_node("refine", self.refine_results)
	# Set entry point and edges
	self.workflow.set_entry_point("ingest")
	self.workflow.add_edge("ingest", "retrieve")
	self.workflow.add_edge("retrieve", "analyze")
	self.workflow.add_conditional_edges(
	"analyze",
	self._quality_check,
	{"valid": "validate", "invalid": "refine"}
	)
	self.workflow.add_edge("validate", END)
	self.workflow.add_edge("refine", "retrieve")

	def ingest_query(self, state: AgentState) -> Dict:
	"""
	Ingests the research query and initializes the refinement counter.
	"""
	try:
	query = state["messages"][-1].content
	# Initialize context with raw query and refinement counter
	new_context = {"raw_query": query, "refine_count": 0}
	logger.info("Query ingested.")
	return {
	"messages": [AIMessage(content="Query ingested successfully")],
	"context": new_context,
	"metadata": {"timestamp": datetime.now().isoformat()}
	}
	except Exception as e:
	return self._error_state(f"Ingestion Error: {str(e)}")

	def retrieve_documents(self, state: AgentState) -> Dict:
	"""
	Retrieves research documents based on the query.
	"""
	try:
	query = state["context"]["raw_query"]
	docs = retriever.retrieve(query, "research")
	logger.info(f"Retrieved {len(docs)} documents for query.")
	return {
	"messages": [AIMessage(content=f"Retrieved {len(docs)} documents")],
	"context": {"documents": docs, "retrieval_time": time.time(), "refine_count": state["context"].get("refine_count", 0)}
	}
	except Exception as e:
	return self._error_state(f"Retrieval Error: {str(e)}")

	def analyze_content(self, state: AgentState) -> Dict:
	"""
	Analyzes the retrieved documents using the DeepSeek API.
	"""
	try:
	docs = state["context"].get("documents", [])
	docs_text = "\n\n".join([d.page_content for d in docs])
	prompt = ResearchConfig.ANALYSIS_TEMPLATE.format(context=docs_text)
	response = self.processor.process_query(prompt)
	if "error" in response:
	return self._error_state(response["error"])
	logger.info("Content analysis completed.")
	return {
	"messages": [AIMessage(content=response.get('choices', [{}])[0].get('message', {}).get('content', ''))],
	"context": {"analysis": response, "refine_count": state["context"].get("refine_count", 0)}
	}
	except Exception as e:
	return self._error_state(f"Analysis Error: {str(e)}")

	def validate_output(self, state: AgentState) -> Dict:
	"""
	Validates the technical analysis report.
	"""
	analysis = state["messages"][-1].content
	validation_prompt = (
	f"Validate research analysis:\n{analysis}\n\n"
	"Check for:\n1. Technical accuracy\n2. Citation support\n3. Logical consistency\n4. Methodological soundness\n\n"
	"Respond with 'VALID' or 'INVALID'"
	)
	response = self.processor.process_query(validation_prompt)
	logger.info("Output validation completed.")
	return {
	"messages": [AIMessage(content=analysis + f"\n\nValidation: {response.get('choices', [{}])[0].get('message', {}).get('content', '')}")]
	}

	def refine_results(self, state: AgentState) -> Dict:
	"""
	Refines the analysis report if validation fails.
	Increments the refinement counter to limit infinite loops.
	"""
	current_count = state["context"].get("refine_count", 0)
	state["context"]["refine_count"] = current_count + 1
	logger.info(f"Refinement iteration: {state['context']['refine_count']}")
	refinement_prompt = (
	f"Refine this analysis:\n{state['messages'][-1].content}\n\n"
	"Improve:\n1. Technical precision\n2. Empirical grounding\n3. Theoretical coherence"
	)
	response = self.processor.process_query(refinement_prompt)
	logger.info("Refinement completed.")
	return {
	"messages": [AIMessage(content=response.get('choices', [{}])[0].get('message', {}).get('content', ''))],
	"context": state["context"]
	}

	def _quality_check(self, state: AgentState) -> str:
	"""
	Checks whether the analysis report is valid.
	Forces a valid state if the refinement count exceeds a threshold.
	"""
	refine_count = state["context"].get("refine_count", 0)
	if refine_count >= 3:
	logger.warning("Refinement limit reached. Forcing valid outcome to prevent infinite recursion.")
	return "valid"
	content = state["messages"][-1].content
	quality = "valid" if "VALID" in content else "invalid"
	logger.info(f"Quality check returned: {quality}")
	return quality

	def _error_state(self, message: str) -> Dict:
	"""
	Returns a standardized error state.
	"""
	logger.error(message)
	return {
	"messages": [AIMessage(content=f"❌ {message}")],
	"context": {"error": True},
	"metadata": {"status": "error"}
	}

	# ------------------------------
	# Research Interface (Streamlit UI)
	# ------------------------------
	class ResearchInterface:
	"""
	Provides the Streamlit-based interface for executing the research workflow.
	"""
	def __init__(self) -> None:
	self.workflow = ResearchWorkflow()
	self._initialize_interface()

	def _initialize_interface(self) -> None:
	st.set_page_config(
	page_title="NeuroResearch AI",
	layout="wide",
	initial_sidebar_state="expanded"
	)
	self._inject_styles()
	self._build_sidebar()
	self._build_main_interface()

	def _inject_styles(self) -> None:
	st.markdown(
	"""
	<style>
	:root {
	--primary: #2ecc71;
	--secondary: #3498db;
	--background: #0a0a0a;
	--text: #ecf0f1;
	}
	.stApp {
	background: var(--background);
	color: var(--text);
	font-family: 'Roboto', sans-serif;
	}
	.stTextArea textarea {
	background: #1a1a1a !important;
	color: var(--text) !important;
	border: 2px solid var(--secondary);
	border-radius: 8px;
	padding: 1rem;
	}
	.stButton>button {
	background: linear-gradient(135deg, var(--primary), var(--secondary));
	border: none;
	border-radius: 8px;
	padding: 1rem 2rem;
	transition: all 0.3s;
	}
	.stButton>button:hover {
	transform: translateY(-2px);
	box-shadow: 0 4px 12px rgba(46, 204, 113, 0.3);
	}
	.stExpander {
	background: #1a1a1a;
	border: 1px solid #2a2a2a;
	border-radius: 8px;
	margin: 1rem 0;
	}
	</style>
	""",
	unsafe_allow_html=True
	)

	def _build_sidebar(self) -> None:
	with st.sidebar:
	st.title("🔍 Research Database")
	st.subheader("Technical Papers")
	for title, short in ResearchConfig.DOCUMENT_MAP.items():
	with st.expander(short):
	st.markdown(f"```\n{title}\n```")
	st.subheader("Analysis Metrics")
	st.metric("Vector Collections", 2)
	st.metric("Embedding Dimensions", ResearchConfig.EMBEDDING_DIMENSIONS)

	def _build_main_interface(self) -> None:
	st.title("🧠 NeuroResearch AI")
	query = st.text_area(
	"Research Query:",
	height=200,
	placeholder="Enter technical research question..."
	)
	if st.button("Execute Analysis", type="primary"):
	self._execute_analysis(query)

	def _execute_analysis(self, query: str) -> None:
	try:
	with st.spinner("Initializing Quantum Analysis..."):
	# Pass a recursion limit configuration into the graph invocation
	results = self.workflow.app.stream({
	"messages": [HumanMessage(content=query)],
	"context": {},
	"metadata": {}
	}, {"recursion_limit": 100})
	for event in results:
	self._render_event(event)
	st.success("✅ Analysis Completed Successfully")
	except Exception as e:
	logger.error(f"Workflow execution failed: {e}")
	st.error(
	f"""Analysis Failed
	{str(e)}
	Potential issues:
	- Complex query structure
	- Document correlation failure
	- Temporal processing constraints"""
	)

	def _render_event(self, event: Dict) -> None:
	if 'ingest' in event:
	with st.container():
	st.success("✅ Query Ingested")
	elif 'retrieve' in event:
	with st.container():
	docs = event['retrieve']['context'].get('documents', [])
	st.info(f"📚 Retrieved {len(docs)} documents")
	with st.expander("View Retrieved Documents", expanded=False):
	for idx, doc in enumerate(docs, start=1):
	st.markdown(f"Document {idx}")
	st.code(doc.page_content, language='text')
	elif 'analyze' in event:
	with st.container():
	content = event['analyze']['messages'][0].content
	with st.expander("Technical Analysis Report", expanded=True):
	st.markdown(content)
	elif 'validate' in event:
	with st.container():
	content = event['validate']['messages'][0].content
	if "VALID" in content:
	st.success("✅ Validation Passed")
	with st.expander("View Validated Analysis", expanded=True):
	st.markdown(content.split("Validation:")[0])
	else:
	st.warning("⚠️ Validation Issues Detected")
	with st.expander("View Validation Details", expanded=True):
	st.markdown(content)

	if __name__ == "__main__":
	ResearchInterface()