Spaces:

mgbam
/

NeuroResearch_AI

Running

App Files Files Community

NeuroResearch_AI / app.py

mgbam

Update app.py

56aed32 verified 4 months ago

raw

history blame

23.8 kB

	# ------------------------------
	# UniversalResearch AI System + LADDER (Tufa Labs)
	# ------------------------------
	import logging
	import os
	import re
	import hashlib
	import json
	import time
	import sys
	from datetime import datetime
	from concurrent.futures import ThreadPoolExecutor, as_completed
	from typing import List, Dict, Any, Optional, Sequence

	import chromadb
	import requests
	import streamlit as st

	# LangChain and LangGraph imports
	from langchain_openai import OpenAIEmbeddings
	from langchain_community.vectorstores import Chroma
	from langchain_core.messages import HumanMessage, AIMessage, ToolMessage
	from langchain.text_splitter import RecursiveCharacterTextSplitter
	from langgraph.graph import END, StateGraph
	from langgraph.prebuilt import ToolNode
	from langgraph.graph.message import add_messages
	from typing_extensions import TypedDict, Annotated
	from langchain.tools.retriever import create_retriever_tool

	# Increase Python's recursion limit if needed
	sys.setrecursionlimit(10000)

	# ------------------------------
	# Logging Configuration
	# ------------------------------
	logging.basicConfig(
	level=logging.INFO,
	format="%(asctime)s [%(levelname)s] %(message)s"
	)
	logger = logging.getLogger(__name__)

	# ------------------------------
	# State Schema Definition
	# ------------------------------
	class AgentState(TypedDict):
	"""
	Stores the messages and context for each step in the workflow.
	'messages' contain the conversation so far,
	'context' can hold domain-specific data, 'metadata' for additional info.
	"""
	messages: Annotated[Sequence[AIMessage \| HumanMessage \| ToolMessage], add_messages]
	context: Dict[str, Any]
	metadata: Dict[str, Any]

	# ------------------------------
	# Configuration
	# ------------------------------
	class ResearchConfig:
	"""
	Universal configuration for the research system, referencing Tufa Labs' LADDER approach.

	Make sure to set DEEPSEEK_API_KEY in your environment or HF Space secrets
	to enable the external LLM calls.
	"""
	DEEPSEEK_API_KEY = os.environ.get("DEEPSEEK_API_KEY")
	CHROMA_PATH = "chroma_db"
	CHUNK_SIZE = 512
	CHUNK_OVERLAP = 64
	MAX_CONCURRENT_REQUESTS = 5
	EMBEDDING_DIMENSIONS = 1536

	# Example map for featured research docs
	DOCUMENT_MAP = {
	"Sample Research Document 1": "Topic A Overview",
	"Sample Research Document 2": "Topic B Analysis",
	"Sample Research Document 3": "Topic C Innovations"
	}

	# Template referencing a general approach for analyzing research documents
	ANALYSIS_TEMPLATE = (
	"Analyze the following research documents with scientific rigor:\n{context}\n\n"
	"Using the LADDER approach from Tufa Labs, the model should:\n"
	"1. Break down the problem into simpler subproblems.\n"
	"2. Iteratively refine the solution.\n"
	"3. Provide thorough analysis, including:\n"
	" a. Key Contributions\n"
	" b. Novel Methodologies\n"
	" c. Empirical Results (with metrics)\n"
	" d. Potential Applications\n"
	" e. Limitations & Future Directions\n\n"
	"Format your response in Markdown with LaTeX mathematical notation where applicable."
	)

	# Early check for missing API key
	if not ResearchConfig.DEEPSEEK_API_KEY:
	st.error(
	"""Research Portal Configuration Required
	1. Obtain your DeepSeek API key from [platform.deepseek.com](https://platform.deepseek.com/)
	2. Set the secret: `DEEPSEEK_API_KEY` in your Space settings
	3. Rebuild your deployment."""
	)
	st.stop()

	# ------------------------------
	# Universal Document Processing
	# ------------------------------
	class UniversalDocumentManager:
	"""
	Manages creation of document collections for any research domain.
	Uses OpenAI embeddings for vector-based semantic search.
	"""
	def __init__(self) -> None:
	try:
	self.client = chromadb.PersistentClient(path=ResearchConfig.CHROMA_PATH)
	logger.info("Initialized PersistentClient for Chroma.")
	except Exception as e:
	logger.error(f"Error initializing PersistentClient: {e}")
	self.client = chromadb.Client() # Fallback to in-memory client

	self.embeddings = OpenAIEmbeddings(
	model="text-embedding-3-large",
	dimensions=ResearchConfig.EMBEDDING_DIMENSIONS
	)

	def create_collection(self, documents: List[str], collection_name: str) -> Chroma:
	"""
	Splits documents into manageable chunks and stores them in a Chroma collection.
	"""
	splitter = RecursiveCharacterTextSplitter(
	chunk_size=ResearchConfig.CHUNK_SIZE,
	chunk_overlap=ResearchConfig.CHUNK_OVERLAP,
	separators=["\n\n", "\n", "\|\|\|"]
	)
	try:
	docs = splitter.create_documents(documents)
	logger.info(f"Created {len(docs)} document chunks for collection '{collection_name}'.")
	except Exception as e:
	logger.error(f"Error splitting documents: {e}")
	raise e

	return Chroma.from_documents(
	documents=docs,
	embedding=self.embeddings,
	client=self.client,
	collection_name=collection_name,
	ids=[self._document_id(doc.page_content) for doc in docs]
	)

	def _document_id(self, content: str) -> str:
	"""
	Generates a unique document ID using SHA256 + timestamp.
	"""
	return f"{hashlib.sha256(content.encode()).hexdigest()[:16]}-{int(time.time())}"

	# Create example collections (can be replaced with domain-specific docs)
	udm = UniversalDocumentManager()
	research_docs = udm.create_collection([
	"Research Report: Novel AI Techniques in Renewable Energy",
	"Academic Paper: Advances in Quantum Computing for Data Analysis",
	"Survey: Emerging Trends in Biomedical Research"
	], "research")

	development_docs = udm.create_collection([
	"Project Update: New Algorithms in Software Engineering",
	"Development Report: Innovations in User Interface Design",
	"Case Study: Agile Methodologies in Large-Scale Software Projects"
	], "development")

	# ------------------------------
	# Advanced Retrieval System
	# ------------------------------
	class ResearchRetriever:
	"""
	Provides retrieval methods for multiple domains (research, development, etc.).
	Uses MMR (Maximal Marginal Relevance) or similarity-based retrieval from Chroma.
	"""
	def __init__(self) -> None:
	try:
	self.research_retriever = research_docs.as_retriever(
	search_type="mmr",
	search_kwargs={'k': 4, 'fetch_k': 20, 'lambda_mult': 0.85}
	)
	self.development_retriever = development_docs.as_retriever(
	search_type="similarity",
	search_kwargs={'k': 3}
	)
	logger.info("Initialized retrievers for research and development domains.")
	except Exception as e:
	logger.error(f"Error initializing retrievers: {e}")
	raise e

	def retrieve(self, query: str, domain: str) -> List[Any]:
	"""
	Retrieves documents for a given query and domain.
	Defaults to 'research' if domain is unrecognized.
	"""
	try:
	if domain == "research":
	return self.research_retriever.invoke(query)
	elif domain == "development":
	return self.development_retriever.invoke(query)
	else:
	logger.warning(f"Domain '{domain}' not recognized. Defaulting to research.")
	return self.research_retriever.invoke(query)
	except Exception as e:
	logger.error(f"Retrieval error for domain '{domain}': {e}")
	return []

	retriever = ResearchRetriever()

	# ------------------------------
	# Cognitive Processing Unit
	# ------------------------------
	class CognitiveProcessor:
	"""
	Sends parallel requests to the DeepSeek API to reduce failures.
	Implements a consensus mechanism to pick the most comprehensive response.
	"""
	def __init__(self) -> None:
	self.executor = ThreadPoolExecutor(max_workers=ResearchConfig.MAX_CONCURRENT_REQUESTS)
	self.session_id = hashlib.sha256(datetime.now().isoformat().encode()).hexdigest()[:12]

	def process_query(self, prompt: str) -> Dict:
	"""
	Processes a query by sending multiple parallel requests (triple redundancy).
	"""
	futures = []
	for _ in range(3):
	futures.append(self.executor.submit(self._execute_api_request, prompt))

	results = []
	for future in as_completed(futures):
	try:
	results.append(future.result())
	except Exception as e:
	logger.error(f"Error in API request: {e}")
	st.error(f"Processing Error: {str(e)}")

	return self._consensus_check(results)

	def _execute_api_request(self, prompt: str) -> Dict:
	"""
	Executes a single request to the DeepSeek endpoint.
	"""
	headers = {
	"Authorization": f"Bearer {ResearchConfig.DEEPSEEK_API_KEY}",
	"Content-Type": "application/json",
	"X-Research-Session": self.session_id
	}
	payload = {
	"model": "deepseek-chat",
	"messages": [{
	"role": "user",
	"content": f"Respond as a Senior Researcher:\n{prompt}"
	}],
	"temperature": 0.7,
	"max_tokens": 1500,
	"top_p": 0.9
	}
	try:
	response = requests.post(
	"https://api.deepseek.com/v1/chat/completions",
	headers=headers,
	json=payload,
	timeout=45
	)
	response.raise_for_status()
	logger.info("DeepSeek API request successful.")
	return response.json()
	except requests.exceptions.RequestException as e:
	logger.error(f"DeepSeek API request failed: {e}")
	return {"error": str(e)}

	def _consensus_check(self, results: List[Dict]) -> Dict:
	"""
	Chooses the best response by comparing the length of the message content.
	"""
	valid_results = [r for r in results if "error" not in r]
	if not valid_results:
	logger.error("All API requests failed.")
	return {"error": "All API requests failed"}
	return max(valid_results, key=lambda x: len(x.get('choices', [{}])[0].get('message', {}).get('content', '')))

	# ------------------------------
	# Research Workflow Engine (LADDER Integration)
	# ------------------------------
	class ResearchWorkflow:
	"""
	Defines a multi-step workflow using LangGraph with Tufa Labs' LADDER approach:
	1. Ingest Query
	2. Retrieve Documents
	3. Analyze Content
	4. Validate Output
	5. Refine (Recursive Self-Learning + TTRL)

	The 'refine_results' node applies LADDER’s idea of iteratively
	breaking down problems and re-solving them with no external data.
	"""
	def __init__(self) -> None:
	self.processor = CognitiveProcessor()
	self.workflow = StateGraph(AgentState)
	self._build_workflow()
	self.app = self.workflow.compile()

	def _build_workflow(self) -> None:
	# Define workflow nodes
	self.workflow.add_node("ingest", self.ingest_query)
	self.workflow.add_node("retrieve", self.retrieve_documents)
	self.workflow.add_node("analyze", self.analyze_content)
	self.workflow.add_node("validate", self.validate_output)
	self.workflow.add_node("refine", self.refine_results)

	# Entry point and transitions
	self.workflow.set_entry_point("ingest")
	self.workflow.add_edge("ingest", "retrieve")
	self.workflow.add_edge("retrieve", "analyze")
	self.workflow.add_conditional_edges(
	"analyze",
	self._quality_check,
	{"valid": "validate", "invalid": "refine"}
	)
	self.workflow.add_edge("validate", END)
	self.workflow.add_edge("refine", "retrieve")

	def ingest_query(self, state: AgentState) -> Dict:
	"""
	Ingests the research query and initializes the LADDER-based refinement counter.
	"""
	try:
	query = state["messages"][-1].content
	new_context = {"raw_query": query, "refine_count": 0}
	logger.info("Query ingested.")
	return {
	"messages": [AIMessage(content="Query ingested successfully")],
	"context": new_context,
	"metadata": {"timestamp": datetime.now().isoformat()}
	}
	except Exception as e:
	return self._error_state(f"Ingestion Error: {str(e)}")

	def retrieve_documents(self, state: AgentState) -> Dict:
	"""
	Retrieves relevant documents based on the query.
	The system can handle any domain (math, code generation, theorem proving, etc.).
	"""
	try:
	query = state["context"]["raw_query"]
	docs = retriever.retrieve(query, "research")
	logger.info(f"Retrieved {len(docs)} documents for query.")
	return {
	"messages": [AIMessage(content=f"Retrieved {len(docs)} documents")],
	"context": {
	"documents": docs,
	"retrieval_time": time.time(),
	"refine_count": state["context"].get("refine_count", 0)
	}
	}
	except Exception as e:
	return self._error_state(f"Retrieval Error: {str(e)}")

	def analyze_content(self, state: AgentState) -> Dict:
	"""
	Analyzes the retrieved documents using Tufa Labs' LADDER principles:
	- Break down the documents,
	- Provide structured analysis,
	- Return a refined solution.
	"""
	try:
	docs = state["context"].get("documents", [])
	docs_text = "\n\n".join([d.page_content for d in docs])
	prompt = ResearchConfig.ANALYSIS_TEMPLATE.format(context=docs_text)
	response = self.processor.process_query(prompt)
	if "error" in response:
	return self._error_state(response["error"])
	logger.info("Content analysis completed.")
	return {
	"messages": [
	AIMessage(
	content=response.get('choices', [{}])[0].get('message', {}).get('content', '')
	)
	],
	"context": {
	"analysis": response,
	"refine_count": state["context"].get("refine_count", 0)
	}
	}
	except Exception as e:
	return self._error_state(f"Analysis Error: {str(e)}")

	def validate_output(self, state: AgentState) -> Dict:
	"""
	Validates the analysis. If invalid, the system can refine the solution
	(potentially multiple times) using LADDER’s iterative approach.
	"""
	analysis = state["messages"][-1].content
	validation_prompt = (
	f"Validate this analysis:\n{analysis}\n\n"
	"Check for:\n1. Technical accuracy\n2. Citation support\n3. Logical consistency\n4. Methodological soundness\n\n"
	"Respond with 'VALID' or 'INVALID'."
	)
	response = self.processor.process_query(validation_prompt)
	logger.info("Output validation completed.")
	return {
	"messages": [
	AIMessage(
	content=analysis + f"\n\nValidation: {response.get('choices', [{}])[0].get('message', {}).get('content', '')}"
	)
	]
	}

	def refine_results(self, state: AgentState) -> Dict:
	"""
	Applies Tufa Labs' LADDER principle:
	- Recursively break down the problem,
	- Re-solve with no external data,
	- Potentially leverage TTRL for dynamic updates.

	This method increments a refinement counter to avoid infinite recursion.
	"""
	current_count = state["context"].get("refine_count", 0)
	state["context"]["refine_count"] = current_count + 1
	logger.info(f"LADDER refinement iteration: {state['context']['refine_count']}")

	refinement_prompt = (
	f"Refine this analysis using LADDER's self-improvement approach:\n"
	f"{state['messages'][-1].content}\n\n"
	"Focus on breaking down complex points further and re-solving them.\n"
	"Enhance technical precision, empirical grounding, and theoretical coherence."
	)
	response = self.processor.process_query(refinement_prompt)
	logger.info("Refinement completed.")
	return {
	"messages": [
	AIMessage(
	content=response.get('choices', [{}])[0].get('message', {}).get('content', '')
	)
	],
	"context": state["context"]
	}

	def _quality_check(self, state: AgentState) -> str:
	"""
	Determines if the analysis is 'valid' or 'invalid'.
	If refine_count exceeds 3, forcibly accept the result to prevent infinite loops.
	"""
	refine_count = state["context"].get("refine_count", 0)
	if refine_count >= 3:
	logger.warning("Refinement limit reached. Forcing valid outcome to prevent infinite recursion.")
	return "valid"
	content = state["messages"][-1].content
	return "valid" if "VALID" in content else "invalid"

	def _error_state(self, message: str) -> Dict:
	"""
	Returns an error state if any node fails.
	"""
	logger.error(message)
	return {
	"messages": [AIMessage(content=f"❌ {message}")],
	"context": {"error": True},
	"metadata": {"status": "error"}
	}

	# ------------------------------
	# Research Interface (Streamlit UI)
	# ------------------------------
	class ResearchInterface:
	"""
	Provides a Streamlit-based interface for the UniversalResearch AI with LADDER.
	The system is domain-agnostic, handling math, code generation, theorem proving, etc.
	"""
	def __init__(self) -> None:
	self.workflow = ResearchWorkflow()
	self._initialize_interface()

	def _initialize_interface(self) -> None:
	st.set_page_config(
	page_title="UniversalResearch AI (LADDER)",
	layout="wide",
	initial_sidebar_state="expanded"
	)
	self._inject_styles()
	self._build_sidebar()
	self._build_main_interface()

	def _inject_styles(self) -> None:
	st.markdown(
	"""
	<style>
	:root {
	--primary: #2ecc71;
	--secondary: #3498db;
	--background: #0a0a0a;
	--text: #ecf0f1;
	}
	.stApp {
	background: var(--background);
	color: var(--text);
	font-family: 'Roboto', sans-serif;
	}
	.stTextArea textarea {
	background: #1a1a1a !important;
	color: var(--text) !important;
	border: 2px solid var(--secondary);
	border-radius: 8px;
	padding: 1rem;
	}
	.stButton>button {
	background: linear-gradient(135deg, var(--primary), var(--secondary));
	border: none;
	border-radius: 8px;
	padding: 1rem 2rem;
	transition: all 0.3s;
	}
	.stButton>button:hover {
	transform: translateY(-2px);
	box-shadow: 0 4px 12px rgba(46, 204, 113, 0.3);
	}
	.stExpander {
	background: #1a1a1a;
	border: 1px solid #2a2a2a;
	border-radius: 8px;
	margin: 1rem 0;
	}
	</style>
	""",
	unsafe_allow_html=True
	)

	def _build_sidebar(self) -> None:
	with st.sidebar:
	st.title("🔍 Research Database (LADDER)")
	st.subheader("Featured Research Topics")
	for title, short in ResearchConfig.DOCUMENT_MAP.items():
	with st.expander(short):
	st.markdown(f"```\n{title}\n```")
	st.subheader("Analysis Metrics")
	st.metric("Vector Collections", 2)
	st.metric("Embedding Dimensions", ResearchConfig.EMBEDDING_DIMENSIONS)

	def _build_main_interface(self) -> None:
	st.title("🧠 UniversalResearch AI with Tufa Labs’ LADDER")
	st.write(
	"Leverage the power of Tufa Labs' LADDER approach for recursive self-improvement. "
	)
	query = st.text_area(
	"Research Query:",
	height=200,
	placeholder="Enter a research question, from math to code generation..."
	)
	if st.button("Execute Analysis", type="primary"):
	self._execute_analysis(query)

	def _execute_analysis(self, query: str) -> None:
	try:
	with st.spinner("Initializing LADDER-based Analysis..."):
	# The recursion_limit config ensures we can handle multiple refine iterations
	results = self.workflow.app.stream({
	"messages": [HumanMessage(content=query)],
	"context": {},
	"metadata": {}
	}, {"recursion_limit": 100})
	for event in results:
	self._render_event(event)
	st.success("✅ Analysis Completed Successfully")
	except Exception as e:
	logger.error(f"Workflow execution failed: {e}")
	st.error(
	f"""Analysis Failed
	{str(e)}
	Potential issues:
	- Complex query structure
	- Document correlation failure
	- Temporal processing constraints"""
	)

	def _render_event(self, event: Dict) -> None:
	"""
	Renders each event in the Streamlit UI.
	"""
	if 'ingest' in event:
	with st.container():
	st.success("✅ Query Ingested")
	elif 'retrieve' in event:
	with st.container():
	docs = event['retrieve']['context'].get('documents', [])
	st.info(f"📚 Retrieved {len(docs)} documents")
	with st.expander("View Retrieved Documents", expanded=False):
	for idx, doc in enumerate(docs, start=1):
	st.markdown(f"Document {idx}")
	st.code(doc.page_content, language='text')
	elif 'analyze' in event:
	with st.container():
	content = event['analyze']['messages'][0].content
	with st.expander("Research Analysis Report", expanded=True):
	st.markdown(content)
	elif 'validate' in event:
	with st.container():
	content = event['validate']['messages'][0].content
	if "VALID" in content:
	st.success("✅ Validation Passed")
	with st.expander("View Validated Analysis", expanded=True):
	# Remove "Validation: ..." for a cleaner final result
	st.markdown(content.split("Validation:")[0])
	else:
	st.warning("⚠️ Validation Issues Detected")
	with st.expander("View Validation Details", expanded=True):
	st.markdown(content)

	if __name__ == "__main__":
	ResearchInterface()