Spaces:
Sleeping
Sleeping
Update app.py
Browse files
app.py
CHANGED
@@ -1,6 +1,3 @@
|
|
1 |
-
# ------------------------------
|
2 |
-
# UniversalResearch AI with LADDER (OpenAI Integration)
|
3 |
-
# ------------------------------
|
4 |
import logging
|
5 |
import os
|
6 |
import re
|
@@ -11,12 +8,11 @@ import sys
|
|
11 |
from datetime import datetime
|
12 |
from concurrent.futures import ThreadPoolExecutor, as_completed
|
13 |
from typing import List, Dict, Any, Optional, Sequence
|
14 |
-
|
15 |
import chromadb
|
16 |
import requests
|
17 |
import streamlit as st
|
18 |
|
19 |
-
# LangChain
|
20 |
from langchain_openai import OpenAIEmbeddings
|
21 |
from langchain_community.vectorstores import Chroma
|
22 |
from langchain_core.messages import HumanMessage, AIMessage, ToolMessage
|
@@ -27,7 +23,7 @@ from langgraph.graph.message import add_messages
|
|
27 |
from typing_extensions import TypedDict, Annotated
|
28 |
from langchain.tools.retriever import create_retriever_tool
|
29 |
|
30 |
-
# Increase Python's recursion limit at the start (if needed)
|
31 |
sys.setrecursionlimit(10000)
|
32 |
|
33 |
# ------------------------------
|
@@ -43,12 +39,6 @@ logger = logging.getLogger(__name__)
|
|
43 |
# State Schema Definition
|
44 |
# ------------------------------
|
45 |
class AgentState(TypedDict):
|
46 |
-
"""
|
47 |
-
Stores the messages and context for each step in the workflow.
|
48 |
-
'messages': conversation so far
|
49 |
-
'context': domain-specific data (docs, counters)
|
50 |
-
'metadata': any additional info (timestamps, status)
|
51 |
-
"""
|
52 |
messages: Annotated[Sequence[AIMessage | HumanMessage | ToolMessage], add_messages]
|
53 |
context: Dict[str, Any]
|
54 |
metadata: Dict[str, Any]
|
@@ -57,56 +47,46 @@ class AgentState(TypedDict):
|
|
57 |
# Configuration
|
58 |
# ------------------------------
|
59 |
class ResearchConfig:
|
60 |
-
""
|
61 |
-
Universal config for the advanced AI system with Tufa Labs' LADDER approach,
|
62 |
-
using OpenAI for both embeddings and completions.
|
63 |
-
|
64 |
-
Make sure to set OPENAI_API_KEY in your environment or HF Space secrets.
|
65 |
-
"""
|
66 |
-
OPENAI_API_KEY = os.environ.get("OPENAI_API_KEY") # Must match your HF secret name
|
67 |
CHROMA_PATH = "chroma_db"
|
68 |
CHUNK_SIZE = 512
|
69 |
CHUNK_OVERLAP = 64
|
70 |
MAX_CONCURRENT_REQUESTS = 5
|
71 |
EMBEDDING_DIMENSIONS = 1536
|
72 |
-
|
73 |
-
# Example map for featured documents
|
74 |
DOCUMENT_MAP = {
|
75 |
-
"
|
76 |
-
|
77 |
-
"
|
|
|
|
|
|
|
78 |
}
|
79 |
-
|
80 |
-
# Analysis template referencing LADDER's approach
|
81 |
ANALYSIS_TEMPLATE = (
|
82 |
-
"Analyze
|
83 |
-
"
|
84 |
-
"1.
|
85 |
-
"2.
|
86 |
-
"3.
|
87 |
-
"
|
88 |
-
"
|
89 |
-
"
|
90 |
-
" d. Potential Applications\n"
|
91 |
-
" e. Limitations & Future Directions\n\n"
|
92 |
-
"Format your response in Markdown with LaTeX where applicable."
|
93 |
)
|
94 |
|
95 |
-
|
96 |
-
if not ResearchConfig.OPENAI_API_KEY:
|
97 |
st.error(
|
98 |
-
"""**
|
99 |
-
|
|
|
|
|
100 |
)
|
101 |
st.stop()
|
102 |
|
103 |
# ------------------------------
|
104 |
-
#
|
105 |
# ------------------------------
|
106 |
-
class
|
107 |
"""
|
108 |
-
Manages creation of
|
109 |
-
using OpenAI embeddings for semantic search.
|
110 |
"""
|
111 |
def __init__(self) -> None:
|
112 |
try:
|
@@ -115,8 +95,6 @@ class UniversalDocumentManager:
|
|
115 |
except Exception as e:
|
116 |
logger.error(f"Error initializing PersistentClient: {e}")
|
117 |
self.client = chromadb.Client() # Fallback to in-memory client
|
118 |
-
|
119 |
-
# Configure embeddings from openai
|
120 |
self.embeddings = OpenAIEmbeddings(
|
121 |
model="text-embedding-3-large",
|
122 |
dimensions=ResearchConfig.EMBEDDING_DIMENSIONS
|
@@ -124,7 +102,7 @@ class UniversalDocumentManager:
|
|
124 |
|
125 |
def create_collection(self, documents: List[str], collection_name: str) -> Chroma:
|
126 |
"""
|
127 |
-
Splits documents into chunks and stores them
|
128 |
"""
|
129 |
splitter = RecursiveCharacterTextSplitter(
|
130 |
chunk_size=ResearchConfig.CHUNK_SIZE,
|
@@ -133,7 +111,7 @@ class UniversalDocumentManager:
|
|
133 |
)
|
134 |
try:
|
135 |
docs = splitter.create_documents(documents)
|
136 |
-
logger.info(f"Created {len(docs)}
|
137 |
except Exception as e:
|
138 |
logger.error(f"Error splitting documents: {e}")
|
139 |
raise e
|
@@ -148,22 +126,22 @@ class UniversalDocumentManager:
|
|
148 |
|
149 |
def _document_id(self, content: str) -> str:
|
150 |
"""
|
151 |
-
Generates a unique ID using SHA256
|
152 |
"""
|
153 |
return f"{hashlib.sha256(content.encode()).hexdigest()[:16]}-{int(time.time())}"
|
154 |
|
155 |
-
#
|
156 |
-
|
157 |
-
research_docs =
|
158 |
-
"Research Report:
|
159 |
-
"Academic Paper:
|
160 |
-
"
|
161 |
], "research")
|
162 |
|
163 |
-
development_docs =
|
164 |
-
"Project
|
165 |
-
"
|
166 |
-
"
|
167 |
], "development")
|
168 |
|
169 |
# ------------------------------
|
@@ -171,8 +149,7 @@ development_docs = udm.create_collection([
|
|
171 |
# ------------------------------
|
172 |
class ResearchRetriever:
|
173 |
"""
|
174 |
-
Provides retrieval methods for
|
175 |
-
Uses MMR or similarity-based retrieval from Chroma.
|
176 |
"""
|
177 |
def __init__(self) -> None:
|
178 |
try:
|
@@ -191,8 +168,7 @@ class ResearchRetriever:
|
|
191 |
|
192 |
def retrieve(self, query: str, domain: str) -> List[Any]:
|
193 |
"""
|
194 |
-
Retrieves documents
|
195 |
-
Defaults to 'research' if domain is unrecognized.
|
196 |
"""
|
197 |
try:
|
198 |
if domain == "research":
|
@@ -200,8 +176,8 @@ class ResearchRetriever:
|
|
200 |
elif domain == "development":
|
201 |
return self.development_retriever.invoke(query)
|
202 |
else:
|
203 |
-
logger.warning(f"Domain '{domain}' not recognized.
|
204 |
-
return
|
205 |
except Exception as e:
|
206 |
logger.error(f"Retrieval error for domain '{domain}': {e}")
|
207 |
return []
|
@@ -213,8 +189,8 @@ retriever = ResearchRetriever()
|
|
213 |
# ------------------------------
|
214 |
class CognitiveProcessor:
|
215 |
"""
|
216 |
-
Executes requests to the
|
217 |
-
|
218 |
"""
|
219 |
def __init__(self) -> None:
|
220 |
self.executor = ThreadPoolExecutor(max_workers=ResearchConfig.MAX_CONCURRENT_REQUESTS)
|
@@ -222,10 +198,10 @@ class CognitiveProcessor:
|
|
222 |
|
223 |
def process_query(self, prompt: str) -> Dict:
|
224 |
"""
|
225 |
-
|
226 |
"""
|
227 |
futures = []
|
228 |
-
for _ in range(3):
|
229 |
futures.append(self.executor.submit(self._execute_api_request, prompt))
|
230 |
|
231 |
results = []
|
@@ -240,63 +216,53 @@ class CognitiveProcessor:
|
|
240 |
|
241 |
def _execute_api_request(self, prompt: str) -> Dict:
|
242 |
"""
|
243 |
-
Executes a single request to
|
244 |
"""
|
245 |
-
# Use your OPENAI_API_KEY
|
246 |
headers = {
|
247 |
-
"Authorization": f"Bearer {ResearchConfig.
|
248 |
-
"Content-Type": "application/json"
|
|
|
249 |
}
|
250 |
payload = {
|
251 |
-
"model": "
|
252 |
-
"messages": [
|
253 |
-
|
254 |
-
|
255 |
-
|
256 |
-
}
|
257 |
-
],
|
258 |
"temperature": 0.7,
|
259 |
"max_tokens": 1500,
|
260 |
"top_p": 0.9
|
261 |
}
|
262 |
try:
|
263 |
response = requests.post(
|
264 |
-
"https://api.
|
265 |
headers=headers,
|
266 |
json=payload,
|
267 |
timeout=45
|
268 |
)
|
269 |
response.raise_for_status()
|
270 |
-
logger.info("
|
271 |
return response.json()
|
272 |
except requests.exceptions.RequestException as e:
|
273 |
-
logger.error(f"
|
274 |
return {"error": str(e)}
|
275 |
|
276 |
def _consensus_check(self, results: List[Dict]) -> Dict:
|
277 |
"""
|
278 |
-
|
279 |
"""
|
280 |
-
|
281 |
-
if not
|
282 |
logger.error("All API requests failed.")
|
283 |
return {"error": "All API requests failed"}
|
284 |
-
return max(
|
285 |
|
286 |
# ------------------------------
|
287 |
-
# Research Workflow Engine
|
288 |
# ------------------------------
|
289 |
class ResearchWorkflow:
|
290 |
"""
|
291 |
-
Defines
|
292 |
-
1. Ingest Query
|
293 |
-
2. Retrieve Documents
|
294 |
-
3. Analyze Content
|
295 |
-
4. Validate Output
|
296 |
-
5. Refine (Recursive Self-Learning)
|
297 |
-
|
298 |
-
The refine step uses iterative subproblem breakdown,
|
299 |
-
potentially combined with test-time reinforcement.
|
300 |
"""
|
301 |
def __init__(self) -> None:
|
302 |
self.processor = CognitiveProcessor()
|
@@ -305,14 +271,13 @@ class ResearchWorkflow:
|
|
305 |
self.app = self.workflow.compile()
|
306 |
|
307 |
def _build_workflow(self) -> None:
|
308 |
-
#
|
309 |
self.workflow.add_node("ingest", self.ingest_query)
|
310 |
self.workflow.add_node("retrieve", self.retrieve_documents)
|
311 |
self.workflow.add_node("analyze", self.analyze_content)
|
312 |
self.workflow.add_node("validate", self.validate_output)
|
313 |
self.workflow.add_node("refine", self.refine_results)
|
314 |
-
|
315 |
-
# Graph edges
|
316 |
self.workflow.set_entry_point("ingest")
|
317 |
self.workflow.add_edge("ingest", "retrieve")
|
318 |
self.workflow.add_edge("retrieve", "analyze")
|
@@ -326,10 +291,11 @@ class ResearchWorkflow:
|
|
326 |
|
327 |
def ingest_query(self, state: AgentState) -> Dict:
|
328 |
"""
|
329 |
-
|
330 |
"""
|
331 |
try:
|
332 |
query = state["messages"][-1].content
|
|
|
333 |
new_context = {"raw_query": query, "refine_count": 0}
|
334 |
logger.info("Query ingested.")
|
335 |
return {
|
@@ -338,11 +304,11 @@ class ResearchWorkflow:
|
|
338 |
"metadata": {"timestamp": datetime.now().isoformat()}
|
339 |
}
|
340 |
except Exception as e:
|
341 |
-
return self._error_state(f"Ingestion Error: {e}")
|
342 |
|
343 |
def retrieve_documents(self, state: AgentState) -> Dict:
|
344 |
"""
|
345 |
-
Retrieves
|
346 |
"""
|
347 |
try:
|
348 |
query = state["context"]["raw_query"]
|
@@ -350,19 +316,14 @@ class ResearchWorkflow:
|
|
350 |
logger.info(f"Retrieved {len(docs)} documents for query.")
|
351 |
return {
|
352 |
"messages": [AIMessage(content=f"Retrieved {len(docs)} documents")],
|
353 |
-
"context": {
|
354 |
-
"documents": docs,
|
355 |
-
"retrieval_time": time.time(),
|
356 |
-
"refine_count": state["context"].get("refine_count", 0)
|
357 |
-
}
|
358 |
}
|
359 |
except Exception as e:
|
360 |
-
return self._error_state(f"Retrieval Error: {e}")
|
361 |
|
362 |
def analyze_content(self, state: AgentState) -> Dict:
|
363 |
"""
|
364 |
-
|
365 |
-
returning a structured research analysis.
|
366 |
"""
|
367 |
try:
|
368 |
docs = state["context"].get("documents", [])
|
@@ -371,82 +332,66 @@ class ResearchWorkflow:
|
|
371 |
response = self.processor.process_query(prompt)
|
372 |
if "error" in response:
|
373 |
return self._error_state(response["error"])
|
374 |
-
logger.info("
|
375 |
return {
|
376 |
-
"messages": [
|
377 |
-
|
378 |
-
],
|
379 |
-
"context": {
|
380 |
-
"analysis": response,
|
381 |
-
"refine_count": state["context"].get("refine_count", 0)
|
382 |
-
}
|
383 |
}
|
384 |
except Exception as e:
|
385 |
-
return self._error_state(f"Analysis Error: {e}")
|
386 |
|
387 |
def validate_output(self, state: AgentState) -> Dict:
|
388 |
"""
|
389 |
-
Validates the analysis.
|
390 |
-
using Tufa Labs’ LADDER approach.
|
391 |
"""
|
392 |
analysis = state["messages"][-1].content
|
393 |
validation_prompt = (
|
394 |
-
f"Validate
|
395 |
"Check for:\n1. Technical accuracy\n2. Citation support\n3. Logical consistency\n4. Methodological soundness\n\n"
|
396 |
-
"Respond with 'VALID' or 'INVALID'
|
397 |
)
|
398 |
response = self.processor.process_query(validation_prompt)
|
399 |
-
logger.info("
|
400 |
return {
|
401 |
-
"messages": [
|
402 |
-
AIMessage(
|
403 |
-
content=analysis + f"\n\nValidation: {response.get('choices', [{}])[0].get('message', {}).get('content', '')}"
|
404 |
-
)
|
405 |
-
]
|
406 |
}
|
407 |
|
408 |
def refine_results(self, state: AgentState) -> Dict:
|
409 |
"""
|
410 |
-
|
411 |
-
|
412 |
"""
|
413 |
current_count = state["context"].get("refine_count", 0)
|
414 |
state["context"]["refine_count"] = current_count + 1
|
415 |
-
logger.info(f"
|
416 |
-
|
417 |
refinement_prompt = (
|
418 |
-
"Refine this analysis
|
419 |
-
|
420 |
-
"Break down complex points further, re-solve them, and enhance:\n"
|
421 |
-
"- Technical precision\n- Empirical grounding\n- Theoretical coherence"
|
422 |
)
|
423 |
response = self.processor.process_query(refinement_prompt)
|
424 |
logger.info("Refinement completed.")
|
425 |
return {
|
426 |
-
"messages": [
|
427 |
-
AIMessage(
|
428 |
-
content=response.get('choices', [{}])[0].get('message', {}).get('content', '')
|
429 |
-
)
|
430 |
-
],
|
431 |
"context": state["context"]
|
432 |
}
|
433 |
|
434 |
def _quality_check(self, state: AgentState) -> str:
|
435 |
"""
|
436 |
-
Checks
|
437 |
-
|
438 |
"""
|
439 |
refine_count = state["context"].get("refine_count", 0)
|
440 |
if refine_count >= 3:
|
441 |
-
logger.warning("Refinement limit reached. Forcing valid outcome.")
|
442 |
return "valid"
|
443 |
-
|
444 |
content = state["messages"][-1].content
|
445 |
-
|
|
|
|
|
446 |
|
447 |
def _error_state(self, message: str) -> Dict:
|
448 |
"""
|
449 |
-
Returns
|
450 |
"""
|
451 |
logger.error(message)
|
452 |
return {
|
@@ -456,12 +401,11 @@ class ResearchWorkflow:
|
|
456 |
}
|
457 |
|
458 |
# ------------------------------
|
459 |
-
# Streamlit UI
|
460 |
# ------------------------------
|
461 |
class ResearchInterface:
|
462 |
"""
|
463 |
-
Provides
|
464 |
-
with Tufa Labs' LADDER approach, using OpenAI for both embeddings & completions.
|
465 |
"""
|
466 |
def __init__(self) -> None:
|
467 |
self.workflow = ResearchWorkflow()
|
@@ -469,7 +413,7 @@ class ResearchInterface:
|
|
469 |
|
470 |
def _initialize_interface(self) -> None:
|
471 |
st.set_page_config(
|
472 |
-
page_title="
|
473 |
layout="wide",
|
474 |
initial_sidebar_state="expanded"
|
475 |
)
|
@@ -523,8 +467,8 @@ class ResearchInterface:
|
|
523 |
|
524 |
def _build_sidebar(self) -> None:
|
525 |
with st.sidebar:
|
526 |
-
st.title("🔍 Research Database
|
527 |
-
st.subheader("
|
528 |
for title, short in ResearchConfig.DOCUMENT_MAP.items():
|
529 |
with st.expander(short):
|
530 |
st.markdown(f"```\n{title}\n```")
|
@@ -533,22 +477,19 @@ class ResearchInterface:
|
|
533 |
st.metric("Embedding Dimensions", ResearchConfig.EMBEDDING_DIMENSIONS)
|
534 |
|
535 |
def _build_main_interface(self) -> None:
|
536 |
-
st.title("🧠
|
537 |
-
st.write(
|
538 |
-
"This system uses OpenAI for embeddings & completions"
|
539 |
-
)
|
540 |
query = st.text_area(
|
541 |
"Research Query:",
|
542 |
height=200,
|
543 |
-
placeholder="Enter
|
544 |
)
|
545 |
if st.button("Execute Analysis", type="primary"):
|
546 |
self._execute_analysis(query)
|
547 |
|
548 |
def _execute_analysis(self, query: str) -> None:
|
549 |
try:
|
550 |
-
with st.spinner("Initializing
|
551 |
-
#
|
552 |
results = self.workflow.app.stream({
|
553 |
"messages": [HumanMessage(content=query)],
|
554 |
"context": {},
|
@@ -565,14 +506,10 @@ class ResearchInterface:
|
|
565 |
Potential issues:
|
566 |
- Complex query structure
|
567 |
- Document correlation failure
|
568 |
-
- Rate limits or invalid API key
|
569 |
- Temporal processing constraints"""
|
570 |
)
|
571 |
|
572 |
def _render_event(self, event: Dict) -> None:
|
573 |
-
"""
|
574 |
-
Renders each event in the Streamlit UI, from ingestion to validation/refinement.
|
575 |
-
"""
|
576 |
if 'ingest' in event:
|
577 |
with st.container():
|
578 |
st.success("✅ Query Ingested")
|
@@ -587,7 +524,7 @@ Potential issues:
|
|
587 |
elif 'analyze' in event:
|
588 |
with st.container():
|
589 |
content = event['analyze']['messages'][0].content
|
590 |
-
with st.expander("
|
591 |
st.markdown(content)
|
592 |
elif 'validate' in event:
|
593 |
with st.container():
|
@@ -595,7 +532,6 @@ Potential issues:
|
|
595 |
if "VALID" in content:
|
596 |
st.success("✅ Validation Passed")
|
597 |
with st.expander("View Validated Analysis", expanded=True):
|
598 |
-
# Hide "Validation: ..." from final output
|
599 |
st.markdown(content.split("Validation:")[0])
|
600 |
else:
|
601 |
st.warning("⚠️ Validation Issues Detected")
|
|
|
|
|
|
|
|
|
1 |
import logging
|
2 |
import os
|
3 |
import re
|
|
|
8 |
from datetime import datetime
|
9 |
from concurrent.futures import ThreadPoolExecutor, as_completed
|
10 |
from typing import List, Dict, Any, Optional, Sequence
|
|
|
11 |
import chromadb
|
12 |
import requests
|
13 |
import streamlit as st
|
14 |
|
15 |
+
# LangChain and LangGraph imports
|
16 |
from langchain_openai import OpenAIEmbeddings
|
17 |
from langchain_community.vectorstores import Chroma
|
18 |
from langchain_core.messages import HumanMessage, AIMessage, ToolMessage
|
|
|
23 |
from typing_extensions import TypedDict, Annotated
|
24 |
from langchain.tools.retriever import create_retriever_tool
|
25 |
|
26 |
+
# Increase Python's recursion limit at the very start (if needed)
|
27 |
sys.setrecursionlimit(10000)
|
28 |
|
29 |
# ------------------------------
|
|
|
39 |
# State Schema Definition
|
40 |
# ------------------------------
|
41 |
class AgentState(TypedDict):
|
|
|
|
|
|
|
|
|
|
|
|
|
42 |
messages: Annotated[Sequence[AIMessage | HumanMessage | ToolMessage], add_messages]
|
43 |
context: Dict[str, Any]
|
44 |
metadata: Dict[str, Any]
|
|
|
47 |
# Configuration
|
48 |
# ------------------------------
|
49 |
class ResearchConfig:
|
50 |
+
DEEPSEEK_API_KEY = os.environ.get("DEEPSEEK_API_KEY")
|
|
|
|
|
|
|
|
|
|
|
|
|
51 |
CHROMA_PATH = "chroma_db"
|
52 |
CHUNK_SIZE = 512
|
53 |
CHUNK_OVERLAP = 64
|
54 |
MAX_CONCURRENT_REQUESTS = 5
|
55 |
EMBEDDING_DIMENSIONS = 1536
|
|
|
|
|
56 |
DOCUMENT_MAP = {
|
57 |
+
"Research Report: Results of a New AI Model Improving Image Recognition Accuracy to 98%":
|
58 |
+
"CV-Transformer Hybrid Architecture",
|
59 |
+
"Academic Paper Summary: Why Transformers Became the Mainstream Architecture in Natural Language Processing":
|
60 |
+
"Transformer Architecture Analysis",
|
61 |
+
"Latest Trends in Machine Learning Methods Using Quantum Computing":
|
62 |
+
"Quantum ML Frontiers"
|
63 |
}
|
|
|
|
|
64 |
ANALYSIS_TEMPLATE = (
|
65 |
+
"Analyze these technical documents with scientific rigor:\n{context}\n\n"
|
66 |
+
"Respond with:\n"
|
67 |
+
"1. Key Technical Contributions (bullet points)\n"
|
68 |
+
"2. Novel Methodologies\n"
|
69 |
+
"3. Empirical Results (with metrics)\n"
|
70 |
+
"4. Potential Applications\n"
|
71 |
+
"5. Limitations & Future Directions\n\n"
|
72 |
+
"Format: Markdown with LaTeX mathematical notation where applicable"
|
|
|
|
|
|
|
73 |
)
|
74 |
|
75 |
+
if not ResearchConfig.DEEPSEEK_API_KEY:
|
|
|
76 |
st.error(
|
77 |
+
"""**Research Portal Configuration Required**
|
78 |
+
1. Obtain DeepSeek API key: [platform.deepseek.com](https://platform.deepseek.com/)
|
79 |
+
2. Configure secret: `DEEPSEEK_API_KEY` in Space settings
|
80 |
+
3. Rebuild deployment"""
|
81 |
)
|
82 |
st.stop()
|
83 |
|
84 |
# ------------------------------
|
85 |
+
# Quantum Document Processing
|
86 |
# ------------------------------
|
87 |
+
class QuantumDocumentManager:
|
88 |
"""
|
89 |
+
Manages creation of Chroma collections from raw document texts.
|
|
|
90 |
"""
|
91 |
def __init__(self) -> None:
|
92 |
try:
|
|
|
95 |
except Exception as e:
|
96 |
logger.error(f"Error initializing PersistentClient: {e}")
|
97 |
self.client = chromadb.Client() # Fallback to in-memory client
|
|
|
|
|
98 |
self.embeddings = OpenAIEmbeddings(
|
99 |
model="text-embedding-3-large",
|
100 |
dimensions=ResearchConfig.EMBEDDING_DIMENSIONS
|
|
|
102 |
|
103 |
def create_collection(self, documents: List[str], collection_name: str) -> Chroma:
|
104 |
"""
|
105 |
+
Splits documents into chunks and stores them as a Chroma collection.
|
106 |
"""
|
107 |
splitter = RecursiveCharacterTextSplitter(
|
108 |
chunk_size=ResearchConfig.CHUNK_SIZE,
|
|
|
111 |
)
|
112 |
try:
|
113 |
docs = splitter.create_documents(documents)
|
114 |
+
logger.info(f"Created {len(docs)} document chunks for collection '{collection_name}'.")
|
115 |
except Exception as e:
|
116 |
logger.error(f"Error splitting documents: {e}")
|
117 |
raise e
|
|
|
126 |
|
127 |
def _document_id(self, content: str) -> str:
|
128 |
"""
|
129 |
+
Generates a unique document ID using SHA256 and the current timestamp.
|
130 |
"""
|
131 |
return f"{hashlib.sha256(content.encode()).hexdigest()[:16]}-{int(time.time())}"
|
132 |
|
133 |
+
# Initialize document collections
|
134 |
+
qdm = QuantumDocumentManager()
|
135 |
+
research_docs = qdm.create_collection([
|
136 |
+
"Research Report: Results of a New AI Model Improving Image Recognition Accuracy to 98%",
|
137 |
+
"Academic Paper Summary: Why Transformers Became the Mainstream Architecture in Natural Language Processing",
|
138 |
+
"Latest Trends in Machine Learning Methods Using Quantum Computing"
|
139 |
], "research")
|
140 |
|
141 |
+
development_docs = qdm.create_collection([
|
142 |
+
"Project A: UI Design Completed, API Integration in Progress",
|
143 |
+
"Project B: Testing New Feature X, Bug Fixes Needed",
|
144 |
+
"Product Y: In the Performance Optimization Stage Before Release"
|
145 |
], "development")
|
146 |
|
147 |
# ------------------------------
|
|
|
149 |
# ------------------------------
|
150 |
class ResearchRetriever:
|
151 |
"""
|
152 |
+
Provides retrieval methods for different domains.
|
|
|
153 |
"""
|
154 |
def __init__(self) -> None:
|
155 |
try:
|
|
|
168 |
|
169 |
def retrieve(self, query: str, domain: str) -> List[Any]:
|
170 |
"""
|
171 |
+
Retrieves documents based on the query and domain.
|
|
|
172 |
"""
|
173 |
try:
|
174 |
if domain == "research":
|
|
|
176 |
elif domain == "development":
|
177 |
return self.development_retriever.invoke(query)
|
178 |
else:
|
179 |
+
logger.warning(f"Domain '{domain}' not recognized.")
|
180 |
+
return []
|
181 |
except Exception as e:
|
182 |
logger.error(f"Retrieval error for domain '{domain}': {e}")
|
183 |
return []
|
|
|
189 |
# ------------------------------
|
190 |
class CognitiveProcessor:
|
191 |
"""
|
192 |
+
Executes API requests to the DeepSeek backend using triple redundancy
|
193 |
+
and consolidates results via a consensus mechanism.
|
194 |
"""
|
195 |
def __init__(self) -> None:
|
196 |
self.executor = ThreadPoolExecutor(max_workers=ResearchConfig.MAX_CONCURRENT_REQUESTS)
|
|
|
198 |
|
199 |
def process_query(self, prompt: str) -> Dict:
|
200 |
"""
|
201 |
+
Processes a query by sending multiple API requests in parallel.
|
202 |
"""
|
203 |
futures = []
|
204 |
+
for _ in range(3): # Triple redundancy for reliability
|
205 |
futures.append(self.executor.submit(self._execute_api_request, prompt))
|
206 |
|
207 |
results = []
|
|
|
216 |
|
217 |
def _execute_api_request(self, prompt: str) -> Dict:
|
218 |
"""
|
219 |
+
Executes a single API request to the DeepSeek endpoint.
|
220 |
"""
|
|
|
221 |
headers = {
|
222 |
+
"Authorization": f"Bearer {ResearchConfig.DEEPSEEK_API_KEY}",
|
223 |
+
"Content-Type": "application/json",
|
224 |
+
"X-Research-Session": self.session_id
|
225 |
}
|
226 |
payload = {
|
227 |
+
"model": "deepseek-chat",
|
228 |
+
"messages": [{
|
229 |
+
"role": "user",
|
230 |
+
"content": f"Respond as Senior AI Researcher:\n{prompt}"
|
231 |
+
}],
|
|
|
|
|
232 |
"temperature": 0.7,
|
233 |
"max_tokens": 1500,
|
234 |
"top_p": 0.9
|
235 |
}
|
236 |
try:
|
237 |
response = requests.post(
|
238 |
+
"https://api.deepseek.com/v1/chat/completions",
|
239 |
headers=headers,
|
240 |
json=payload,
|
241 |
timeout=45
|
242 |
)
|
243 |
response.raise_for_status()
|
244 |
+
logger.info("DeepSeek API request successful.")
|
245 |
return response.json()
|
246 |
except requests.exceptions.RequestException as e:
|
247 |
+
logger.error(f"DeepSeek API request failed: {e}")
|
248 |
return {"error": str(e)}
|
249 |
|
250 |
def _consensus_check(self, results: List[Dict]) -> Dict:
|
251 |
"""
|
252 |
+
Consolidates multiple API responses, selecting the one with the most content.
|
253 |
"""
|
254 |
+
valid_results = [r for r in results if "error" not in r]
|
255 |
+
if not valid_results:
|
256 |
logger.error("All API requests failed.")
|
257 |
return {"error": "All API requests failed"}
|
258 |
+
return max(valid_results, key=lambda x: len(x.get('choices', [{}])[0].get('message', {}).get('content', '')))
|
259 |
|
260 |
# ------------------------------
|
261 |
+
# Research Workflow Engine
|
262 |
# ------------------------------
|
263 |
class ResearchWorkflow:
|
264 |
"""
|
265 |
+
Defines the multi-step research workflow using a state graph.
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
266 |
"""
|
267 |
def __init__(self) -> None:
|
268 |
self.processor = CognitiveProcessor()
|
|
|
271 |
self.app = self.workflow.compile()
|
272 |
|
273 |
def _build_workflow(self) -> None:
|
274 |
+
# Define nodes
|
275 |
self.workflow.add_node("ingest", self.ingest_query)
|
276 |
self.workflow.add_node("retrieve", self.retrieve_documents)
|
277 |
self.workflow.add_node("analyze", self.analyze_content)
|
278 |
self.workflow.add_node("validate", self.validate_output)
|
279 |
self.workflow.add_node("refine", self.refine_results)
|
280 |
+
# Set entry point and edges
|
|
|
281 |
self.workflow.set_entry_point("ingest")
|
282 |
self.workflow.add_edge("ingest", "retrieve")
|
283 |
self.workflow.add_edge("retrieve", "analyze")
|
|
|
291 |
|
292 |
def ingest_query(self, state: AgentState) -> Dict:
|
293 |
"""
|
294 |
+
Ingests the research query and initializes the refinement counter.
|
295 |
"""
|
296 |
try:
|
297 |
query = state["messages"][-1].content
|
298 |
+
# Initialize context with raw query and refinement counter
|
299 |
new_context = {"raw_query": query, "refine_count": 0}
|
300 |
logger.info("Query ingested.")
|
301 |
return {
|
|
|
304 |
"metadata": {"timestamp": datetime.now().isoformat()}
|
305 |
}
|
306 |
except Exception as e:
|
307 |
+
return self._error_state(f"Ingestion Error: {str(e)}")
|
308 |
|
309 |
def retrieve_documents(self, state: AgentState) -> Dict:
|
310 |
"""
|
311 |
+
Retrieves research documents based on the query.
|
312 |
"""
|
313 |
try:
|
314 |
query = state["context"]["raw_query"]
|
|
|
316 |
logger.info(f"Retrieved {len(docs)} documents for query.")
|
317 |
return {
|
318 |
"messages": [AIMessage(content=f"Retrieved {len(docs)} documents")],
|
319 |
+
"context": {"documents": docs, "retrieval_time": time.time(), "refine_count": state["context"].get("refine_count", 0)}
|
|
|
|
|
|
|
|
|
320 |
}
|
321 |
except Exception as e:
|
322 |
+
return self._error_state(f"Retrieval Error: {str(e)}")
|
323 |
|
324 |
def analyze_content(self, state: AgentState) -> Dict:
|
325 |
"""
|
326 |
+
Analyzes the retrieved documents using the DeepSeek API.
|
|
|
327 |
"""
|
328 |
try:
|
329 |
docs = state["context"].get("documents", [])
|
|
|
332 |
response = self.processor.process_query(prompt)
|
333 |
if "error" in response:
|
334 |
return self._error_state(response["error"])
|
335 |
+
logger.info("Content analysis completed.")
|
336 |
return {
|
337 |
+
"messages": [AIMessage(content=response.get('choices', [{}])[0].get('message', {}).get('content', ''))],
|
338 |
+
"context": {"analysis": response, "refine_count": state["context"].get("refine_count", 0)}
|
|
|
|
|
|
|
|
|
|
|
339 |
}
|
340 |
except Exception as e:
|
341 |
+
return self._error_state(f"Analysis Error: {str(e)}")
|
342 |
|
343 |
def validate_output(self, state: AgentState) -> Dict:
|
344 |
"""
|
345 |
+
Validates the technical analysis report.
|
|
|
346 |
"""
|
347 |
analysis = state["messages"][-1].content
|
348 |
validation_prompt = (
|
349 |
+
f"Validate research analysis:\n{analysis}\n\n"
|
350 |
"Check for:\n1. Technical accuracy\n2. Citation support\n3. Logical consistency\n4. Methodological soundness\n\n"
|
351 |
+
"Respond with 'VALID' or 'INVALID'"
|
352 |
)
|
353 |
response = self.processor.process_query(validation_prompt)
|
354 |
+
logger.info("Output validation completed.")
|
355 |
return {
|
356 |
+
"messages": [AIMessage(content=analysis + f"\n\nValidation: {response.get('choices', [{}])[0].get('message', {}).get('content', '')}")]
|
|
|
|
|
|
|
|
|
357 |
}
|
358 |
|
359 |
def refine_results(self, state: AgentState) -> Dict:
|
360 |
"""
|
361 |
+
Refines the analysis report if validation fails.
|
362 |
+
Increments the refinement counter to limit infinite loops.
|
363 |
"""
|
364 |
current_count = state["context"].get("refine_count", 0)
|
365 |
state["context"]["refine_count"] = current_count + 1
|
366 |
+
logger.info(f"Refinement iteration: {state['context']['refine_count']}")
|
|
|
367 |
refinement_prompt = (
|
368 |
+
f"Refine this analysis:\n{state['messages'][-1].content}\n\n"
|
369 |
+
"Improve:\n1. Technical precision\n2. Empirical grounding\n3. Theoretical coherence"
|
|
|
|
|
370 |
)
|
371 |
response = self.processor.process_query(refinement_prompt)
|
372 |
logger.info("Refinement completed.")
|
373 |
return {
|
374 |
+
"messages": [AIMessage(content=response.get('choices', [{}])[0].get('message', {}).get('content', ''))],
|
|
|
|
|
|
|
|
|
375 |
"context": state["context"]
|
376 |
}
|
377 |
|
378 |
def _quality_check(self, state: AgentState) -> str:
|
379 |
"""
|
380 |
+
Checks whether the analysis report is valid.
|
381 |
+
Forces a valid state if the refinement count exceeds a threshold.
|
382 |
"""
|
383 |
refine_count = state["context"].get("refine_count", 0)
|
384 |
if refine_count >= 3:
|
385 |
+
logger.warning("Refinement limit reached. Forcing valid outcome to prevent infinite recursion.")
|
386 |
return "valid"
|
|
|
387 |
content = state["messages"][-1].content
|
388 |
+
quality = "valid" if "VALID" in content else "invalid"
|
389 |
+
logger.info(f"Quality check returned: {quality}")
|
390 |
+
return quality
|
391 |
|
392 |
def _error_state(self, message: str) -> Dict:
|
393 |
"""
|
394 |
+
Returns a standardized error state.
|
395 |
"""
|
396 |
logger.error(message)
|
397 |
return {
|
|
|
401 |
}
|
402 |
|
403 |
# ------------------------------
|
404 |
+
# Research Interface (Streamlit UI)
|
405 |
# ------------------------------
|
406 |
class ResearchInterface:
|
407 |
"""
|
408 |
+
Provides the Streamlit-based interface for executing the research workflow.
|
|
|
409 |
"""
|
410 |
def __init__(self) -> None:
|
411 |
self.workflow = ResearchWorkflow()
|
|
|
413 |
|
414 |
def _initialize_interface(self) -> None:
|
415 |
st.set_page_config(
|
416 |
+
page_title="NeuroResearch AI",
|
417 |
layout="wide",
|
418 |
initial_sidebar_state="expanded"
|
419 |
)
|
|
|
467 |
|
468 |
def _build_sidebar(self) -> None:
|
469 |
with st.sidebar:
|
470 |
+
st.title("🔍 Research Database")
|
471 |
+
st.subheader("Technical Papers")
|
472 |
for title, short in ResearchConfig.DOCUMENT_MAP.items():
|
473 |
with st.expander(short):
|
474 |
st.markdown(f"```\n{title}\n```")
|
|
|
477 |
st.metric("Embedding Dimensions", ResearchConfig.EMBEDDING_DIMENSIONS)
|
478 |
|
479 |
def _build_main_interface(self) -> None:
|
480 |
+
st.title("🧠 NeuroResearch AI")
|
|
|
|
|
|
|
481 |
query = st.text_area(
|
482 |
"Research Query:",
|
483 |
height=200,
|
484 |
+
placeholder="Enter technical research question..."
|
485 |
)
|
486 |
if st.button("Execute Analysis", type="primary"):
|
487 |
self._execute_analysis(query)
|
488 |
|
489 |
def _execute_analysis(self, query: str) -> None:
|
490 |
try:
|
491 |
+
with st.spinner("Initializing Quantum Analysis..."):
|
492 |
+
# Pass a recursion limit configuration into the graph invocation
|
493 |
results = self.workflow.app.stream({
|
494 |
"messages": [HumanMessage(content=query)],
|
495 |
"context": {},
|
|
|
506 |
Potential issues:
|
507 |
- Complex query structure
|
508 |
- Document correlation failure
|
|
|
509 |
- Temporal processing constraints"""
|
510 |
)
|
511 |
|
512 |
def _render_event(self, event: Dict) -> None:
|
|
|
|
|
|
|
513 |
if 'ingest' in event:
|
514 |
with st.container():
|
515 |
st.success("✅ Query Ingested")
|
|
|
524 |
elif 'analyze' in event:
|
525 |
with st.container():
|
526 |
content = event['analyze']['messages'][0].content
|
527 |
+
with st.expander("Technical Analysis Report", expanded=True):
|
528 |
st.markdown(content)
|
529 |
elif 'validate' in event:
|
530 |
with st.container():
|
|
|
532 |
if "VALID" in content:
|
533 |
st.success("✅ Validation Passed")
|
534 |
with st.expander("View Validated Analysis", expanded=True):
|
|
|
535 |
st.markdown(content.split("Validation:")[0])
|
536 |
else:
|
537 |
st.warning("⚠️ Validation Issues Detected")
|