Spaces:
Sleeping
Sleeping
Update app.py
Browse files
app.py
CHANGED
@@ -1,334 +1,539 @@
|
|
1 |
# ------------------------------
|
2 |
-
# NeuroResearch
|
3 |
# ------------------------------
|
4 |
-
|
5 |
-
from langchain_community.vectorstores import Chroma
|
6 |
-
from langchain_community.retrievers import BM25Retriever
|
7 |
-
from langchain.text_splitter import RecursiveCharacterTextSplitter
|
8 |
-
from rank_bm25 import BM25Okapi
|
9 |
-
from sentence_transformers import CrossEncoder
|
10 |
-
from typing_extensions import TypedDict, Annotated
|
11 |
-
from typing import (
|
12 |
-
Sequence, Dict, List, Optional, Any, Tuple, Union
|
13 |
-
)
|
14 |
-
|
15 |
-
import chromadb
|
16 |
import os
|
|
|
17 |
import hashlib
|
18 |
import json
|
19 |
import time
|
20 |
-
|
21 |
-
from concurrent.futures import ThreadPoolExecutor, as_completed
|
22 |
from datetime import datetime
|
|
|
|
|
23 |
|
|
|
24 |
import streamlit as st
|
25 |
-
|
26 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
27 |
|
28 |
# ------------------------------
|
29 |
# Configuration
|
30 |
# ------------------------------
|
31 |
-
class
|
32 |
-
"""
|
33 |
-
Configuration class for NeuroResearch system.
|
34 |
-
|
35 |
-
Attributes:
|
36 |
-
DEEPSEEK_API_KEY (str): Optional API key for external services.
|
37 |
-
CHROMA_PATH (str): File path for Chroma's persistent storage.
|
38 |
-
CHUNK_SIZE (int): Maximum length of text chunks for splitting.
|
39 |
-
CHUNK_OVERLAP (int): Overlap between text chunks to preserve context.
|
40 |
-
MAX_CONCURRENT_REQUESTS (int): Number of concurrent threads for processing.
|
41 |
-
EMBEDDING_DIMENSIONS (int): Dimensionality of embeddings.
|
42 |
-
HYBRID_RERANK_TOP_K (int): Number of documents to retrieve and rerank.
|
43 |
-
ANALYSIS_MODES (dict): Possible analysis modes and their descriptions.
|
44 |
-
CACHE_TTL (int): Time-to-live (seconds) for cached items.
|
45 |
-
"""
|
46 |
DEEPSEEK_API_KEY = os.environ.get("DEEPSEEK_API_KEY")
|
47 |
-
CHROMA_PATH = "
|
48 |
CHUNK_SIZE = 512
|
49 |
CHUNK_OVERLAP = 64
|
50 |
-
MAX_CONCURRENT_REQUESTS =
|
51 |
-
EMBEDDING_DIMENSIONS =
|
52 |
-
|
53 |
-
|
54 |
-
|
55 |
-
"
|
56 |
-
|
57 |
-
"
|
|
|
58 |
}
|
59 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
60 |
|
61 |
# ------------------------------
|
62 |
-
# Document
|
63 |
# ------------------------------
|
64 |
-
class
|
65 |
"""
|
66 |
-
|
67 |
-
|
68 |
-
Responsibilities:
|
69 |
-
- Splitting documents into manageable chunks.
|
70 |
-
- Storing and retrieving embeddings with Chroma.
|
71 |
-
- Performing hybrid retrieval (vector + BM25) with cross-encoder reranking.
|
72 |
-
- Handling concurrency during document ingestion (optional).
|
73 |
"""
|
74 |
def __init__(self) -> None:
|
75 |
-
"""
|
76 |
-
Initialize the NeuralDocumentProcessor with a persistent Chroma client,
|
77 |
-
OpenAI-based embeddings, a CrossEncoder for reranking, and a text splitter.
|
78 |
-
"""
|
79 |
-
# Persistent Chroma client
|
80 |
try:
|
81 |
-
self.client = chromadb.PersistentClient(path=
|
|
|
82 |
except Exception as e:
|
83 |
-
|
84 |
-
|
85 |
-
self.client = chromadb.Client()
|
86 |
-
|
87 |
-
# Embeddings (OpenAI-based)
|
88 |
self.embeddings = OpenAIEmbeddings(
|
89 |
model="text-embedding-3-large",
|
90 |
-
dimensions=
|
91 |
)
|
92 |
|
93 |
-
|
94 |
-
|
95 |
-
|
96 |
-
|
97 |
-
|
98 |
-
chunk_size=
|
99 |
-
chunk_overlap=
|
100 |
-
separators=["\n\n", "\n", "
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
101 |
)
|
102 |
|
103 |
-
def
|
104 |
-
self,
|
105 |
-
documents: List[str],
|
106 |
-
collection: str,
|
107 |
-
use_concurrency: bool = False
|
108 |
-
) -> Optional[Chroma]:
|
109 |
"""
|
110 |
-
|
111 |
-
Optionally uses concurrency for splitting documents.
|
112 |
-
|
113 |
-
Args:
|
114 |
-
documents (List[str]): The list of raw document texts.
|
115 |
-
collection (str): The Chroma collection name to store these documents in.
|
116 |
-
use_concurrency (bool, optional): If True, process documents concurrently. Defaults to False.
|
117 |
-
|
118 |
-
Returns:
|
119 |
-
Optional[Chroma]: The Chroma vectorstore for the specified collection, or None if no docs.
|
120 |
"""
|
121 |
-
|
122 |
-
|
123 |
-
|
124 |
-
|
125 |
-
|
126 |
-
|
127 |
-
|
128 |
-
|
129 |
-
|
130 |
-
|
131 |
-
|
132 |
-
|
133 |
-
|
134 |
-
|
135 |
-
|
136 |
-
|
137 |
-
|
138 |
-
|
139 |
-
|
140 |
-
|
141 |
-
|
142 |
-
|
143 |
-
|
144 |
-
|
145 |
-
# Build unique IDs for each chunk
|
146 |
-
chunk_ids = [self._quantum_id(doc.page_content) for doc in chunks]
|
147 |
-
|
148 |
-
# Create Chroma from documents
|
149 |
try:
|
150 |
-
|
151 |
-
|
152 |
-
|
153 |
-
|
154 |
-
|
155 |
-
|
|
|
156 |
)
|
157 |
-
|
158 |
except Exception as e:
|
159 |
-
|
160 |
-
|
161 |
-
|
162 |
-
def
|
163 |
-
self,
|
164 |
-
query: str,
|
165 |
-
collection: str,
|
166 |
-
return_scores: bool = False
|
167 |
-
) -> Union[List[str], List[Tuple[str, float]]]:
|
168 |
"""
|
169 |
-
|
170 |
-
then re-rank the combined results using a cross-encoder.
|
171 |
-
|
172 |
-
Args:
|
173 |
-
query (str): The user query for retrieving documents.
|
174 |
-
collection (str): The name of the Chroma collection to search.
|
175 |
-
return_scores (bool): If True, return a list of (document, score) tuples.
|
176 |
-
Otherwise, return a list of document strings only.
|
177 |
-
|
178 |
-
Returns:
|
179 |
-
Union[List[str], List[Tuple[str, float]]]: The top-k reranked results,
|
180 |
-
either as strings or (string, score) pairs.
|
181 |
"""
|
182 |
-
# Try to load the existing collection
|
183 |
try:
|
184 |
-
|
185 |
-
|
186 |
-
|
187 |
-
|
188 |
-
|
|
|
|
|
189 |
except Exception as e:
|
190 |
-
|
191 |
-
return []
|
192 |
|
193 |
-
|
194 |
-
stored_docs = vector_store.get()
|
195 |
-
if not stored_docs or "documents" not in stored_docs or not stored_docs["documents"]:
|
196 |
-
print(f"No documents found in collection '{collection}'.")
|
197 |
-
return [] if not return_scores else []
|
198 |
|
199 |
-
|
200 |
-
|
201 |
-
|
202 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
203 |
|
204 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
205 |
try:
|
206 |
-
|
207 |
-
|
|
|
|
|
|
|
208 |
)
|
209 |
-
|
210 |
-
|
211 |
-
|
212 |
-
|
213 |
-
|
214 |
-
|
215 |
-
|
216 |
-
|
217 |
-
|
218 |
-
|
219 |
-
|
220 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
221 |
)
|
|
|
|
|
222 |
|
223 |
-
|
224 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
225 |
|
226 |
-
|
227 |
-
|
228 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
229 |
|
230 |
-
|
231 |
-
|
232 |
-
|
233 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
234 |
|
235 |
-
|
236 |
-
|
237 |
-
|
238 |
-
|
239 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
240 |
|
241 |
-
def
|
|
|
|
|
242 |
"""
|
243 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
244 |
|
245 |
-
|
246 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
247 |
|
248 |
-
|
249 |
-
|
|
|
250 |
"""
|
251 |
-
|
|
|
|
|
|
|
|
|
|
|
252 |
|
253 |
# ------------------------------
|
254 |
-
#
|
255 |
# ------------------------------
|
256 |
-
|
257 |
"""
|
258 |
-
|
259 |
-
This function can be adapted for Hugging Face Spaces or other frontends.
|
260 |
"""
|
261 |
-
|
262 |
-
|
263 |
-
|
264 |
-
|
265 |
-
|
266 |
-
|
267 |
-
|
268 |
-
|
269 |
-
|
270 |
-
"Upload one or more text files",
|
271 |
-
type=["txt", "md", "pdf"],
|
272 |
-
accept_multiple_files=True
|
273 |
)
|
274 |
-
|
275 |
-
|
276 |
-
|
277 |
-
|
278 |
-
|
279 |
-
|
280 |
-
|
281 |
-
|
282 |
-
|
283 |
-
|
284 |
-
|
285 |
-
|
286 |
-
|
287 |
-
|
288 |
-
|
289 |
-
|
290 |
-
|
291 |
-
|
292 |
-
|
293 |
-
|
294 |
-
|
295 |
-
|
296 |
-
|
297 |
-
|
298 |
-
|
299 |
-
|
300 |
-
|
301 |
-
|
302 |
-
|
303 |
-
|
304 |
-
|
305 |
-
|
306 |
-
|
307 |
-
|
308 |
-
|
309 |
-
|
310 |
-
|
311 |
-
|
312 |
-
|
313 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
314 |
)
|
315 |
-
|
316 |
-
|
317 |
-
|
318 |
-
|
319 |
-
|
320 |
-
|
321 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
322 |
else:
|
323 |
-
|
324 |
-
|
325 |
-
st.markdown(
|
326 |
-
st.write(doc[:500] + ("..." if len(doc) > 500 else ""))
|
327 |
-
else:
|
328 |
-
st.warning("No results found or collection may be empty.")
|
329 |
|
330 |
-
# ------------------------------
|
331 |
-
# Main Entry Point
|
332 |
-
# ------------------------------
|
333 |
if __name__ == "__main__":
|
334 |
-
|
|
|
1 |
# ------------------------------
|
2 |
+
# Enhanced NeuroResearch AI System
|
3 |
# ------------------------------
|
4 |
+
import logging
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
5 |
import os
|
6 |
+
import re
|
7 |
import hashlib
|
8 |
import json
|
9 |
import time
|
|
|
|
|
10 |
from datetime import datetime
|
11 |
+
from concurrent.futures import ThreadPoolExecutor, as_completed
|
12 |
+
from typing import List, Dict, Any, Optional, Sequence
|
13 |
|
14 |
+
import requests
|
15 |
import streamlit as st
|
16 |
+
|
17 |
+
# LangChain and LangGraph imports
|
18 |
+
from langchain_openai import OpenAIEmbeddings
|
19 |
+
from langchain_community.vectorstores import Chroma
|
20 |
+
from langchain_core.messages import HumanMessage, AIMessage, ToolMessage
|
21 |
+
from langchain.text_splitter import RecursiveCharacterTextSplitter
|
22 |
+
from langgraph.graph import END, StateGraph
|
23 |
+
from langgraph.prebuilt import ToolNode
|
24 |
+
from langgraph.graph.message import add_messages
|
25 |
+
from typing_extensions import TypedDict, Annotated
|
26 |
+
from langchain.tools.retriever import create_retriever_tool
|
27 |
+
|
28 |
+
# ------------------------------
|
29 |
+
# Logging Configuration
|
30 |
+
# ------------------------------
|
31 |
+
logging.basicConfig(
|
32 |
+
level=logging.INFO,
|
33 |
+
format="%(asctime)s [%(levelname)s] %(message)s"
|
34 |
+
)
|
35 |
+
logger = logging.getLogger(__name__)
|
36 |
+
|
37 |
+
# ------------------------------
|
38 |
+
# State Schema Definition
|
39 |
+
# ------------------------------
|
40 |
+
class AgentState(TypedDict):
|
41 |
+
messages: Annotated[Sequence[AIMessage | HumanMessage | ToolMessage], add_messages]
|
42 |
+
context: Dict[str, Any]
|
43 |
+
metadata: Dict[str, Any]
|
44 |
|
45 |
# ------------------------------
|
46 |
# Configuration
|
47 |
# ------------------------------
|
48 |
+
class ResearchConfig:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
49 |
DEEPSEEK_API_KEY = os.environ.get("DEEPSEEK_API_KEY")
|
50 |
+
CHROMA_PATH = "chroma_db"
|
51 |
CHUNK_SIZE = 512
|
52 |
CHUNK_OVERLAP = 64
|
53 |
+
MAX_CONCURRENT_REQUESTS = 5
|
54 |
+
EMBEDDING_DIMENSIONS = 1536
|
55 |
+
DOCUMENT_MAP = {
|
56 |
+
"Research Report: Results of a New AI Model Improving Image Recognition Accuracy to 98%":
|
57 |
+
"CV-Transformer Hybrid Architecture",
|
58 |
+
"Academic Paper Summary: Why Transformers Became the Mainstream Architecture in Natural Language Processing":
|
59 |
+
"Transformer Architecture Analysis",
|
60 |
+
"Latest Trends in Machine Learning Methods Using Quantum Computing":
|
61 |
+
"Quantum ML Frontiers"
|
62 |
}
|
63 |
+
ANALYSIS_TEMPLATE = (
|
64 |
+
"Analyze these technical documents with scientific rigor:\n{context}\n\n"
|
65 |
+
"Respond with:\n"
|
66 |
+
"1. Key Technical Contributions (bullet points)\n"
|
67 |
+
"2. Novel Methodologies\n"
|
68 |
+
"3. Empirical Results (with metrics)\n"
|
69 |
+
"4. Potential Applications\n"
|
70 |
+
"5. Limitations & Future Directions\n\n"
|
71 |
+
"Format: Markdown with LaTeX mathematical notation where applicable"
|
72 |
+
)
|
73 |
+
|
74 |
+
if not ResearchConfig.DEEPSEEK_API_KEY:
|
75 |
+
st.error(
|
76 |
+
"""**Research Portal Configuration Required**
|
77 |
+
1. Obtain DeepSeek API key: [platform.deepseek.com](https://platform.deepseek.com/)
|
78 |
+
2. Configure secret: `DEEPSEEK_API_KEY` in Space settings
|
79 |
+
3. Rebuild deployment"""
|
80 |
+
)
|
81 |
+
st.stop()
|
82 |
|
83 |
# ------------------------------
|
84 |
+
# Quantum Document Processing
|
85 |
# ------------------------------
|
86 |
+
class QuantumDocumentManager:
|
87 |
"""
|
88 |
+
Manages the creation of Chroma collections from raw document texts.
|
|
|
|
|
|
|
|
|
|
|
|
|
89 |
"""
|
90 |
def __init__(self) -> None:
|
|
|
|
|
|
|
|
|
|
|
91 |
try:
|
92 |
+
self.client = chromadb.PersistentClient(path=ResearchConfig.CHROMA_PATH)
|
93 |
+
logger.info("Initialized PersistentClient for Chroma.")
|
94 |
except Exception as e:
|
95 |
+
logger.error(f"Error initializing PersistentClient: {e}")
|
96 |
+
self.client = chromadb.Client() # Fallback to in-memory client
|
|
|
|
|
|
|
97 |
self.embeddings = OpenAIEmbeddings(
|
98 |
model="text-embedding-3-large",
|
99 |
+
dimensions=ResearchConfig.EMBEDDING_DIMENSIONS
|
100 |
)
|
101 |
|
102 |
+
def create_collection(self, documents: List[str], collection_name: str) -> Chroma:
|
103 |
+
"""
|
104 |
+
Splits documents into chunks and stores them as a Chroma collection.
|
105 |
+
"""
|
106 |
+
splitter = RecursiveCharacterTextSplitter(
|
107 |
+
chunk_size=ResearchConfig.CHUNK_SIZE,
|
108 |
+
chunk_overlap=ResearchConfig.CHUNK_OVERLAP,
|
109 |
+
separators=["\n\n", "\n", "|||"]
|
110 |
+
)
|
111 |
+
try:
|
112 |
+
docs = splitter.create_documents(documents)
|
113 |
+
logger.info(f"Created {len(docs)} document chunks for collection '{collection_name}'.")
|
114 |
+
except Exception as e:
|
115 |
+
logger.error(f"Error splitting documents: {e}")
|
116 |
+
raise e
|
117 |
+
|
118 |
+
return Chroma.from_documents(
|
119 |
+
documents=docs,
|
120 |
+
embedding=self.embeddings,
|
121 |
+
client=self.client,
|
122 |
+
collection_name=collection_name,
|
123 |
+
ids=[self._document_id(doc.page_content) for doc in docs]
|
124 |
)
|
125 |
|
126 |
+
def _document_id(self, content: str) -> str:
|
|
|
|
|
|
|
|
|
|
|
127 |
"""
|
128 |
+
Generates a unique document ID using SHA256 and the current timestamp.
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
129 |
"""
|
130 |
+
return f"{hashlib.sha256(content.encode()).hexdigest()[:16]}-{int(time.time())}"
|
131 |
+
|
132 |
+
# Initialize document collections
|
133 |
+
qdm = QuantumDocumentManager()
|
134 |
+
research_docs = qdm.create_collection([
|
135 |
+
"Research Report: Results of a New AI Model Improving Image Recognition Accuracy to 98%",
|
136 |
+
"Academic Paper Summary: Why Transformers Became the Mainstream Architecture in Natural Language Processing",
|
137 |
+
"Latest Trends in Machine Learning Methods Using Quantum Computing"
|
138 |
+
], "research")
|
139 |
+
|
140 |
+
development_docs = qdm.create_collection([
|
141 |
+
"Project A: UI Design Completed, API Integration in Progress",
|
142 |
+
"Project B: Testing New Feature X, Bug Fixes Needed",
|
143 |
+
"Product Y: In the Performance Optimization Stage Before Release"
|
144 |
+
], "development")
|
145 |
+
|
146 |
+
# ------------------------------
|
147 |
+
# Advanced Retrieval System
|
148 |
+
# ------------------------------
|
149 |
+
class ResearchRetriever:
|
150 |
+
"""
|
151 |
+
Provides retrieval methods for different domains.
|
152 |
+
"""
|
153 |
+
def __init__(self) -> None:
|
|
|
|
|
|
|
|
|
154 |
try:
|
155 |
+
self.research_retriever = research_docs.as_retriever(
|
156 |
+
search_type="mmr",
|
157 |
+
search_kwargs={'k': 4, 'fetch_k': 20, 'lambda_mult': 0.85}
|
158 |
+
)
|
159 |
+
self.development_retriever = development_docs.as_retriever(
|
160 |
+
search_type="similarity",
|
161 |
+
search_kwargs={'k': 3}
|
162 |
)
|
163 |
+
logger.info("Initialized retrievers for research and development domains.")
|
164 |
except Exception as e:
|
165 |
+
logger.error(f"Error initializing retrievers: {e}")
|
166 |
+
raise e
|
167 |
+
|
168 |
+
def retrieve(self, query: str, domain: str) -> List[Any]:
|
|
|
|
|
|
|
|
|
|
|
169 |
"""
|
170 |
+
Retrieves documents based on the query and domain.
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
171 |
"""
|
|
|
172 |
try:
|
173 |
+
if domain == "research":
|
174 |
+
return self.research_retriever.invoke(query)
|
175 |
+
elif domain == "development":
|
176 |
+
return self.development_retriever.invoke(query)
|
177 |
+
else:
|
178 |
+
logger.warning(f"Domain '{domain}' not recognized.")
|
179 |
+
return []
|
180 |
except Exception as e:
|
181 |
+
logger.error(f"Retrieval error for domain '{domain}': {e}")
|
182 |
+
return []
|
183 |
|
184 |
+
retriever = ResearchRetriever()
|
|
|
|
|
|
|
|
|
185 |
|
186 |
+
# ------------------------------
|
187 |
+
# Cognitive Processing Unit
|
188 |
+
# ------------------------------
|
189 |
+
class CognitiveProcessor:
|
190 |
+
"""
|
191 |
+
Executes API requests to the DeepSeek backend using triple redundancy
|
192 |
+
and consolidates results via a consensus mechanism.
|
193 |
+
"""
|
194 |
+
def __init__(self) -> None:
|
195 |
+
self.executor = ThreadPoolExecutor(max_workers=ResearchConfig.MAX_CONCURRENT_REQUESTS)
|
196 |
+
self.session_id = hashlib.sha256(datetime.now().isoformat().encode()).hexdigest()[:12]
|
197 |
|
198 |
+
def process_query(self, prompt: str) -> Dict:
|
199 |
+
"""
|
200 |
+
Process a query by sending multiple API requests in parallel.
|
201 |
+
"""
|
202 |
+
futures = []
|
203 |
+
for _ in range(3): # Triple redundancy for reliability
|
204 |
+
futures.append(self.executor.submit(self._execute_api_request, prompt))
|
205 |
+
|
206 |
+
results = []
|
207 |
+
for future in as_completed(futures):
|
208 |
+
try:
|
209 |
+
results.append(future.result())
|
210 |
+
except Exception as e:
|
211 |
+
logger.error(f"Error in API request: {e}")
|
212 |
+
st.error(f"Processing Error: {str(e)}")
|
213 |
+
|
214 |
+
return self._consensus_check(results)
|
215 |
+
|
216 |
+
def _execute_api_request(self, prompt: str) -> Dict:
|
217 |
+
"""
|
218 |
+
Executes a single API request to the DeepSeek endpoint.
|
219 |
+
"""
|
220 |
+
headers = {
|
221 |
+
"Authorization": f"Bearer {ResearchConfig.DEEPSEEK_API_KEY}",
|
222 |
+
"Content-Type": "application/json",
|
223 |
+
"X-Research-Session": self.session_id
|
224 |
+
}
|
225 |
+
payload = {
|
226 |
+
"model": "deepseek-chat",
|
227 |
+
"messages": [{
|
228 |
+
"role": "user",
|
229 |
+
"content": f"Respond as Senior AI Researcher:\n{prompt}"
|
230 |
+
}],
|
231 |
+
"temperature": 0.7,
|
232 |
+
"max_tokens": 1500,
|
233 |
+
"top_p": 0.9
|
234 |
+
}
|
235 |
try:
|
236 |
+
response = requests.post(
|
237 |
+
"https://api.deepseek.com/v1/chat/completions",
|
238 |
+
headers=headers,
|
239 |
+
json=payload,
|
240 |
+
timeout=45
|
241 |
)
|
242 |
+
response.raise_for_status()
|
243 |
+
logger.info("DeepSeek API request successful.")
|
244 |
+
return response.json()
|
245 |
+
except requests.exceptions.RequestException as e:
|
246 |
+
logger.error(f"DeepSeek API request failed: {e}")
|
247 |
+
return {"error": str(e)}
|
248 |
+
|
249 |
+
def _consensus_check(self, results: List[Dict]) -> Dict:
|
250 |
+
"""
|
251 |
+
Consolidates multiple API responses, selecting the one with the most content.
|
252 |
+
"""
|
253 |
+
valid_results = [r for r in results if "error" not in r]
|
254 |
+
if not valid_results:
|
255 |
+
logger.error("All API requests failed.")
|
256 |
+
return {"error": "All API requests failed"}
|
257 |
+
# Choose the response with the longest content
|
258 |
+
return max(valid_results, key=lambda x: len(x.get('choices', [{}])[0].get('message', {}).get('content', '')))
|
259 |
+
|
260 |
+
# ------------------------------
|
261 |
+
# Research Workflow Engine
|
262 |
+
# ------------------------------
|
263 |
+
class ResearchWorkflow:
|
264 |
+
"""
|
265 |
+
Defines the multi-step research workflow using a state graph.
|
266 |
+
"""
|
267 |
+
def __init__(self) -> None:
|
268 |
+
self.processor = CognitiveProcessor()
|
269 |
+
self.workflow = StateGraph(AgentState)
|
270 |
+
self._build_workflow()
|
271 |
+
self.app = self.workflow.compile()
|
272 |
+
|
273 |
+
def _build_workflow(self) -> None:
|
274 |
+
# Define nodes
|
275 |
+
self.workflow.add_node("ingest", self.ingest_query)
|
276 |
+
self.workflow.add_node("retrieve", self.retrieve_documents)
|
277 |
+
self.workflow.add_node("analyze", self.analyze_content)
|
278 |
+
self.workflow.add_node("validate", self.validate_output)
|
279 |
+
self.workflow.add_node("refine", self.refine_results)
|
280 |
+
# Set entry point and edges
|
281 |
+
self.workflow.set_entry_point("ingest")
|
282 |
+
self.workflow.add_edge("ingest", "retrieve")
|
283 |
+
self.workflow.add_edge("retrieve", "analyze")
|
284 |
+
self.workflow.add_conditional_edges(
|
285 |
+
"analyze",
|
286 |
+
self._quality_check,
|
287 |
+
{"valid": "validate", "invalid": "refine"}
|
288 |
)
|
289 |
+
self.workflow.add_edge("validate", END)
|
290 |
+
self.workflow.add_edge("refine", "retrieve")
|
291 |
|
292 |
+
def ingest_query(self, state: AgentState) -> Dict:
|
293 |
+
"""
|
294 |
+
Ingests the research query.
|
295 |
+
"""
|
296 |
+
try:
|
297 |
+
query = state["messages"][-1].content
|
298 |
+
logger.info("Query ingested.")
|
299 |
+
return {
|
300 |
+
"messages": [AIMessage(content="Query ingested successfully")],
|
301 |
+
"context": {"raw_query": query},
|
302 |
+
"metadata": {"timestamp": datetime.now().isoformat()}
|
303 |
+
}
|
304 |
+
except Exception as e:
|
305 |
+
return self._error_state(f"Ingestion Error: {str(e)}")
|
306 |
|
307 |
+
def retrieve_documents(self, state: AgentState) -> Dict:
|
308 |
+
"""
|
309 |
+
Retrieves research documents based on the query.
|
310 |
+
"""
|
311 |
+
try:
|
312 |
+
query = state["context"]["raw_query"]
|
313 |
+
docs = retriever.retrieve(query, "research")
|
314 |
+
logger.info(f"Retrieved {len(docs)} documents for query.")
|
315 |
+
return {
|
316 |
+
"messages": [AIMessage(content=f"Retrieved {len(docs)} documents")],
|
317 |
+
"context": {"documents": docs, "retrieval_time": time.time()}
|
318 |
+
}
|
319 |
+
except Exception as e:
|
320 |
+
return self._error_state(f"Retrieval Error: {str(e)}")
|
321 |
|
322 |
+
def analyze_content(self, state: AgentState) -> Dict:
|
323 |
+
"""
|
324 |
+
Analyzes the retrieved documents using the DeepSeek API.
|
325 |
+
"""
|
326 |
+
try:
|
327 |
+
docs = state["context"].get("documents", [])
|
328 |
+
docs_text = "\n\n".join([d.page_content for d in docs])
|
329 |
+
prompt = ResearchConfig.ANALYSIS_TEMPLATE.format(context=docs_text)
|
330 |
+
response = self.processor.process_query(prompt)
|
331 |
+
if "error" in response:
|
332 |
+
return self._error_state(response["error"])
|
333 |
+
logger.info("Content analysis completed.")
|
334 |
+
return {
|
335 |
+
"messages": [AIMessage(content=response.get('choices', [{}])[0].get('message', {}).get('content', ''))],
|
336 |
+
"context": {"analysis": response}
|
337 |
+
}
|
338 |
+
except Exception as e:
|
339 |
+
return self._error_state(f"Analysis Error: {str(e)}")
|
340 |
|
341 |
+
def validate_output(self, state: AgentState) -> Dict:
|
342 |
+
"""
|
343 |
+
Validates the technical analysis report.
|
344 |
+
"""
|
345 |
+
analysis = state["messages"][-1].content
|
346 |
+
validation_prompt = (
|
347 |
+
f"Validate research analysis:\n{analysis}\n\n"
|
348 |
+
"Check for:\n1. Technical accuracy\n2. Citation support\n3. Logical consistency\n4. Methodological soundness\n\n"
|
349 |
+
"Respond with 'VALID' or 'INVALID'"
|
350 |
+
)
|
351 |
+
response = self.processor.process_query(validation_prompt)
|
352 |
+
logger.info("Output validation completed.")
|
353 |
+
return {
|
354 |
+
"messages": [
|
355 |
+
AIMessage(
|
356 |
+
content=analysis +
|
357 |
+
f"\n\nValidation: {response.get('choices', [{}])[0].get('message', {}).get('content', '')}"
|
358 |
+
)
|
359 |
+
]
|
360 |
+
}
|
361 |
|
362 |
+
def refine_results(self, state: AgentState) -> Dict:
|
363 |
+
"""
|
364 |
+
Refines the analysis report if validation fails.
|
365 |
"""
|
366 |
+
refinement_prompt = (
|
367 |
+
f"Refine this analysis:\n{state['messages'][-1].content}\n\n"
|
368 |
+
"Improve:\n1. Technical precision\n2. Empirical grounding\n3. Theoretical coherence"
|
369 |
+
)
|
370 |
+
response = self.processor.process_query(refinement_prompt)
|
371 |
+
logger.info("Refinement completed.")
|
372 |
+
return {
|
373 |
+
"messages": [
|
374 |
+
AIMessage(
|
375 |
+
content=response.get('choices', [{}])[0].get('message', {}).get('content', '')
|
376 |
+
)
|
377 |
+
],
|
378 |
+
"context": state["context"]
|
379 |
+
}
|
380 |
|
381 |
+
def _quality_check(self, state: AgentState) -> str:
|
382 |
+
"""
|
383 |
+
Checks whether the analysis report is valid.
|
384 |
+
"""
|
385 |
+
content = state["messages"][-1].content
|
386 |
+
quality = "valid" if "VALID" in content else "invalid"
|
387 |
+
logger.info(f"Quality check returned: {quality}")
|
388 |
+
return quality
|
389 |
|
390 |
+
def _error_state(self, message: str) -> Dict:
|
391 |
+
"""
|
392 |
+
Returns a standardized error state.
|
393 |
"""
|
394 |
+
logger.error(message)
|
395 |
+
return {
|
396 |
+
"messages": [AIMessage(content=f"β {message}")],
|
397 |
+
"context": {"error": True},
|
398 |
+
"metadata": {"status": "error"}
|
399 |
+
}
|
400 |
|
401 |
# ------------------------------
|
402 |
+
# Research Interface (Streamlit UI)
|
403 |
# ------------------------------
|
404 |
+
class ResearchInterface:
|
405 |
"""
|
406 |
+
Provides the Streamlit-based interface for executing the research workflow.
|
|
|
407 |
"""
|
408 |
+
def __init__(self) -> None:
|
409 |
+
self.workflow = ResearchWorkflow()
|
410 |
+
self._initialize_interface()
|
411 |
+
|
412 |
+
def _initialize_interface(self) -> None:
|
413 |
+
st.set_page_config(
|
414 |
+
page_title="NeuroResearch AI",
|
415 |
+
layout="wide",
|
416 |
+
initial_sidebar_state="expanded"
|
|
|
|
|
|
|
417 |
)
|
418 |
+
self._inject_styles()
|
419 |
+
self._build_sidebar()
|
420 |
+
self._build_main_interface()
|
421 |
+
|
422 |
+
def _inject_styles(self) -> None:
|
423 |
+
st.markdown(
|
424 |
+
"""
|
425 |
+
<style>
|
426 |
+
:root {
|
427 |
+
--primary: #2ecc71;
|
428 |
+
--secondary: #3498db;
|
429 |
+
--background: #0a0a0a;
|
430 |
+
--text: #ecf0f1;
|
431 |
+
}
|
432 |
+
.stApp {
|
433 |
+
background: var(--background);
|
434 |
+
color: var(--text);
|
435 |
+
font-family: 'Roboto', sans-serif;
|
436 |
+
}
|
437 |
+
.stTextArea textarea {
|
438 |
+
background: #1a1a1a !important;
|
439 |
+
color: var(--text) !important;
|
440 |
+
border: 2px solid var(--secondary);
|
441 |
+
border-radius: 8px;
|
442 |
+
padding: 1rem;
|
443 |
+
}
|
444 |
+
.stButton>button {
|
445 |
+
background: linear-gradient(135deg, var(--primary), var(--secondary));
|
446 |
+
border: none;
|
447 |
+
border-radius: 8px;
|
448 |
+
padding: 1rem 2rem;
|
449 |
+
transition: all 0.3s;
|
450 |
+
}
|
451 |
+
.stButton>button:hover {
|
452 |
+
transform: translateY(-2px);
|
453 |
+
box-shadow: 0 4px 12px rgba(46, 204, 113, 0.3);
|
454 |
+
}
|
455 |
+
.stExpander {
|
456 |
+
background: #1a1a1a;
|
457 |
+
border: 1px solid #2a2a2a;
|
458 |
+
border-radius: 8px;
|
459 |
+
margin: 1rem 0;
|
460 |
+
}
|
461 |
+
</style>
|
462 |
+
""",
|
463 |
+
unsafe_allow_html=True
|
464 |
+
)
|
465 |
+
|
466 |
+
def _build_sidebar(self) -> None:
|
467 |
+
with st.sidebar:
|
468 |
+
st.title("π Research Database")
|
469 |
+
st.subheader("Technical Papers")
|
470 |
+
for title, short in ResearchConfig.DOCUMENT_MAP.items():
|
471 |
+
with st.expander(short):
|
472 |
+
st.markdown(f"```\n{title}\n```")
|
473 |
+
st.subheader("Analysis Metrics")
|
474 |
+
st.metric("Vector Collections", 2)
|
475 |
+
st.metric("Embedding Dimensions", ResearchConfig.EMBEDDING_DIMENSIONS)
|
476 |
+
|
477 |
+
def _build_main_interface(self) -> None:
|
478 |
+
st.title("π§ NeuroResearch AI")
|
479 |
+
query = st.text_area(
|
480 |
+
"Research Query:",
|
481 |
+
height=200,
|
482 |
+
placeholder="Enter technical research question..."
|
483 |
+
)
|
484 |
+
if st.button("Execute Analysis", type="primary"):
|
485 |
+
self._execute_analysis(query)
|
486 |
+
|
487 |
+
def _execute_analysis(self, query: str) -> None:
|
488 |
+
try:
|
489 |
+
with st.spinner("Initializing Quantum Analysis..."):
|
490 |
+
results = self.workflow.app.stream({
|
491 |
+
"messages": [HumanMessage(content=query)],
|
492 |
+
"context": {},
|
493 |
+
"metadata": {}
|
494 |
+
})
|
495 |
+
for event in results:
|
496 |
+
self._render_event(event)
|
497 |
+
st.success("β
Analysis Completed Successfully")
|
498 |
+
except Exception as e:
|
499 |
+
logger.error(f"Workflow execution failed: {e}")
|
500 |
+
st.error(
|
501 |
+
f"""**Analysis Failed**
|
502 |
+
{str(e)}
|
503 |
+
Potential issues:
|
504 |
+
- Complex query structure
|
505 |
+
- Document correlation failure
|
506 |
+
- Temporal processing constraints"""
|
507 |
)
|
508 |
+
|
509 |
+
def _render_event(self, event: Dict) -> None:
|
510 |
+
if 'ingest' in event:
|
511 |
+
with st.container():
|
512 |
+
st.success("β
Query Ingested")
|
513 |
+
elif 'retrieve' in event:
|
514 |
+
with st.container():
|
515 |
+
docs = event['retrieve']['context'].get('documents', [])
|
516 |
+
st.info(f"π Retrieved {len(docs)} documents")
|
517 |
+
with st.expander("View Retrieved Documents", expanded=False):
|
518 |
+
for idx, doc in enumerate(docs, start=1):
|
519 |
+
st.markdown(f"**Document {idx}**")
|
520 |
+
st.code(doc.page_content, language='text')
|
521 |
+
elif 'analyze' in event:
|
522 |
+
with st.container():
|
523 |
+
content = event['analyze']['messages'][0].content
|
524 |
+
with st.expander("Technical Analysis Report", expanded=True):
|
525 |
+
st.markdown(content)
|
526 |
+
elif 'validate' in event:
|
527 |
+
with st.container():
|
528 |
+
content = event['validate']['messages'][0].content
|
529 |
+
if "VALID" in content:
|
530 |
+
st.success("β
Validation Passed")
|
531 |
+
with st.expander("View Validated Analysis", expanded=True):
|
532 |
+
st.markdown(content.split("Validation:")[0])
|
533 |
else:
|
534 |
+
st.warning("β οΈ Validation Issues Detected")
|
535 |
+
with st.expander("View Validation Details", expanded=True):
|
536 |
+
st.markdown(content)
|
|
|
|
|
|
|
537 |
|
|
|
|
|
|
|
538 |
if __name__ == "__main__":
|
539 |
+
ResearchInterface()
|