mgbam commited on
Commit
7370048
Β·
verified Β·
1 Parent(s): de3ef7d

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +474 -269
app.py CHANGED
@@ -1,334 +1,539 @@
1
  # ------------------------------
2
- # NeuroResearch 2.1: Robust Research System
3
  # ------------------------------
4
- from langchain_openai import OpenAIEmbeddings
5
- from langchain_community.vectorstores import Chroma
6
- from langchain_community.retrievers import BM25Retriever
7
- from langchain.text_splitter import RecursiveCharacterTextSplitter
8
- from rank_bm25 import BM25Okapi
9
- from sentence_transformers import CrossEncoder
10
- from typing_extensions import TypedDict, Annotated
11
- from typing import (
12
- Sequence, Dict, List, Optional, Any, Tuple, Union
13
- )
14
-
15
- import chromadb
16
  import os
 
17
  import hashlib
18
  import json
19
  import time
20
-
21
- from concurrent.futures import ThreadPoolExecutor, as_completed
22
  from datetime import datetime
 
 
23
 
 
24
  import streamlit as st
25
- import plotly.express as px
26
- import pandas as pd
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
27
 
28
  # ------------------------------
29
  # Configuration
30
  # ------------------------------
31
- class NeuroConfig:
32
- """
33
- Configuration class for NeuroResearch system.
34
-
35
- Attributes:
36
- DEEPSEEK_API_KEY (str): Optional API key for external services.
37
- CHROMA_PATH (str): File path for Chroma's persistent storage.
38
- CHUNK_SIZE (int): Maximum length of text chunks for splitting.
39
- CHUNK_OVERLAP (int): Overlap between text chunks to preserve context.
40
- MAX_CONCURRENT_REQUESTS (int): Number of concurrent threads for processing.
41
- EMBEDDING_DIMENSIONS (int): Dimensionality of embeddings.
42
- HYBRID_RERANK_TOP_K (int): Number of documents to retrieve and rerank.
43
- ANALYSIS_MODES (dict): Possible analysis modes and their descriptions.
44
- CACHE_TTL (int): Time-to-live (seconds) for cached items.
45
- """
46
  DEEPSEEK_API_KEY = os.environ.get("DEEPSEEK_API_KEY")
47
- CHROMA_PATH = "neuro_db"
48
  CHUNK_SIZE = 512
49
  CHUNK_OVERLAP = 64
50
- MAX_CONCURRENT_REQUESTS = 7
51
- EMBEDDING_DIMENSIONS = 3072
52
- HYBRID_RERANK_TOP_K = 15
53
- ANALYSIS_MODES = {
54
- "technical": "Deep Technical Analysis",
55
- "comparative": "Cross-Paper Comparison",
56
- "temporal": "Temporal Trend Analysis",
57
- "critical": "Critical Literature Review"
 
58
  }
59
- CACHE_TTL = 3600 # 1 hour
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
60
 
61
  # ------------------------------
62
- # Document Processor
63
  # ------------------------------
64
- class NeuralDocumentProcessor:
65
  """
66
- A document processing and retrieval utility class.
67
-
68
- Responsibilities:
69
- - Splitting documents into manageable chunks.
70
- - Storing and retrieving embeddings with Chroma.
71
- - Performing hybrid retrieval (vector + BM25) with cross-encoder reranking.
72
- - Handling concurrency during document ingestion (optional).
73
  """
74
  def __init__(self) -> None:
75
- """
76
- Initialize the NeuralDocumentProcessor with a persistent Chroma client,
77
- OpenAI-based embeddings, a CrossEncoder for reranking, and a text splitter.
78
- """
79
- # Persistent Chroma client
80
  try:
81
- self.client = chromadb.PersistentClient(path=NeuroConfig.CHROMA_PATH)
 
82
  except Exception as e:
83
- # Fallback to in-memory client if persistent fails
84
- print(f"Error initializing Chroma PersistentClient: {e}")
85
- self.client = chromadb.Client()
86
-
87
- # Embeddings (OpenAI-based)
88
  self.embeddings = OpenAIEmbeddings(
89
  model="text-embedding-3-large",
90
- dimensions=NeuroConfig.EMBEDDING_DIMENSIONS
91
  )
92
 
93
- # Cross-encoder for reranking
94
- self.cross_encoder = CrossEncoder('cross-encoder/ms-marco-MiniLM-L-12-v2')
95
-
96
- # Text splitter
97
- self.text_splitter = RecursiveCharacterTextSplitter(
98
- chunk_size=NeuroConfig.CHUNK_SIZE,
99
- chunk_overlap=NeuroConfig.CHUNK_OVERLAP,
100
- separators=["\n\n", "\n", "(?<=\\. )", " "],
 
 
 
 
 
 
 
 
 
 
 
 
 
 
101
  )
102
 
103
- def process_documents(
104
- self,
105
- documents: List[str],
106
- collection: str,
107
- use_concurrency: bool = False
108
- ) -> Optional[Chroma]:
109
  """
110
- Process a list of document strings by splitting, embedding, and storing them in Chroma.
111
- Optionally uses concurrency for splitting documents.
112
-
113
- Args:
114
- documents (List[str]): The list of raw document texts.
115
- collection (str): The Chroma collection name to store these documents in.
116
- use_concurrency (bool, optional): If True, process documents concurrently. Defaults to False.
117
-
118
- Returns:
119
- Optional[Chroma]: The Chroma vectorstore for the specified collection, or None if no docs.
120
  """
121
- if not documents:
122
- print("No documents provided to process_documents.")
123
- return None
124
-
125
- # Split documents into chunks
126
- if use_concurrency and len(documents) > 1:
127
- chunks = []
128
- with ThreadPoolExecutor(max_workers=NeuroConfig.MAX_CONCURRENT_REQUESTS) as executor:
129
- future_to_doc = {
130
- executor.submit(self.text_splitter.create_documents, [doc]): doc
131
- for doc in documents
132
- }
133
- for future in as_completed(future_to_doc):
134
- try:
135
- result = future.result()
136
- chunks.extend(result)
137
- except Exception as e:
138
- print(f"Error splitting document: {e}")
139
- else:
140
- # Single-threaded splitting
141
- chunks = []
142
- for doc in documents:
143
- chunks.extend(self.text_splitter.create_documents([doc]))
144
-
145
- # Build unique IDs for each chunk
146
- chunk_ids = [self._quantum_id(doc.page_content) for doc in chunks]
147
-
148
- # Create Chroma from documents
149
  try:
150
- vectorstore = Chroma.from_documents(
151
- documents=chunks,
152
- embedding=self.embeddings,
153
- client=self.client,
154
- collection_name=collection,
155
- ids=chunk_ids
 
156
  )
157
- return vectorstore
158
  except Exception as e:
159
- print(f"Error creating Chroma collection: {e}")
160
- return None
161
-
162
- def hybrid_retrieval(
163
- self,
164
- query: str,
165
- collection: str,
166
- return_scores: bool = False
167
- ) -> Union[List[str], List[Tuple[str, float]]]:
168
  """
169
- Perform hybrid retrieval combining vector-based search with BM25,
170
- then re-rank the combined results using a cross-encoder.
171
-
172
- Args:
173
- query (str): The user query for retrieving documents.
174
- collection (str): The name of the Chroma collection to search.
175
- return_scores (bool): If True, return a list of (document, score) tuples.
176
- Otherwise, return a list of document strings only.
177
-
178
- Returns:
179
- Union[List[str], List[Tuple[str, float]]]: The top-k reranked results,
180
- either as strings or (string, score) pairs.
181
  """
182
- # Try to load the existing collection
183
  try:
184
- vector_store = Chroma(
185
- client=self.client,
186
- collection_name=collection,
187
- embedding_function=self.embeddings
188
- )
 
 
189
  except Exception as e:
190
- print(f"Error loading Chroma collection '{collection}': {e}")
191
- return [] if not return_scores else []
192
 
193
- # Check if the collection is empty
194
- stored_docs = vector_store.get()
195
- if not stored_docs or "documents" not in stored_docs or not stored_docs["documents"]:
196
- print(f"No documents found in collection '{collection}'.")
197
- return [] if not return_scores else []
198
 
199
- all_docs = [doc.page_content for doc in stored_docs["documents"]]
200
- if not all_docs:
201
- print(f"No documents found in collection '{collection}'.")
202
- return [] if not return_scores else []
 
 
 
 
 
 
 
203
 
204
- # Vector-based retrieval
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
205
  try:
206
- vector_retriever = vector_store.as_retriever(
207
- search_kwargs={"k": NeuroConfig.HYBRID_RERANK_TOP_K}
 
 
 
208
  )
209
- vector_results = [doc.page_content for doc in vector_retriever.invoke(query)]
210
- except Exception as e:
211
- print(f"Error during vector retrieval: {e}")
212
- vector_results = []
213
-
214
- # BM25 retrieval
215
- tokenized_docs = [doc.split() for doc in all_docs]
216
- bm25 = BM25Okapi(tokenized_docs)
217
- bm25_results = bm25.get_top_n(
218
- query.split(),
219
- all_docs,
220
- n=NeuroConfig.HYBRID_RERANK_TOP_K
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
221
  )
 
 
222
 
223
- # Combine results and remove duplicates
224
- combined = list(set(vector_results + bm25_results))
 
 
 
 
 
 
 
 
 
 
 
 
225
 
226
- if not combined:
227
- print("No documents retrieved by either BM25 or vector search.")
228
- return [] if not return_scores else []
 
 
 
 
 
 
 
 
 
 
 
229
 
230
- # Cross-encoder reranking
231
- scores = self.cross_encoder.predict([(query, doc) for doc in combined])
232
- reranked = sorted(zip(combined, scores), key=lambda x: x[1], reverse=True)
233
- top_results = reranked[:NeuroConfig.HYBRID_RERANK_TOP_K]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
234
 
235
- # Return based on user preference
236
- if return_scores:
237
- return top_results # List[Tuple[str, float]]
238
- else:
239
- return [doc for doc, _ in top_results]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
240
 
241
- def _quantum_id(self, content: str) -> str:
 
 
242
  """
243
- Create a unique ID for each text chunk by hashing its content.
 
 
 
 
 
 
 
 
 
 
 
 
 
244
 
245
- Args:
246
- content (str): The text content of the chunk.
 
 
 
 
 
 
247
 
248
- Returns:
249
- str: A unique hash-based identifier.
 
250
  """
251
- return f"neuro_{hashlib.sha3_256(content.encode()).hexdigest()[:24]}"
 
 
 
 
 
252
 
253
  # ------------------------------
254
- # NeuroInterface (Streamlit Example)
255
  # ------------------------------
256
- def NeuroInterface() -> None:
257
  """
258
- A basic Streamlit-based interface to demonstrate usage of the NeuralDocumentProcessor.
259
- This function can be adapted for Hugging Face Spaces or other frontends.
260
  """
261
- st.title("NeuroResearch 2.1: Robust Research System")
262
-
263
- # Initialize Document Processor
264
- processor = NeuralDocumentProcessor()
265
-
266
- # Sidebar for uploading and processing documents
267
- with st.sidebar:
268
- st.header("Document Ingestion")
269
- uploaded_files = st.file_uploader(
270
- "Upload one or more text files",
271
- type=["txt", "md", "pdf"],
272
- accept_multiple_files=True
273
  )
274
- collection_name = st.text_input("Collection Name", value="default_collection")
275
-
276
- use_concurrency = st.checkbox("Use Concurrency for Processing?", value=False)
277
-
278
- if st.button("Process Documents"):
279
- if uploaded_files and collection_name.strip():
280
- # Read files
281
- docs_content = []
282
- for uf in uploaded_files:
283
- content = uf.read()
284
- # Assume UTF-8; adapt as needed
285
- try:
286
- docs_content.append(content.decode("utf-8"))
287
- except UnicodeDecodeError:
288
- st.error(f"Could not decode {uf.name}. Make sure it's UTF-8 text.")
289
- st.write("Processing documents...")
290
- vectorstore = processor.process_documents(
291
- documents=docs_content,
292
- collection=collection_name,
293
- use_concurrency=use_concurrency
294
- )
295
- if vectorstore:
296
- st.success(f"Documents processed and stored in collection: {collection_name}")
297
- else:
298
- st.error("Processing failed or returned no vectorstore.")
299
-
300
- # Main interface for querying
301
- st.subheader("Query Documents")
302
- user_query = st.text_input("Enter your query:")
303
- return_scores = st.checkbox("Return Scores?")
304
-
305
- if st.button("Search"):
306
- if not user_query.strip() or not collection_name.strip():
307
- st.warning("Please provide both a query and a valid collection name.")
308
- else:
309
- st.write(f"Retrieving from collection: {collection_name}")
310
- results = processor.hybrid_retrieval(
311
- query=user_query,
312
- collection=collection_name,
313
- return_scores=return_scores
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
314
  )
315
- if results:
316
- st.write("Top Reranked Results:")
317
- if return_scores:
318
- # Each result is (doc, score)
319
- for idx, (doc, score) in enumerate(results, start=1):
320
- st.markdown(f"**Result {idx} | Score: {score:.4f}**")
321
- st.write(doc[:500] + ("..." if len(doc) > 500 else ""))
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
322
  else:
323
- # Just doc texts
324
- for idx, doc in enumerate(results, start=1):
325
- st.markdown(f"**Result {idx}**")
326
- st.write(doc[:500] + ("..." if len(doc) > 500 else ""))
327
- else:
328
- st.warning("No results found or collection may be empty.")
329
 
330
- # ------------------------------
331
- # Main Entry Point
332
- # ------------------------------
333
  if __name__ == "__main__":
334
- NeuroInterface()
 
1
  # ------------------------------
2
+ # Enhanced NeuroResearch AI System
3
  # ------------------------------
4
+ import logging
 
 
 
 
 
 
 
 
 
 
 
5
  import os
6
+ import re
7
  import hashlib
8
  import json
9
  import time
 
 
10
  from datetime import datetime
11
+ from concurrent.futures import ThreadPoolExecutor, as_completed
12
+ from typing import List, Dict, Any, Optional, Sequence
13
 
14
+ import requests
15
  import streamlit as st
16
+
17
+ # LangChain and LangGraph imports
18
+ from langchain_openai import OpenAIEmbeddings
19
+ from langchain_community.vectorstores import Chroma
20
+ from langchain_core.messages import HumanMessage, AIMessage, ToolMessage
21
+ from langchain.text_splitter import RecursiveCharacterTextSplitter
22
+ from langgraph.graph import END, StateGraph
23
+ from langgraph.prebuilt import ToolNode
24
+ from langgraph.graph.message import add_messages
25
+ from typing_extensions import TypedDict, Annotated
26
+ from langchain.tools.retriever import create_retriever_tool
27
+
28
+ # ------------------------------
29
+ # Logging Configuration
30
+ # ------------------------------
31
+ logging.basicConfig(
32
+ level=logging.INFO,
33
+ format="%(asctime)s [%(levelname)s] %(message)s"
34
+ )
35
+ logger = logging.getLogger(__name__)
36
+
37
+ # ------------------------------
38
+ # State Schema Definition
39
+ # ------------------------------
40
+ class AgentState(TypedDict):
41
+ messages: Annotated[Sequence[AIMessage | HumanMessage | ToolMessage], add_messages]
42
+ context: Dict[str, Any]
43
+ metadata: Dict[str, Any]
44
 
45
  # ------------------------------
46
  # Configuration
47
  # ------------------------------
48
+ class ResearchConfig:
 
 
 
 
 
 
 
 
 
 
 
 
 
 
49
  DEEPSEEK_API_KEY = os.environ.get("DEEPSEEK_API_KEY")
50
+ CHROMA_PATH = "chroma_db"
51
  CHUNK_SIZE = 512
52
  CHUNK_OVERLAP = 64
53
+ MAX_CONCURRENT_REQUESTS = 5
54
+ EMBEDDING_DIMENSIONS = 1536
55
+ DOCUMENT_MAP = {
56
+ "Research Report: Results of a New AI Model Improving Image Recognition Accuracy to 98%":
57
+ "CV-Transformer Hybrid Architecture",
58
+ "Academic Paper Summary: Why Transformers Became the Mainstream Architecture in Natural Language Processing":
59
+ "Transformer Architecture Analysis",
60
+ "Latest Trends in Machine Learning Methods Using Quantum Computing":
61
+ "Quantum ML Frontiers"
62
  }
63
+ ANALYSIS_TEMPLATE = (
64
+ "Analyze these technical documents with scientific rigor:\n{context}\n\n"
65
+ "Respond with:\n"
66
+ "1. Key Technical Contributions (bullet points)\n"
67
+ "2. Novel Methodologies\n"
68
+ "3. Empirical Results (with metrics)\n"
69
+ "4. Potential Applications\n"
70
+ "5. Limitations & Future Directions\n\n"
71
+ "Format: Markdown with LaTeX mathematical notation where applicable"
72
+ )
73
+
74
+ if not ResearchConfig.DEEPSEEK_API_KEY:
75
+ st.error(
76
+ """**Research Portal Configuration Required**
77
+ 1. Obtain DeepSeek API key: [platform.deepseek.com](https://platform.deepseek.com/)
78
+ 2. Configure secret: `DEEPSEEK_API_KEY` in Space settings
79
+ 3. Rebuild deployment"""
80
+ )
81
+ st.stop()
82
 
83
  # ------------------------------
84
+ # Quantum Document Processing
85
  # ------------------------------
86
+ class QuantumDocumentManager:
87
  """
88
+ Manages the creation of Chroma collections from raw document texts.
 
 
 
 
 
 
89
  """
90
  def __init__(self) -> None:
 
 
 
 
 
91
  try:
92
+ self.client = chromadb.PersistentClient(path=ResearchConfig.CHROMA_PATH)
93
+ logger.info("Initialized PersistentClient for Chroma.")
94
  except Exception as e:
95
+ logger.error(f"Error initializing PersistentClient: {e}")
96
+ self.client = chromadb.Client() # Fallback to in-memory client
 
 
 
97
  self.embeddings = OpenAIEmbeddings(
98
  model="text-embedding-3-large",
99
+ dimensions=ResearchConfig.EMBEDDING_DIMENSIONS
100
  )
101
 
102
+ def create_collection(self, documents: List[str], collection_name: str) -> Chroma:
103
+ """
104
+ Splits documents into chunks and stores them as a Chroma collection.
105
+ """
106
+ splitter = RecursiveCharacterTextSplitter(
107
+ chunk_size=ResearchConfig.CHUNK_SIZE,
108
+ chunk_overlap=ResearchConfig.CHUNK_OVERLAP,
109
+ separators=["\n\n", "\n", "|||"]
110
+ )
111
+ try:
112
+ docs = splitter.create_documents(documents)
113
+ logger.info(f"Created {len(docs)} document chunks for collection '{collection_name}'.")
114
+ except Exception as e:
115
+ logger.error(f"Error splitting documents: {e}")
116
+ raise e
117
+
118
+ return Chroma.from_documents(
119
+ documents=docs,
120
+ embedding=self.embeddings,
121
+ client=self.client,
122
+ collection_name=collection_name,
123
+ ids=[self._document_id(doc.page_content) for doc in docs]
124
  )
125
 
126
+ def _document_id(self, content: str) -> str:
 
 
 
 
 
127
  """
128
+ Generates a unique document ID using SHA256 and the current timestamp.
 
 
 
 
 
 
 
 
 
129
  """
130
+ return f"{hashlib.sha256(content.encode()).hexdigest()[:16]}-{int(time.time())}"
131
+
132
+ # Initialize document collections
133
+ qdm = QuantumDocumentManager()
134
+ research_docs = qdm.create_collection([
135
+ "Research Report: Results of a New AI Model Improving Image Recognition Accuracy to 98%",
136
+ "Academic Paper Summary: Why Transformers Became the Mainstream Architecture in Natural Language Processing",
137
+ "Latest Trends in Machine Learning Methods Using Quantum Computing"
138
+ ], "research")
139
+
140
+ development_docs = qdm.create_collection([
141
+ "Project A: UI Design Completed, API Integration in Progress",
142
+ "Project B: Testing New Feature X, Bug Fixes Needed",
143
+ "Product Y: In the Performance Optimization Stage Before Release"
144
+ ], "development")
145
+
146
+ # ------------------------------
147
+ # Advanced Retrieval System
148
+ # ------------------------------
149
+ class ResearchRetriever:
150
+ """
151
+ Provides retrieval methods for different domains.
152
+ """
153
+ def __init__(self) -> None:
 
 
 
 
154
  try:
155
+ self.research_retriever = research_docs.as_retriever(
156
+ search_type="mmr",
157
+ search_kwargs={'k': 4, 'fetch_k': 20, 'lambda_mult': 0.85}
158
+ )
159
+ self.development_retriever = development_docs.as_retriever(
160
+ search_type="similarity",
161
+ search_kwargs={'k': 3}
162
  )
163
+ logger.info("Initialized retrievers for research and development domains.")
164
  except Exception as e:
165
+ logger.error(f"Error initializing retrievers: {e}")
166
+ raise e
167
+
168
+ def retrieve(self, query: str, domain: str) -> List[Any]:
 
 
 
 
 
169
  """
170
+ Retrieves documents based on the query and domain.
 
 
 
 
 
 
 
 
 
 
 
171
  """
 
172
  try:
173
+ if domain == "research":
174
+ return self.research_retriever.invoke(query)
175
+ elif domain == "development":
176
+ return self.development_retriever.invoke(query)
177
+ else:
178
+ logger.warning(f"Domain '{domain}' not recognized.")
179
+ return []
180
  except Exception as e:
181
+ logger.error(f"Retrieval error for domain '{domain}': {e}")
182
+ return []
183
 
184
+ retriever = ResearchRetriever()
 
 
 
 
185
 
186
+ # ------------------------------
187
+ # Cognitive Processing Unit
188
+ # ------------------------------
189
+ class CognitiveProcessor:
190
+ """
191
+ Executes API requests to the DeepSeek backend using triple redundancy
192
+ and consolidates results via a consensus mechanism.
193
+ """
194
+ def __init__(self) -> None:
195
+ self.executor = ThreadPoolExecutor(max_workers=ResearchConfig.MAX_CONCURRENT_REQUESTS)
196
+ self.session_id = hashlib.sha256(datetime.now().isoformat().encode()).hexdigest()[:12]
197
 
198
+ def process_query(self, prompt: str) -> Dict:
199
+ """
200
+ Process a query by sending multiple API requests in parallel.
201
+ """
202
+ futures = []
203
+ for _ in range(3): # Triple redundancy for reliability
204
+ futures.append(self.executor.submit(self._execute_api_request, prompt))
205
+
206
+ results = []
207
+ for future in as_completed(futures):
208
+ try:
209
+ results.append(future.result())
210
+ except Exception as e:
211
+ logger.error(f"Error in API request: {e}")
212
+ st.error(f"Processing Error: {str(e)}")
213
+
214
+ return self._consensus_check(results)
215
+
216
+ def _execute_api_request(self, prompt: str) -> Dict:
217
+ """
218
+ Executes a single API request to the DeepSeek endpoint.
219
+ """
220
+ headers = {
221
+ "Authorization": f"Bearer {ResearchConfig.DEEPSEEK_API_KEY}",
222
+ "Content-Type": "application/json",
223
+ "X-Research-Session": self.session_id
224
+ }
225
+ payload = {
226
+ "model": "deepseek-chat",
227
+ "messages": [{
228
+ "role": "user",
229
+ "content": f"Respond as Senior AI Researcher:\n{prompt}"
230
+ }],
231
+ "temperature": 0.7,
232
+ "max_tokens": 1500,
233
+ "top_p": 0.9
234
+ }
235
  try:
236
+ response = requests.post(
237
+ "https://api.deepseek.com/v1/chat/completions",
238
+ headers=headers,
239
+ json=payload,
240
+ timeout=45
241
  )
242
+ response.raise_for_status()
243
+ logger.info("DeepSeek API request successful.")
244
+ return response.json()
245
+ except requests.exceptions.RequestException as e:
246
+ logger.error(f"DeepSeek API request failed: {e}")
247
+ return {"error": str(e)}
248
+
249
+ def _consensus_check(self, results: List[Dict]) -> Dict:
250
+ """
251
+ Consolidates multiple API responses, selecting the one with the most content.
252
+ """
253
+ valid_results = [r for r in results if "error" not in r]
254
+ if not valid_results:
255
+ logger.error("All API requests failed.")
256
+ return {"error": "All API requests failed"}
257
+ # Choose the response with the longest content
258
+ return max(valid_results, key=lambda x: len(x.get('choices', [{}])[0].get('message', {}).get('content', '')))
259
+
260
+ # ------------------------------
261
+ # Research Workflow Engine
262
+ # ------------------------------
263
+ class ResearchWorkflow:
264
+ """
265
+ Defines the multi-step research workflow using a state graph.
266
+ """
267
+ def __init__(self) -> None:
268
+ self.processor = CognitiveProcessor()
269
+ self.workflow = StateGraph(AgentState)
270
+ self._build_workflow()
271
+ self.app = self.workflow.compile()
272
+
273
+ def _build_workflow(self) -> None:
274
+ # Define nodes
275
+ self.workflow.add_node("ingest", self.ingest_query)
276
+ self.workflow.add_node("retrieve", self.retrieve_documents)
277
+ self.workflow.add_node("analyze", self.analyze_content)
278
+ self.workflow.add_node("validate", self.validate_output)
279
+ self.workflow.add_node("refine", self.refine_results)
280
+ # Set entry point and edges
281
+ self.workflow.set_entry_point("ingest")
282
+ self.workflow.add_edge("ingest", "retrieve")
283
+ self.workflow.add_edge("retrieve", "analyze")
284
+ self.workflow.add_conditional_edges(
285
+ "analyze",
286
+ self._quality_check,
287
+ {"valid": "validate", "invalid": "refine"}
288
  )
289
+ self.workflow.add_edge("validate", END)
290
+ self.workflow.add_edge("refine", "retrieve")
291
 
292
+ def ingest_query(self, state: AgentState) -> Dict:
293
+ """
294
+ Ingests the research query.
295
+ """
296
+ try:
297
+ query = state["messages"][-1].content
298
+ logger.info("Query ingested.")
299
+ return {
300
+ "messages": [AIMessage(content="Query ingested successfully")],
301
+ "context": {"raw_query": query},
302
+ "metadata": {"timestamp": datetime.now().isoformat()}
303
+ }
304
+ except Exception as e:
305
+ return self._error_state(f"Ingestion Error: {str(e)}")
306
 
307
+ def retrieve_documents(self, state: AgentState) -> Dict:
308
+ """
309
+ Retrieves research documents based on the query.
310
+ """
311
+ try:
312
+ query = state["context"]["raw_query"]
313
+ docs = retriever.retrieve(query, "research")
314
+ logger.info(f"Retrieved {len(docs)} documents for query.")
315
+ return {
316
+ "messages": [AIMessage(content=f"Retrieved {len(docs)} documents")],
317
+ "context": {"documents": docs, "retrieval_time": time.time()}
318
+ }
319
+ except Exception as e:
320
+ return self._error_state(f"Retrieval Error: {str(e)}")
321
 
322
+ def analyze_content(self, state: AgentState) -> Dict:
323
+ """
324
+ Analyzes the retrieved documents using the DeepSeek API.
325
+ """
326
+ try:
327
+ docs = state["context"].get("documents", [])
328
+ docs_text = "\n\n".join([d.page_content for d in docs])
329
+ prompt = ResearchConfig.ANALYSIS_TEMPLATE.format(context=docs_text)
330
+ response = self.processor.process_query(prompt)
331
+ if "error" in response:
332
+ return self._error_state(response["error"])
333
+ logger.info("Content analysis completed.")
334
+ return {
335
+ "messages": [AIMessage(content=response.get('choices', [{}])[0].get('message', {}).get('content', ''))],
336
+ "context": {"analysis": response}
337
+ }
338
+ except Exception as e:
339
+ return self._error_state(f"Analysis Error: {str(e)}")
340
 
341
+ def validate_output(self, state: AgentState) -> Dict:
342
+ """
343
+ Validates the technical analysis report.
344
+ """
345
+ analysis = state["messages"][-1].content
346
+ validation_prompt = (
347
+ f"Validate research analysis:\n{analysis}\n\n"
348
+ "Check for:\n1. Technical accuracy\n2. Citation support\n3. Logical consistency\n4. Methodological soundness\n\n"
349
+ "Respond with 'VALID' or 'INVALID'"
350
+ )
351
+ response = self.processor.process_query(validation_prompt)
352
+ logger.info("Output validation completed.")
353
+ return {
354
+ "messages": [
355
+ AIMessage(
356
+ content=analysis +
357
+ f"\n\nValidation: {response.get('choices', [{}])[0].get('message', {}).get('content', '')}"
358
+ )
359
+ ]
360
+ }
361
 
362
+ def refine_results(self, state: AgentState) -> Dict:
363
+ """
364
+ Refines the analysis report if validation fails.
365
  """
366
+ refinement_prompt = (
367
+ f"Refine this analysis:\n{state['messages'][-1].content}\n\n"
368
+ "Improve:\n1. Technical precision\n2. Empirical grounding\n3. Theoretical coherence"
369
+ )
370
+ response = self.processor.process_query(refinement_prompt)
371
+ logger.info("Refinement completed.")
372
+ return {
373
+ "messages": [
374
+ AIMessage(
375
+ content=response.get('choices', [{}])[0].get('message', {}).get('content', '')
376
+ )
377
+ ],
378
+ "context": state["context"]
379
+ }
380
 
381
+ def _quality_check(self, state: AgentState) -> str:
382
+ """
383
+ Checks whether the analysis report is valid.
384
+ """
385
+ content = state["messages"][-1].content
386
+ quality = "valid" if "VALID" in content else "invalid"
387
+ logger.info(f"Quality check returned: {quality}")
388
+ return quality
389
 
390
+ def _error_state(self, message: str) -> Dict:
391
+ """
392
+ Returns a standardized error state.
393
  """
394
+ logger.error(message)
395
+ return {
396
+ "messages": [AIMessage(content=f"❌ {message}")],
397
+ "context": {"error": True},
398
+ "metadata": {"status": "error"}
399
+ }
400
 
401
  # ------------------------------
402
+ # Research Interface (Streamlit UI)
403
  # ------------------------------
404
+ class ResearchInterface:
405
  """
406
+ Provides the Streamlit-based interface for executing the research workflow.
 
407
  """
408
+ def __init__(self) -> None:
409
+ self.workflow = ResearchWorkflow()
410
+ self._initialize_interface()
411
+
412
+ def _initialize_interface(self) -> None:
413
+ st.set_page_config(
414
+ page_title="NeuroResearch AI",
415
+ layout="wide",
416
+ initial_sidebar_state="expanded"
 
 
 
417
  )
418
+ self._inject_styles()
419
+ self._build_sidebar()
420
+ self._build_main_interface()
421
+
422
+ def _inject_styles(self) -> None:
423
+ st.markdown(
424
+ """
425
+ <style>
426
+ :root {
427
+ --primary: #2ecc71;
428
+ --secondary: #3498db;
429
+ --background: #0a0a0a;
430
+ --text: #ecf0f1;
431
+ }
432
+ .stApp {
433
+ background: var(--background);
434
+ color: var(--text);
435
+ font-family: 'Roboto', sans-serif;
436
+ }
437
+ .stTextArea textarea {
438
+ background: #1a1a1a !important;
439
+ color: var(--text) !important;
440
+ border: 2px solid var(--secondary);
441
+ border-radius: 8px;
442
+ padding: 1rem;
443
+ }
444
+ .stButton>button {
445
+ background: linear-gradient(135deg, var(--primary), var(--secondary));
446
+ border: none;
447
+ border-radius: 8px;
448
+ padding: 1rem 2rem;
449
+ transition: all 0.3s;
450
+ }
451
+ .stButton>button:hover {
452
+ transform: translateY(-2px);
453
+ box-shadow: 0 4px 12px rgba(46, 204, 113, 0.3);
454
+ }
455
+ .stExpander {
456
+ background: #1a1a1a;
457
+ border: 1px solid #2a2a2a;
458
+ border-radius: 8px;
459
+ margin: 1rem 0;
460
+ }
461
+ </style>
462
+ """,
463
+ unsafe_allow_html=True
464
+ )
465
+
466
+ def _build_sidebar(self) -> None:
467
+ with st.sidebar:
468
+ st.title("πŸ” Research Database")
469
+ st.subheader("Technical Papers")
470
+ for title, short in ResearchConfig.DOCUMENT_MAP.items():
471
+ with st.expander(short):
472
+ st.markdown(f"```\n{title}\n```")
473
+ st.subheader("Analysis Metrics")
474
+ st.metric("Vector Collections", 2)
475
+ st.metric("Embedding Dimensions", ResearchConfig.EMBEDDING_DIMENSIONS)
476
+
477
+ def _build_main_interface(self) -> None:
478
+ st.title("🧠 NeuroResearch AI")
479
+ query = st.text_area(
480
+ "Research Query:",
481
+ height=200,
482
+ placeholder="Enter technical research question..."
483
+ )
484
+ if st.button("Execute Analysis", type="primary"):
485
+ self._execute_analysis(query)
486
+
487
+ def _execute_analysis(self, query: str) -> None:
488
+ try:
489
+ with st.spinner("Initializing Quantum Analysis..."):
490
+ results = self.workflow.app.stream({
491
+ "messages": [HumanMessage(content=query)],
492
+ "context": {},
493
+ "metadata": {}
494
+ })
495
+ for event in results:
496
+ self._render_event(event)
497
+ st.success("βœ… Analysis Completed Successfully")
498
+ except Exception as e:
499
+ logger.error(f"Workflow execution failed: {e}")
500
+ st.error(
501
+ f"""**Analysis Failed**
502
+ {str(e)}
503
+ Potential issues:
504
+ - Complex query structure
505
+ - Document correlation failure
506
+ - Temporal processing constraints"""
507
  )
508
+
509
+ def _render_event(self, event: Dict) -> None:
510
+ if 'ingest' in event:
511
+ with st.container():
512
+ st.success("βœ… Query Ingested")
513
+ elif 'retrieve' in event:
514
+ with st.container():
515
+ docs = event['retrieve']['context'].get('documents', [])
516
+ st.info(f"πŸ“š Retrieved {len(docs)} documents")
517
+ with st.expander("View Retrieved Documents", expanded=False):
518
+ for idx, doc in enumerate(docs, start=1):
519
+ st.markdown(f"**Document {idx}**")
520
+ st.code(doc.page_content, language='text')
521
+ elif 'analyze' in event:
522
+ with st.container():
523
+ content = event['analyze']['messages'][0].content
524
+ with st.expander("Technical Analysis Report", expanded=True):
525
+ st.markdown(content)
526
+ elif 'validate' in event:
527
+ with st.container():
528
+ content = event['validate']['messages'][0].content
529
+ if "VALID" in content:
530
+ st.success("βœ… Validation Passed")
531
+ with st.expander("View Validated Analysis", expanded=True):
532
+ st.markdown(content.split("Validation:")[0])
533
  else:
534
+ st.warning("⚠️ Validation Issues Detected")
535
+ with st.expander("View Validation Details", expanded=True):
536
+ st.markdown(content)
 
 
 
537
 
 
 
 
538
  if __name__ == "__main__":
539
+ ResearchInterface()