mgbam commited on
Commit
de3ef7d
·
verified ·
1 Parent(s): 9370b00

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +277 -378
app.py CHANGED
@@ -1,34 +1,48 @@
1
  # ------------------------------
2
- # NeuroResearch 2.0: Advanced Research Cognition System
3
  # ------------------------------
4
  from langchain_openai import OpenAIEmbeddings
5
  from langchain_community.vectorstores import Chroma
6
  from langchain_community.retrievers import BM25Retriever
7
- from langchain_core.messages import HumanMessage, AIMessage, ToolMessage
8
- from langchain.text_splitter import SemanticChunker
9
- from langgraph.graph import END, StateGraph
10
- from langgraph.prebuilt import ToolNode
11
- from langgraph.graph.message import add_messages
12
  from typing_extensions import TypedDict, Annotated
13
- from typing import Sequence, Dict, List, Optional, Any, Tuple
 
 
 
14
  import chromadb
15
  import os
16
- import streamlit as st
17
- import requests
18
  import hashlib
19
  import json
20
  import time
 
21
  from concurrent.futures import ThreadPoolExecutor, as_completed
22
  from datetime import datetime
 
 
23
  import plotly.express as px
24
  import pandas as pd
25
- from rank_bm25 import BM25Okapi
26
- from sentence_transformers import CrossEncoder
27
 
28
  # ------------------------------
29
- # Quantum Cognition Configuration
30
  # ------------------------------
31
  class NeuroConfig:
 
 
 
 
 
 
 
 
 
 
 
 
 
 
32
  DEEPSEEK_API_KEY = os.environ.get("DEEPSEEK_API_KEY")
33
  CHROMA_PATH = "neuro_db"
34
  CHUNK_SIZE = 512
@@ -45,391 +59,276 @@ class NeuroConfig:
45
  CACHE_TTL = 3600 # 1 hour
46
 
47
  # ------------------------------
48
- # Quantum State Schema
49
- # ------------------------------
50
- class ResearchState(TypedDict):
51
- messages: Annotated[Sequence[AIMessage | HumanMessage | ToolMessage], add_messages]
52
- context: Dict[str, Any]
53
- metadata: Dict[str, Any]
54
- cognitive_artifacts: Dict[str, Any]
55
-
56
- # ------------------------------
57
- # Neural Document Processor
58
  # ------------------------------
59
  class NeuralDocumentProcessor:
60
- def __init__(self):
61
- self.client = chromadb.PersistentClient(path=NeuroConfig.CHROMA_PATH)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
62
  self.embeddings = OpenAIEmbeddings(
63
  model="text-embedding-3-large",
64
  dimensions=NeuroConfig.EMBEDDING_DIMENSIONS
65
  )
 
 
66
  self.cross_encoder = CrossEncoder('cross-encoder/ms-marco-MiniLM-L-12-v2')
67
-
68
- def process_documents(self, documents: List[str], collection: str) -> Chroma:
69
- splitter = SemanticChunker(
70
- self.embeddings,
71
- breakpoint_threshold_type="percentile",
72
- breakpoint_threshold_amount=0.8
73
- )
74
-
75
- docs = splitter.create_documents(documents)
76
- return Chroma.from_documents(
77
- documents=docs,
78
- embedding=self.embeddings,
79
- client=self.client,
80
- collection_name=collection,
81
- ids=[self._quantum_id(doc.page_content) for doc in docs]
82
- )
83
-
84
- def hybrid_retrieval(self, query: str, collection: str) -> List[Tuple[str, float]]:
85
- vector_retriever = Chroma(
86
- client=self.client,
87
- collection_name=collection,
88
- embedding_function=self.embeddings
89
- ).as_retriever(search_kwargs={"k": NeuroConfig.HYBRID_RERANK_TOP_K})
90
-
91
- bm25_retriever = BM25Retriever.from_documents(
92
- vector_retriever.get()["documents"],
93
- preprocess_func=lambda x: x.split()
94
  )
95
-
96
- vector_results = vector_retriever.invoke(query)
97
- bm25_results = bm25_retriever.invoke(query)
98
-
99
- combined = list({doc.page_content: doc for doc in vector_results + bm25_results}.values())
100
- scores = self.cross_encoder.predict([(query, doc.page_content) for doc in combined])
101
-
102
- reranked = sorted(zip(combined, scores), key=lambda x: x[1], reverse=True)
103
- return [doc for doc, _ in reranked[:NeuroConfig.HYBRID_RERANK_TOP_K]]
104
-
105
- def _quantum_id(self, content: str) -> str:
106
- return f"neuro_{hashlib.sha3_256(content.encode()).hexdigest()[:24]}"
107
 
108
- # ------------------------------
109
- # Cognitive Processing Units
110
- # ------------------------------
111
- class NeuroAnalyticalEngine:
112
- def __init__(self):
113
- self.executor = ThreadPoolExecutor(max_workers=NeuroConfig.MAX_CONCURRENT_REQUESTS)
114
- self.cache = {}
115
-
116
- def parallel_analysis(self, query: str, context: str, mode: str) -> Dict:
117
- cache_key = f"{hashlib.sha256(query.encode()).hexdigest()[:16]}_{mode}"
118
- if cached := self.cache.get(cache_key):
119
- if time.time() - cached["timestamp"] < NeuroConfig.CACHE_TTL:
120
- return cached["response"]
121
-
122
- futures = []
123
- for _ in range(3):
124
- futures.append(self.executor.submit(
125
- self._cognitive_process,
126
- query,
127
- context,
128
- mode
129
- ))
130
-
131
- results = [f.result() for f in as_completed(futures)]
132
- best_response = max(results, key=lambda x: x.get('quality_score', 0))
133
-
134
- self.cache[cache_key] = {
135
- "response": best_response,
136
- "timestamp": time.time()
137
- }
138
-
139
- return best_response
140
-
141
- def _cognitive_process(self, query: str, context: str, mode: str) -> Dict:
142
- headers = {
143
- "Authorization": f"Bearer {NeuroConfig.DEEPSEEK_API_KEY}",
144
- "Content-Type": "application/json",
145
- "X-Neuro-Mode": mode
146
- }
147
-
 
 
 
 
 
 
148
  try:
149
- response = requests.post(
150
- "https://api.deepseek.com/v1/chat/completions",
151
- headers=headers,
152
- json={
153
- "model": "deepseek-researcher-v2",
154
- "messages": [{
155
- "role": "system",
156
- "content": f"""Perform {mode} analysis. Context:
157
- {context}"""
158
- }, {
159
- "role": "user",
160
- "content": query
161
- }],
162
- "temperature": 0.3 if mode == "technical" else 0.7,
163
- "max_tokens": 2048,
164
- "top_p": 0.95,
165
- "response_format": {"type": "json_object"},
166
- "seed": 42
167
- },
168
- timeout=60
169
  )
170
-
171
- response.raise_for_status()
172
- analysis = json.loads(response.json()["choices"][0]["message"]["content"])
173
- return {
174
- **analysis,
175
- "quality_score": self._evaluate_quality(analysis)
176
- }
177
  except Exception as e:
178
- return {"error": str(e), "quality_score": 0}
179
-
180
- def _evaluate_quality(self, analysis: Dict) -> float:
181
- score = 0.0
182
- score += len(analysis.get("key_points", [])) * 0.2
183
- score += len(analysis.get("comparisons", [])) * 0.3
184
- score += len(analysis.get("citations", [])) * 0.5
185
- return min(score, 1.0)
186
 
187
- # ------------------------------
188
- # Advanced Research Workflow
189
- # ------------------------------
190
- class NeuroResearchWorkflow:
191
- def __init__(self):
192
- self.processor = NeuralDocumentProcessor()
193
- self.engine = NeuroAnalyticalEngine()
194
- self._build_cognitive_graph()
195
-
196
- def _build_cognitive_graph(self):
197
- workflow = StateGraph(ResearchState)
198
-
199
- workflow.add_node("ingest", self.ingest_query)
200
- workflow.add_node("retrieve", self.retrieve_documents)
201
- workflow.add_node("analyze", self.analyze_content)
202
- workflow.add_node("visualize", self.generate_insights)
203
- workflow.add_node("validate", self.validate_knowledge)
204
-
205
- workflow.set_entry_point("ingest")
206
- workflow.add_edge("ingest", "retrieve")
207
- workflow.add_edge("retrieve", "analyze")
208
- workflow.add_edge("analyze", "visualize")
209
- workflow.add_edge("visualize", "validate")
210
- workflow.add_edge("validate", END)
211
-
212
- self.app = workflow.compile()
213
-
214
- def ingest_query(self, state: ResearchState) -> ResearchState:
215
- query = state["messages"][-1].content
216
- return {
217
- **state,
218
- "context": {
219
- "raw_query": query,
220
- "analysis_mode": "technical"
221
- },
222
- "metadata": {
223
- "timestamp": datetime.now().isoformat(),
224
- "session_id": hashlib.sha256(query.encode()).hexdigest()[:16]
225
- }
226
- }
227
-
228
- def retrieve_documents(self, state: ResearchState) -> ResearchState:
229
- docs = self.processor.hybrid_retrieval(
230
- state["context"]["raw_query"],
231
- "research"
232
- )
233
- return {
234
- **state,
235
- "context": {
236
- **state["context"],
237
- "documents": docs,
238
- "retrieval_metrics": {
239
- "total": len(docs),
240
- "relevance_scores": [doc.metadata.get("score", 0) for doc in docs]
241
- }
242
- }
243
- }
244
-
245
- def analyze_content(self, state: ResearchState) -> ResearchState:
246
- context = "\n".join([doc.page_content for doc in state["context"]["documents"]])
247
- analysis = self.engine.parallel_analysis(
248
- query=state["context"]["raw_query"],
249
- context=context,
250
- mode=state["context"]["analysis_mode"]
251
- )
252
-
253
- return {
254
- **state,
255
- "cognitive_artifacts": analysis,
256
- "messages": [AIMessage(content=json.dumps(analysis, indent=2))]
257
- }
258
-
259
- def generate_insights(self, state: ResearchState) -> ResearchState:
260
- df = pd.DataFrame({
261
- "document": [doc.metadata.get("source", "") for doc in state["context"]["documents"]],
262
- "relevance": [doc.metadata.get("score", 0) for doc in state["context"]["documents"]],
263
- "year": [doc.metadata.get("year", 2023) for doc in state["context"]["documents"]]
264
- })
265
-
266
- figures = {
267
- "temporal": px.line(df, x="year", y="relevance", title="Temporal Relevance"),
268
- "distribution": px.histogram(df, x="relevance", title="Score Distribution")
269
- }
270
-
271
- return {
272
- **state,
273
- "cognitive_artifacts": {
274
- **state["cognitive_artifacts"],
275
- "visualizations": figures
276
- }
277
- }
278
-
279
- def validate_knowledge(self, state: ResearchState) -> ResearchState:
280
- validation_prompt = f"""
281
- Validate research artifacts:
282
- {json.dumps(state['cognitive_artifacts'], indent=2)}
283
-
284
- Return JSON with:
285
- - validity_score: 0-1
286
- - critical_issues: List[str]
287
- - strength_points: List[str]
288
  """
289
-
290
- validation = self.engine.parallel_analysis(
291
- query=validation_prompt,
292
- context="",
293
- mode="critical"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
294
  )
295
-
296
- return {
297
- **state,
298
- "cognitive_artifacts": {
299
- **state["cognitive_artifacts"],
300
- "validation": validation
301
- }
302
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
303
 
304
  # ------------------------------
305
- # Holographic Research Interface
306
  # ------------------------------
307
- class NeuroInterface:
308
- def __init__(self):
309
- self.workflow = NeuroResearchWorkflow()
310
- self._initialize_nexus()
311
-
312
- def _initialize_nexus(self):
313
- st.set_page_config(
314
- page_title="NeuroResearch Nexus",
315
- layout="wide",
316
- initial_sidebar_state="expanded"
 
 
 
 
 
 
 
317
  )
318
- self._inject_neuro_styles()
319
- self._build_quantum_sidebar()
320
- self._build_main_nexus()
321
-
322
- def _inject_neuro_styles(self):
323
- st.markdown("""
324
- <style>
325
- :root {
326
- --neuro-primary: #7F00FF;
327
- --neuro-secondary: #E100FF;
328
- --neuro-background: #0A0A2E;
329
- --neuro-text: #F0F2F6;
330
- }
331
-
332
- .stApp {
333
- background: var(--neuro-background);
334
- color: var(--neuro-text);
335
- font-family: 'Inter', sans-serif;
336
- }
337
-
338
- .stTextArea textarea {
339
- background: #1A1A4E !important;
340
- color: var(--neuro-text) !important;
341
- border: 2px solid var(--neuro-secondary);
342
- border-radius: 12px;
343
- padding: 1.5rem;
344
- font-size: 1.1rem;
345
- }
346
-
347
- .stButton>button {
348
- background: linear-gradient(135deg, var(--neuro-primary), var(--neuro-secondary));
349
- border: none;
350
- border-radius: 12px;
351
- padding: 1.2rem 2.4rem;
352
- font-weight: 600;
353
- transition: all 0.4s cubic-bezier(0.4, 0, 0.2, 1);
354
- }
355
-
356
- .stButton>button:hover {
357
- transform: translateY(-2px);
358
- box-shadow: 0 8px 24px rgba(127, 0, 255, 0.3);
359
- }
360
-
361
- .neuro-card {
362
- background: #1A1A4E;
363
- border-radius: 16px;
364
- padding: 2rem;
365
- margin: 1.5rem 0;
366
- border: 1px solid #2E2E6E;
367
- }
368
- </style>
369
- """, unsafe_allow_html=True)
370
-
371
- def _build_quantum_sidebar(self):
372
- with st.sidebar:
373
- st.title("🌀 Neuro Nexus")
374
- st.subheader("Analysis Modes")
375
- selected_mode = st.selectbox(
376
- "Select Cognitive Mode",
377
- options=list(NeuroConfig.ANALYSIS_MODES.keys()),
378
- format_func=lambda x: NeuroConfig.ANALYSIS_MODES[x]
379
  )
380
-
381
- st.subheader("Quantum Metrics")
382
- col1, col2 = st.columns(2)
383
- col1.metric("Vector Dimensions", NeuroConfig.EMBEDDING_DIMENSIONS)
384
- col2.metric("Hybrid Recall", "92.4%", "1.2% ↑")
385
-
386
- st.divider()
387
- st.write("**Cognitive Filters**")
388
- st.checkbox("Temporal Analysis", True)
389
- st.checkbox("Methodology Comparison")
390
- st.checkbox("Citation Graph")
391
-
392
- def _build_main_nexus(self):
393
- st.title("🧠 NeuroResearch Nexus")
394
- query = st.text_area("Enter Research Query:", height=200,
395
- placeholder="Query our knowledge continuum...")
396
-
397
- if st.button("Initiate NeuroAnalysis", type="primary"):
398
- self._execute_neuro_analysis(query)
399
-
400
- def _execute_neuro_analysis(self, query: str):
401
- with st.spinner("Activating Cognitive Matrix..."):
402
- result = self.workflow.app.invoke({
403
- "messages": [HumanMessage(content=query)],
404
- "context": {},
405
- "metadata": {},
406
- "cognitive_artifacts": {}
407
- })
408
-
409
- self._render_quantum_results(result)
410
-
411
- def _render_quantum_results(self, result: Dict):
412
- with st.container():
413
- st.subheader("🧬 Cognitive Artifacts")
414
-
415
- with st.expander("Core Analysis", expanded=True):
416
- st.json(result["cognitive_artifacts"].get("analysis", {}))
417
-
418
- with st.expander("Visual Insights", expanded=True):
419
- visuals = result["cognitive_artifacts"].get("visualizations", {})
420
- col1, col2 = st.columns(2)
421
- with col1:
422
- st.plotly_chart(visuals.get("temporal"), use_container_width=True)
423
- with col2:
424
- st.plotly_chart(visuals.get("distribution"), use_container_width=True)
425
-
426
- with st.expander("Validation Report", expanded=False):
427
- validation = result["cognitive_artifacts"].get("validation", {})
428
- st.metric("Validity Score", f"{validation.get('validity_score', 0)*100:.1f}%")
429
- st.write("**Critical Issues**")
430
- st.write(validation.get("critical_issues", []))
431
- st.write("**Strengths**")
432
- st.write(validation.get("strength_points", []))
433
 
 
 
 
434
  if __name__ == "__main__":
435
- NeuroInterface()
 
1
  # ------------------------------
2
+ # NeuroResearch 2.1: Robust Research System
3
  # ------------------------------
4
  from langchain_openai import OpenAIEmbeddings
5
  from langchain_community.vectorstores import Chroma
6
  from langchain_community.retrievers import BM25Retriever
7
+ from langchain.text_splitter import RecursiveCharacterTextSplitter
8
+ from rank_bm25 import BM25Okapi
9
+ from sentence_transformers import CrossEncoder
 
 
10
  from typing_extensions import TypedDict, Annotated
11
+ from typing import (
12
+ Sequence, Dict, List, Optional, Any, Tuple, Union
13
+ )
14
+
15
  import chromadb
16
  import os
 
 
17
  import hashlib
18
  import json
19
  import time
20
+
21
  from concurrent.futures import ThreadPoolExecutor, as_completed
22
  from datetime import datetime
23
+
24
+ import streamlit as st
25
  import plotly.express as px
26
  import pandas as pd
 
 
27
 
28
  # ------------------------------
29
+ # Configuration
30
  # ------------------------------
31
  class NeuroConfig:
32
+ """
33
+ Configuration class for NeuroResearch system.
34
+
35
+ Attributes:
36
+ DEEPSEEK_API_KEY (str): Optional API key for external services.
37
+ CHROMA_PATH (str): File path for Chroma's persistent storage.
38
+ CHUNK_SIZE (int): Maximum length of text chunks for splitting.
39
+ CHUNK_OVERLAP (int): Overlap between text chunks to preserve context.
40
+ MAX_CONCURRENT_REQUESTS (int): Number of concurrent threads for processing.
41
+ EMBEDDING_DIMENSIONS (int): Dimensionality of embeddings.
42
+ HYBRID_RERANK_TOP_K (int): Number of documents to retrieve and rerank.
43
+ ANALYSIS_MODES (dict): Possible analysis modes and their descriptions.
44
+ CACHE_TTL (int): Time-to-live (seconds) for cached items.
45
+ """
46
  DEEPSEEK_API_KEY = os.environ.get("DEEPSEEK_API_KEY")
47
  CHROMA_PATH = "neuro_db"
48
  CHUNK_SIZE = 512
 
59
  CACHE_TTL = 3600 # 1 hour
60
 
61
  # ------------------------------
62
+ # Document Processor
 
 
 
 
 
 
 
 
 
63
  # ------------------------------
64
  class NeuralDocumentProcessor:
65
+ """
66
+ A document processing and retrieval utility class.
67
+
68
+ Responsibilities:
69
+ - Splitting documents into manageable chunks.
70
+ - Storing and retrieving embeddings with Chroma.
71
+ - Performing hybrid retrieval (vector + BM25) with cross-encoder reranking.
72
+ - Handling concurrency during document ingestion (optional).
73
+ """
74
+ def __init__(self) -> None:
75
+ """
76
+ Initialize the NeuralDocumentProcessor with a persistent Chroma client,
77
+ OpenAI-based embeddings, a CrossEncoder for reranking, and a text splitter.
78
+ """
79
+ # Persistent Chroma client
80
+ try:
81
+ self.client = chromadb.PersistentClient(path=NeuroConfig.CHROMA_PATH)
82
+ except Exception as e:
83
+ # Fallback to in-memory client if persistent fails
84
+ print(f"Error initializing Chroma PersistentClient: {e}")
85
+ self.client = chromadb.Client()
86
+
87
+ # Embeddings (OpenAI-based)
88
  self.embeddings = OpenAIEmbeddings(
89
  model="text-embedding-3-large",
90
  dimensions=NeuroConfig.EMBEDDING_DIMENSIONS
91
  )
92
+
93
+ # Cross-encoder for reranking
94
  self.cross_encoder = CrossEncoder('cross-encoder/ms-marco-MiniLM-L-12-v2')
95
+
96
+ # Text splitter
97
+ self.text_splitter = RecursiveCharacterTextSplitter(
98
+ chunk_size=NeuroConfig.CHUNK_SIZE,
99
+ chunk_overlap=NeuroConfig.CHUNK_OVERLAP,
100
+ separators=["\n\n", "\n", "(?<=\\. )", " "],
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
101
  )
 
 
 
 
 
 
 
 
 
 
 
 
102
 
103
+ def process_documents(
104
+ self,
105
+ documents: List[str],
106
+ collection: str,
107
+ use_concurrency: bool = False
108
+ ) -> Optional[Chroma]:
109
+ """
110
+ Process a list of document strings by splitting, embedding, and storing them in Chroma.
111
+ Optionally uses concurrency for splitting documents.
112
+
113
+ Args:
114
+ documents (List[str]): The list of raw document texts.
115
+ collection (str): The Chroma collection name to store these documents in.
116
+ use_concurrency (bool, optional): If True, process documents concurrently. Defaults to False.
117
+
118
+ Returns:
119
+ Optional[Chroma]: The Chroma vectorstore for the specified collection, or None if no docs.
120
+ """
121
+ if not documents:
122
+ print("No documents provided to process_documents.")
123
+ return None
124
+
125
+ # Split documents into chunks
126
+ if use_concurrency and len(documents) > 1:
127
+ chunks = []
128
+ with ThreadPoolExecutor(max_workers=NeuroConfig.MAX_CONCURRENT_REQUESTS) as executor:
129
+ future_to_doc = {
130
+ executor.submit(self.text_splitter.create_documents, [doc]): doc
131
+ for doc in documents
132
+ }
133
+ for future in as_completed(future_to_doc):
134
+ try:
135
+ result = future.result()
136
+ chunks.extend(result)
137
+ except Exception as e:
138
+ print(f"Error splitting document: {e}")
139
+ else:
140
+ # Single-threaded splitting
141
+ chunks = []
142
+ for doc in documents:
143
+ chunks.extend(self.text_splitter.create_documents([doc]))
144
+
145
+ # Build unique IDs for each chunk
146
+ chunk_ids = [self._quantum_id(doc.page_content) for doc in chunks]
147
+
148
+ # Create Chroma from documents
149
  try:
150
+ vectorstore = Chroma.from_documents(
151
+ documents=chunks,
152
+ embedding=self.embeddings,
153
+ client=self.client,
154
+ collection_name=collection,
155
+ ids=chunk_ids
 
 
 
 
 
 
 
 
 
 
 
 
 
 
156
  )
157
+ return vectorstore
 
 
 
 
 
 
158
  except Exception as e:
159
+ print(f"Error creating Chroma collection: {e}")
160
+ return None
 
 
 
 
 
 
161
 
162
+ def hybrid_retrieval(
163
+ self,
164
+ query: str,
165
+ collection: str,
166
+ return_scores: bool = False
167
+ ) -> Union[List[str], List[Tuple[str, float]]]:
168
+ """
169
+ Perform hybrid retrieval combining vector-based search with BM25,
170
+ then re-rank the combined results using a cross-encoder.
171
+
172
+ Args:
173
+ query (str): The user query for retrieving documents.
174
+ collection (str): The name of the Chroma collection to search.
175
+ return_scores (bool): If True, return a list of (document, score) tuples.
176
+ Otherwise, return a list of document strings only.
177
+
178
+ Returns:
179
+ Union[List[str], List[Tuple[str, float]]]: The top-k reranked results,
180
+ either as strings or (string, score) pairs.
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
181
  """
182
+ # Try to load the existing collection
183
+ try:
184
+ vector_store = Chroma(
185
+ client=self.client,
186
+ collection_name=collection,
187
+ embedding_function=self.embeddings
188
+ )
189
+ except Exception as e:
190
+ print(f"Error loading Chroma collection '{collection}': {e}")
191
+ return [] if not return_scores else []
192
+
193
+ # Check if the collection is empty
194
+ stored_docs = vector_store.get()
195
+ if not stored_docs or "documents" not in stored_docs or not stored_docs["documents"]:
196
+ print(f"No documents found in collection '{collection}'.")
197
+ return [] if not return_scores else []
198
+
199
+ all_docs = [doc.page_content for doc in stored_docs["documents"]]
200
+ if not all_docs:
201
+ print(f"No documents found in collection '{collection}'.")
202
+ return [] if not return_scores else []
203
+
204
+ # Vector-based retrieval
205
+ try:
206
+ vector_retriever = vector_store.as_retriever(
207
+ search_kwargs={"k": NeuroConfig.HYBRID_RERANK_TOP_K}
208
+ )
209
+ vector_results = [doc.page_content for doc in vector_retriever.invoke(query)]
210
+ except Exception as e:
211
+ print(f"Error during vector retrieval: {e}")
212
+ vector_results = []
213
+
214
+ # BM25 retrieval
215
+ tokenized_docs = [doc.split() for doc in all_docs]
216
+ bm25 = BM25Okapi(tokenized_docs)
217
+ bm25_results = bm25.get_top_n(
218
+ query.split(),
219
+ all_docs,
220
+ n=NeuroConfig.HYBRID_RERANK_TOP_K
221
  )
222
+
223
+ # Combine results and remove duplicates
224
+ combined = list(set(vector_results + bm25_results))
225
+
226
+ if not combined:
227
+ print("No documents retrieved by either BM25 or vector search.")
228
+ return [] if not return_scores else []
229
+
230
+ # Cross-encoder reranking
231
+ scores = self.cross_encoder.predict([(query, doc) for doc in combined])
232
+ reranked = sorted(zip(combined, scores), key=lambda x: x[1], reverse=True)
233
+ top_results = reranked[:NeuroConfig.HYBRID_RERANK_TOP_K]
234
+
235
+ # Return based on user preference
236
+ if return_scores:
237
+ return top_results # List[Tuple[str, float]]
238
+ else:
239
+ return [doc for doc, _ in top_results]
240
+
241
+ def _quantum_id(self, content: str) -> str:
242
+ """
243
+ Create a unique ID for each text chunk by hashing its content.
244
+
245
+ Args:
246
+ content (str): The text content of the chunk.
247
+
248
+ Returns:
249
+ str: A unique hash-based identifier.
250
+ """
251
+ return f"neuro_{hashlib.sha3_256(content.encode()).hexdigest()[:24]}"
252
 
253
  # ------------------------------
254
+ # NeuroInterface (Streamlit Example)
255
  # ------------------------------
256
+ def NeuroInterface() -> None:
257
+ """
258
+ A basic Streamlit-based interface to demonstrate usage of the NeuralDocumentProcessor.
259
+ This function can be adapted for Hugging Face Spaces or other frontends.
260
+ """
261
+ st.title("NeuroResearch 2.1: Robust Research System")
262
+
263
+ # Initialize Document Processor
264
+ processor = NeuralDocumentProcessor()
265
+
266
+ # Sidebar for uploading and processing documents
267
+ with st.sidebar:
268
+ st.header("Document Ingestion")
269
+ uploaded_files = st.file_uploader(
270
+ "Upload one or more text files",
271
+ type=["txt", "md", "pdf"],
272
+ accept_multiple_files=True
273
  )
274
+ collection_name = st.text_input("Collection Name", value="default_collection")
275
+
276
+ use_concurrency = st.checkbox("Use Concurrency for Processing?", value=False)
277
+
278
+ if st.button("Process Documents"):
279
+ if uploaded_files and collection_name.strip():
280
+ # Read files
281
+ docs_content = []
282
+ for uf in uploaded_files:
283
+ content = uf.read()
284
+ # Assume UTF-8; adapt as needed
285
+ try:
286
+ docs_content.append(content.decode("utf-8"))
287
+ except UnicodeDecodeError:
288
+ st.error(f"Could not decode {uf.name}. Make sure it's UTF-8 text.")
289
+ st.write("Processing documents...")
290
+ vectorstore = processor.process_documents(
291
+ documents=docs_content,
292
+ collection=collection_name,
293
+ use_concurrency=use_concurrency
294
+ )
295
+ if vectorstore:
296
+ st.success(f"Documents processed and stored in collection: {collection_name}")
297
+ else:
298
+ st.error("Processing failed or returned no vectorstore.")
299
+
300
+ # Main interface for querying
301
+ st.subheader("Query Documents")
302
+ user_query = st.text_input("Enter your query:")
303
+ return_scores = st.checkbox("Return Scores?")
304
+
305
+ if st.button("Search"):
306
+ if not user_query.strip() or not collection_name.strip():
307
+ st.warning("Please provide both a query and a valid collection name.")
308
+ else:
309
+ st.write(f"Retrieving from collection: {collection_name}")
310
+ results = processor.hybrid_retrieval(
311
+ query=user_query,
312
+ collection=collection_name,
313
+ return_scores=return_scores
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
314
  )
315
+ if results:
316
+ st.write("Top Reranked Results:")
317
+ if return_scores:
318
+ # Each result is (doc, score)
319
+ for idx, (doc, score) in enumerate(results, start=1):
320
+ st.markdown(f"**Result {idx} | Score: {score:.4f}**")
321
+ st.write(doc[:500] + ("..." if len(doc) > 500 else ""))
322
+ else:
323
+ # Just doc texts
324
+ for idx, doc in enumerate(results, start=1):
325
+ st.markdown(f"**Result {idx}**")
326
+ st.write(doc[:500] + ("..." if len(doc) > 500 else ""))
327
+ else:
328
+ st.warning("No results found or collection may be empty.")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
329
 
330
+ # ------------------------------
331
+ # Main Entry Point
332
+ # ------------------------------
333
  if __name__ == "__main__":
334
+ NeuroInterface()