Spaces:
Running
Running
Update app.py
Browse files
app.py
CHANGED
@@ -1,34 +1,48 @@
|
|
1 |
# ------------------------------
|
2 |
-
# NeuroResearch 2.
|
3 |
# ------------------------------
|
4 |
from langchain_openai import OpenAIEmbeddings
|
5 |
from langchain_community.vectorstores import Chroma
|
6 |
from langchain_community.retrievers import BM25Retriever
|
7 |
-
from
|
8 |
-
from
|
9 |
-
from
|
10 |
-
from langgraph.prebuilt import ToolNode
|
11 |
-
from langgraph.graph.message import add_messages
|
12 |
from typing_extensions import TypedDict, Annotated
|
13 |
-
from typing import
|
|
|
|
|
|
|
14 |
import chromadb
|
15 |
import os
|
16 |
-
import streamlit as st
|
17 |
-
import requests
|
18 |
import hashlib
|
19 |
import json
|
20 |
import time
|
|
|
21 |
from concurrent.futures import ThreadPoolExecutor, as_completed
|
22 |
from datetime import datetime
|
|
|
|
|
23 |
import plotly.express as px
|
24 |
import pandas as pd
|
25 |
-
from rank_bm25 import BM25Okapi
|
26 |
-
from sentence_transformers import CrossEncoder
|
27 |
|
28 |
# ------------------------------
|
29 |
-
#
|
30 |
# ------------------------------
|
31 |
class NeuroConfig:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
32 |
DEEPSEEK_API_KEY = os.environ.get("DEEPSEEK_API_KEY")
|
33 |
CHROMA_PATH = "neuro_db"
|
34 |
CHUNK_SIZE = 512
|
@@ -45,391 +59,276 @@ class NeuroConfig:
|
|
45 |
CACHE_TTL = 3600 # 1 hour
|
46 |
|
47 |
# ------------------------------
|
48 |
-
#
|
49 |
-
# ------------------------------
|
50 |
-
class ResearchState(TypedDict):
|
51 |
-
messages: Annotated[Sequence[AIMessage | HumanMessage | ToolMessage], add_messages]
|
52 |
-
context: Dict[str, Any]
|
53 |
-
metadata: Dict[str, Any]
|
54 |
-
cognitive_artifacts: Dict[str, Any]
|
55 |
-
|
56 |
-
# ------------------------------
|
57 |
-
# Neural Document Processor
|
58 |
# ------------------------------
|
59 |
class NeuralDocumentProcessor:
|
60 |
-
|
61 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
62 |
self.embeddings = OpenAIEmbeddings(
|
63 |
model="text-embedding-3-large",
|
64 |
dimensions=NeuroConfig.EMBEDDING_DIMENSIONS
|
65 |
)
|
|
|
|
|
66 |
self.cross_encoder = CrossEncoder('cross-encoder/ms-marco-MiniLM-L-12-v2')
|
67 |
-
|
68 |
-
|
69 |
-
|
70 |
-
|
71 |
-
|
72 |
-
|
73 |
-
)
|
74 |
-
|
75 |
-
docs = splitter.create_documents(documents)
|
76 |
-
return Chroma.from_documents(
|
77 |
-
documents=docs,
|
78 |
-
embedding=self.embeddings,
|
79 |
-
client=self.client,
|
80 |
-
collection_name=collection,
|
81 |
-
ids=[self._quantum_id(doc.page_content) for doc in docs]
|
82 |
-
)
|
83 |
-
|
84 |
-
def hybrid_retrieval(self, query: str, collection: str) -> List[Tuple[str, float]]:
|
85 |
-
vector_retriever = Chroma(
|
86 |
-
client=self.client,
|
87 |
-
collection_name=collection,
|
88 |
-
embedding_function=self.embeddings
|
89 |
-
).as_retriever(search_kwargs={"k": NeuroConfig.HYBRID_RERANK_TOP_K})
|
90 |
-
|
91 |
-
bm25_retriever = BM25Retriever.from_documents(
|
92 |
-
vector_retriever.get()["documents"],
|
93 |
-
preprocess_func=lambda x: x.split()
|
94 |
)
|
95 |
-
|
96 |
-
vector_results = vector_retriever.invoke(query)
|
97 |
-
bm25_results = bm25_retriever.invoke(query)
|
98 |
-
|
99 |
-
combined = list({doc.page_content: doc for doc in vector_results + bm25_results}.values())
|
100 |
-
scores = self.cross_encoder.predict([(query, doc.page_content) for doc in combined])
|
101 |
-
|
102 |
-
reranked = sorted(zip(combined, scores), key=lambda x: x[1], reverse=True)
|
103 |
-
return [doc for doc, _ in reranked[:NeuroConfig.HYBRID_RERANK_TOP_K]]
|
104 |
-
|
105 |
-
def _quantum_id(self, content: str) -> str:
|
106 |
-
return f"neuro_{hashlib.sha3_256(content.encode()).hexdigest()[:24]}"
|
107 |
|
108 |
-
|
109 |
-
|
110 |
-
|
111 |
-
|
112 |
-
|
113 |
-
|
114 |
-
|
115 |
-
|
116 |
-
|
117 |
-
|
118 |
-
|
119 |
-
|
120 |
-
|
121 |
-
|
122 |
-
|
123 |
-
|
124 |
-
|
125 |
-
|
126 |
-
|
127 |
-
|
128 |
-
|
129 |
-
|
130 |
-
|
131 |
-
|
132 |
-
|
133 |
-
|
134 |
-
|
135 |
-
|
136 |
-
|
137 |
-
|
138 |
-
|
139 |
-
|
140 |
-
|
141 |
-
|
142 |
-
|
143 |
-
|
144 |
-
|
145 |
-
|
146 |
-
|
147 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
148 |
try:
|
149 |
-
|
150 |
-
|
151 |
-
|
152 |
-
|
153 |
-
|
154 |
-
|
155 |
-
"role": "system",
|
156 |
-
"content": f"""Perform {mode} analysis. Context:
|
157 |
-
{context}"""
|
158 |
-
}, {
|
159 |
-
"role": "user",
|
160 |
-
"content": query
|
161 |
-
}],
|
162 |
-
"temperature": 0.3 if mode == "technical" else 0.7,
|
163 |
-
"max_tokens": 2048,
|
164 |
-
"top_p": 0.95,
|
165 |
-
"response_format": {"type": "json_object"},
|
166 |
-
"seed": 42
|
167 |
-
},
|
168 |
-
timeout=60
|
169 |
)
|
170 |
-
|
171 |
-
response.raise_for_status()
|
172 |
-
analysis = json.loads(response.json()["choices"][0]["message"]["content"])
|
173 |
-
return {
|
174 |
-
**analysis,
|
175 |
-
"quality_score": self._evaluate_quality(analysis)
|
176 |
-
}
|
177 |
except Exception as e:
|
178 |
-
|
179 |
-
|
180 |
-
def _evaluate_quality(self, analysis: Dict) -> float:
|
181 |
-
score = 0.0
|
182 |
-
score += len(analysis.get("key_points", [])) * 0.2
|
183 |
-
score += len(analysis.get("comparisons", [])) * 0.3
|
184 |
-
score += len(analysis.get("citations", [])) * 0.5
|
185 |
-
return min(score, 1.0)
|
186 |
|
187 |
-
|
188 |
-
|
189 |
-
|
190 |
-
|
191 |
-
|
192 |
-
|
193 |
-
|
194 |
-
|
195 |
-
|
196 |
-
|
197 |
-
|
198 |
-
|
199 |
-
|
200 |
-
|
201 |
-
|
202 |
-
|
203 |
-
|
204 |
-
|
205 |
-
|
206 |
-
workflow.add_edge("ingest", "retrieve")
|
207 |
-
workflow.add_edge("retrieve", "analyze")
|
208 |
-
workflow.add_edge("analyze", "visualize")
|
209 |
-
workflow.add_edge("visualize", "validate")
|
210 |
-
workflow.add_edge("validate", END)
|
211 |
-
|
212 |
-
self.app = workflow.compile()
|
213 |
-
|
214 |
-
def ingest_query(self, state: ResearchState) -> ResearchState:
|
215 |
-
query = state["messages"][-1].content
|
216 |
-
return {
|
217 |
-
**state,
|
218 |
-
"context": {
|
219 |
-
"raw_query": query,
|
220 |
-
"analysis_mode": "technical"
|
221 |
-
},
|
222 |
-
"metadata": {
|
223 |
-
"timestamp": datetime.now().isoformat(),
|
224 |
-
"session_id": hashlib.sha256(query.encode()).hexdigest()[:16]
|
225 |
-
}
|
226 |
-
}
|
227 |
-
|
228 |
-
def retrieve_documents(self, state: ResearchState) -> ResearchState:
|
229 |
-
docs = self.processor.hybrid_retrieval(
|
230 |
-
state["context"]["raw_query"],
|
231 |
-
"research"
|
232 |
-
)
|
233 |
-
return {
|
234 |
-
**state,
|
235 |
-
"context": {
|
236 |
-
**state["context"],
|
237 |
-
"documents": docs,
|
238 |
-
"retrieval_metrics": {
|
239 |
-
"total": len(docs),
|
240 |
-
"relevance_scores": [doc.metadata.get("score", 0) for doc in docs]
|
241 |
-
}
|
242 |
-
}
|
243 |
-
}
|
244 |
-
|
245 |
-
def analyze_content(self, state: ResearchState) -> ResearchState:
|
246 |
-
context = "\n".join([doc.page_content for doc in state["context"]["documents"]])
|
247 |
-
analysis = self.engine.parallel_analysis(
|
248 |
-
query=state["context"]["raw_query"],
|
249 |
-
context=context,
|
250 |
-
mode=state["context"]["analysis_mode"]
|
251 |
-
)
|
252 |
-
|
253 |
-
return {
|
254 |
-
**state,
|
255 |
-
"cognitive_artifacts": analysis,
|
256 |
-
"messages": [AIMessage(content=json.dumps(analysis, indent=2))]
|
257 |
-
}
|
258 |
-
|
259 |
-
def generate_insights(self, state: ResearchState) -> ResearchState:
|
260 |
-
df = pd.DataFrame({
|
261 |
-
"document": [doc.metadata.get("source", "") for doc in state["context"]["documents"]],
|
262 |
-
"relevance": [doc.metadata.get("score", 0) for doc in state["context"]["documents"]],
|
263 |
-
"year": [doc.metadata.get("year", 2023) for doc in state["context"]["documents"]]
|
264 |
-
})
|
265 |
-
|
266 |
-
figures = {
|
267 |
-
"temporal": px.line(df, x="year", y="relevance", title="Temporal Relevance"),
|
268 |
-
"distribution": px.histogram(df, x="relevance", title="Score Distribution")
|
269 |
-
}
|
270 |
-
|
271 |
-
return {
|
272 |
-
**state,
|
273 |
-
"cognitive_artifacts": {
|
274 |
-
**state["cognitive_artifacts"],
|
275 |
-
"visualizations": figures
|
276 |
-
}
|
277 |
-
}
|
278 |
-
|
279 |
-
def validate_knowledge(self, state: ResearchState) -> ResearchState:
|
280 |
-
validation_prompt = f"""
|
281 |
-
Validate research artifacts:
|
282 |
-
{json.dumps(state['cognitive_artifacts'], indent=2)}
|
283 |
-
|
284 |
-
Return JSON with:
|
285 |
-
- validity_score: 0-1
|
286 |
-
- critical_issues: List[str]
|
287 |
-
- strength_points: List[str]
|
288 |
"""
|
289 |
-
|
290 |
-
|
291 |
-
|
292 |
-
|
293 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
294 |
)
|
295 |
-
|
296 |
-
|
297 |
-
|
298 |
-
|
299 |
-
|
300 |
-
|
301 |
-
|
302 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
303 |
|
304 |
# ------------------------------
|
305 |
-
#
|
306 |
# ------------------------------
|
307 |
-
|
308 |
-
|
309 |
-
|
310 |
-
|
311 |
-
|
312 |
-
|
313 |
-
|
314 |
-
|
315 |
-
|
316 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
317 |
)
|
318 |
-
|
319 |
-
|
320 |
-
|
321 |
-
|
322 |
-
|
323 |
-
|
324 |
-
|
325 |
-
|
326 |
-
|
327 |
-
|
328 |
-
|
329 |
-
|
330 |
-
|
331 |
-
|
332 |
-
|
333 |
-
|
334 |
-
|
335 |
-
|
336 |
-
|
337 |
-
|
338 |
-
|
339 |
-
|
340 |
-
|
341 |
-
|
342 |
-
|
343 |
-
|
344 |
-
|
345 |
-
|
346 |
-
|
347 |
-
|
348 |
-
|
349 |
-
|
350 |
-
|
351 |
-
|
352 |
-
|
353 |
-
|
354 |
-
|
355 |
-
|
356 |
-
|
357 |
-
|
358 |
-
box-shadow: 0 8px 24px rgba(127, 0, 255, 0.3);
|
359 |
-
}
|
360 |
-
|
361 |
-
.neuro-card {
|
362 |
-
background: #1A1A4E;
|
363 |
-
border-radius: 16px;
|
364 |
-
padding: 2rem;
|
365 |
-
margin: 1.5rem 0;
|
366 |
-
border: 1px solid #2E2E6E;
|
367 |
-
}
|
368 |
-
</style>
|
369 |
-
""", unsafe_allow_html=True)
|
370 |
-
|
371 |
-
def _build_quantum_sidebar(self):
|
372 |
-
with st.sidebar:
|
373 |
-
st.title("🌀 Neuro Nexus")
|
374 |
-
st.subheader("Analysis Modes")
|
375 |
-
selected_mode = st.selectbox(
|
376 |
-
"Select Cognitive Mode",
|
377 |
-
options=list(NeuroConfig.ANALYSIS_MODES.keys()),
|
378 |
-
format_func=lambda x: NeuroConfig.ANALYSIS_MODES[x]
|
379 |
)
|
380 |
-
|
381 |
-
|
382 |
-
|
383 |
-
|
384 |
-
|
385 |
-
|
386 |
-
|
387 |
-
|
388 |
-
|
389 |
-
|
390 |
-
|
391 |
-
|
392 |
-
|
393 |
-
|
394 |
-
query = st.text_area("Enter Research Query:", height=200,
|
395 |
-
placeholder="Query our knowledge continuum...")
|
396 |
-
|
397 |
-
if st.button("Initiate NeuroAnalysis", type="primary"):
|
398 |
-
self._execute_neuro_analysis(query)
|
399 |
-
|
400 |
-
def _execute_neuro_analysis(self, query: str):
|
401 |
-
with st.spinner("Activating Cognitive Matrix..."):
|
402 |
-
result = self.workflow.app.invoke({
|
403 |
-
"messages": [HumanMessage(content=query)],
|
404 |
-
"context": {},
|
405 |
-
"metadata": {},
|
406 |
-
"cognitive_artifacts": {}
|
407 |
-
})
|
408 |
-
|
409 |
-
self._render_quantum_results(result)
|
410 |
-
|
411 |
-
def _render_quantum_results(self, result: Dict):
|
412 |
-
with st.container():
|
413 |
-
st.subheader("🧬 Cognitive Artifacts")
|
414 |
-
|
415 |
-
with st.expander("Core Analysis", expanded=True):
|
416 |
-
st.json(result["cognitive_artifacts"].get("analysis", {}))
|
417 |
-
|
418 |
-
with st.expander("Visual Insights", expanded=True):
|
419 |
-
visuals = result["cognitive_artifacts"].get("visualizations", {})
|
420 |
-
col1, col2 = st.columns(2)
|
421 |
-
with col1:
|
422 |
-
st.plotly_chart(visuals.get("temporal"), use_container_width=True)
|
423 |
-
with col2:
|
424 |
-
st.plotly_chart(visuals.get("distribution"), use_container_width=True)
|
425 |
-
|
426 |
-
with st.expander("Validation Report", expanded=False):
|
427 |
-
validation = result["cognitive_artifacts"].get("validation", {})
|
428 |
-
st.metric("Validity Score", f"{validation.get('validity_score', 0)*100:.1f}%")
|
429 |
-
st.write("**Critical Issues**")
|
430 |
-
st.write(validation.get("critical_issues", []))
|
431 |
-
st.write("**Strengths**")
|
432 |
-
st.write(validation.get("strength_points", []))
|
433 |
|
|
|
|
|
|
|
434 |
if __name__ == "__main__":
|
435 |
-
NeuroInterface()
|
|
|
1 |
# ------------------------------
|
2 |
+
# NeuroResearch 2.1: Robust Research System
|
3 |
# ------------------------------
|
4 |
from langchain_openai import OpenAIEmbeddings
|
5 |
from langchain_community.vectorstores import Chroma
|
6 |
from langchain_community.retrievers import BM25Retriever
|
7 |
+
from langchain.text_splitter import RecursiveCharacterTextSplitter
|
8 |
+
from rank_bm25 import BM25Okapi
|
9 |
+
from sentence_transformers import CrossEncoder
|
|
|
|
|
10 |
from typing_extensions import TypedDict, Annotated
|
11 |
+
from typing import (
|
12 |
+
Sequence, Dict, List, Optional, Any, Tuple, Union
|
13 |
+
)
|
14 |
+
|
15 |
import chromadb
|
16 |
import os
|
|
|
|
|
17 |
import hashlib
|
18 |
import json
|
19 |
import time
|
20 |
+
|
21 |
from concurrent.futures import ThreadPoolExecutor, as_completed
|
22 |
from datetime import datetime
|
23 |
+
|
24 |
+
import streamlit as st
|
25 |
import plotly.express as px
|
26 |
import pandas as pd
|
|
|
|
|
27 |
|
28 |
# ------------------------------
|
29 |
+
# Configuration
|
30 |
# ------------------------------
|
31 |
class NeuroConfig:
|
32 |
+
"""
|
33 |
+
Configuration class for NeuroResearch system.
|
34 |
+
|
35 |
+
Attributes:
|
36 |
+
DEEPSEEK_API_KEY (str): Optional API key for external services.
|
37 |
+
CHROMA_PATH (str): File path for Chroma's persistent storage.
|
38 |
+
CHUNK_SIZE (int): Maximum length of text chunks for splitting.
|
39 |
+
CHUNK_OVERLAP (int): Overlap between text chunks to preserve context.
|
40 |
+
MAX_CONCURRENT_REQUESTS (int): Number of concurrent threads for processing.
|
41 |
+
EMBEDDING_DIMENSIONS (int): Dimensionality of embeddings.
|
42 |
+
HYBRID_RERANK_TOP_K (int): Number of documents to retrieve and rerank.
|
43 |
+
ANALYSIS_MODES (dict): Possible analysis modes and their descriptions.
|
44 |
+
CACHE_TTL (int): Time-to-live (seconds) for cached items.
|
45 |
+
"""
|
46 |
DEEPSEEK_API_KEY = os.environ.get("DEEPSEEK_API_KEY")
|
47 |
CHROMA_PATH = "neuro_db"
|
48 |
CHUNK_SIZE = 512
|
|
|
59 |
CACHE_TTL = 3600 # 1 hour
|
60 |
|
61 |
# ------------------------------
|
62 |
+
# Document Processor
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
63 |
# ------------------------------
|
64 |
class NeuralDocumentProcessor:
|
65 |
+
"""
|
66 |
+
A document processing and retrieval utility class.
|
67 |
+
|
68 |
+
Responsibilities:
|
69 |
+
- Splitting documents into manageable chunks.
|
70 |
+
- Storing and retrieving embeddings with Chroma.
|
71 |
+
- Performing hybrid retrieval (vector + BM25) with cross-encoder reranking.
|
72 |
+
- Handling concurrency during document ingestion (optional).
|
73 |
+
"""
|
74 |
+
def __init__(self) -> None:
|
75 |
+
"""
|
76 |
+
Initialize the NeuralDocumentProcessor with a persistent Chroma client,
|
77 |
+
OpenAI-based embeddings, a CrossEncoder for reranking, and a text splitter.
|
78 |
+
"""
|
79 |
+
# Persistent Chroma client
|
80 |
+
try:
|
81 |
+
self.client = chromadb.PersistentClient(path=NeuroConfig.CHROMA_PATH)
|
82 |
+
except Exception as e:
|
83 |
+
# Fallback to in-memory client if persistent fails
|
84 |
+
print(f"Error initializing Chroma PersistentClient: {e}")
|
85 |
+
self.client = chromadb.Client()
|
86 |
+
|
87 |
+
# Embeddings (OpenAI-based)
|
88 |
self.embeddings = OpenAIEmbeddings(
|
89 |
model="text-embedding-3-large",
|
90 |
dimensions=NeuroConfig.EMBEDDING_DIMENSIONS
|
91 |
)
|
92 |
+
|
93 |
+
# Cross-encoder for reranking
|
94 |
self.cross_encoder = CrossEncoder('cross-encoder/ms-marco-MiniLM-L-12-v2')
|
95 |
+
|
96 |
+
# Text splitter
|
97 |
+
self.text_splitter = RecursiveCharacterTextSplitter(
|
98 |
+
chunk_size=NeuroConfig.CHUNK_SIZE,
|
99 |
+
chunk_overlap=NeuroConfig.CHUNK_OVERLAP,
|
100 |
+
separators=["\n\n", "\n", "(?<=\\. )", " "],
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
101 |
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
102 |
|
103 |
+
def process_documents(
|
104 |
+
self,
|
105 |
+
documents: List[str],
|
106 |
+
collection: str,
|
107 |
+
use_concurrency: bool = False
|
108 |
+
) -> Optional[Chroma]:
|
109 |
+
"""
|
110 |
+
Process a list of document strings by splitting, embedding, and storing them in Chroma.
|
111 |
+
Optionally uses concurrency for splitting documents.
|
112 |
+
|
113 |
+
Args:
|
114 |
+
documents (List[str]): The list of raw document texts.
|
115 |
+
collection (str): The Chroma collection name to store these documents in.
|
116 |
+
use_concurrency (bool, optional): If True, process documents concurrently. Defaults to False.
|
117 |
+
|
118 |
+
Returns:
|
119 |
+
Optional[Chroma]: The Chroma vectorstore for the specified collection, or None if no docs.
|
120 |
+
"""
|
121 |
+
if not documents:
|
122 |
+
print("No documents provided to process_documents.")
|
123 |
+
return None
|
124 |
+
|
125 |
+
# Split documents into chunks
|
126 |
+
if use_concurrency and len(documents) > 1:
|
127 |
+
chunks = []
|
128 |
+
with ThreadPoolExecutor(max_workers=NeuroConfig.MAX_CONCURRENT_REQUESTS) as executor:
|
129 |
+
future_to_doc = {
|
130 |
+
executor.submit(self.text_splitter.create_documents, [doc]): doc
|
131 |
+
for doc in documents
|
132 |
+
}
|
133 |
+
for future in as_completed(future_to_doc):
|
134 |
+
try:
|
135 |
+
result = future.result()
|
136 |
+
chunks.extend(result)
|
137 |
+
except Exception as e:
|
138 |
+
print(f"Error splitting document: {e}")
|
139 |
+
else:
|
140 |
+
# Single-threaded splitting
|
141 |
+
chunks = []
|
142 |
+
for doc in documents:
|
143 |
+
chunks.extend(self.text_splitter.create_documents([doc]))
|
144 |
+
|
145 |
+
# Build unique IDs for each chunk
|
146 |
+
chunk_ids = [self._quantum_id(doc.page_content) for doc in chunks]
|
147 |
+
|
148 |
+
# Create Chroma from documents
|
149 |
try:
|
150 |
+
vectorstore = Chroma.from_documents(
|
151 |
+
documents=chunks,
|
152 |
+
embedding=self.embeddings,
|
153 |
+
client=self.client,
|
154 |
+
collection_name=collection,
|
155 |
+
ids=chunk_ids
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
156 |
)
|
157 |
+
return vectorstore
|
|
|
|
|
|
|
|
|
|
|
|
|
158 |
except Exception as e:
|
159 |
+
print(f"Error creating Chroma collection: {e}")
|
160 |
+
return None
|
|
|
|
|
|
|
|
|
|
|
|
|
161 |
|
162 |
+
def hybrid_retrieval(
|
163 |
+
self,
|
164 |
+
query: str,
|
165 |
+
collection: str,
|
166 |
+
return_scores: bool = False
|
167 |
+
) -> Union[List[str], List[Tuple[str, float]]]:
|
168 |
+
"""
|
169 |
+
Perform hybrid retrieval combining vector-based search with BM25,
|
170 |
+
then re-rank the combined results using a cross-encoder.
|
171 |
+
|
172 |
+
Args:
|
173 |
+
query (str): The user query for retrieving documents.
|
174 |
+
collection (str): The name of the Chroma collection to search.
|
175 |
+
return_scores (bool): If True, return a list of (document, score) tuples.
|
176 |
+
Otherwise, return a list of document strings only.
|
177 |
+
|
178 |
+
Returns:
|
179 |
+
Union[List[str], List[Tuple[str, float]]]: The top-k reranked results,
|
180 |
+
either as strings or (string, score) pairs.
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
181 |
"""
|
182 |
+
# Try to load the existing collection
|
183 |
+
try:
|
184 |
+
vector_store = Chroma(
|
185 |
+
client=self.client,
|
186 |
+
collection_name=collection,
|
187 |
+
embedding_function=self.embeddings
|
188 |
+
)
|
189 |
+
except Exception as e:
|
190 |
+
print(f"Error loading Chroma collection '{collection}': {e}")
|
191 |
+
return [] if not return_scores else []
|
192 |
+
|
193 |
+
# Check if the collection is empty
|
194 |
+
stored_docs = vector_store.get()
|
195 |
+
if not stored_docs or "documents" not in stored_docs or not stored_docs["documents"]:
|
196 |
+
print(f"No documents found in collection '{collection}'.")
|
197 |
+
return [] if not return_scores else []
|
198 |
+
|
199 |
+
all_docs = [doc.page_content for doc in stored_docs["documents"]]
|
200 |
+
if not all_docs:
|
201 |
+
print(f"No documents found in collection '{collection}'.")
|
202 |
+
return [] if not return_scores else []
|
203 |
+
|
204 |
+
# Vector-based retrieval
|
205 |
+
try:
|
206 |
+
vector_retriever = vector_store.as_retriever(
|
207 |
+
search_kwargs={"k": NeuroConfig.HYBRID_RERANK_TOP_K}
|
208 |
+
)
|
209 |
+
vector_results = [doc.page_content for doc in vector_retriever.invoke(query)]
|
210 |
+
except Exception as e:
|
211 |
+
print(f"Error during vector retrieval: {e}")
|
212 |
+
vector_results = []
|
213 |
+
|
214 |
+
# BM25 retrieval
|
215 |
+
tokenized_docs = [doc.split() for doc in all_docs]
|
216 |
+
bm25 = BM25Okapi(tokenized_docs)
|
217 |
+
bm25_results = bm25.get_top_n(
|
218 |
+
query.split(),
|
219 |
+
all_docs,
|
220 |
+
n=NeuroConfig.HYBRID_RERANK_TOP_K
|
221 |
)
|
222 |
+
|
223 |
+
# Combine results and remove duplicates
|
224 |
+
combined = list(set(vector_results + bm25_results))
|
225 |
+
|
226 |
+
if not combined:
|
227 |
+
print("No documents retrieved by either BM25 or vector search.")
|
228 |
+
return [] if not return_scores else []
|
229 |
+
|
230 |
+
# Cross-encoder reranking
|
231 |
+
scores = self.cross_encoder.predict([(query, doc) for doc in combined])
|
232 |
+
reranked = sorted(zip(combined, scores), key=lambda x: x[1], reverse=True)
|
233 |
+
top_results = reranked[:NeuroConfig.HYBRID_RERANK_TOP_K]
|
234 |
+
|
235 |
+
# Return based on user preference
|
236 |
+
if return_scores:
|
237 |
+
return top_results # List[Tuple[str, float]]
|
238 |
+
else:
|
239 |
+
return [doc for doc, _ in top_results]
|
240 |
+
|
241 |
+
def _quantum_id(self, content: str) -> str:
|
242 |
+
"""
|
243 |
+
Create a unique ID for each text chunk by hashing its content.
|
244 |
+
|
245 |
+
Args:
|
246 |
+
content (str): The text content of the chunk.
|
247 |
+
|
248 |
+
Returns:
|
249 |
+
str: A unique hash-based identifier.
|
250 |
+
"""
|
251 |
+
return f"neuro_{hashlib.sha3_256(content.encode()).hexdigest()[:24]}"
|
252 |
|
253 |
# ------------------------------
|
254 |
+
# NeuroInterface (Streamlit Example)
|
255 |
# ------------------------------
|
256 |
+
def NeuroInterface() -> None:
|
257 |
+
"""
|
258 |
+
A basic Streamlit-based interface to demonstrate usage of the NeuralDocumentProcessor.
|
259 |
+
This function can be adapted for Hugging Face Spaces or other frontends.
|
260 |
+
"""
|
261 |
+
st.title("NeuroResearch 2.1: Robust Research System")
|
262 |
+
|
263 |
+
# Initialize Document Processor
|
264 |
+
processor = NeuralDocumentProcessor()
|
265 |
+
|
266 |
+
# Sidebar for uploading and processing documents
|
267 |
+
with st.sidebar:
|
268 |
+
st.header("Document Ingestion")
|
269 |
+
uploaded_files = st.file_uploader(
|
270 |
+
"Upload one or more text files",
|
271 |
+
type=["txt", "md", "pdf"],
|
272 |
+
accept_multiple_files=True
|
273 |
)
|
274 |
+
collection_name = st.text_input("Collection Name", value="default_collection")
|
275 |
+
|
276 |
+
use_concurrency = st.checkbox("Use Concurrency for Processing?", value=False)
|
277 |
+
|
278 |
+
if st.button("Process Documents"):
|
279 |
+
if uploaded_files and collection_name.strip():
|
280 |
+
# Read files
|
281 |
+
docs_content = []
|
282 |
+
for uf in uploaded_files:
|
283 |
+
content = uf.read()
|
284 |
+
# Assume UTF-8; adapt as needed
|
285 |
+
try:
|
286 |
+
docs_content.append(content.decode("utf-8"))
|
287 |
+
except UnicodeDecodeError:
|
288 |
+
st.error(f"Could not decode {uf.name}. Make sure it's UTF-8 text.")
|
289 |
+
st.write("Processing documents...")
|
290 |
+
vectorstore = processor.process_documents(
|
291 |
+
documents=docs_content,
|
292 |
+
collection=collection_name,
|
293 |
+
use_concurrency=use_concurrency
|
294 |
+
)
|
295 |
+
if vectorstore:
|
296 |
+
st.success(f"Documents processed and stored in collection: {collection_name}")
|
297 |
+
else:
|
298 |
+
st.error("Processing failed or returned no vectorstore.")
|
299 |
+
|
300 |
+
# Main interface for querying
|
301 |
+
st.subheader("Query Documents")
|
302 |
+
user_query = st.text_input("Enter your query:")
|
303 |
+
return_scores = st.checkbox("Return Scores?")
|
304 |
+
|
305 |
+
if st.button("Search"):
|
306 |
+
if not user_query.strip() or not collection_name.strip():
|
307 |
+
st.warning("Please provide both a query and a valid collection name.")
|
308 |
+
else:
|
309 |
+
st.write(f"Retrieving from collection: {collection_name}")
|
310 |
+
results = processor.hybrid_retrieval(
|
311 |
+
query=user_query,
|
312 |
+
collection=collection_name,
|
313 |
+
return_scores=return_scores
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
314 |
)
|
315 |
+
if results:
|
316 |
+
st.write("Top Reranked Results:")
|
317 |
+
if return_scores:
|
318 |
+
# Each result is (doc, score)
|
319 |
+
for idx, (doc, score) in enumerate(results, start=1):
|
320 |
+
st.markdown(f"**Result {idx} | Score: {score:.4f}**")
|
321 |
+
st.write(doc[:500] + ("..." if len(doc) > 500 else ""))
|
322 |
+
else:
|
323 |
+
# Just doc texts
|
324 |
+
for idx, doc in enumerate(results, start=1):
|
325 |
+
st.markdown(f"**Result {idx}**")
|
326 |
+
st.write(doc[:500] + ("..." if len(doc) > 500 else ""))
|
327 |
+
else:
|
328 |
+
st.warning("No results found or collection may be empty.")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
329 |
|
330 |
+
# ------------------------------
|
331 |
+
# Main Entry Point
|
332 |
+
# ------------------------------
|
333 |
if __name__ == "__main__":
|
334 |
+
NeuroInterface()
|