Spaces:
Running
Running
File size: 12,930 Bytes
09a0b53 de3ef7d 09a0b53 dd92890 0f83924 9370b00 de3ef7d bfe5a86 de3ef7d dd92890 3cf95b0 dfecac2 3cf95b0 de3ef7d 3cf95b0 de3ef7d 9370b00 3cf95b0 de3ef7d 3cf95b0 9370b00 de3ef7d 3cf95b0 9370b00 3cf95b0 9370b00 9c89976 9370b00 bfe5a86 9370b00 de3ef7d bfe5a86 9370b00 de3ef7d 3cf95b0 9370b00 3cf95b0 de3ef7d 9370b00 de3ef7d 9370b00 dfecac2 de3ef7d 3cf95b0 de3ef7d 3cf95b0 de3ef7d 9370b00 de3ef7d d94f105 de3ef7d 9370b00 de3ef7d 9370b00 de3ef7d ddd0e04 09a0b53 de3ef7d 09a0b53 de3ef7d 3cf95b0 de3ef7d 9370b00 de3ef7d ddd0e04 de3ef7d ddd0e04 de3ef7d |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 |
# ------------------------------
# NeuroResearch 2.1: Robust Research System
# ------------------------------
from langchain_openai import OpenAIEmbeddings
from langchain_community.vectorstores import Chroma
from langchain_community.retrievers import BM25Retriever
from langchain.text_splitter import RecursiveCharacterTextSplitter
from rank_bm25 import BM25Okapi
from sentence_transformers import CrossEncoder
from typing_extensions import TypedDict, Annotated
from typing import (
Sequence, Dict, List, Optional, Any, Tuple, Union
)
import chromadb
import os
import hashlib
import json
import time
from concurrent.futures import ThreadPoolExecutor, as_completed
from datetime import datetime
import streamlit as st
import plotly.express as px
import pandas as pd
# ------------------------------
# Configuration
# ------------------------------
class NeuroConfig:
"""
Configuration class for NeuroResearch system.
Attributes:
DEEPSEEK_API_KEY (str): Optional API key for external services.
CHROMA_PATH (str): File path for Chroma's persistent storage.
CHUNK_SIZE (int): Maximum length of text chunks for splitting.
CHUNK_OVERLAP (int): Overlap between text chunks to preserve context.
MAX_CONCURRENT_REQUESTS (int): Number of concurrent threads for processing.
EMBEDDING_DIMENSIONS (int): Dimensionality of embeddings.
HYBRID_RERANK_TOP_K (int): Number of documents to retrieve and rerank.
ANALYSIS_MODES (dict): Possible analysis modes and their descriptions.
CACHE_TTL (int): Time-to-live (seconds) for cached items.
"""
DEEPSEEK_API_KEY = os.environ.get("DEEPSEEK_API_KEY")
CHROMA_PATH = "neuro_db"
CHUNK_SIZE = 512
CHUNK_OVERLAP = 64
MAX_CONCURRENT_REQUESTS = 7
EMBEDDING_DIMENSIONS = 3072
HYBRID_RERANK_TOP_K = 15
ANALYSIS_MODES = {
"technical": "Deep Technical Analysis",
"comparative": "Cross-Paper Comparison",
"temporal": "Temporal Trend Analysis",
"critical": "Critical Literature Review"
}
CACHE_TTL = 3600 # 1 hour
# ------------------------------
# Document Processor
# ------------------------------
class NeuralDocumentProcessor:
"""
A document processing and retrieval utility class.
Responsibilities:
- Splitting documents into manageable chunks.
- Storing and retrieving embeddings with Chroma.
- Performing hybrid retrieval (vector + BM25) with cross-encoder reranking.
- Handling concurrency during document ingestion (optional).
"""
def __init__(self) -> None:
"""
Initialize the NeuralDocumentProcessor with a persistent Chroma client,
OpenAI-based embeddings, a CrossEncoder for reranking, and a text splitter.
"""
# Persistent Chroma client
try:
self.client = chromadb.PersistentClient(path=NeuroConfig.CHROMA_PATH)
except Exception as e:
# Fallback to in-memory client if persistent fails
print(f"Error initializing Chroma PersistentClient: {e}")
self.client = chromadb.Client()
# Embeddings (OpenAI-based)
self.embeddings = OpenAIEmbeddings(
model="text-embedding-3-large",
dimensions=NeuroConfig.EMBEDDING_DIMENSIONS
)
# Cross-encoder for reranking
self.cross_encoder = CrossEncoder('cross-encoder/ms-marco-MiniLM-L-12-v2')
# Text splitter
self.text_splitter = RecursiveCharacterTextSplitter(
chunk_size=NeuroConfig.CHUNK_SIZE,
chunk_overlap=NeuroConfig.CHUNK_OVERLAP,
separators=["\n\n", "\n", "(?<=\\. )", " "],
)
def process_documents(
self,
documents: List[str],
collection: str,
use_concurrency: bool = False
) -> Optional[Chroma]:
"""
Process a list of document strings by splitting, embedding, and storing them in Chroma.
Optionally uses concurrency for splitting documents.
Args:
documents (List[str]): The list of raw document texts.
collection (str): The Chroma collection name to store these documents in.
use_concurrency (bool, optional): If True, process documents concurrently. Defaults to False.
Returns:
Optional[Chroma]: The Chroma vectorstore for the specified collection, or None if no docs.
"""
if not documents:
print("No documents provided to process_documents.")
return None
# Split documents into chunks
if use_concurrency and len(documents) > 1:
chunks = []
with ThreadPoolExecutor(max_workers=NeuroConfig.MAX_CONCURRENT_REQUESTS) as executor:
future_to_doc = {
executor.submit(self.text_splitter.create_documents, [doc]): doc
for doc in documents
}
for future in as_completed(future_to_doc):
try:
result = future.result()
chunks.extend(result)
except Exception as e:
print(f"Error splitting document: {e}")
else:
# Single-threaded splitting
chunks = []
for doc in documents:
chunks.extend(self.text_splitter.create_documents([doc]))
# Build unique IDs for each chunk
chunk_ids = [self._quantum_id(doc.page_content) for doc in chunks]
# Create Chroma from documents
try:
vectorstore = Chroma.from_documents(
documents=chunks,
embedding=self.embeddings,
client=self.client,
collection_name=collection,
ids=chunk_ids
)
return vectorstore
except Exception as e:
print(f"Error creating Chroma collection: {e}")
return None
def hybrid_retrieval(
self,
query: str,
collection: str,
return_scores: bool = False
) -> Union[List[str], List[Tuple[str, float]]]:
"""
Perform hybrid retrieval combining vector-based search with BM25,
then re-rank the combined results using a cross-encoder.
Args:
query (str): The user query for retrieving documents.
collection (str): The name of the Chroma collection to search.
return_scores (bool): If True, return a list of (document, score) tuples.
Otherwise, return a list of document strings only.
Returns:
Union[List[str], List[Tuple[str, float]]]: The top-k reranked results,
either as strings or (string, score) pairs.
"""
# Try to load the existing collection
try:
vector_store = Chroma(
client=self.client,
collection_name=collection,
embedding_function=self.embeddings
)
except Exception as e:
print(f"Error loading Chroma collection '{collection}': {e}")
return [] if not return_scores else []
# Check if the collection is empty
stored_docs = vector_store.get()
if not stored_docs or "documents" not in stored_docs or not stored_docs["documents"]:
print(f"No documents found in collection '{collection}'.")
return [] if not return_scores else []
all_docs = [doc.page_content for doc in stored_docs["documents"]]
if not all_docs:
print(f"No documents found in collection '{collection}'.")
return [] if not return_scores else []
# Vector-based retrieval
try:
vector_retriever = vector_store.as_retriever(
search_kwargs={"k": NeuroConfig.HYBRID_RERANK_TOP_K}
)
vector_results = [doc.page_content for doc in vector_retriever.invoke(query)]
except Exception as e:
print(f"Error during vector retrieval: {e}")
vector_results = []
# BM25 retrieval
tokenized_docs = [doc.split() for doc in all_docs]
bm25 = BM25Okapi(tokenized_docs)
bm25_results = bm25.get_top_n(
query.split(),
all_docs,
n=NeuroConfig.HYBRID_RERANK_TOP_K
)
# Combine results and remove duplicates
combined = list(set(vector_results + bm25_results))
if not combined:
print("No documents retrieved by either BM25 or vector search.")
return [] if not return_scores else []
# Cross-encoder reranking
scores = self.cross_encoder.predict([(query, doc) for doc in combined])
reranked = sorted(zip(combined, scores), key=lambda x: x[1], reverse=True)
top_results = reranked[:NeuroConfig.HYBRID_RERANK_TOP_K]
# Return based on user preference
if return_scores:
return top_results # List[Tuple[str, float]]
else:
return [doc for doc, _ in top_results]
def _quantum_id(self, content: str) -> str:
"""
Create a unique ID for each text chunk by hashing its content.
Args:
content (str): The text content of the chunk.
Returns:
str: A unique hash-based identifier.
"""
return f"neuro_{hashlib.sha3_256(content.encode()).hexdigest()[:24]}"
# ------------------------------
# NeuroInterface (Streamlit Example)
# ------------------------------
def NeuroInterface() -> None:
"""
A basic Streamlit-based interface to demonstrate usage of the NeuralDocumentProcessor.
This function can be adapted for Hugging Face Spaces or other frontends.
"""
st.title("NeuroResearch 2.1: Robust Research System")
# Initialize Document Processor
processor = NeuralDocumentProcessor()
# Sidebar for uploading and processing documents
with st.sidebar:
st.header("Document Ingestion")
uploaded_files = st.file_uploader(
"Upload one or more text files",
type=["txt", "md", "pdf"],
accept_multiple_files=True
)
collection_name = st.text_input("Collection Name", value="default_collection")
use_concurrency = st.checkbox("Use Concurrency for Processing?", value=False)
if st.button("Process Documents"):
if uploaded_files and collection_name.strip():
# Read files
docs_content = []
for uf in uploaded_files:
content = uf.read()
# Assume UTF-8; adapt as needed
try:
docs_content.append(content.decode("utf-8"))
except UnicodeDecodeError:
st.error(f"Could not decode {uf.name}. Make sure it's UTF-8 text.")
st.write("Processing documents...")
vectorstore = processor.process_documents(
documents=docs_content,
collection=collection_name,
use_concurrency=use_concurrency
)
if vectorstore:
st.success(f"Documents processed and stored in collection: {collection_name}")
else:
st.error("Processing failed or returned no vectorstore.")
# Main interface for querying
st.subheader("Query Documents")
user_query = st.text_input("Enter your query:")
return_scores = st.checkbox("Return Scores?")
if st.button("Search"):
if not user_query.strip() or not collection_name.strip():
st.warning("Please provide both a query and a valid collection name.")
else:
st.write(f"Retrieving from collection: {collection_name}")
results = processor.hybrid_retrieval(
query=user_query,
collection=collection_name,
return_scores=return_scores
)
if results:
st.write("Top Reranked Results:")
if return_scores:
# Each result is (doc, score)
for idx, (doc, score) in enumerate(results, start=1):
st.markdown(f"**Result {idx} | Score: {score:.4f}**")
st.write(doc[:500] + ("..." if len(doc) > 500 else ""))
else:
# Just doc texts
for idx, doc in enumerate(results, start=1):
st.markdown(f"**Result {idx}**")
st.write(doc[:500] + ("..." if len(doc) > 500 else ""))
else:
st.warning("No results found or collection may be empty.")
# ------------------------------
# Main Entry Point
# ------------------------------
if __name__ == "__main__":
NeuroInterface()
|