Spaces:
Running
Running
# ------------------------------ | |
# NeuroResearch 2.1: Robust Research System | |
# ------------------------------ | |
from langchain_openai import OpenAIEmbeddings | |
from langchain_community.vectorstores import Chroma | |
from langchain_community.retrievers import BM25Retriever | |
from langchain.text_splitter import RecursiveCharacterTextSplitter | |
from rank_bm25 import BM25Okapi | |
from sentence_transformers import CrossEncoder | |
from typing_extensions import TypedDict, Annotated | |
from typing import ( | |
Sequence, Dict, List, Optional, Any, Tuple, Union | |
) | |
import chromadb | |
import os | |
import hashlib | |
import json | |
import time | |
from concurrent.futures import ThreadPoolExecutor, as_completed | |
from datetime import datetime | |
import streamlit as st | |
import plotly.express as px | |
import pandas as pd | |
# ------------------------------ | |
# Configuration | |
# ------------------------------ | |
class NeuroConfig: | |
""" | |
Configuration class for NeuroResearch system. | |
Attributes: | |
DEEPSEEK_API_KEY (str): Optional API key for external services. | |
CHROMA_PATH (str): File path for Chroma's persistent storage. | |
CHUNK_SIZE (int): Maximum length of text chunks for splitting. | |
CHUNK_OVERLAP (int): Overlap between text chunks to preserve context. | |
MAX_CONCURRENT_REQUESTS (int): Number of concurrent threads for processing. | |
EMBEDDING_DIMENSIONS (int): Dimensionality of embeddings. | |
HYBRID_RERANK_TOP_K (int): Number of documents to retrieve and rerank. | |
ANALYSIS_MODES (dict): Possible analysis modes and their descriptions. | |
CACHE_TTL (int): Time-to-live (seconds) for cached items. | |
""" | |
DEEPSEEK_API_KEY = os.environ.get("DEEPSEEK_API_KEY") | |
CHROMA_PATH = "neuro_db" | |
CHUNK_SIZE = 512 | |
CHUNK_OVERLAP = 64 | |
MAX_CONCURRENT_REQUESTS = 7 | |
EMBEDDING_DIMENSIONS = 3072 | |
HYBRID_RERANK_TOP_K = 15 | |
ANALYSIS_MODES = { | |
"technical": "Deep Technical Analysis", | |
"comparative": "Cross-Paper Comparison", | |
"temporal": "Temporal Trend Analysis", | |
"critical": "Critical Literature Review" | |
} | |
CACHE_TTL = 3600 # 1 hour | |
# ------------------------------ | |
# Document Processor | |
# ------------------------------ | |
class NeuralDocumentProcessor: | |
""" | |
A document processing and retrieval utility class. | |
Responsibilities: | |
- Splitting documents into manageable chunks. | |
- Storing and retrieving embeddings with Chroma. | |
- Performing hybrid retrieval (vector + BM25) with cross-encoder reranking. | |
- Handling concurrency during document ingestion (optional). | |
""" | |
def __init__(self) -> None: | |
""" | |
Initialize the NeuralDocumentProcessor with a persistent Chroma client, | |
OpenAI-based embeddings, a CrossEncoder for reranking, and a text splitter. | |
""" | |
# Persistent Chroma client | |
try: | |
self.client = chromadb.PersistentClient(path=NeuroConfig.CHROMA_PATH) | |
except Exception as e: | |
# Fallback to in-memory client if persistent fails | |
print(f"Error initializing Chroma PersistentClient: {e}") | |
self.client = chromadb.Client() | |
# Embeddings (OpenAI-based) | |
self.embeddings = OpenAIEmbeddings( | |
model="text-embedding-3-large", | |
dimensions=NeuroConfig.EMBEDDING_DIMENSIONS | |
) | |
# Cross-encoder for reranking | |
self.cross_encoder = CrossEncoder('cross-encoder/ms-marco-MiniLM-L-12-v2') | |
# Text splitter | |
self.text_splitter = RecursiveCharacterTextSplitter( | |
chunk_size=NeuroConfig.CHUNK_SIZE, | |
chunk_overlap=NeuroConfig.CHUNK_OVERLAP, | |
separators=["\n\n", "\n", "(?<=\\. )", " "], | |
) | |
def process_documents( | |
self, | |
documents: List[str], | |
collection: str, | |
use_concurrency: bool = False | |
) -> Optional[Chroma]: | |
""" | |
Process a list of document strings by splitting, embedding, and storing them in Chroma. | |
Optionally uses concurrency for splitting documents. | |
Args: | |
documents (List[str]): The list of raw document texts. | |
collection (str): The Chroma collection name to store these documents in. | |
use_concurrency (bool, optional): If True, process documents concurrently. Defaults to False. | |
Returns: | |
Optional[Chroma]: The Chroma vectorstore for the specified collection, or None if no docs. | |
""" | |
if not documents: | |
print("No documents provided to process_documents.") | |
return None | |
# Split documents into chunks | |
if use_concurrency and len(documents) > 1: | |
chunks = [] | |
with ThreadPoolExecutor(max_workers=NeuroConfig.MAX_CONCURRENT_REQUESTS) as executor: | |
future_to_doc = { | |
executor.submit(self.text_splitter.create_documents, [doc]): doc | |
for doc in documents | |
} | |
for future in as_completed(future_to_doc): | |
try: | |
result = future.result() | |
chunks.extend(result) | |
except Exception as e: | |
print(f"Error splitting document: {e}") | |
else: | |
# Single-threaded splitting | |
chunks = [] | |
for doc in documents: | |
chunks.extend(self.text_splitter.create_documents([doc])) | |
# Build unique IDs for each chunk | |
chunk_ids = [self._quantum_id(doc.page_content) for doc in chunks] | |
# Create Chroma from documents | |
try: | |
vectorstore = Chroma.from_documents( | |
documents=chunks, | |
embedding=self.embeddings, | |
client=self.client, | |
collection_name=collection, | |
ids=chunk_ids | |
) | |
return vectorstore | |
except Exception as e: | |
print(f"Error creating Chroma collection: {e}") | |
return None | |
def hybrid_retrieval( | |
self, | |
query: str, | |
collection: str, | |
return_scores: bool = False | |
) -> Union[List[str], List[Tuple[str, float]]]: | |
""" | |
Perform hybrid retrieval combining vector-based search with BM25, | |
then re-rank the combined results using a cross-encoder. | |
Args: | |
query (str): The user query for retrieving documents. | |
collection (str): The name of the Chroma collection to search. | |
return_scores (bool): If True, return a list of (document, score) tuples. | |
Otherwise, return a list of document strings only. | |
Returns: | |
Union[List[str], List[Tuple[str, float]]]: The top-k reranked results, | |
either as strings or (string, score) pairs. | |
""" | |
# Try to load the existing collection | |
try: | |
vector_store = Chroma( | |
client=self.client, | |
collection_name=collection, | |
embedding_function=self.embeddings | |
) | |
except Exception as e: | |
print(f"Error loading Chroma collection '{collection}': {e}") | |
return [] if not return_scores else [] | |
# Check if the collection is empty | |
stored_docs = vector_store.get() | |
if not stored_docs or "documents" not in stored_docs or not stored_docs["documents"]: | |
print(f"No documents found in collection '{collection}'.") | |
return [] if not return_scores else [] | |
all_docs = [doc.page_content for doc in stored_docs["documents"]] | |
if not all_docs: | |
print(f"No documents found in collection '{collection}'.") | |
return [] if not return_scores else [] | |
# Vector-based retrieval | |
try: | |
vector_retriever = vector_store.as_retriever( | |
search_kwargs={"k": NeuroConfig.HYBRID_RERANK_TOP_K} | |
) | |
vector_results = [doc.page_content for doc in vector_retriever.invoke(query)] | |
except Exception as e: | |
print(f"Error during vector retrieval: {e}") | |
vector_results = [] | |
# BM25 retrieval | |
tokenized_docs = [doc.split() for doc in all_docs] | |
bm25 = BM25Okapi(tokenized_docs) | |
bm25_results = bm25.get_top_n( | |
query.split(), | |
all_docs, | |
n=NeuroConfig.HYBRID_RERANK_TOP_K | |
) | |
# Combine results and remove duplicates | |
combined = list(set(vector_results + bm25_results)) | |
if not combined: | |
print("No documents retrieved by either BM25 or vector search.") | |
return [] if not return_scores else [] | |
# Cross-encoder reranking | |
scores = self.cross_encoder.predict([(query, doc) for doc in combined]) | |
reranked = sorted(zip(combined, scores), key=lambda x: x[1], reverse=True) | |
top_results = reranked[:NeuroConfig.HYBRID_RERANK_TOP_K] | |
# Return based on user preference | |
if return_scores: | |
return top_results # List[Tuple[str, float]] | |
else: | |
return [doc for doc, _ in top_results] | |
def _quantum_id(self, content: str) -> str: | |
""" | |
Create a unique ID for each text chunk by hashing its content. | |
Args: | |
content (str): The text content of the chunk. | |
Returns: | |
str: A unique hash-based identifier. | |
""" | |
return f"neuro_{hashlib.sha3_256(content.encode()).hexdigest()[:24]}" | |
# ------------------------------ | |
# NeuroInterface (Streamlit Example) | |
# ------------------------------ | |
def NeuroInterface() -> None: | |
""" | |
A basic Streamlit-based interface to demonstrate usage of the NeuralDocumentProcessor. | |
This function can be adapted for Hugging Face Spaces or other frontends. | |
""" | |
st.title("NeuroResearch 2.1: Robust Research System") | |
# Initialize Document Processor | |
processor = NeuralDocumentProcessor() | |
# Sidebar for uploading and processing documents | |
with st.sidebar: | |
st.header("Document Ingestion") | |
uploaded_files = st.file_uploader( | |
"Upload one or more text files", | |
type=["txt", "md", "pdf"], | |
accept_multiple_files=True | |
) | |
collection_name = st.text_input("Collection Name", value="default_collection") | |
use_concurrency = st.checkbox("Use Concurrency for Processing?", value=False) | |
if st.button("Process Documents"): | |
if uploaded_files and collection_name.strip(): | |
# Read files | |
docs_content = [] | |
for uf in uploaded_files: | |
content = uf.read() | |
# Assume UTF-8; adapt as needed | |
try: | |
docs_content.append(content.decode("utf-8")) | |
except UnicodeDecodeError: | |
st.error(f"Could not decode {uf.name}. Make sure it's UTF-8 text.") | |
st.write("Processing documents...") | |
vectorstore = processor.process_documents( | |
documents=docs_content, | |
collection=collection_name, | |
use_concurrency=use_concurrency | |
) | |
if vectorstore: | |
st.success(f"Documents processed and stored in collection: {collection_name}") | |
else: | |
st.error("Processing failed or returned no vectorstore.") | |
# Main interface for querying | |
st.subheader("Query Documents") | |
user_query = st.text_input("Enter your query:") | |
return_scores = st.checkbox("Return Scores?") | |
if st.button("Search"): | |
if not user_query.strip() or not collection_name.strip(): | |
st.warning("Please provide both a query and a valid collection name.") | |
else: | |
st.write(f"Retrieving from collection: {collection_name}") | |
results = processor.hybrid_retrieval( | |
query=user_query, | |
collection=collection_name, | |
return_scores=return_scores | |
) | |
if results: | |
st.write("Top Reranked Results:") | |
if return_scores: | |
# Each result is (doc, score) | |
for idx, (doc, score) in enumerate(results, start=1): | |
st.markdown(f"**Result {idx} | Score: {score:.4f}**") | |
st.write(doc[:500] + ("..." if len(doc) > 500 else "")) | |
else: | |
# Just doc texts | |
for idx, doc in enumerate(results, start=1): | |
st.markdown(f"**Result {idx}**") | |
st.write(doc[:500] + ("..." if len(doc) > 500 else "")) | |
else: | |
st.warning("No results found or collection may be empty.") | |
# ------------------------------ | |
# Main Entry Point | |
# ------------------------------ | |
if __name__ == "__main__": | |
NeuroInterface() | |