# ------------------------------ # NeuroResearch 2.1: Robust Research System # ------------------------------ from langchain_openai import OpenAIEmbeddings from langchain_community.vectorstores import Chroma from langchain_community.retrievers import BM25Retriever from langchain.text_splitter import RecursiveCharacterTextSplitter from rank_bm25 import BM25Okapi from sentence_transformers import CrossEncoder from typing_extensions import TypedDict, Annotated from typing import ( Sequence, Dict, List, Optional, Any, Tuple, Union ) import chromadb import os import hashlib import json import time from concurrent.futures import ThreadPoolExecutor, as_completed from datetime import datetime import streamlit as st import plotly.express as px import pandas as pd # ------------------------------ # Configuration # ------------------------------ class NeuroConfig: """ Configuration class for NeuroResearch system. Attributes: DEEPSEEK_API_KEY (str): Optional API key for external services. CHROMA_PATH (str): File path for Chroma's persistent storage. CHUNK_SIZE (int): Maximum length of text chunks for splitting. CHUNK_OVERLAP (int): Overlap between text chunks to preserve context. MAX_CONCURRENT_REQUESTS (int): Number of concurrent threads for processing. EMBEDDING_DIMENSIONS (int): Dimensionality of embeddings. HYBRID_RERANK_TOP_K (int): Number of documents to retrieve and rerank. ANALYSIS_MODES (dict): Possible analysis modes and their descriptions. CACHE_TTL (int): Time-to-live (seconds) for cached items. """ DEEPSEEK_API_KEY = os.environ.get("DEEPSEEK_API_KEY") CHROMA_PATH = "neuro_db" CHUNK_SIZE = 512 CHUNK_OVERLAP = 64 MAX_CONCURRENT_REQUESTS = 7 EMBEDDING_DIMENSIONS = 3072 HYBRID_RERANK_TOP_K = 15 ANALYSIS_MODES = { "technical": "Deep Technical Analysis", "comparative": "Cross-Paper Comparison", "temporal": "Temporal Trend Analysis", "critical": "Critical Literature Review" } CACHE_TTL = 3600 # 1 hour # ------------------------------ # Document Processor # ------------------------------ class NeuralDocumentProcessor: """ A document processing and retrieval utility class. Responsibilities: - Splitting documents into manageable chunks. - Storing and retrieving embeddings with Chroma. - Performing hybrid retrieval (vector + BM25) with cross-encoder reranking. - Handling concurrency during document ingestion (optional). """ def __init__(self) -> None: """ Initialize the NeuralDocumentProcessor with a persistent Chroma client, OpenAI-based embeddings, a CrossEncoder for reranking, and a text splitter. """ # Persistent Chroma client try: self.client = chromadb.PersistentClient(path=NeuroConfig.CHROMA_PATH) except Exception as e: # Fallback to in-memory client if persistent fails print(f"Error initializing Chroma PersistentClient: {e}") self.client = chromadb.Client() # Embeddings (OpenAI-based) self.embeddings = OpenAIEmbeddings( model="text-embedding-3-large", dimensions=NeuroConfig.EMBEDDING_DIMENSIONS ) # Cross-encoder for reranking self.cross_encoder = CrossEncoder('cross-encoder/ms-marco-MiniLM-L-12-v2') # Text splitter self.text_splitter = RecursiveCharacterTextSplitter( chunk_size=NeuroConfig.CHUNK_SIZE, chunk_overlap=NeuroConfig.CHUNK_OVERLAP, separators=["\n\n", "\n", "(?<=\\. )", " "], ) def process_documents( self, documents: List[str], collection: str, use_concurrency: bool = False ) -> Optional[Chroma]: """ Process a list of document strings by splitting, embedding, and storing them in Chroma. Optionally uses concurrency for splitting documents. Args: documents (List[str]): The list of raw document texts. collection (str): The Chroma collection name to store these documents in. use_concurrency (bool, optional): If True, process documents concurrently. Defaults to False. Returns: Optional[Chroma]: The Chroma vectorstore for the specified collection, or None if no docs. """ if not documents: print("No documents provided to process_documents.") return None # Split documents into chunks if use_concurrency and len(documents) > 1: chunks = [] with ThreadPoolExecutor(max_workers=NeuroConfig.MAX_CONCURRENT_REQUESTS) as executor: future_to_doc = { executor.submit(self.text_splitter.create_documents, [doc]): doc for doc in documents } for future in as_completed(future_to_doc): try: result = future.result() chunks.extend(result) except Exception as e: print(f"Error splitting document: {e}") else: # Single-threaded splitting chunks = [] for doc in documents: chunks.extend(self.text_splitter.create_documents([doc])) # Build unique IDs for each chunk chunk_ids = [self._quantum_id(doc.page_content) for doc in chunks] # Create Chroma from documents try: vectorstore = Chroma.from_documents( documents=chunks, embedding=self.embeddings, client=self.client, collection_name=collection, ids=chunk_ids ) return vectorstore except Exception as e: print(f"Error creating Chroma collection: {e}") return None def hybrid_retrieval( self, query: str, collection: str, return_scores: bool = False ) -> Union[List[str], List[Tuple[str, float]]]: """ Perform hybrid retrieval combining vector-based search with BM25, then re-rank the combined results using a cross-encoder. Args: query (str): The user query for retrieving documents. collection (str): The name of the Chroma collection to search. return_scores (bool): If True, return a list of (document, score) tuples. Otherwise, return a list of document strings only. Returns: Union[List[str], List[Tuple[str, float]]]: The top-k reranked results, either as strings or (string, score) pairs. """ # Try to load the existing collection try: vector_store = Chroma( client=self.client, collection_name=collection, embedding_function=self.embeddings ) except Exception as e: print(f"Error loading Chroma collection '{collection}': {e}") return [] if not return_scores else [] # Check if the collection is empty stored_docs = vector_store.get() if not stored_docs or "documents" not in stored_docs or not stored_docs["documents"]: print(f"No documents found in collection '{collection}'.") return [] if not return_scores else [] all_docs = [doc.page_content for doc in stored_docs["documents"]] if not all_docs: print(f"No documents found in collection '{collection}'.") return [] if not return_scores else [] # Vector-based retrieval try: vector_retriever = vector_store.as_retriever( search_kwargs={"k": NeuroConfig.HYBRID_RERANK_TOP_K} ) vector_results = [doc.page_content for doc in vector_retriever.invoke(query)] except Exception as e: print(f"Error during vector retrieval: {e}") vector_results = [] # BM25 retrieval tokenized_docs = [doc.split() for doc in all_docs] bm25 = BM25Okapi(tokenized_docs) bm25_results = bm25.get_top_n( query.split(), all_docs, n=NeuroConfig.HYBRID_RERANK_TOP_K ) # Combine results and remove duplicates combined = list(set(vector_results + bm25_results)) if not combined: print("No documents retrieved by either BM25 or vector search.") return [] if not return_scores else [] # Cross-encoder reranking scores = self.cross_encoder.predict([(query, doc) for doc in combined]) reranked = sorted(zip(combined, scores), key=lambda x: x[1], reverse=True) top_results = reranked[:NeuroConfig.HYBRID_RERANK_TOP_K] # Return based on user preference if return_scores: return top_results # List[Tuple[str, float]] else: return [doc for doc, _ in top_results] def _quantum_id(self, content: str) -> str: """ Create a unique ID for each text chunk by hashing its content. Args: content (str): The text content of the chunk. Returns: str: A unique hash-based identifier. """ return f"neuro_{hashlib.sha3_256(content.encode()).hexdigest()[:24]}" # ------------------------------ # NeuroInterface (Streamlit Example) # ------------------------------ def NeuroInterface() -> None: """ A basic Streamlit-based interface to demonstrate usage of the NeuralDocumentProcessor. This function can be adapted for Hugging Face Spaces or other frontends. """ st.title("NeuroResearch 2.1: Robust Research System") # Initialize Document Processor processor = NeuralDocumentProcessor() # Sidebar for uploading and processing documents with st.sidebar: st.header("Document Ingestion") uploaded_files = st.file_uploader( "Upload one or more text files", type=["txt", "md", "pdf"], accept_multiple_files=True ) collection_name = st.text_input("Collection Name", value="default_collection") use_concurrency = st.checkbox("Use Concurrency for Processing?", value=False) if st.button("Process Documents"): if uploaded_files and collection_name.strip(): # Read files docs_content = [] for uf in uploaded_files: content = uf.read() # Assume UTF-8; adapt as needed try: docs_content.append(content.decode("utf-8")) except UnicodeDecodeError: st.error(f"Could not decode {uf.name}. Make sure it's UTF-8 text.") st.write("Processing documents...") vectorstore = processor.process_documents( documents=docs_content, collection=collection_name, use_concurrency=use_concurrency ) if vectorstore: st.success(f"Documents processed and stored in collection: {collection_name}") else: st.error("Processing failed or returned no vectorstore.") # Main interface for querying st.subheader("Query Documents") user_query = st.text_input("Enter your query:") return_scores = st.checkbox("Return Scores?") if st.button("Search"): if not user_query.strip() or not collection_name.strip(): st.warning("Please provide both a query and a valid collection name.") else: st.write(f"Retrieving from collection: {collection_name}") results = processor.hybrid_retrieval( query=user_query, collection=collection_name, return_scores=return_scores ) if results: st.write("Top Reranked Results:") if return_scores: # Each result is (doc, score) for idx, (doc, score) in enumerate(results, start=1): st.markdown(f"**Result {idx} | Score: {score:.4f}**") st.write(doc[:500] + ("..." if len(doc) > 500 else "")) else: # Just doc texts for idx, doc in enumerate(results, start=1): st.markdown(f"**Result {idx}**") st.write(doc[:500] + ("..." if len(doc) > 500 else "")) else: st.warning("No results found or collection may be empty.") # ------------------------------ # Main Entry Point # ------------------------------ if __name__ == "__main__": NeuroInterface()