mgbam's picture
Update app.py
de3ef7d verified
raw
history blame
12.9 kB
# ------------------------------
# NeuroResearch 2.1: Robust Research System
# ------------------------------
from langchain_openai import OpenAIEmbeddings
from langchain_community.vectorstores import Chroma
from langchain_community.retrievers import BM25Retriever
from langchain.text_splitter import RecursiveCharacterTextSplitter
from rank_bm25 import BM25Okapi
from sentence_transformers import CrossEncoder
from typing_extensions import TypedDict, Annotated
from typing import (
Sequence, Dict, List, Optional, Any, Tuple, Union
)
import chromadb
import os
import hashlib
import json
import time
from concurrent.futures import ThreadPoolExecutor, as_completed
from datetime import datetime
import streamlit as st
import plotly.express as px
import pandas as pd
# ------------------------------
# Configuration
# ------------------------------
class NeuroConfig:
"""
Configuration class for NeuroResearch system.
Attributes:
DEEPSEEK_API_KEY (str): Optional API key for external services.
CHROMA_PATH (str): File path for Chroma's persistent storage.
CHUNK_SIZE (int): Maximum length of text chunks for splitting.
CHUNK_OVERLAP (int): Overlap between text chunks to preserve context.
MAX_CONCURRENT_REQUESTS (int): Number of concurrent threads for processing.
EMBEDDING_DIMENSIONS (int): Dimensionality of embeddings.
HYBRID_RERANK_TOP_K (int): Number of documents to retrieve and rerank.
ANALYSIS_MODES (dict): Possible analysis modes and their descriptions.
CACHE_TTL (int): Time-to-live (seconds) for cached items.
"""
DEEPSEEK_API_KEY = os.environ.get("DEEPSEEK_API_KEY")
CHROMA_PATH = "neuro_db"
CHUNK_SIZE = 512
CHUNK_OVERLAP = 64
MAX_CONCURRENT_REQUESTS = 7
EMBEDDING_DIMENSIONS = 3072
HYBRID_RERANK_TOP_K = 15
ANALYSIS_MODES = {
"technical": "Deep Technical Analysis",
"comparative": "Cross-Paper Comparison",
"temporal": "Temporal Trend Analysis",
"critical": "Critical Literature Review"
}
CACHE_TTL = 3600 # 1 hour
# ------------------------------
# Document Processor
# ------------------------------
class NeuralDocumentProcessor:
"""
A document processing and retrieval utility class.
Responsibilities:
- Splitting documents into manageable chunks.
- Storing and retrieving embeddings with Chroma.
- Performing hybrid retrieval (vector + BM25) with cross-encoder reranking.
- Handling concurrency during document ingestion (optional).
"""
def __init__(self) -> None:
"""
Initialize the NeuralDocumentProcessor with a persistent Chroma client,
OpenAI-based embeddings, a CrossEncoder for reranking, and a text splitter.
"""
# Persistent Chroma client
try:
self.client = chromadb.PersistentClient(path=NeuroConfig.CHROMA_PATH)
except Exception as e:
# Fallback to in-memory client if persistent fails
print(f"Error initializing Chroma PersistentClient: {e}")
self.client = chromadb.Client()
# Embeddings (OpenAI-based)
self.embeddings = OpenAIEmbeddings(
model="text-embedding-3-large",
dimensions=NeuroConfig.EMBEDDING_DIMENSIONS
)
# Cross-encoder for reranking
self.cross_encoder = CrossEncoder('cross-encoder/ms-marco-MiniLM-L-12-v2')
# Text splitter
self.text_splitter = RecursiveCharacterTextSplitter(
chunk_size=NeuroConfig.CHUNK_SIZE,
chunk_overlap=NeuroConfig.CHUNK_OVERLAP,
separators=["\n\n", "\n", "(?<=\\. )", " "],
)
def process_documents(
self,
documents: List[str],
collection: str,
use_concurrency: bool = False
) -> Optional[Chroma]:
"""
Process a list of document strings by splitting, embedding, and storing them in Chroma.
Optionally uses concurrency for splitting documents.
Args:
documents (List[str]): The list of raw document texts.
collection (str): The Chroma collection name to store these documents in.
use_concurrency (bool, optional): If True, process documents concurrently. Defaults to False.
Returns:
Optional[Chroma]: The Chroma vectorstore for the specified collection, or None if no docs.
"""
if not documents:
print("No documents provided to process_documents.")
return None
# Split documents into chunks
if use_concurrency and len(documents) > 1:
chunks = []
with ThreadPoolExecutor(max_workers=NeuroConfig.MAX_CONCURRENT_REQUESTS) as executor:
future_to_doc = {
executor.submit(self.text_splitter.create_documents, [doc]): doc
for doc in documents
}
for future in as_completed(future_to_doc):
try:
result = future.result()
chunks.extend(result)
except Exception as e:
print(f"Error splitting document: {e}")
else:
# Single-threaded splitting
chunks = []
for doc in documents:
chunks.extend(self.text_splitter.create_documents([doc]))
# Build unique IDs for each chunk
chunk_ids = [self._quantum_id(doc.page_content) for doc in chunks]
# Create Chroma from documents
try:
vectorstore = Chroma.from_documents(
documents=chunks,
embedding=self.embeddings,
client=self.client,
collection_name=collection,
ids=chunk_ids
)
return vectorstore
except Exception as e:
print(f"Error creating Chroma collection: {e}")
return None
def hybrid_retrieval(
self,
query: str,
collection: str,
return_scores: bool = False
) -> Union[List[str], List[Tuple[str, float]]]:
"""
Perform hybrid retrieval combining vector-based search with BM25,
then re-rank the combined results using a cross-encoder.
Args:
query (str): The user query for retrieving documents.
collection (str): The name of the Chroma collection to search.
return_scores (bool): If True, return a list of (document, score) tuples.
Otherwise, return a list of document strings only.
Returns:
Union[List[str], List[Tuple[str, float]]]: The top-k reranked results,
either as strings or (string, score) pairs.
"""
# Try to load the existing collection
try:
vector_store = Chroma(
client=self.client,
collection_name=collection,
embedding_function=self.embeddings
)
except Exception as e:
print(f"Error loading Chroma collection '{collection}': {e}")
return [] if not return_scores else []
# Check if the collection is empty
stored_docs = vector_store.get()
if not stored_docs or "documents" not in stored_docs or not stored_docs["documents"]:
print(f"No documents found in collection '{collection}'.")
return [] if not return_scores else []
all_docs = [doc.page_content for doc in stored_docs["documents"]]
if not all_docs:
print(f"No documents found in collection '{collection}'.")
return [] if not return_scores else []
# Vector-based retrieval
try:
vector_retriever = vector_store.as_retriever(
search_kwargs={"k": NeuroConfig.HYBRID_RERANK_TOP_K}
)
vector_results = [doc.page_content for doc in vector_retriever.invoke(query)]
except Exception as e:
print(f"Error during vector retrieval: {e}")
vector_results = []
# BM25 retrieval
tokenized_docs = [doc.split() for doc in all_docs]
bm25 = BM25Okapi(tokenized_docs)
bm25_results = bm25.get_top_n(
query.split(),
all_docs,
n=NeuroConfig.HYBRID_RERANK_TOP_K
)
# Combine results and remove duplicates
combined = list(set(vector_results + bm25_results))
if not combined:
print("No documents retrieved by either BM25 or vector search.")
return [] if not return_scores else []
# Cross-encoder reranking
scores = self.cross_encoder.predict([(query, doc) for doc in combined])
reranked = sorted(zip(combined, scores), key=lambda x: x[1], reverse=True)
top_results = reranked[:NeuroConfig.HYBRID_RERANK_TOP_K]
# Return based on user preference
if return_scores:
return top_results # List[Tuple[str, float]]
else:
return [doc for doc, _ in top_results]
def _quantum_id(self, content: str) -> str:
"""
Create a unique ID for each text chunk by hashing its content.
Args:
content (str): The text content of the chunk.
Returns:
str: A unique hash-based identifier.
"""
return f"neuro_{hashlib.sha3_256(content.encode()).hexdigest()[:24]}"
# ------------------------------
# NeuroInterface (Streamlit Example)
# ------------------------------
def NeuroInterface() -> None:
"""
A basic Streamlit-based interface to demonstrate usage of the NeuralDocumentProcessor.
This function can be adapted for Hugging Face Spaces or other frontends.
"""
st.title("NeuroResearch 2.1: Robust Research System")
# Initialize Document Processor
processor = NeuralDocumentProcessor()
# Sidebar for uploading and processing documents
with st.sidebar:
st.header("Document Ingestion")
uploaded_files = st.file_uploader(
"Upload one or more text files",
type=["txt", "md", "pdf"],
accept_multiple_files=True
)
collection_name = st.text_input("Collection Name", value="default_collection")
use_concurrency = st.checkbox("Use Concurrency for Processing?", value=False)
if st.button("Process Documents"):
if uploaded_files and collection_name.strip():
# Read files
docs_content = []
for uf in uploaded_files:
content = uf.read()
# Assume UTF-8; adapt as needed
try:
docs_content.append(content.decode("utf-8"))
except UnicodeDecodeError:
st.error(f"Could not decode {uf.name}. Make sure it's UTF-8 text.")
st.write("Processing documents...")
vectorstore = processor.process_documents(
documents=docs_content,
collection=collection_name,
use_concurrency=use_concurrency
)
if vectorstore:
st.success(f"Documents processed and stored in collection: {collection_name}")
else:
st.error("Processing failed or returned no vectorstore.")
# Main interface for querying
st.subheader("Query Documents")
user_query = st.text_input("Enter your query:")
return_scores = st.checkbox("Return Scores?")
if st.button("Search"):
if not user_query.strip() or not collection_name.strip():
st.warning("Please provide both a query and a valid collection name.")
else:
st.write(f"Retrieving from collection: {collection_name}")
results = processor.hybrid_retrieval(
query=user_query,
collection=collection_name,
return_scores=return_scores
)
if results:
st.write("Top Reranked Results:")
if return_scores:
# Each result is (doc, score)
for idx, (doc, score) in enumerate(results, start=1):
st.markdown(f"**Result {idx} | Score: {score:.4f}**")
st.write(doc[:500] + ("..." if len(doc) > 500 else ""))
else:
# Just doc texts
for idx, doc in enumerate(results, start=1):
st.markdown(f"**Result {idx}**")
st.write(doc[:500] + ("..." if len(doc) > 500 else ""))
else:
st.warning("No results found or collection may be empty.")
# ------------------------------
# Main Entry Point
# ------------------------------
if __name__ == "__main__":
NeuroInterface()