Spaces:

mgbam
/

NeuroResearch_AI

Sleeping

File size: 12,930 Bytes

09a0b53
de3ef7d
09a0b53
dd92890
0f83924
9370b00
de3ef7d
 
 
bfe5a86
de3ef7d
 
 
 
dd92890
 
3cf95b0
dfecac2
3cf95b0
de3ef7d
3cf95b0
 
de3ef7d
 
9370b00
 
3cf95b0
 
de3ef7d
3cf95b0
9370b00
de3ef7d
 
 
 
 
 
 
 
 
 
 
 
 
 
3cf95b0
9370b00
3cf95b0
 
9370b00
 
 
 
 
 
 
 
9c89976
9370b00
bfe5a86
9370b00
de3ef7d
bfe5a86
9370b00
de3ef7d
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3cf95b0
 
9370b00
3cf95b0
de3ef7d
 
9370b00
de3ef7d
 
 
 
 
 
9370b00
dfecac2
de3ef7d
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3cf95b0
de3ef7d
 
 
 
 
 
3cf95b0
de3ef7d
9370b00
de3ef7d
 
d94f105
de3ef7d
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
9370b00
de3ef7d
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
9370b00
de3ef7d
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
ddd0e04
09a0b53
de3ef7d
09a0b53
de3ef7d
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3cf95b0
de3ef7d
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
9370b00
de3ef7d
 
 
 
 
 
 
 
 
 
 
 
 
 
ddd0e04
de3ef7d
 
 
ddd0e04
de3ef7d

# ------------------------------
# NeuroResearch 2.1: Robust Research System
# ------------------------------
from langchain_openai import OpenAIEmbeddings
from langchain_community.vectorstores import Chroma
from langchain_community.retrievers import BM25Retriever
from langchain.text_splitter import RecursiveCharacterTextSplitter
from rank_bm25 import BM25Okapi
from sentence_transformers import CrossEncoder
from typing_extensions import TypedDict, Annotated
from typing import (
    Sequence, Dict, List, Optional, Any, Tuple, Union
)

import chromadb
import os
import hashlib
import json
import time

from concurrent.futures import ThreadPoolExecutor, as_completed
from datetime import datetime

import streamlit as st
import plotly.express as px
import pandas as pd

# ------------------------------
# Configuration
# ------------------------------
class NeuroConfig:
    """
    Configuration class for NeuroResearch system.

    Attributes:
        DEEPSEEK_API_KEY (str): Optional API key for external services.
        CHROMA_PATH (str): File path for Chroma's persistent storage.
        CHUNK_SIZE (int): Maximum length of text chunks for splitting.
        CHUNK_OVERLAP (int): Overlap between text chunks to preserve context.
        MAX_CONCURRENT_REQUESTS (int): Number of concurrent threads for processing.
        EMBEDDING_DIMENSIONS (int): Dimensionality of embeddings.
        HYBRID_RERANK_TOP_K (int): Number of documents to retrieve and rerank.
        ANALYSIS_MODES (dict): Possible analysis modes and their descriptions.
        CACHE_TTL (int): Time-to-live (seconds) for cached items.
    """
    DEEPSEEK_API_KEY = os.environ.get("DEEPSEEK_API_KEY")
    CHROMA_PATH = "neuro_db"
    CHUNK_SIZE = 512
    CHUNK_OVERLAP = 64
    MAX_CONCURRENT_REQUESTS = 7
    EMBEDDING_DIMENSIONS = 3072
    HYBRID_RERANK_TOP_K = 15
    ANALYSIS_MODES = {
        "technical": "Deep Technical Analysis",
        "comparative": "Cross-Paper Comparison",
        "temporal": "Temporal Trend Analysis",
        "critical": "Critical Literature Review"
    }
    CACHE_TTL = 3600  # 1 hour

# ------------------------------
# Document Processor
# ------------------------------
class NeuralDocumentProcessor:
    """
    A document processing and retrieval utility class.

    Responsibilities:
      - Splitting documents into manageable chunks.
      - Storing and retrieving embeddings with Chroma.
      - Performing hybrid retrieval (vector + BM25) with cross-encoder reranking.
      - Handling concurrency during document ingestion (optional).
    """
    def __init__(self) -> None:
        """
        Initialize the NeuralDocumentProcessor with a persistent Chroma client,
        OpenAI-based embeddings, a CrossEncoder for reranking, and a text splitter.
        """
        # Persistent Chroma client
        try:
            self.client = chromadb.PersistentClient(path=NeuroConfig.CHROMA_PATH)
        except Exception as e:
            # Fallback to in-memory client if persistent fails
            print(f"Error initializing Chroma PersistentClient: {e}")
            self.client = chromadb.Client()

        # Embeddings (OpenAI-based)
        self.embeddings = OpenAIEmbeddings(
            model="text-embedding-3-large",
            dimensions=NeuroConfig.EMBEDDING_DIMENSIONS
        )

        # Cross-encoder for reranking
        self.cross_encoder = CrossEncoder('cross-encoder/ms-marco-MiniLM-L-12-v2')

        # Text splitter
        self.text_splitter = RecursiveCharacterTextSplitter(
            chunk_size=NeuroConfig.CHUNK_SIZE,
            chunk_overlap=NeuroConfig.CHUNK_OVERLAP,
            separators=["\n\n", "\n", "(?<=\\. )", " "],
        )

    def process_documents(
        self, 
        documents: List[str], 
        collection: str, 
        use_concurrency: bool = False
    ) -> Optional[Chroma]:
        """
        Process a list of document strings by splitting, embedding, and storing them in Chroma.
        Optionally uses concurrency for splitting documents.

        Args:
            documents (List[str]): The list of raw document texts.
            collection (str): The Chroma collection name to store these documents in.
            use_concurrency (bool, optional): If True, process documents concurrently. Defaults to False.

        Returns:
            Optional[Chroma]: The Chroma vectorstore for the specified collection, or None if no docs.
        """
        if not documents:
            print("No documents provided to process_documents.")
            return None

        # Split documents into chunks
        if use_concurrency and len(documents) > 1:
            chunks = []
            with ThreadPoolExecutor(max_workers=NeuroConfig.MAX_CONCURRENT_REQUESTS) as executor:
                future_to_doc = {
                    executor.submit(self.text_splitter.create_documents, [doc]): doc 
                    for doc in documents
                }
                for future in as_completed(future_to_doc):
                    try:
                        result = future.result()
                        chunks.extend(result)
                    except Exception as e:
                        print(f"Error splitting document: {e}")
        else:
            # Single-threaded splitting
            chunks = []
            for doc in documents:
                chunks.extend(self.text_splitter.create_documents([doc]))

        # Build unique IDs for each chunk
        chunk_ids = [self._quantum_id(doc.page_content) for doc in chunks]

        # Create Chroma from documents
        try:
            vectorstore = Chroma.from_documents(
                documents=chunks,
                embedding=self.embeddings,
                client=self.client,
                collection_name=collection,
                ids=chunk_ids
            )
            return vectorstore
        except Exception as e:
            print(f"Error creating Chroma collection: {e}")
            return None

    def hybrid_retrieval(
        self, 
        query: str, 
        collection: str, 
        return_scores: bool = False
    ) -> Union[List[str], List[Tuple[str, float]]]:
        """
        Perform hybrid retrieval combining vector-based search with BM25,
        then re-rank the combined results using a cross-encoder.

        Args:
            query (str): The user query for retrieving documents.
            collection (str): The name of the Chroma collection to search.
            return_scores (bool): If True, return a list of (document, score) tuples.
                                  Otherwise, return a list of document strings only.

        Returns:
            Union[List[str], List[Tuple[str, float]]]: The top-k reranked results, 
            either as strings or (string, score) pairs.
        """
        # Try to load the existing collection
        try:
            vector_store = Chroma(
                client=self.client,
                collection_name=collection,
                embedding_function=self.embeddings
            )
        except Exception as e:
            print(f"Error loading Chroma collection '{collection}': {e}")
            return [] if not return_scores else []

        # Check if the collection is empty
        stored_docs = vector_store.get()
        if not stored_docs or "documents" not in stored_docs or not stored_docs["documents"]:
            print(f"No documents found in collection '{collection}'.")
            return [] if not return_scores else []

        all_docs = [doc.page_content for doc in stored_docs["documents"]]
        if not all_docs:
            print(f"No documents found in collection '{collection}'.")
            return [] if not return_scores else []

        # Vector-based retrieval
        try:
            vector_retriever = vector_store.as_retriever(
                search_kwargs={"k": NeuroConfig.HYBRID_RERANK_TOP_K}
            )
            vector_results = [doc.page_content for doc in vector_retriever.invoke(query)]
        except Exception as e:
            print(f"Error during vector retrieval: {e}")
            vector_results = []

        # BM25 retrieval
        tokenized_docs = [doc.split() for doc in all_docs]
        bm25 = BM25Okapi(tokenized_docs)
        bm25_results = bm25.get_top_n(
            query.split(), 
            all_docs, 
            n=NeuroConfig.HYBRID_RERANK_TOP_K
        )

        # Combine results and remove duplicates
        combined = list(set(vector_results + bm25_results))

        if not combined:
            print("No documents retrieved by either BM25 or vector search.")
            return [] if not return_scores else []

        # Cross-encoder reranking
        scores = self.cross_encoder.predict([(query, doc) for doc in combined])
        reranked = sorted(zip(combined, scores), key=lambda x: x[1], reverse=True)
        top_results = reranked[:NeuroConfig.HYBRID_RERANK_TOP_K]

        # Return based on user preference
        if return_scores:
            return top_results  # List[Tuple[str, float]]
        else:
            return [doc for doc, _ in top_results]

    def _quantum_id(self, content: str) -> str:
        """
        Create a unique ID for each text chunk by hashing its content.

        Args:
            content (str): The text content of the chunk.

        Returns:
            str: A unique hash-based identifier.
        """
        return f"neuro_{hashlib.sha3_256(content.encode()).hexdigest()[:24]}"

# ------------------------------
# NeuroInterface (Streamlit Example)
# ------------------------------
def NeuroInterface() -> None:
    """
    A basic Streamlit-based interface to demonstrate usage of the NeuralDocumentProcessor.
    This function can be adapted for Hugging Face Spaces or other frontends.
    """
    st.title("NeuroResearch 2.1: Robust Research System")

    # Initialize Document Processor
    processor = NeuralDocumentProcessor()

    # Sidebar for uploading and processing documents
    with st.sidebar:
        st.header("Document Ingestion")
        uploaded_files = st.file_uploader(
            "Upload one or more text files", 
            type=["txt", "md", "pdf"], 
            accept_multiple_files=True
        )
        collection_name = st.text_input("Collection Name", value="default_collection")

        use_concurrency = st.checkbox("Use Concurrency for Processing?", value=False)

        if st.button("Process Documents"):
            if uploaded_files and collection_name.strip():
                # Read files
                docs_content = []
                for uf in uploaded_files:
                    content = uf.read()
                    # Assume UTF-8; adapt as needed
                    try:
                        docs_content.append(content.decode("utf-8"))
                    except UnicodeDecodeError:
                        st.error(f"Could not decode {uf.name}. Make sure it's UTF-8 text.")
                st.write("Processing documents...")
                vectorstore = processor.process_documents(
                    documents=docs_content,
                    collection=collection_name,
                    use_concurrency=use_concurrency
                )
                if vectorstore:
                    st.success(f"Documents processed and stored in collection: {collection_name}")
                else:
                    st.error("Processing failed or returned no vectorstore.")

    # Main interface for querying
    st.subheader("Query Documents")
    user_query = st.text_input("Enter your query:")
    return_scores = st.checkbox("Return Scores?")

    if st.button("Search"):
        if not user_query.strip() or not collection_name.strip():
            st.warning("Please provide both a query and a valid collection name.")
        else:
            st.write(f"Retrieving from collection: {collection_name}")
            results = processor.hybrid_retrieval(
                query=user_query,
                collection=collection_name,
                return_scores=return_scores
            )
            if results:
                st.write("Top Reranked Results:")
                if return_scores:
                    # Each result is (doc, score)
                    for idx, (doc, score) in enumerate(results, start=1):
                        st.markdown(f"**Result {idx} | Score: {score:.4f}**")
                        st.write(doc[:500] + ("..." if len(doc) > 500 else ""))
                else:
                    # Just doc texts
                    for idx, doc in enumerate(results, start=1):
                        st.markdown(f"**Result {idx}**")
                        st.write(doc[:500] + ("..." if len(doc) > 500 else ""))
            else:
                st.warning("No results found or collection may be empty.")

# ------------------------------
# Main Entry Point
# ------------------------------
if __name__ == "__main__":
    NeuroInterface()