File size: 12,930 Bytes
09a0b53
de3ef7d
09a0b53
dd92890
0f83924
9370b00
de3ef7d
 
 
bfe5a86
de3ef7d
 
 
 
dd92890
 
3cf95b0
dfecac2
3cf95b0
de3ef7d
3cf95b0
 
de3ef7d
 
9370b00
 
3cf95b0
 
de3ef7d
3cf95b0
9370b00
de3ef7d
 
 
 
 
 
 
 
 
 
 
 
 
 
3cf95b0
9370b00
3cf95b0
 
9370b00
 
 
 
 
 
 
 
9c89976
9370b00
bfe5a86
9370b00
de3ef7d
bfe5a86
9370b00
de3ef7d
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3cf95b0
 
9370b00
3cf95b0
de3ef7d
 
9370b00
de3ef7d
 
 
 
 
 
9370b00
dfecac2
de3ef7d
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3cf95b0
de3ef7d
 
 
 
 
 
3cf95b0
de3ef7d
9370b00
de3ef7d
 
d94f105
de3ef7d
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
9370b00
de3ef7d
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
9370b00
de3ef7d
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
ddd0e04
09a0b53
de3ef7d
09a0b53
de3ef7d
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3cf95b0
de3ef7d
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
9370b00
de3ef7d
 
 
 
 
 
 
 
 
 
 
 
 
 
ddd0e04
de3ef7d
 
 
ddd0e04
de3ef7d
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
# ------------------------------
# NeuroResearch 2.1: Robust Research System
# ------------------------------
from langchain_openai import OpenAIEmbeddings
from langchain_community.vectorstores import Chroma
from langchain_community.retrievers import BM25Retriever
from langchain.text_splitter import RecursiveCharacterTextSplitter
from rank_bm25 import BM25Okapi
from sentence_transformers import CrossEncoder
from typing_extensions import TypedDict, Annotated
from typing import (
    Sequence, Dict, List, Optional, Any, Tuple, Union
)

import chromadb
import os
import hashlib
import json
import time

from concurrent.futures import ThreadPoolExecutor, as_completed
from datetime import datetime

import streamlit as st
import plotly.express as px
import pandas as pd

# ------------------------------
# Configuration
# ------------------------------
class NeuroConfig:
    """
    Configuration class for NeuroResearch system.

    Attributes:
        DEEPSEEK_API_KEY (str): Optional API key for external services.
        CHROMA_PATH (str): File path for Chroma's persistent storage.
        CHUNK_SIZE (int): Maximum length of text chunks for splitting.
        CHUNK_OVERLAP (int): Overlap between text chunks to preserve context.
        MAX_CONCURRENT_REQUESTS (int): Number of concurrent threads for processing.
        EMBEDDING_DIMENSIONS (int): Dimensionality of embeddings.
        HYBRID_RERANK_TOP_K (int): Number of documents to retrieve and rerank.
        ANALYSIS_MODES (dict): Possible analysis modes and their descriptions.
        CACHE_TTL (int): Time-to-live (seconds) for cached items.
    """
    DEEPSEEK_API_KEY = os.environ.get("DEEPSEEK_API_KEY")
    CHROMA_PATH = "neuro_db"
    CHUNK_SIZE = 512
    CHUNK_OVERLAP = 64
    MAX_CONCURRENT_REQUESTS = 7
    EMBEDDING_DIMENSIONS = 3072
    HYBRID_RERANK_TOP_K = 15
    ANALYSIS_MODES = {
        "technical": "Deep Technical Analysis",
        "comparative": "Cross-Paper Comparison",
        "temporal": "Temporal Trend Analysis",
        "critical": "Critical Literature Review"
    }
    CACHE_TTL = 3600  # 1 hour

# ------------------------------
# Document Processor
# ------------------------------
class NeuralDocumentProcessor:
    """
    A document processing and retrieval utility class.

    Responsibilities:
      - Splitting documents into manageable chunks.
      - Storing and retrieving embeddings with Chroma.
      - Performing hybrid retrieval (vector + BM25) with cross-encoder reranking.
      - Handling concurrency during document ingestion (optional).
    """
    def __init__(self) -> None:
        """
        Initialize the NeuralDocumentProcessor with a persistent Chroma client,
        OpenAI-based embeddings, a CrossEncoder for reranking, and a text splitter.
        """
        # Persistent Chroma client
        try:
            self.client = chromadb.PersistentClient(path=NeuroConfig.CHROMA_PATH)
        except Exception as e:
            # Fallback to in-memory client if persistent fails
            print(f"Error initializing Chroma PersistentClient: {e}")
            self.client = chromadb.Client()

        # Embeddings (OpenAI-based)
        self.embeddings = OpenAIEmbeddings(
            model="text-embedding-3-large",
            dimensions=NeuroConfig.EMBEDDING_DIMENSIONS
        )

        # Cross-encoder for reranking
        self.cross_encoder = CrossEncoder('cross-encoder/ms-marco-MiniLM-L-12-v2')

        # Text splitter
        self.text_splitter = RecursiveCharacterTextSplitter(
            chunk_size=NeuroConfig.CHUNK_SIZE,
            chunk_overlap=NeuroConfig.CHUNK_OVERLAP,
            separators=["\n\n", "\n", "(?<=\\. )", " "],
        )

    def process_documents(
        self, 
        documents: List[str], 
        collection: str, 
        use_concurrency: bool = False
    ) -> Optional[Chroma]:
        """
        Process a list of document strings by splitting, embedding, and storing them in Chroma.
        Optionally uses concurrency for splitting documents.

        Args:
            documents (List[str]): The list of raw document texts.
            collection (str): The Chroma collection name to store these documents in.
            use_concurrency (bool, optional): If True, process documents concurrently. Defaults to False.

        Returns:
            Optional[Chroma]: The Chroma vectorstore for the specified collection, or None if no docs.
        """
        if not documents:
            print("No documents provided to process_documents.")
            return None

        # Split documents into chunks
        if use_concurrency and len(documents) > 1:
            chunks = []
            with ThreadPoolExecutor(max_workers=NeuroConfig.MAX_CONCURRENT_REQUESTS) as executor:
                future_to_doc = {
                    executor.submit(self.text_splitter.create_documents, [doc]): doc 
                    for doc in documents
                }
                for future in as_completed(future_to_doc):
                    try:
                        result = future.result()
                        chunks.extend(result)
                    except Exception as e:
                        print(f"Error splitting document: {e}")
        else:
            # Single-threaded splitting
            chunks = []
            for doc in documents:
                chunks.extend(self.text_splitter.create_documents([doc]))

        # Build unique IDs for each chunk
        chunk_ids = [self._quantum_id(doc.page_content) for doc in chunks]

        # Create Chroma from documents
        try:
            vectorstore = Chroma.from_documents(
                documents=chunks,
                embedding=self.embeddings,
                client=self.client,
                collection_name=collection,
                ids=chunk_ids
            )
            return vectorstore
        except Exception as e:
            print(f"Error creating Chroma collection: {e}")
            return None

    def hybrid_retrieval(
        self, 
        query: str, 
        collection: str, 
        return_scores: bool = False
    ) -> Union[List[str], List[Tuple[str, float]]]:
        """
        Perform hybrid retrieval combining vector-based search with BM25,
        then re-rank the combined results using a cross-encoder.

        Args:
            query (str): The user query for retrieving documents.
            collection (str): The name of the Chroma collection to search.
            return_scores (bool): If True, return a list of (document, score) tuples.
                                  Otherwise, return a list of document strings only.

        Returns:
            Union[List[str], List[Tuple[str, float]]]: The top-k reranked results, 
            either as strings or (string, score) pairs.
        """
        # Try to load the existing collection
        try:
            vector_store = Chroma(
                client=self.client,
                collection_name=collection,
                embedding_function=self.embeddings
            )
        except Exception as e:
            print(f"Error loading Chroma collection '{collection}': {e}")
            return [] if not return_scores else []

        # Check if the collection is empty
        stored_docs = vector_store.get()
        if not stored_docs or "documents" not in stored_docs or not stored_docs["documents"]:
            print(f"No documents found in collection '{collection}'.")
            return [] if not return_scores else []

        all_docs = [doc.page_content for doc in stored_docs["documents"]]
        if not all_docs:
            print(f"No documents found in collection '{collection}'.")
            return [] if not return_scores else []

        # Vector-based retrieval
        try:
            vector_retriever = vector_store.as_retriever(
                search_kwargs={"k": NeuroConfig.HYBRID_RERANK_TOP_K}
            )
            vector_results = [doc.page_content for doc in vector_retriever.invoke(query)]
        except Exception as e:
            print(f"Error during vector retrieval: {e}")
            vector_results = []

        # BM25 retrieval
        tokenized_docs = [doc.split() for doc in all_docs]
        bm25 = BM25Okapi(tokenized_docs)
        bm25_results = bm25.get_top_n(
            query.split(), 
            all_docs, 
            n=NeuroConfig.HYBRID_RERANK_TOP_K
        )

        # Combine results and remove duplicates
        combined = list(set(vector_results + bm25_results))

        if not combined:
            print("No documents retrieved by either BM25 or vector search.")
            return [] if not return_scores else []

        # Cross-encoder reranking
        scores = self.cross_encoder.predict([(query, doc) for doc in combined])
        reranked = sorted(zip(combined, scores), key=lambda x: x[1], reverse=True)
        top_results = reranked[:NeuroConfig.HYBRID_RERANK_TOP_K]

        # Return based on user preference
        if return_scores:
            return top_results  # List[Tuple[str, float]]
        else:
            return [doc for doc, _ in top_results]

    def _quantum_id(self, content: str) -> str:
        """
        Create a unique ID for each text chunk by hashing its content.

        Args:
            content (str): The text content of the chunk.

        Returns:
            str: A unique hash-based identifier.
        """
        return f"neuro_{hashlib.sha3_256(content.encode()).hexdigest()[:24]}"

# ------------------------------
# NeuroInterface (Streamlit Example)
# ------------------------------
def NeuroInterface() -> None:
    """
    A basic Streamlit-based interface to demonstrate usage of the NeuralDocumentProcessor.
    This function can be adapted for Hugging Face Spaces or other frontends.
    """
    st.title("NeuroResearch 2.1: Robust Research System")

    # Initialize Document Processor
    processor = NeuralDocumentProcessor()

    # Sidebar for uploading and processing documents
    with st.sidebar:
        st.header("Document Ingestion")
        uploaded_files = st.file_uploader(
            "Upload one or more text files", 
            type=["txt", "md", "pdf"], 
            accept_multiple_files=True
        )
        collection_name = st.text_input("Collection Name", value="default_collection")

        use_concurrency = st.checkbox("Use Concurrency for Processing?", value=False)

        if st.button("Process Documents"):
            if uploaded_files and collection_name.strip():
                # Read files
                docs_content = []
                for uf in uploaded_files:
                    content = uf.read()
                    # Assume UTF-8; adapt as needed
                    try:
                        docs_content.append(content.decode("utf-8"))
                    except UnicodeDecodeError:
                        st.error(f"Could not decode {uf.name}. Make sure it's UTF-8 text.")
                st.write("Processing documents...")
                vectorstore = processor.process_documents(
                    documents=docs_content,
                    collection=collection_name,
                    use_concurrency=use_concurrency
                )
                if vectorstore:
                    st.success(f"Documents processed and stored in collection: {collection_name}")
                else:
                    st.error("Processing failed or returned no vectorstore.")

    # Main interface for querying
    st.subheader("Query Documents")
    user_query = st.text_input("Enter your query:")
    return_scores = st.checkbox("Return Scores?")

    if st.button("Search"):
        if not user_query.strip() or not collection_name.strip():
            st.warning("Please provide both a query and a valid collection name.")
        else:
            st.write(f"Retrieving from collection: {collection_name}")
            results = processor.hybrid_retrieval(
                query=user_query,
                collection=collection_name,
                return_scores=return_scores
            )
            if results:
                st.write("Top Reranked Results:")
                if return_scores:
                    # Each result is (doc, score)
                    for idx, (doc, score) in enumerate(results, start=1):
                        st.markdown(f"**Result {idx} | Score: {score:.4f}**")
                        st.write(doc[:500] + ("..." if len(doc) > 500 else ""))
                else:
                    # Just doc texts
                    for idx, doc in enumerate(results, start=1):
                        st.markdown(f"**Result {idx}**")
                        st.write(doc[:500] + ("..." if len(doc) > 500 else ""))
            else:
                st.warning("No results found or collection may be empty.")

# ------------------------------
# Main Entry Point
# ------------------------------
if __name__ == "__main__":
    NeuroInterface()