Spaces:

mgbam
/

NeuroResearch_AI

Sleeping

App Files Files Community

NeuroResearch_AI / app.py

mgbam

Update app.py

de3ef7d verified 4 months ago

raw

history blame

12.9 kB

	# ------------------------------
	# NeuroResearch 2.1: Robust Research System
	# ------------------------------
	from langchain_openai import OpenAIEmbeddings
	from langchain_community.vectorstores import Chroma
	from langchain_community.retrievers import BM25Retriever
	from langchain.text_splitter import RecursiveCharacterTextSplitter
	from rank_bm25 import BM25Okapi
	from sentence_transformers import CrossEncoder
	from typing_extensions import TypedDict, Annotated
	from typing import (
	Sequence, Dict, List, Optional, Any, Tuple, Union
	)

	import chromadb
	import os
	import hashlib
	import json
	import time

	from concurrent.futures import ThreadPoolExecutor, as_completed
	from datetime import datetime

	import streamlit as st
	import plotly.express as px
	import pandas as pd

	# ------------------------------
	# Configuration
	# ------------------------------
	class NeuroConfig:
	"""
	Configuration class for NeuroResearch system.

	Attributes:
	DEEPSEEK_API_KEY (str): Optional API key for external services.
	CHROMA_PATH (str): File path for Chroma's persistent storage.
	CHUNK_SIZE (int): Maximum length of text chunks for splitting.
	CHUNK_OVERLAP (int): Overlap between text chunks to preserve context.
	MAX_CONCURRENT_REQUESTS (int): Number of concurrent threads for processing.
	EMBEDDING_DIMENSIONS (int): Dimensionality of embeddings.
	HYBRID_RERANK_TOP_K (int): Number of documents to retrieve and rerank.
	ANALYSIS_MODES (dict): Possible analysis modes and their descriptions.
	CACHE_TTL (int): Time-to-live (seconds) for cached items.
	"""
	DEEPSEEK_API_KEY = os.environ.get("DEEPSEEK_API_KEY")
	CHROMA_PATH = "neuro_db"
	CHUNK_SIZE = 512
	CHUNK_OVERLAP = 64
	MAX_CONCURRENT_REQUESTS = 7
	EMBEDDING_DIMENSIONS = 3072
	HYBRID_RERANK_TOP_K = 15
	ANALYSIS_MODES = {
	"technical": "Deep Technical Analysis",
	"comparative": "Cross-Paper Comparison",
	"temporal": "Temporal Trend Analysis",
	"critical": "Critical Literature Review"
	}
	CACHE_TTL = 3600 # 1 hour

	# ------------------------------
	# Document Processor
	# ------------------------------
	class NeuralDocumentProcessor:
	"""
	A document processing and retrieval utility class.

	Responsibilities:
	- Splitting documents into manageable chunks.
	- Storing and retrieving embeddings with Chroma.
	- Performing hybrid retrieval (vector + BM25) with cross-encoder reranking.
	- Handling concurrency during document ingestion (optional).
	"""
	def __init__(self) -> None:
	"""
	Initialize the NeuralDocumentProcessor with a persistent Chroma client,
	OpenAI-based embeddings, a CrossEncoder for reranking, and a text splitter.
	"""
	# Persistent Chroma client
	try:
	self.client = chromadb.PersistentClient(path=NeuroConfig.CHROMA_PATH)
	except Exception as e:
	# Fallback to in-memory client if persistent fails
	print(f"Error initializing Chroma PersistentClient: {e}")
	self.client = chromadb.Client()

	# Embeddings (OpenAI-based)
	self.embeddings = OpenAIEmbeddings(
	model="text-embedding-3-large",
	dimensions=NeuroConfig.EMBEDDING_DIMENSIONS
	)

	# Cross-encoder for reranking
	self.cross_encoder = CrossEncoder('cross-encoder/ms-marco-MiniLM-L-12-v2')

	# Text splitter
	self.text_splitter = RecursiveCharacterTextSplitter(
	chunk_size=NeuroConfig.CHUNK_SIZE,
	chunk_overlap=NeuroConfig.CHUNK_OVERLAP,
	separators=["\n\n", "\n", "(?<=\\. )", " "],
	)

	def process_documents(
	self,
	documents: List[str],
	collection: str,
	use_concurrency: bool = False
	) -> Optional[Chroma]:
	"""
	Process a list of document strings by splitting, embedding, and storing them in Chroma.
	Optionally uses concurrency for splitting documents.

	Args:
	documents (List[str]): The list of raw document texts.
	collection (str): The Chroma collection name to store these documents in.
	use_concurrency (bool, optional): If True, process documents concurrently. Defaults to False.

	Returns:
	Optional[Chroma]: The Chroma vectorstore for the specified collection, or None if no docs.
	"""
	if not documents:
	print("No documents provided to process_documents.")
	return None

	# Split documents into chunks
	if use_concurrency and len(documents) > 1:
	chunks = []
	with ThreadPoolExecutor(max_workers=NeuroConfig.MAX_CONCURRENT_REQUESTS) as executor:
	future_to_doc = {
	executor.submit(self.text_splitter.create_documents, [doc]): doc
	for doc in documents
	}
	for future in as_completed(future_to_doc):
	try:
	result = future.result()
	chunks.extend(result)
	except Exception as e:
	print(f"Error splitting document: {e}")
	else:
	# Single-threaded splitting
	chunks = []
	for doc in documents:
	chunks.extend(self.text_splitter.create_documents([doc]))

	# Build unique IDs for each chunk
	chunk_ids = [self._quantum_id(doc.page_content) for doc in chunks]

	# Create Chroma from documents
	try:
	vectorstore = Chroma.from_documents(
	documents=chunks,
	embedding=self.embeddings,
	client=self.client,
	collection_name=collection,
	ids=chunk_ids
	)
	return vectorstore
	except Exception as e:
	print(f"Error creating Chroma collection: {e}")
	return None

	def hybrid_retrieval(
	self,
	query: str,
	collection: str,
	return_scores: bool = False
	) -> Union[List[str], List[Tuple[str, float]]]:
	"""
	Perform hybrid retrieval combining vector-based search with BM25,
	then re-rank the combined results using a cross-encoder.

	Args:
	query (str): The user query for retrieving documents.
	collection (str): The name of the Chroma collection to search.
	return_scores (bool): If True, return a list of (document, score) tuples.
	Otherwise, return a list of document strings only.

	Returns:
	Union[List[str], List[Tuple[str, float]]]: The top-k reranked results,
	either as strings or (string, score) pairs.
	"""
	# Try to load the existing collection
	try:
	vector_store = Chroma(
	client=self.client,
	collection_name=collection,
	embedding_function=self.embeddings
	)
	except Exception as e:
	print(f"Error loading Chroma collection '{collection}': {e}")
	return [] if not return_scores else []

	# Check if the collection is empty
	stored_docs = vector_store.get()
	if not stored_docs or "documents" not in stored_docs or not stored_docs["documents"]:
	print(f"No documents found in collection '{collection}'.")
	return [] if not return_scores else []

	all_docs = [doc.page_content for doc in stored_docs["documents"]]
	if not all_docs:
	print(f"No documents found in collection '{collection}'.")
	return [] if not return_scores else []

	# Vector-based retrieval
	try:
	vector_retriever = vector_store.as_retriever(
	search_kwargs={"k": NeuroConfig.HYBRID_RERANK_TOP_K}
	)
	vector_results = [doc.page_content for doc in vector_retriever.invoke(query)]
	except Exception as e:
	print(f"Error during vector retrieval: {e}")
	vector_results = []

	# BM25 retrieval
	tokenized_docs = [doc.split() for doc in all_docs]
	bm25 = BM25Okapi(tokenized_docs)
	bm25_results = bm25.get_top_n(
	query.split(),
	all_docs,
	n=NeuroConfig.HYBRID_RERANK_TOP_K
	)

	# Combine results and remove duplicates
	combined = list(set(vector_results + bm25_results))

	if not combined:
	print("No documents retrieved by either BM25 or vector search.")
	return [] if not return_scores else []

	# Cross-encoder reranking
	scores = self.cross_encoder.predict([(query, doc) for doc in combined])
	reranked = sorted(zip(combined, scores), key=lambda x: x[1], reverse=True)
	top_results = reranked[:NeuroConfig.HYBRID_RERANK_TOP_K]

	# Return based on user preference
	if return_scores:
	return top_results # List[Tuple[str, float]]
	else:
	return [doc for doc, _ in top_results]

	def _quantum_id(self, content: str) -> str:
	"""
	Create a unique ID for each text chunk by hashing its content.

	Args:
	content (str): The text content of the chunk.

	Returns:
	str: A unique hash-based identifier.
	"""
	return f"neuro_{hashlib.sha3_256(content.encode()).hexdigest()[:24]}"

	# ------------------------------
	# NeuroInterface (Streamlit Example)
	# ------------------------------
	def NeuroInterface() -> None:
	"""
	A basic Streamlit-based interface to demonstrate usage of the NeuralDocumentProcessor.
	This function can be adapted for Hugging Face Spaces or other frontends.
	"""
	st.title("NeuroResearch 2.1: Robust Research System")

	# Initialize Document Processor
	processor = NeuralDocumentProcessor()

	# Sidebar for uploading and processing documents
	with st.sidebar:
	st.header("Document Ingestion")
	uploaded_files = st.file_uploader(
	"Upload one or more text files",
	type=["txt", "md", "pdf"],
	accept_multiple_files=True
	)
	collection_name = st.text_input("Collection Name", value="default_collection")

	use_concurrency = st.checkbox("Use Concurrency for Processing?", value=False)

	if st.button("Process Documents"):
	if uploaded_files and collection_name.strip():
	# Read files
	docs_content = []
	for uf in uploaded_files:
	content = uf.read()
	# Assume UTF-8; adapt as needed
	try:
	docs_content.append(content.decode("utf-8"))
	except UnicodeDecodeError:
	st.error(f"Could not decode {uf.name}. Make sure it's UTF-8 text.")
	st.write("Processing documents...")
	vectorstore = processor.process_documents(
	documents=docs_content,
	collection=collection_name,
	use_concurrency=use_concurrency
	)
	if vectorstore:
	st.success(f"Documents processed and stored in collection: {collection_name}")
	else:
	st.error("Processing failed or returned no vectorstore.")

	# Main interface for querying
	st.subheader("Query Documents")
	user_query = st.text_input("Enter your query:")
	return_scores = st.checkbox("Return Scores?")

	if st.button("Search"):
	if not user_query.strip() or not collection_name.strip():
	st.warning("Please provide both a query and a valid collection name.")
	else:
	st.write(f"Retrieving from collection: {collection_name}")
	results = processor.hybrid_retrieval(
	query=user_query,
	collection=collection_name,
	return_scores=return_scores
	)
	if results:
	st.write("Top Reranked Results:")
	if return_scores:
	# Each result is (doc, score)
	for idx, (doc, score) in enumerate(results, start=1):
	st.markdown(f"Result {idx} \| Score: {score:.4f}")
	st.write(doc[:500] + ("..." if len(doc) > 500 else ""))
	else:
	# Just doc texts
	for idx, doc in enumerate(results, start=1):
	st.markdown(f"Result {idx}")
	st.write(doc[:500] + ("..." if len(doc) > 500 else ""))
	else:
	st.warning("No results found or collection may be empty.")

	# ------------------------------
	# Main Entry Point
	# ------------------------------
	if __name__ == "__main__":
	NeuroInterface()