Spaces:
Running
Running
# ------------------------------ | |
# Imports & Dependencies (Enhanced) | |
# ------------------------------ | |
from langchain_openai import OpenAIEmbeddings | |
from langchain_community.vectorstores import Chroma | |
from langchain_core.messages import HumanMessage, AIMessage, ToolMessage | |
from langchain.text_splitter import RecursiveCharacterTextSplitter | |
from langgraph.graph import END, StateGraph | |
from langgraph.prebuilt import ToolNode | |
from langgraph.graph.message import add_messages | |
from typing_extensions import TypedDict, Annotated | |
from typing import Sequence, List, Dict, Any | |
import chromadb | |
import re | |
import os | |
import streamlit as st | |
import requests | |
import time | |
import hashlib | |
from langchain.tools.retriever import create_retriever_tool | |
from datetime import datetime | |
# ------------------------------ | |
# Enhanced Configuration | |
# ------------------------------ | |
class AppConfig: | |
def __init__(self): | |
self.DEEPSEEK_API_KEY = os.environ.get("DEEPSEEK_API_KEY") | |
self.CHROMA_PATH = "chroma_db" | |
self.MAX_RETRIES = 3 | |
self.RETRY_DELAY = 1.5 | |
self.DOCUMENT_CHUNK_SIZE = 300 # Increased from 100 | |
self.DOCUMENT_OVERLAP = 50 # Added overlap for context preservation | |
self.SEARCH_K = 5 # Number of documents to retrieve | |
self.SEARCH_TYPE = "mmr" # Maximal Marginal Relevance | |
self.validate_config() | |
def validate_config(self): | |
if not self.DEEPSEEK_API_KEY: | |
st.error(""" | |
**Critical Configuration Missing** | |
π DeepSeek API key not found in environment variables. | |
Please configure through Hugging Face Space secrets: | |
1. Go to Space Settings β Repository secrets | |
2. Add secret: Name=DEEPSEEK_API_KEY, Value=your_api_key | |
3. Rebuild Space | |
""") | |
st.stop() | |
config = AppConfig() | |
# ------------------------------ | |
# Enhanced ChromaDB Setup | |
# ------------------------------ | |
class ChromaManager: | |
def __init__(self): | |
os.makedirs(config.CHROMA_PATH, exist_ok=True) | |
self.client = chromadb.PersistentClient(path=config.CHROMA_PATH) | |
self.embeddings = OpenAIEmbeddings( | |
model="text-embedding-3-large", | |
# dimensions=1024 # Optional for large-scale deployments | |
) | |
def create_collection(self, documents: List[str], collection_name: str) -> Chroma: | |
"""Enhanced document processing with optimized chunking""" | |
text_splitter = RecursiveCharacterTextSplitter( | |
chunk_size=config.DOCUMENT_CHUNK_SIZE, | |
chunk_overlap=config.DOCUMENT_OVERLAP, | |
separators=["\n\n", "\n", "γ", " "] | |
) | |
docs = text_splitter.create_documents(documents) | |
return Chroma.from_documents( | |
documents=docs, | |
embedding=self.embeddings, | |
client=self.client, | |
collection_name=collection_name | |
) | |
# Initialize Chroma with improved parameters | |
chroma_manager = ChromaManager() | |
research_collection = chroma_manager.create_collection(research_texts, "research_collection") | |
dev_collection = chroma_manager.create_collection(development_texts, "development_collection") | |
# ------------------------------ | |
# Enhanced Retriever Configuration | |
# ------------------------------ | |
research_retriever = research_collection.as_retriever( | |
search_type=config.SEARCH_TYPE, | |
search_kwargs={"k": config.SEARCH_K, "fetch_k": config.SEARCH_K * 2} | |
) | |
development_retriever = dev_collection.as_retriever( | |
search_type=config.SEARCH_TYPE, | |
search_kwargs={"k": config.SEARCH_K, "fetch_k": config.SEARCH_K * 2} | |
) | |
# ------------------------------ | |
# Enhanced Document Processing | |
# ------------------------------ | |
class DocumentProcessor: | |
def deduplicate_documents(docs: List[Any]) -> List[Any]: | |
"""Advanced deduplication using content hashing""" | |
seen = set() | |
unique_docs = [] | |
for doc in docs: | |
content_hash = hashlib.md5(doc.page_content.encode()).hexdigest() | |
if content_hash not in seen: | |
unique_docs.append(doc) | |
seen.add(content_hash) | |
return unique_docs | |
def extract_key_points(docs: List[Any]) -> str: | |
"""Semantic analysis of retrieved documents""" | |
key_points = [] | |
categories = { | |
"quantum": ["quantum", "qpu", "qubit"], | |
"vision": ["image", "recognition", "vision"], | |
"nlp": ["transformer", "language", "llm"] | |
} | |
for doc in docs: | |
content = doc.page_content.lower() | |
# Categorization logic | |
if any(kw in content for kw in categories["quantum"]): | |
key_points.append("- Quantum computing integration showing promising results") | |
if any(kw in content for kw in categories["vision"]): | |
key_points.append("- Computer vision models achieving state-of-the-art accuracy") | |
if any(kw in content for kw in categories["nlp"]): | |
key_points.append("- NLP architectures evolving with memory-augmented transformers") | |
return "\n".join(list(set(key_points))) # Remove duplicates | |
# ------------------------------ | |
# Enhanced Agent Workflow (Additions) | |
# ------------------------------ | |
class EnhancedAgent: | |
def __init__(self): | |
self.session_stats = { | |
"processing_times": [], | |
"doc_counts": [], | |
"error_count": 0 | |
} | |
def api_request_with_retry(self, endpoint: str, payload: Dict) -> Dict: | |
"""Robust API handling with exponential backoff""" | |
headers = { | |
"Authorization": f"Bearer {config.DEEPSEEK_API_KEY}", | |
"Content-Type": "application/json" | |
} | |
for attempt in range(config.MAX_RETRIES): | |
try: | |
response = requests.post( | |
endpoint, | |
headers=headers, | |
json=payload, | |
timeout=30, | |
verify=False | |
) | |
response.raise_for_status() | |
return response.json() | |
except requests.exceptions.HTTPError as e: | |
if e.response.status_code == 429: | |
delay = config.RETRY_DELAY ** (attempt + 1) | |
time.sleep(delay) | |
continue | |
raise | |
raise Exception(f"API request failed after {config.MAX_RETRIES} attempts") | |
# ------------------------------ | |
# Enhanced Streamlit UI (Dark Professional Theme) | |
# ------------------------------ | |
class UITheme: | |
primary_color = "#2E86C1" | |
secondary_color = "#28B463" | |
background_color = "#1A1A1A" | |
text_color = "#EAECEE" | |
def apply(cls): | |
st.markdown(f""" | |
<style> | |
.stApp {{ | |
background-color: {cls.background_color}; | |
color: {cls.text_color}; | |
}} | |
.stTextArea textarea {{ | |
background-color: #2D2D2D !important; | |
color: {cls.text_color} !important; | |
border: 1px solid {cls.primary_color}; | |
}} | |
.stButton > button {{ | |
background-color: {cls.primary_color}; | |
color: white; | |
border: none; | |
padding: 12px 28px; | |
border-radius: 6px; | |
transition: all 0.3s ease; | |
font-weight: 500; | |
}} | |
.stButton > button:hover {{ | |
background-color: {cls.secondary_color}; | |
transform: translateY(-1px); | |
box-shadow: 0 4px 12px rgba(0,0,0,0.2); | |
}} | |
.data-box {{ | |
background-color: #2D2D2D; | |
border-left: 4px solid {cls.primary_color}; | |
padding: 18px; | |
margin: 14px 0; | |
border-radius: 8px; | |
box-shadow: 0 2px 8px rgba(0,0,0,0.15); | |
}} | |
.st-expander {{ | |
background-color: #2D2D2D; | |
border: 1px solid #3D3D3D; | |
border-radius: 6px; | |
margin: 12px 0; | |
}} | |
.stAlert {{ | |
background-color: #423a2d !important; | |
border: 1px solid #E67E22 !important; | |
}} | |
</style> | |
""", unsafe_allow_html=True) | |
# ------------------------------ | |
# Enhanced Main Application | |
# ------------------------------ | |
def main(): | |
UITheme.apply() | |
st.set_page_config( | |
page_title="AI Research Assistant Pro", | |
layout="wide", | |
initial_sidebar_state="expanded", | |
menu_items={ | |
'Get Help': 'https://example.com/docs', | |
'Report a bug': 'https://example.com/issues', | |
'About': "v2.1 | Enhanced Research Assistant" | |
} | |
) | |
with st.sidebar: | |
st.header("π Knowledge Bases") | |
with st.expander("Research Database", expanded=True): | |
for text in research_texts: | |
st.markdown(f'<div class="data-box research-box">{text}</div>', | |
unsafe_allow_html=True) | |
with st.expander("Development Database"): | |
for text in development_texts: | |
st.markdown(f'<div class="data-box dev-box">{text}</div>', | |
unsafe_allow_html=True) | |
st.title("π¬ AI Research Assistant Pro") | |
st.markdown("---") | |
# Enhanced query input with examples | |
query = st.text_area( | |
"Research Query Input", | |
height=120, | |
placeholder="Enter your research question...\nExample: What are recent breakthroughs in quantum machine learning?", | |
help="Be specific about domains (e.g., computer vision, NLP) for better results" | |
) | |
col1, col2 = st.columns([1, 2]) | |
with col1: | |
if st.button("π Analyze Documents", use_container_width=True): | |
if not query: | |
st.warning("β οΈ Please enter a research question") | |
return | |
with st.status("Processing Workflow...", expanded=True) as status: | |
try: | |
start_time = time.time() | |
# Document Retrieval Phase | |
status.update(label="π Retrieving Relevant Documents", state="running") | |
events = process_question(query, app, {"configurable": {"thread_id": "1"}}) | |
# Processing Phase | |
status.update(label="π Analyzing Content", state="running") | |
processed_data = [] | |
for event in events: | |
if 'agent' in event: | |
content = event['agent']['messages'][0].content | |
if "Results:" in content: | |
docs_str = content.split("Results: ")[1] | |
docs = eval(docs_str) | |
unique_docs = DocumentProcessor.deduplicate_documents(docs) | |
key_points = DocumentProcessor.extract_key_points(unique_docs) | |
processed_data.append(key_points) | |
with st.expander("π Retrieved Documents", expanded=False): | |
st.info(f"Found {len(unique_docs)} unique documents") | |
st.write(docs_str) | |
elif 'generate' in event: | |
final_answer = event['generate']['messages'][0].content | |
status.update(label="β Analysis Complete", state="complete") | |
st.markdown("## π Research Summary") | |
st.markdown(final_answer) | |
# Performance metrics | |
proc_time = time.time() - start_time | |
st.caption(f"β±οΈ Processed in {proc_time:.2f}s | {len(processed_data)} document clusters") | |
except Exception as e: | |
status.update(label="β Processing Failed", state="error") | |
st.error(f""" | |
**Critical Error** | |
{str(e)} | |
Recommended Actions: | |
- Verify API key configuration | |
- Check service status | |
- Simplify query complexity | |
""") | |
# Log error with timestamp | |
error_log = f"{datetime.now()} | {str(e)}\n" | |
with open("error_log.txt", "a") as f: | |
f.write(error_log) | |
with col2: | |
st.markdown(""" | |
## π Usage Guide | |
**1. Query Formulation** | |
- Be domain-specific (e.g., "quantum NLP") | |
- Include timeframes (e.g., "2023-2024 advances") | |
**2. Results Interpretation** | |
- Expand document sections for sources | |
- Key points highlight technical breakthroughs | |
- Summary shows commercial implications | |
**3. Advanced Features** | |
- `CTRL+Enter` for quick reruns | |
- Click documents for raw context | |
- Export results via screenshot | |
""") | |
if __name__ == "__main__": | |
main() |