mgbam commited on
Commit
09a0b53
Β·
verified Β·
1 Parent(s): 6952be5

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +325 -346
app.py CHANGED
@@ -1,39 +1,28 @@
1
- # -----------------------------------------------------
2
- # Imports & Initial Configuration
3
- # -----------------------------------------------------
4
- import streamlit as st
5
-
6
- # IMPORTANT: Must be the first Streamlit command
7
- st.set_page_config(page_title="NeuroResearch AI", layout="wide", initial_sidebar_state="expanded")
8
-
9
  from langchain_openai import OpenAIEmbeddings
10
  from langchain_community.vectorstores import Chroma
11
- from langchain_core.messages import HumanMessage, AIMessage, ToolMessage
12
  from langchain.text_splitter import RecursiveCharacterTextSplitter
13
  from langgraph.graph import END, StateGraph
14
- from langgraph.prebuilt import ToolNode
15
- from langgraph.graph.message import add_messages
16
  from typing_extensions import TypedDict, Annotated
17
  from typing import Sequence, Dict, List, Optional, Any
18
  import chromadb
19
  import os
 
20
  import requests
21
  import hashlib
 
22
  import time
23
  from concurrent.futures import ThreadPoolExecutor, as_completed
24
  from datetime import datetime
 
 
25
 
26
- # -----------------------------------------------------
27
- # State Schema Definition
28
- # -----------------------------------------------------
29
- class AgentState(TypedDict):
30
- messages: Annotated[Sequence[AIMessage | HumanMessage | ToolMessage], add_messages]
31
- context: Dict[str, Any]
32
- metadata: Dict[str, Any]
33
-
34
- # -----------------------------------------------------
35
- # Configuration
36
- # -----------------------------------------------------
37
  class ResearchConfig:
38
  DEEPSEEK_API_KEY = os.environ.get("DEEPSEEK_API_KEY")
39
  CHROMA_PATH = "chroma_db"
@@ -41,139 +30,171 @@ class ResearchConfig:
41
  CHUNK_OVERLAP = 64
42
  MAX_CONCURRENT_REQUESTS = 5
43
  EMBEDDING_DIMENSIONS = 1536
44
- DOCUMENT_MAP = {
45
- "Research Report: Results of a New AI Model Improving Image Recognition Accuracy to 98%":
46
- "CV-Transformer Hybrid Architecture",
47
- "Academic Paper Summary: Why Transformers Became the Mainstream Architecture in Natural Language Processing":
48
- "Transformer Architecture Analysis",
49
- "Latest Trends in Machine Learning Methods Using Quantum Computing":
50
- "Quantum ML Frontiers"
51
- }
52
- ANALYSIS_TEMPLATE = """Analyze these technical documents with scientific rigor:
53
  {context}
54
 
55
  Respond with:
56
- 1. Key Technical Contributions (bullet points)
57
- 2. Novel Methodologies
58
- 3. Empirical Results (with metrics)
59
- 4. Potential Applications
60
- 5. Limitations & Future Directions
61
-
62
- Format: Markdown with LaTeX mathematical notation where applicable
63
- """
64
-
65
- # Validate API key configuration
66
- if not ResearchConfig.DEEPSEEK_API_KEY:
67
- st.error("""**Research Portal Configuration Required**
68
- 1. Obtain DeepSeek API key: [platform.deepseek.com](https://platform.deepseek.com/)
69
- 2. Configure secret: `DEEPSEEK_API_KEY` in Space settings
70
- 3. Rebuild deployment""")
71
- st.stop()
72
-
73
- # -----------------------------------------------------
74
- # Quantum Document Processing
75
- # -----------------------------------------------------
76
- class QuantumDocumentManager:
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
77
  def __init__(self):
78
  self.client = chromadb.PersistentClient(path=ResearchConfig.CHROMA_PATH)
79
  self.embeddings = OpenAIEmbeddings(
80
  model="text-embedding-3-large",
81
  dimensions=ResearchConfig.EMBEDDING_DIMENSIONS
82
  )
 
 
 
 
 
 
 
 
 
 
 
83
 
84
- def create_collection(self, documents: List[str], collection_name: str) -> Chroma:
 
 
 
 
 
 
 
 
 
 
 
 
 
85
  splitter = RecursiveCharacterTextSplitter(
86
  chunk_size=ResearchConfig.CHUNK_SIZE,
87
  chunk_overlap=ResearchConfig.CHUNK_OVERLAP,
88
- separators=["\n\n", "\n", "|||"]
89
  )
90
- docs = splitter.create_documents(documents)
91
- # Debug lines about chunk creation removed
92
- return Chroma.from_documents(
93
- documents=docs,
94
- embedding=self.embeddings,
95
- client=self.client,
96
- collection_name=collection_name,
97
- ids=[self._document_id(doc.page_content) for doc in docs]
98
- )
99
-
100
- def _document_id(self, content: str) -> str:
101
- """Create a unique ID for each document chunk."""
102
- return f"{hashlib.sha256(content.encode()).hexdigest()[:16]}-{int(time.time())}"
103
-
104
- # Initialize document collections
105
- qdm = QuantumDocumentManager()
106
- research_docs = qdm.create_collection([
107
- "Research Report: Results of a New AI Model Improving Image Recognition Accuracy to 98%",
108
- "Academic Paper Summary: Why Transformers Became the Mainstream Architecture in Natural Language Processing",
109
- "Latest Trends in Machine Learning Methods Using Quantum Computing"
110
- ], "research")
111
-
112
- development_docs = qdm.create_collection([
113
- "Project A: UI Design Completed, API Integration in Progress",
114
- "Project B: Testing New Feature X, Bug Fixes Needed",
115
- "Product Y: In the Performance Optimization Stage Before Release"
116
- ], "development")
117
-
118
- # -----------------------------------------------------
119
- # Advanced Retrieval System
120
- # -----------------------------------------------------
121
  class ResearchRetriever:
122
  def __init__(self):
123
- self.retrievers = {
124
- "research": research_docs.as_retriever(
125
- search_type="mmr",
126
- search_kwargs={
127
- 'k': 4,
128
- 'fetch_k': 20,
129
- 'lambda_mult': 0.85
130
- }
131
- ),
132
- "development": development_docs.as_retriever(
133
- search_type="similarity",
134
- search_kwargs={'k': 3}
135
- )
136
- }
137
 
138
- def retrieve(self, query: str, domain: str) -> List[Any]:
139
- """Retrieve documents from the specified domain."""
140
  try:
141
- return self.retrievers[domain].invoke(query)
142
- except KeyError:
143
- st.error(f"[ERROR] Retrieval domain '{domain}' not found.")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
144
  return []
145
 
146
- retriever = ResearchRetriever()
147
-
148
- # -----------------------------------------------------
149
- # Cognitive Processing Unit
150
- # -----------------------------------------------------
151
- class CognitiveProcessor:
152
  def __init__(self):
153
  self.executor = ThreadPoolExecutor(max_workers=ResearchConfig.MAX_CONCURRENT_REQUESTS)
154
- self.session_id = hashlib.sha256(datetime.now().isoformat().encode()).hexdigest()[:12]
155
-
156
- def process_query(self, prompt: str) -> Dict:
157
- """Send the prompt to the DeepSeek API using triple redundancy for robustness."""
158
- futures = []
159
- for _ in range(3):
160
- futures.append(self.executor.submit(self._execute_api_request, prompt))
161
-
162
- results = []
163
- for future in as_completed(futures):
164
- try:
165
- results.append(future.result())
166
- except Exception as e:
167
- st.error(f"Processing Error: {str(e)}")
168
-
169
- return self._consensus_check(results)
170
-
171
- def _execute_api_request(self, prompt: str) -> Dict:
172
- """Make a single request to the DeepSeek API."""
173
  headers = {
174
  "Authorization": f"Bearer {ResearchConfig.DEEPSEEK_API_KEY}",
175
- "Content-Type": "application/json",
176
- "X-Research-Session": self.session_id
177
  }
178
 
179
  try:
@@ -182,293 +203,251 @@ class CognitiveProcessor:
182
  headers=headers,
183
  json={
184
  "model": "deepseek-chat",
185
- "messages": [{
186
- "role": "user",
187
- "content": f"Respond as Senior AI Researcher:\n{prompt}"
188
- }],
189
  "temperature": 0.7,
190
- "max_tokens": 1500,
191
- "top_p": 0.9
192
  },
193
- timeout=45
194
  )
195
  response.raise_for_status()
196
  return response.json()
197
- except requests.exceptions.RequestException as e:
198
- return {"error": str(e)}
199
-
200
- def _consensus_check(self, results: List[Dict]) -> Dict:
201
- """Pick the best result by comparing content length among successful responses."""
202
  valid = [r for r in results if "error" not in r]
203
  if not valid:
204
- return {"error": "All API requests failed"}
205
- return max(valid, key=lambda x: len(x.get('choices', [{}])[0].get('message', {}).get('content', '')))
 
 
 
206
 
207
- # -----------------------------------------------------
208
- # Research Workflow Engine
209
- # -----------------------------------------------------
210
  class ResearchWorkflow:
211
  def __init__(self):
212
- self.processor = CognitiveProcessor()
213
- self.workflow = StateGraph(AgentState)
214
- self._build_workflow()
215
-
216
- def _build_workflow(self):
217
- # Register nodes in the state graph
218
- self.workflow.add_node("ingest", self.ingest_query)
219
- self.workflow.add_node("retrieve", self.retrieve_documents)
220
- self.workflow.add_node("analyze", self.analyze_content)
221
- self.workflow.add_node("validate", self.validate_output)
222
- self.workflow.add_node("refine", self.refine_results)
223
-
224
- # Define workflow transitions
225
  self.workflow.set_entry_point("ingest")
226
  self.workflow.add_edge("ingest", "retrieve")
227
  self.workflow.add_edge("retrieve", "analyze")
228
  self.workflow.add_conditional_edges(
229
  "analyze",
230
- self._quality_check,
231
  {"valid": "validate", "invalid": "refine"}
232
  )
233
  self.workflow.add_edge("validate", END)
234
  self.workflow.add_edge("refine", "retrieve")
235
 
236
- # Compile the final state machine
237
- self.app = self.workflow.compile()
238
-
239
- def ingest_query(self, state: AgentState) -> Dict:
240
- """Extract the user query and store it in the state."""
241
  try:
242
- query = state["messages"][-1].content
 
243
  return {
244
- "messages": [AIMessage(content="Query ingested successfully")],
245
- "context": {"raw_query": query},
246
- "metadata": {"timestamp": datetime.now().isoformat()}
 
 
 
 
 
 
 
247
  }
248
  except Exception as e:
249
- return self._error_state(f"Ingestion Error: {str(e)}")
250
 
251
- def retrieve_documents(self, state: AgentState) -> Dict:
252
- """Retrieve relevant documents from the 'research' domain."""
253
  try:
254
- # Fallback check for 'raw_query'
255
- if "raw_query" not in state["context"]:
256
- return self._error_state("No 'raw_query' found in context. Make sure the ingest step has run.")
257
-
258
- query = state["context"]["raw_query"]
259
- docs = retriever.retrieve(query, "research")
260
  return {
261
  "messages": [AIMessage(content=f"Retrieved {len(docs)} documents")],
262
  "context": {
 
263
  "documents": docs,
264
  "retrieval_time": time.time()
265
- }
 
266
  }
267
  except Exception as e:
268
- return self._error_state(f"Retrieval Error: {str(e)}")
269
 
270
- def analyze_content(self, state: AgentState) -> Dict:
271
- """Concatenate document contents and analyze them using the CognitiveProcessor."""
 
 
 
272
  try:
273
- if "documents" not in state["context"] or not state["context"]["documents"]:
274
- return self._error_state("No documents retrieved; please check your query or retrieval process.")
 
 
 
 
275
 
276
- docs = "\n\n".join([
277
- d.page_content for d in state["context"]["documents"]
278
- if hasattr(d, "page_content") and d.page_content
279
- ])
280
- prompt = ResearchConfig.ANALYSIS_TEMPLATE.format(context=docs)
281
- response = self.processor.process_query(prompt)
282
 
283
- if "error" in response:
284
- return self._error_state(response["error"])
285
 
286
  return {
287
- "messages": [AIMessage(content=response['choices'][0]['message']['content'])],
288
- "context": {"analysis": response}
 
289
  }
290
  except Exception as e:
291
- return self._error_state(f"Analysis Error: {str(e)}")
292
-
293
- def validate_output(self, state: AgentState) -> Dict:
294
- """Validate the technical correctness of the analysis output."""
295
- analysis = state["messages"][-1].content
296
- validation_prompt = f"""Validate research analysis:
297
- {analysis}
298
-
299
- Check for:
300
- 1. Technical accuracy
301
- 2. Citation support
302
- 3. Logical consistency
303
- 4. Methodological soundness
304
-
305
- Respond with 'VALID' or 'INVALID'"""
306
-
307
- response = self.processor.process_query(validation_prompt)
308
- return {
309
- "messages": [AIMessage(content=analysis + f"\n\nValidation: {response.get('choices', [{}])[0].get('message', {}).get('content', '')}")]
310
- }
311
 
312
- def refine_results(self, state: AgentState) -> Dict:
313
- """Refine the analysis based on the validation feedback."""
314
- refinement_prompt = f"""Refine this analysis:
315
- {state["messages"][-1].content}
316
 
317
- Improve:
318
- 1. Technical precision
319
- 2. Empirical grounding
320
- 3. Theoretical coherence"""
321
-
322
- response = self.processor.process_query(refinement_prompt)
323
- return {
324
- "messages": [AIMessage(content=response.get('choices', [{}])[0].get('message', {}).get('content', ''))],
325
- "context": state["context"]
326
- }
327
 
328
- def _quality_check(self, state: AgentState) -> str:
329
- """Check if the validation step indicates a 'VALID' or 'INVALID' output."""
330
- content = state["messages"][-1].content
331
- return "valid" if "VALID" in content else "invalid"
332
 
333
- def _error_state(self, message: str) -> Dict:
334
- """Return an error message and mark the state as erroneous."""
335
- st.error(f"[ERROR] {message}")
336
  return {
337
- "messages": [AIMessage(content=f"❌ {message}")],
338
- "context": {"error": True},
339
- "metadata": {"status": "error"}
 
 
 
340
  }
341
 
342
- # -----------------------------------------------------
343
- # Research Interface
344
- # -----------------------------------------------------
345
  class ResearchInterface:
346
  def __init__(self):
347
- self.workflow = ResearchWorkflow()
348
- # Page config already set at the top.
349
- self._inject_styles()
 
 
 
 
 
 
 
350
  self._build_sidebar()
351
- self._build_main_interface()
352
 
353
- def _inject_styles(self):
354
- """Inject custom CSS for a sleek interface."""
355
  st.markdown("""
356
  <style>
357
- :root {
358
- --primary: #2ecc71;
359
- --secondary: #3498db;
360
- --background: #0a0a0a;
361
- --text: #ecf0f1;
362
- }
363
-
364
  .stApp {
365
- background: var(--background);
366
- color: var(--text);
367
- font-family: 'Roboto', sans-serif;
368
  }
369
-
370
  .stTextArea textarea {
371
- background: #1a1a1a !important;
372
- color: var(--text) !important;
373
- border: 2px solid var(--secondary);
374
- border-radius: 8px;
375
- padding: 1rem;
376
  }
377
-
378
  .stButton>button {
379
- background: linear-gradient(135deg, var(--primary), var(--secondary));
380
- border: none;
381
- border-radius: 8px;
382
- padding: 1rem 2rem;
383
- transition: all 0.3s;
384
- }
385
-
386
- .stButton>button:hover {
387
- transform: translateY(-2px);
388
- box-shadow: 0 4px 12px rgba(46, 204, 113, 0.3);
389
  }
390
-
391
- .stExpander {
392
- background: #1a1a1a;
393
- border: 1px solid #2a2a2a;
394
- border-radius: 8px;
395
  margin: 1rem 0;
396
  }
397
  </style>
398
  """, unsafe_allow_html=True)
399
 
400
  def _build_sidebar(self):
401
- """Construct the left sidebar with document info and metrics."""
402
  with st.sidebar:
403
- st.title("πŸ” Research Database")
404
- st.subheader("Technical Papers")
405
- for title, short in ResearchConfig.DOCUMENT_MAP.items():
406
- with st.expander(short):
407
- st.markdown(f"```\n{title}\n```")
408
-
409
- st.subheader("Analysis Metrics")
410
- st.metric("Vector Collections", 2)
411
- st.metric("Embedding Dimensions", ResearchConfig.EMBEDDING_DIMENSIONS)
412
-
413
- def _build_main_interface(self):
414
- """Construct the main interface for query input and result display."""
415
- st.title("🧠 NeuroResearch AI")
416
- query = st.text_area("Research Query:", height=200,
417
- placeholder="Enter technical research question...")
418
 
419
- if st.button("Execute Analysis", type="primary"):
420
- self._execute_analysis(query)
421
 
422
- def _execute_analysis(self, query: str):
423
- """Execute the entire research workflow and render the results."""
424
  try:
425
- with st.spinner("Initializing Quantum Analysis..."):
426
- results = self.workflow.app.stream(
427
- {"messages": [HumanMessage(content=query)], "context": {}, "metadata": {}}
428
- )
429
- for event in results:
430
- self._render_event(event)
431
- st.success("βœ… Analysis Completed Successfully")
 
 
 
 
 
 
 
 
 
 
432
  except Exception as e:
433
  st.error(f"""**Analysis Failed**
434
- {str(e)}
435
- Potential issues:
436
- - Complex query structure
437
- - Document correlation failure
438
- - Temporal processing constraints""")
439
-
440
- def _render_event(self, event: Dict):
441
- """Render each node's output in the UI as it streams through the workflow."""
442
- if 'ingest' in event:
443
- with st.container():
444
- st.success("βœ… Query Ingested")
445
- elif 'retrieve' in event:
446
- with st.container():
447
- docs = event['retrieve']['context']['documents']
448
- st.info(f"πŸ“š Retrieved {len(docs)} documents")
449
- with st.expander("View Retrieved Documents", expanded=False):
450
- for i, doc in enumerate(docs, 1):
451
- st.markdown(f"**Document {i}**")
452
- st.code(doc.page_content, language='text')
453
- elif 'analyze' in event:
454
- with st.container():
455
- content = event['analyze']['messages'][0].content
456
- with st.expander("Technical Analysis Report", expanded=True):
457
- st.markdown(content)
458
- elif 'validate' in event:
459
- with st.container():
460
- content = event['validate']['messages'][0].content
461
- if "VALID" in content:
462
- st.success("βœ… Validation Passed")
463
- with st.expander("View Validated Analysis", expanded=True):
464
- st.markdown(content.split("Validation:")[0])
465
  else:
466
- st.warning("⚠️ Validation Issues Detected")
467
- with st.expander("View Validation Details", expanded=True):
468
- st.markdown(content)
 
 
 
 
 
 
 
 
 
469
 
470
- # -----------------------------------------------------
471
- # Main Execution
472
- # -----------------------------------------------------
473
  if __name__ == "__main__":
474
- ResearchInterface()
 
1
+ # ------------------------------
2
+ # Imports & Dependencies
3
+ # ------------------------------
 
 
 
 
 
4
  from langchain_openai import OpenAIEmbeddings
5
  from langchain_community.vectorstores import Chroma
6
+ from langchain_core.messages import HumanMessage, AIMessage, BaseMessage
7
  from langchain.text_splitter import RecursiveCharacterTextSplitter
8
  from langgraph.graph import END, StateGraph
 
 
9
  from typing_extensions import TypedDict, Annotated
10
  from typing import Sequence, Dict, List, Optional, Any
11
  import chromadb
12
  import os
13
+ import streamlit as st
14
  import requests
15
  import hashlib
16
+ import json
17
  import time
18
  from concurrent.futures import ThreadPoolExecutor, as_completed
19
  from datetime import datetime
20
+ from pydantic import BaseModel, ValidationError
21
+ import traceback
22
 
23
+ # ------------------------------
24
+ # Configuration & Constants
25
+ # ------------------------------
 
 
 
 
 
 
 
 
26
  class ResearchConfig:
27
  DEEPSEEK_API_KEY = os.environ.get("DEEPSEEK_API_KEY")
28
  CHROMA_PATH = "chroma_db"
 
30
  CHUNK_OVERLAP = 64
31
  MAX_CONCURRENT_REQUESTS = 5
32
  EMBEDDING_DIMENSIONS = 1536
33
+ ANALYSIS_TEMPLATE = """**Technical Analysis Request**
 
 
 
 
 
 
 
 
34
  {context}
35
 
36
  Respond with:
37
+ 1. Key Technical Innovations (markdown table)
38
+ 2. Methodological Breakdown (bullet points)
39
+ 3. Quantitative Results (LaTeX equations)
40
+ 4. Critical Evaluation
41
+ 5. Research Impact Assessment
42
+
43
+ Include proper academic citations where applicable."""
44
+
45
+ # ------------------------------
46
+ # Document Schema & Content
47
+ # ------------------------------
48
+ DOCUMENT_CONTENT = {
49
+ "CV-Transformer Hybrid": {
50
+ "content": """## Hybrid Architecture for Computer Vision
51
+ **Authors**: DeepVision Research Team
52
+ **Abstract**: Novel combination of convolutional layers with transformer attention mechanisms.
53
+
54
+ ### Key Innovations:
55
+ - Cross-attention feature fusion
56
+ - Adaptive spatial pooling
57
+ - Multi-scale gradient propagation
58
+
59
+ $$\\mathcal{L}_{total} = \\alpha\\mathcal{L}_{CE} + \\beta\\mathcal{L}_{SSIM}$$""",
60
+ "metadata": {
61
+ "year": 2024,
62
+ "domain": "computer_vision",
63
+ "citations": 142
64
+ }
65
+ },
66
+ "Quantum ML Advances": {
67
+ "content": """## Quantum Machine Learning Breakthroughs
68
+ **Authors**: Quantum AI Lab
69
+
70
+ ### Achievements:
71
+ - Quantum-enhanced SGD (40% faster convergence)
72
+ - 5-qubit QNN achieving 98% accuracy
73
+ - Hybrid quantum-classical GANs
74
+
75
+ $$\\mathcal{H} = -\\sum_{i<j} J_{ij}\\sigma_i^z\\sigma_j^z - \\Gamma\\sum_i\\sigma_i^x$$""",
76
+ "metadata": {
77
+ "year": 2023,
78
+ "domain": "quantum_ml",
79
+ "citations": 89
80
+ }
81
+ }
82
+ }
83
+
84
+ class DocumentSchema(BaseModel):
85
+ content: str
86
+ metadata: dict
87
+ doc_id: str
88
+
89
+ # ------------------------------
90
+ # State Management
91
+ # ------------------------------
92
+ class ResearchState(TypedDict):
93
+ messages: Annotated[List[BaseMessage], add_messages]
94
+ context: Annotated[Dict[str, Any], "research_context"]
95
+ metadata: Annotated[Dict[str, str], "system_metadata"]
96
+
97
+ # ------------------------------
98
+ # Document Processing
99
+ # ------------------------------
100
+ class DocumentManager:
101
  def __init__(self):
102
  self.client = chromadb.PersistentClient(path=ResearchConfig.CHROMA_PATH)
103
  self.embeddings = OpenAIEmbeddings(
104
  model="text-embedding-3-large",
105
  dimensions=ResearchConfig.EMBEDDING_DIMENSIONS
106
  )
107
+
108
+ def initialize_collections(self):
109
+ try:
110
+ self.research_col = self._create_collection("research")
111
+ self.dev_col = self._create_collection("development")
112
+ except Exception as e:
113
+ st.error(f"Collection initialization failed: {str(e)}")
114
+ traceback.print_exc()
115
+
116
+ def _create_collection(self, name: str) -> Chroma:
117
+ documents, metadatas, ids = [], [], []
118
 
119
+ for title, data in DOCUMENT_CONTENT.items():
120
+ try:
121
+ doc = DocumentSchema(
122
+ content=data["content"],
123
+ metadata=data["metadata"],
124
+ doc_id=hashlib.sha256(title.encode()).hexdigest()[:16]
125
+ )
126
+ documents.append(doc.content)
127
+ metadatas.append(doc.metadata)
128
+ ids.append(doc.doc_id)
129
+ except ValidationError as e:
130
+ st.error(f"Invalid document format: {title} - {str(e)}")
131
+ continue
132
+
133
  splitter = RecursiveCharacterTextSplitter(
134
  chunk_size=ResearchConfig.CHUNK_SIZE,
135
  chunk_overlap=ResearchConfig.CHUNK_OVERLAP,
136
+ separators=["\n## ", "\n### ", "\n\n", "\nβ€’ "]
137
  )
138
+
139
+ try:
140
+ docs = splitter.create_documents(documents, metadatas=metadatas)
141
+ return Chroma.from_documents(
142
+ docs,
143
+ self.embeddings,
144
+ client=self.client,
145
+ collection_name=name,
146
+ ids=ids
147
+ )
148
+ except Exception as e:
149
+ raise RuntimeError(f"Failed creating {name} collection: {str(e)}")
150
+
151
+ # ------------------------------
152
+ # Retrieval System
153
+ # ------------------------------
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
154
  class ResearchRetriever:
155
  def __init__(self):
156
+ self.dm = DocumentManager()
157
+ self.dm.initialize_collections()
 
 
 
 
 
 
 
 
 
 
 
 
158
 
159
+ def retrieve(self, query: str, domain: str) -> List[DocumentSchema]:
 
160
  try:
161
+ collection = self.dm.research_col if domain == "research" else self.dm.dev_col
162
+ if not collection:
163
+ return []
164
+
165
+ results = collection.as_retriever(
166
+ search_type="mmr",
167
+ search_kwargs={'k': 4, 'fetch_k': 20}
168
+ ).invoke(query)
169
+
170
+ return [DocumentSchema(
171
+ content=doc.page_content,
172
+ metadata=doc.metadata,
173
+ doc_id=doc.metadata.get("doc_id", "")
174
+ ) for doc in results if doc.page_content]
175
+
176
+ except Exception as e:
177
+ st.error(f"Retrieval failure: {str(e)}")
178
+ traceback.print_exc()
179
  return []
180
 
181
+ # ------------------------------
182
+ # Analysis Processor
183
+ # ------------------------------
184
+ class AnalysisEngine:
 
 
185
  def __init__(self):
186
  self.executor = ThreadPoolExecutor(max_workers=ResearchConfig.MAX_CONCURRENT_REQUESTS)
187
+ self.session_hash = hashlib.sha256(str(time.time()).encode()).hexdigest()[:12]
188
+
189
+ def analyze(self, prompt: str) -> Dict:
190
+ futures = [self.executor.submit(self._api_request, prompt) for _ in range(3)]
191
+ return self._validate_results([f.result() for f in as_completed(futures)])
192
+
193
+ def _api_request(self, prompt: str) -> Dict:
 
 
 
 
 
 
 
 
 
 
 
 
194
  headers = {
195
  "Authorization": f"Bearer {ResearchConfig.DEEPSEEK_API_KEY}",
196
+ "X-Session-ID": self.session_hash,
197
+ "Content-Type": "application/json"
198
  }
199
 
200
  try:
 
203
  headers=headers,
204
  json={
205
  "model": "deepseek-chat",
206
+ "messages": [{"role": "user", "content": prompt}],
 
 
 
207
  "temperature": 0.7,
208
+ "max_tokens": 2000
 
209
  },
210
+ timeout=30
211
  )
212
  response.raise_for_status()
213
  return response.json()
214
+ except Exception as e:
215
+ return {"error": str(e), "status_code": 500}
216
+
217
+ def _validate_results(self, results: List[Dict]) -> Dict:
 
218
  valid = [r for r in results if "error" not in r]
219
  if not valid:
220
+ return {"error": "All analysis attempts failed", "results": results}
221
+
222
+ # Corrected line with proper parenthesis closure
223
+ best = max(valid, key=lambda x: len(x.get('choices', [{}])[0].get('message', {}).get('content', '')))
224
+ return best
225
 
226
+ # ------------------------------
227
+ # Workflow Implementation
228
+ # ------------------------------
229
  class ResearchWorkflow:
230
  def __init__(self):
231
+ self.retriever = ResearchRetriever()
232
+ self.engine = AnalysisEngine()
233
+ self.workflow = StateGraph(ResearchState)
234
+ self._build_graph()
235
+
236
+ def _build_graph(self):
237
+ self.workflow.add_node("ingest", self._ingest)
238
+ self.workflow.add_node("retrieve", self._retrieve)
239
+ self.workflow.add_node("analyze", self._analyze)
240
+ self.workflow.add_node("validate", self._validate)
241
+ self.workflow.add_node("refine", self._refine)
242
+
 
243
  self.workflow.set_entry_point("ingest")
244
  self.workflow.add_edge("ingest", "retrieve")
245
  self.workflow.add_edge("retrieve", "analyze")
246
  self.workflow.add_conditional_edges(
247
  "analyze",
248
+ self._quality_gate,
249
  {"valid": "validate", "invalid": "refine"}
250
  )
251
  self.workflow.add_edge("validate", END)
252
  self.workflow.add_edge("refine", "retrieve")
253
 
254
+ def _ingest(self, state: ResearchState) -> ResearchState:
 
 
 
 
255
  try:
256
+ query = next(msg.content for msg in reversed(state["messages"])
257
+ if isinstance(msg, HumanMessage))
258
  return {
259
+ "messages": [AIMessage(content="Query ingested")],
260
+ "context": {
261
+ "query": query,
262
+ "documents": [],
263
+ "errors": []
264
+ },
265
+ "metadata": {
266
+ "session_id": hashlib.sha256(str(time.time()).encode()).hexdigest()[:8],
267
+ "timestamp": datetime.now().isoformat()
268
+ }
269
  }
270
  except Exception as e:
271
+ return self._handle_error(f"Ingest failed: {str(e)}", state)
272
 
273
+ def _retrieve(self, state: ResearchState) -> ResearchState:
 
274
  try:
275
+ docs = self.retriever.retrieve(state["context"]["query"], "research")
 
 
 
 
 
276
  return {
277
  "messages": [AIMessage(content=f"Retrieved {len(docs)} documents")],
278
  "context": {
279
+ **state["context"],
280
  "documents": docs,
281
  "retrieval_time": time.time()
282
+ },
283
+ "metadata": state["metadata"]
284
  }
285
  except Exception as e:
286
+ return self._handle_error(f"Retrieval error: {str(e)}", state)
287
 
288
+ def _analyze(self, state: ResearchState) -> ResearchState:
289
+ docs = state["context"].get("documents", [])
290
+ if not docs:
291
+ return self._handle_error("No documents for analysis", state)
292
+
293
  try:
294
+ context = "\n\n".join([d.content for d in docs])
295
+ prompt = ResearchConfig.ANALYSIS_TEMPLATE.format(context=context)
296
+ result = self.engine.analyze(prompt)
297
+
298
+ if "error" in result:
299
+ raise RuntimeError(result["error"])
300
 
301
+ content = result['choices'][0]['message']['content']
 
 
 
 
 
302
 
303
+ if len(content) < 200 or not any(c.isalpha() for c in content):
304
+ raise ValueError("Insufficient analysis content")
305
 
306
  return {
307
+ "messages": [AIMessage(content=content)],
308
+ "context": state["context"],
309
+ "metadata": state["metadata"]
310
  }
311
  except Exception as e:
312
+ return self._handle_error(f"Analysis failed: {str(e)}", state)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
313
 
314
+ def _validate(self, state: ResearchState) -> ResearchState:
315
+ return state
 
 
316
 
317
+ def _refine(self, state: ResearchState) -> ResearchState:
318
+ return state
 
 
 
 
 
 
 
 
319
 
320
+ def _quality_gate(self, state: ResearchState) -> str:
321
+ content = state["messages"][-1].content if state["messages"] else ""
322
+ required = ["Innovations", "Results", "Evaluation"]
323
+ return "valid" if all(kw in content for kw in required) else "invalid"
324
 
325
+ def _handle_error(self, message: str, state: ResearchState) -> ResearchState:
 
 
326
  return {
327
+ "messages": [AIMessage(content=f"🚨 Error: {message}")],
328
+ "context": {
329
+ **state["context"],
330
+ "errors": state["context"]["errors"] + [message]
331
+ },
332
+ "metadata": state["metadata"]
333
  }
334
 
335
+ # ------------------------------
336
+ # User Interface
337
+ # ------------------------------
338
  class ResearchInterface:
339
  def __init__(self):
340
+ self.workflow = ResearchWorkflow().workflow.compile()
341
+ self._setup_interface()
342
+
343
+ def _setup_interface(self):
344
+ st.set_page_config(
345
+ page_title="Research Assistant",
346
+ layout="wide",
347
+ initial_sidebar_state="expanded"
348
+ )
349
+ self._apply_styles()
350
  self._build_sidebar()
351
+ self._build_main()
352
 
353
+ def _apply_styles(self):
 
354
  st.markdown("""
355
  <style>
 
 
 
 
 
 
 
356
  .stApp {
357
+ background: #0a192f;
358
+ color: #64ffda;
 
359
  }
 
360
  .stTextArea textarea {
361
+ background: #172a45 !important;
362
+ color: #a8b2d1 !important;
 
 
 
363
  }
 
364
  .stButton>button {
365
+ background: #233554;
366
+ border: 1px solid #64ffda;
 
 
 
 
 
 
 
 
367
  }
368
+ .error-box {
369
+ border: 1px solid #ff4444;
370
+ border-radius: 5px;
371
+ padding: 1rem;
 
372
  margin: 1rem 0;
373
  }
374
  </style>
375
  """, unsafe_allow_html=True)
376
 
377
  def _build_sidebar(self):
 
378
  with st.sidebar:
379
+ st.title("πŸ” Document Database")
380
+ for title, data in DOCUMENT_CONTENT.items():
381
+ with st.expander(title[:25]+"..."):
382
+ st.markdown(f"```\n{data['content'][:300]}...\n```")
383
+
384
+ def _build_main(self):
385
+ st.title("🧠 Research Analysis System")
386
+ query = st.text_area("Enter your research query:", height=150)
 
 
 
 
 
 
 
387
 
388
+ if st.button("Start Analysis", type="primary"):
389
+ self._run_analysis(query)
390
 
391
+ def _run_analysis(self, query: str):
 
392
  try:
393
+ with st.spinner("πŸ” Analyzing documents..."):
394
+ state = {
395
+ "messages": [HumanMessage(content=query)],
396
+ "context": {
397
+ "query": "",
398
+ "documents": [],
399
+ "errors": []
400
+ },
401
+ "metadata": {}
402
+ }
403
+
404
+ for event in self.workflow.stream(state):
405
+ self._display_progress(event)
406
+
407
+ final_state = self.workflow.invoke(state)
408
+ self._show_results(final_state)
409
+
410
  except Exception as e:
411
  st.error(f"""**Analysis Failed**
412
+ {str(e)}
413
+ Common solutions:
414
+ - Simplify your query
415
+ - Check document database status
416
+ - Verify API connectivity""")
417
+
418
+ def _display_progress(self, event):
419
+ current_state = next(iter(event.values()))
420
+ with st.container():
421
+ st.markdown("---")
422
+ cols = st.columns([1,2,1])
423
+
424
+ with cols[0]:
425
+ st.subheader("Processing Stage")
426
+ stage = list(event.keys())[0].title()
427
+ st.code(stage)
428
+
429
+ with cols[1]:
430
+ st.subheader("Documents")
431
+ docs = current_state["context"].get("documents", [])
432
+ st.metric("Retrieved", len(docs))
433
+
434
+ with cols[2]:
435
+ st.subheader("Status")
436
+ if current_state["context"].get("errors"):
437
+ st.error("Errors detected")
 
 
 
 
 
438
  else:
439
+ st.success("Normal operation")
440
+
441
+ def _show_results(self, state: ResearchState):
442
+ if state["context"].get("errors"):
443
+ st.error("Analysis completed with errors")
444
+ with st.expander("Error Details"):
445
+ for error in state["context"]["errors"]:
446
+ st.markdown(f"- {error}")
447
+ else:
448
+ st.success("Analysis completed successfully βœ…")
449
+ with st.expander("Full Report"):
450
+ st.markdown(state["messages"][-1].content)
451
 
 
 
 
452
  if __name__ == "__main__":
453
+ ResearchInterface()