mgbam commited on
Commit
b294f9c
·
verified ·
1 Parent(s): 9dc7678

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +94 -54
app.py CHANGED
@@ -79,12 +79,80 @@ class ResearchConfig:
79
  "5. Limitations & Future Directions\n\n"
80
  "Format: Markdown with LaTeX mathematical notation where applicable"
81
  )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
82
  DOMAIN_PROMPTS = {
83
- "Biomedical Research": "Consider clinical trial design, patient outcomes, and recent biomedical breakthroughs.",
84
- "Legal Research": "Emphasize legal precedents, case law, and nuanced statutory interpretations.",
85
- "Environmental and Energy Studies": "Highlight renewable energy technologies, efficiency metrics, and policy implications.",
86
- "Competitive Programming and Theoretical Computer Science": "Focus on algorithmic complexity, innovative proofs, and computational techniques.",
87
- "Social Sciences": "Concentrate on economic trends, sociological data, and correlations impacting public policy."
88
  }
89
  ENSEMBLE_MODELS = {
90
  "deepseek-chat": {"max_tokens": 2000, "temp": 0.7},
@@ -117,16 +185,13 @@ class QuantumDocumentManager:
117
  logger.info("Initialized PersistentClient for Chroma.")
118
  except Exception as e:
119
  logger.exception("Error initializing PersistentClient; falling back to in-memory client.")
120
- self.client = chromadb.Client() # Fallback to in-memory client
121
  self.embeddings = OpenAIEmbeddings(
122
  model="text-embedding-3-large",
123
  dimensions=ResearchConfig.EMBEDDING_DIMENSIONS
124
  )
125
 
126
  def create_collection(self, documents: List[str], collection_name: str) -> Chroma:
127
- """
128
- Splits documents into chunks and stores them as a Chroma collection.
129
- """
130
  splitter = RecursiveCharacterTextSplitter(
131
  chunk_size=ResearchConfig.CHUNK_SIZE,
132
  chunk_overlap=ResearchConfig.CHUNK_OVERLAP,
@@ -138,7 +203,6 @@ class QuantumDocumentManager:
138
  except Exception as e:
139
  logger.exception("Error during document splitting.")
140
  raise e
141
-
142
  return Chroma.from_documents(
143
  documents=docs,
144
  embedding=self.embeddings,
@@ -148,9 +212,6 @@ class QuantumDocumentManager:
148
  )
149
 
150
  def _document_id(self, content: str) -> str:
151
- """
152
- Generates a unique document ID using SHA256 and the current timestamp.
153
- """
154
  return f"{hashlib.sha256(content.encode()).hexdigest()[:16]}-{int(time.time())}"
155
 
156
  # ------------------------------
@@ -170,7 +231,7 @@ class ExtendedQuantumDocumentManager(QuantumDocumentManager):
170
  embeddings.append(emb.numpy())
171
  valid_images.append(img_path)
172
  except FileNotFoundError:
173
- logger.warning(f"Image file not found: {img_path}. Skipping this file.")
174
  except Exception as e:
175
  logger.exception(f"Error processing image {img_path}: {str(e)}")
176
  if not embeddings:
@@ -189,7 +250,6 @@ research_docs = qdm.create_collection([
189
  "Academic Paper Summary: Why Transformers Became the Mainstream Architecture in Natural Language Processing",
190
  "Latest Trends in Machine Learning Methods Using Quantum Computing"
191
  ], "research")
192
-
193
  development_docs = qdm.create_collection([
194
  "Project A: UI Design Completed, API Integration in Progress",
195
  "Project B: Testing New Feature X, Bug Fixes Needed",
@@ -219,10 +279,6 @@ class ResearchRetriever:
219
  raise e
220
 
221
  def retrieve(self, query: str, domain: str) -> List[Any]:
222
- """
223
- Retrieves documents based on the query and domain.
224
- For demonstration, always using the "research" retriever.
225
- """
226
  try:
227
  return self.research_retriever.invoke(query)
228
  except Exception as e:
@@ -243,9 +299,6 @@ class CognitiveProcessor:
243
  self.session_id = hashlib.sha256(datetime.now().isoformat().encode()).hexdigest()[:12]
244
 
245
  def process_query(self, prompt: str) -> Dict:
246
- """
247
- Processes a query by sending multiple API requests in parallel.
248
- """
249
  futures = []
250
  for _ in range(3):
251
  futures.append(self.executor.submit(self._execute_api_request, prompt))
@@ -259,9 +312,6 @@ class CognitiveProcessor:
259
  return self._consensus_check(results)
260
 
261
  def _execute_api_request(self, prompt: str) -> Dict:
262
- """
263
- Executes a single API request to the backend endpoint.
264
- """
265
  headers = {
266
  "Authorization": f"Bearer {ResearchConfig.DEEPSEEK_API_KEY}",
267
  "Content-Type": "application/json",
@@ -292,9 +342,6 @@ class CognitiveProcessor:
292
  return {"error": str(e)}
293
 
294
  def _consensus_check(self, results: List[Dict]) -> Dict:
295
- """
296
- Consolidates multiple API responses, selecting the one with the most content.
297
- """
298
  valid_results = [r for r in results if "error" not in r]
299
  if not valid_results:
300
  logger.error("All API requests failed.")
@@ -374,7 +421,7 @@ class QuantumKnowledgeGraph:
374
  self.nodes = {}
375
  self.relations = []
376
  self.node_counter = 0
377
-
378
  def create_node(self, content: Dict, node_type: str) -> int:
379
  self.node_counter += 1
380
  self.nodes[self.node_counter] = {
@@ -384,7 +431,7 @@ class QuantumKnowledgeGraph:
384
  "connections": []
385
  }
386
  return self.node_counter
387
-
388
  def create_relation(self, source: int, target: int, rel_type: str, strength: float = 1.0):
389
  self.relations.append({
390
  "source": source,
@@ -393,7 +440,7 @@ class QuantumKnowledgeGraph:
393
  "strength": strength
394
  })
395
  self.nodes[source]["connections"].append(target)
396
-
397
  def visualize_graph(self, focus_node: int = None) -> str:
398
  dot = Digraph(engine="neato")
399
  for nid, node in self.nodes.items():
@@ -404,7 +451,7 @@ class QuantumKnowledgeGraph:
404
  if focus_node:
405
  dot.node(str(focus_node), color="red", style="filled")
406
  return dot.source
407
-
408
  def _truncate_content(self, content: Dict) -> str:
409
  return json.dumps(content)[:50] + "..."
410
 
@@ -415,7 +462,7 @@ class MultiModalRetriever:
415
  self.clip_model = clip_model
416
  self.clip_processor = clip_processor
417
  self.code_retriever = create_retriever_tool([], "Code Retriever", "Retriever for code snippets")
418
-
419
  def retrieve(self, query: str, domain: str) -> Dict[str, List]:
420
  results = {
421
  "text": self._retrieve_text(query),
@@ -423,16 +470,16 @@ class MultiModalRetriever:
423
  "code": self._retrieve_code(query)
424
  }
425
  return results
426
-
427
  def _retrieve_text(self, query: str) -> List[Any]:
428
  return self.text_retriever.invoke(query)
429
-
430
  def _retrieve_images(self, query: str) -> List[str]:
431
  inputs = self.clip_processor(text=query, return_tensors="pt")
432
  with torch.no_grad():
433
  _ = self.clip_model.get_text_features(**inputs)
434
  return ["image_result_1.png", "image_result_2.png"]
435
-
436
  def _retrieve_code(self, query: str) -> List[str]:
437
  return self.code_retriever.invoke(query)
438
 
@@ -450,7 +497,6 @@ class ResearchWorkflow:
450
  self.app = self.workflow.compile()
451
 
452
  def _build_workflow(self) -> None:
453
- # Base workflow nodes
454
  self.workflow.add_node("ingest", self.ingest_query)
455
  self.workflow.add_node("retrieve", self.retrieve_documents)
456
  self.workflow.add_node("analyze", self.analyze_content)
@@ -507,13 +553,14 @@ class ResearchWorkflow:
507
 
508
  def analyze_content(self, state: AgentState) -> Dict:
509
  """
510
- Analyzes the retrieved documents. Injects a domain-specific fallback analysis for each supported domain.
 
 
511
  """
512
  try:
513
- domain = state["context"].get("domain", "Biomedical Research")
514
- query = state["context"].get("raw_query", "")
515
- fallback_analyses = {
516
- "Biomedical Research": """
517
  # Biomedical Research Analysis
518
  ## Key Contributions
519
  - Integration of clinical trial design with digital biomarkers.
@@ -526,7 +573,7 @@ class ResearchWorkflow:
526
  ## Applications
527
  - Personalized medicine, early diagnosis, treatment optimization.
528
  """,
529
- "Legal Research": """
530
  # Legal Research Analysis
531
  ## Key Contributions
532
  - Analysis of legal precedents using NLP.
@@ -539,7 +586,7 @@ class ResearchWorkflow:
539
  ## Applications
540
  - Legal analytics, risk assessment, regulatory compliance.
541
  """,
542
- "Environmental and Energy Studies": """
543
  # Environmental and Energy Studies Analysis
544
  ## Key Contributions
545
  - Novel approaches to renewable energy efficiency.
@@ -550,9 +597,9 @@ class ResearchWorkflow:
550
  ## Empirical Results
551
  - Enhanced performance in energy forecasting.
552
  ## Applications
553
- - Sustainable urban planning, energy policy formulation.
554
  """,
555
- "Competitive Programming and Theoretical Computer Science": """
556
  # Competitive Programming & Theoretical CS Analysis
557
  ## Key Contributions
558
  - Advanced approximation algorithms for NP-hard problems.
@@ -565,7 +612,7 @@ class ResearchWorkflow:
565
  ## Applications
566
  - Optimization in competitive programming and algorithm design.
567
  """,
568
- "Social Sciences": """
569
  # Social Sciences Analysis
570
  ## Key Contributions
571
  - Identification of economic trends through data analytics.
@@ -579,7 +626,6 @@ class ResearchWorkflow:
579
  - Policy design, urban studies, social impact analysis.
580
  """
581
  }
582
-
583
  if domain in fallback_analyses:
584
  logger.info(f"Using fallback analysis for domain: {domain}")
585
  return {
@@ -630,7 +676,6 @@ class ResearchWorkflow:
630
  refinement_history.append(current_analysis)
631
  difficulty_level = max(0, 3 - state["context"]["refine_count"])
632
  logger.info(f"Refinement iteration: {state['context']['refine_count']}, Difficulty level: {difficulty_level}")
633
-
634
  if state["context"]["refine_count"] >= 3:
635
  meta_prompt = (
636
  "You are given the following series of refinement outputs:\n" +
@@ -678,10 +723,6 @@ class ResearchWorkflow:
678
  }
679
 
680
  def enhance_analysis(self, state: AgentState) -> Dict:
681
- """
682
- Augments the analysis with multi-modal insights.
683
- If images or code snippets are available in the context, they are appended to the analysis.
684
- """
685
  try:
686
  analysis = state["messages"][-1].content
687
  enhanced = f"{analysis}\n\n## Multi-Modal Insights\n"
@@ -863,7 +904,6 @@ Potential issues:
863
  st.markdown(content)
864
 
865
  def _display_knowledge_graph(self) -> None:
866
- # Placeholder for knowledge graph visualization
867
  st.write("Knowledge Graph visualization is not implemented yet.")
868
 
869
  # ------------------------------
 
79
  "5. Limitations & Future Directions\n\n"
80
  "Format: Markdown with LaTeX mathematical notation where applicable"
81
  )
82
+ # Lowercase keys for fallback analyses
83
+ DOMAIN_FALLBACKS = {
84
+ "biomedical research": """
85
+ # Biomedical Research Analysis
86
+ ## Key Contributions
87
+ - Integration of clinical trial design with digital biomarkers.
88
+ - Multi-omics data used for precise patient stratification.
89
+ ## Methodologies
90
+ - Machine learning for precision medicine.
91
+ - Federated learning for multi-center trials.
92
+ ## Empirical Results
93
+ - Significant improvements in patient outcomes.
94
+ ## Applications
95
+ - Personalized medicine, early diagnosis, and treatment optimization.
96
+ """,
97
+ "legal research": """
98
+ # Legal Research Analysis
99
+ ## Key Contributions
100
+ - Analysis of legal precedents using NLP.
101
+ - Advanced case law retrieval and summarization.
102
+ ## Methodologies
103
+ - Automated legal reasoning with transformer models.
104
+ - Sentiment analysis on judicial opinions.
105
+ ## Empirical Results
106
+ - Improved accuracy in predicting case outcomes.
107
+ ## Applications
108
+ - Legal analytics, risk assessment, and regulatory compliance.
109
+ """,
110
+ "environmental and energy studies": """
111
+ # Environmental and Energy Studies Analysis
112
+ ## Key Contributions
113
+ - Novel approaches to renewable energy efficiency.
114
+ - Integration of policy analysis with technical metrics.
115
+ ## Methodologies
116
+ - Simulation models for climate impact.
117
+ - Data fusion from sensor networks and satellite imagery.
118
+ ## Empirical Results
119
+ - Enhanced performance in energy forecasting.
120
+ ## Applications
121
+ - Sustainable urban planning and energy policy formulation.
122
+ """,
123
+ "competitive programming and theoretical computer science": """
124
+ # Competitive Programming & Theoretical CS Analysis
125
+ ## Key Contributions
126
+ - Advanced approximation algorithms for NP-hard problems.
127
+ - Use of parameterized complexity and fixed-parameter tractability.
128
+ ## Methodologies
129
+ - Branch-and-bound with dynamic programming.
130
+ - Quantum-inspired algorithms for optimization.
131
+ ## Empirical Results
132
+ - Significant improvements in computational efficiency.
133
+ ## Applications
134
+ - Optimization in competitive programming and algorithm design.
135
+ """,
136
+ "social sciences": """
137
+ # Social Sciences Analysis
138
+ ## Key Contributions
139
+ - Identification of economic trends through data analytics.
140
+ - Integration of sociological data with computational models.
141
+ ## Methodologies
142
+ - Advanced statistical modeling for behavioral analysis.
143
+ - Machine learning for trend forecasting.
144
+ ## Empirical Results
145
+ - High correlation with traditional survey methods.
146
+ ## Applications
147
+ - Policy design, urban studies, and social impact analysis.
148
+ """
149
+ }
150
  DOMAIN_PROMPTS = {
151
+ "biomedical research": "Consider clinical trial design, patient outcomes, and recent biomedical breakthroughs.",
152
+ "legal research": "Emphasize legal precedents, case law, and nuanced statutory interpretations.",
153
+ "environmental and energy studies": "Highlight renewable energy technologies, efficiency metrics, and policy implications.",
154
+ "competitive programming and theoretical computer science": "Focus on algorithmic complexity, innovative proofs, and computational techniques.",
155
+ "social sciences": "Concentrate on economic trends, sociological data, and correlations impacting public policy."
156
  }
157
  ENSEMBLE_MODELS = {
158
  "deepseek-chat": {"max_tokens": 2000, "temp": 0.7},
 
185
  logger.info("Initialized PersistentClient for Chroma.")
186
  except Exception as e:
187
  logger.exception("Error initializing PersistentClient; falling back to in-memory client.")
188
+ self.client = chromadb.Client()
189
  self.embeddings = OpenAIEmbeddings(
190
  model="text-embedding-3-large",
191
  dimensions=ResearchConfig.EMBEDDING_DIMENSIONS
192
  )
193
 
194
  def create_collection(self, documents: List[str], collection_name: str) -> Chroma:
 
 
 
195
  splitter = RecursiveCharacterTextSplitter(
196
  chunk_size=ResearchConfig.CHUNK_SIZE,
197
  chunk_overlap=ResearchConfig.CHUNK_OVERLAP,
 
203
  except Exception as e:
204
  logger.exception("Error during document splitting.")
205
  raise e
 
206
  return Chroma.from_documents(
207
  documents=docs,
208
  embedding=self.embeddings,
 
212
  )
213
 
214
  def _document_id(self, content: str) -> str:
 
 
 
215
  return f"{hashlib.sha256(content.encode()).hexdigest()[:16]}-{int(time.time())}"
216
 
217
  # ------------------------------
 
231
  embeddings.append(emb.numpy())
232
  valid_images.append(img_path)
233
  except FileNotFoundError:
234
+ logger.warning(f"Image file not found: {img_path}. Skipping.")
235
  except Exception as e:
236
  logger.exception(f"Error processing image {img_path}: {str(e)}")
237
  if not embeddings:
 
250
  "Academic Paper Summary: Why Transformers Became the Mainstream Architecture in Natural Language Processing",
251
  "Latest Trends in Machine Learning Methods Using Quantum Computing"
252
  ], "research")
 
253
  development_docs = qdm.create_collection([
254
  "Project A: UI Design Completed, API Integration in Progress",
255
  "Project B: Testing New Feature X, Bug Fixes Needed",
 
279
  raise e
280
 
281
  def retrieve(self, query: str, domain: str) -> List[Any]:
 
 
 
 
282
  try:
283
  return self.research_retriever.invoke(query)
284
  except Exception as e:
 
299
  self.session_id = hashlib.sha256(datetime.now().isoformat().encode()).hexdigest()[:12]
300
 
301
  def process_query(self, prompt: str) -> Dict:
 
 
 
302
  futures = []
303
  for _ in range(3):
304
  futures.append(self.executor.submit(self._execute_api_request, prompt))
 
312
  return self._consensus_check(results)
313
 
314
  def _execute_api_request(self, prompt: str) -> Dict:
 
 
 
315
  headers = {
316
  "Authorization": f"Bearer {ResearchConfig.DEEPSEEK_API_KEY}",
317
  "Content-Type": "application/json",
 
342
  return {"error": str(e)}
343
 
344
  def _consensus_check(self, results: List[Dict]) -> Dict:
 
 
 
345
  valid_results = [r for r in results if "error" not in r]
346
  if not valid_results:
347
  logger.error("All API requests failed.")
 
421
  self.nodes = {}
422
  self.relations = []
423
  self.node_counter = 0
424
+
425
  def create_node(self, content: Dict, node_type: str) -> int:
426
  self.node_counter += 1
427
  self.nodes[self.node_counter] = {
 
431
  "connections": []
432
  }
433
  return self.node_counter
434
+
435
  def create_relation(self, source: int, target: int, rel_type: str, strength: float = 1.0):
436
  self.relations.append({
437
  "source": source,
 
440
  "strength": strength
441
  })
442
  self.nodes[source]["connections"].append(target)
443
+
444
  def visualize_graph(self, focus_node: int = None) -> str:
445
  dot = Digraph(engine="neato")
446
  for nid, node in self.nodes.items():
 
451
  if focus_node:
452
  dot.node(str(focus_node), color="red", style="filled")
453
  return dot.source
454
+
455
  def _truncate_content(self, content: Dict) -> str:
456
  return json.dumps(content)[:50] + "..."
457
 
 
462
  self.clip_model = clip_model
463
  self.clip_processor = clip_processor
464
  self.code_retriever = create_retriever_tool([], "Code Retriever", "Retriever for code snippets")
465
+
466
  def retrieve(self, query: str, domain: str) -> Dict[str, List]:
467
  results = {
468
  "text": self._retrieve_text(query),
 
470
  "code": self._retrieve_code(query)
471
  }
472
  return results
473
+
474
  def _retrieve_text(self, query: str) -> List[Any]:
475
  return self.text_retriever.invoke(query)
476
+
477
  def _retrieve_images(self, query: str) -> List[str]:
478
  inputs = self.clip_processor(text=query, return_tensors="pt")
479
  with torch.no_grad():
480
  _ = self.clip_model.get_text_features(**inputs)
481
  return ["image_result_1.png", "image_result_2.png"]
482
+
483
  def _retrieve_code(self, query: str) -> List[str]:
484
  return self.code_retriever.invoke(query)
485
 
 
497
  self.app = self.workflow.compile()
498
 
499
  def _build_workflow(self) -> None:
 
500
  self.workflow.add_node("ingest", self.ingest_query)
501
  self.workflow.add_node("retrieve", self.retrieve_documents)
502
  self.workflow.add_node("analyze", self.analyze_content)
 
553
 
554
  def analyze_content(self, state: AgentState) -> Dict:
555
  """
556
+ Analyzes the retrieved documents using a domain-specific fallback analysis.
557
+ If the domain matches one of the predefined domains, a hardcoded analysis is returned.
558
+ Otherwise, the normal backend analysis pipeline is used.
559
  """
560
  try:
561
+ domain = state["context"].get("domain", "Biomedical Research").lower()
562
+ fallback_analyses = {
563
+ "biomedical research": """
 
564
  # Biomedical Research Analysis
565
  ## Key Contributions
566
  - Integration of clinical trial design with digital biomarkers.
 
573
  ## Applications
574
  - Personalized medicine, early diagnosis, treatment optimization.
575
  """,
576
+ "legal research": """
577
  # Legal Research Analysis
578
  ## Key Contributions
579
  - Analysis of legal precedents using NLP.
 
586
  ## Applications
587
  - Legal analytics, risk assessment, regulatory compliance.
588
  """,
589
+ "environmental and energy studies": """
590
  # Environmental and Energy Studies Analysis
591
  ## Key Contributions
592
  - Novel approaches to renewable energy efficiency.
 
597
  ## Empirical Results
598
  - Enhanced performance in energy forecasting.
599
  ## Applications
600
+ - Sustainable urban planning and energy policy formulation.
601
  """,
602
+ "competitive programming and theoretical computer science": """
603
  # Competitive Programming & Theoretical CS Analysis
604
  ## Key Contributions
605
  - Advanced approximation algorithms for NP-hard problems.
 
612
  ## Applications
613
  - Optimization in competitive programming and algorithm design.
614
  """,
615
+ "social sciences": """
616
  # Social Sciences Analysis
617
  ## Key Contributions
618
  - Identification of economic trends through data analytics.
 
626
  - Policy design, urban studies, social impact analysis.
627
  """
628
  }
 
629
  if domain in fallback_analyses:
630
  logger.info(f"Using fallback analysis for domain: {domain}")
631
  return {
 
676
  refinement_history.append(current_analysis)
677
  difficulty_level = max(0, 3 - state["context"]["refine_count"])
678
  logger.info(f"Refinement iteration: {state['context']['refine_count']}, Difficulty level: {difficulty_level}")
 
679
  if state["context"]["refine_count"] >= 3:
680
  meta_prompt = (
681
  "You are given the following series of refinement outputs:\n" +
 
723
  }
724
 
725
  def enhance_analysis(self, state: AgentState) -> Dict:
 
 
 
 
726
  try:
727
  analysis = state["messages"][-1].content
728
  enhanced = f"{analysis}\n\n## Multi-Modal Insights\n"
 
904
  st.markdown(content)
905
 
906
  def _display_knowledge_graph(self) -> None:
 
907
  st.write("Knowledge Graph visualization is not implemented yet.")
908
 
909
  # ------------------------------