Reality123b commited on
Commit
5e99554
·
verified ·
1 Parent(s): 9823b36

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +24 -34
app.py CHANGED
@@ -17,8 +17,9 @@ import arxiv
17
  import scholarly
18
  import pymed
19
  import wikipedia
20
- #from newspaper import Article # Removed direct import
21
- from newspaper3k import Article # Import from newspaper3k
 
22
  import pickle
23
  import faiss
24
  import threading
@@ -282,10 +283,10 @@ def tool_search_scholar(query: str, max_results: int = 5) -> list:
282
 
283
  def extract_article_content(url: str) -> str:
284
  try:
285
- article = Article(url)
286
- article.download()
287
- article.parse()
288
- return article.text
289
  except Exception as e:
290
  logger.error(f"Failed to extract article content from {url}: {e}")
291
  return ""
@@ -575,14 +576,9 @@ def tool_draft_research_plan(prompt: str, entities: list, focus_areas: list = []
575
  return "Could not generate a research plan due to an error."
576
 
577
  def tool_extract_article(url: str) -> str:
578
- content = extract_article_content(url)
579
- if not content:
580
- return f"Could not extract content from {url}"
581
-
582
- if len(content) > MAX_FULL_TEXT_LENGTH:
583
- content = content[:MAX_FULL_TEXT_LENGTH] + "... [content truncated]"
584
-
585
- return content
586
 
587
  tools = {
588
  "search_web": {
@@ -679,7 +675,7 @@ tools = {
679
  "description": "Identifies contradictions across multiple insights.",
680
  "parameters": {
681
  "insights": {"type": "array", "description": "Collection of insights to analyze for contradictions."},
682
- },
683
  },
684
  "identify_focus_areas": {
685
  "function": tool_identify_focus_areas,
@@ -761,7 +757,7 @@ def deep_research(prompt):
761
  context = research_data.get('context', [])
762
  all_insights = research_data.get('all_insights', [])
763
  entity_specific_insights = research_data.get('entity_specific_insights', {})
764
- intermediate_output = "" # For Gradio display
765
  previous_queries = research_data.get('previous_queries', [])
766
  failed_queries = research_data.get('failed_queries', [])
767
  reasoning_context = research_data.get('reasoning_context', [])
@@ -772,12 +768,11 @@ def deep_research(prompt):
772
  contradictions = research_data.get('contradictions', [])
773
  research_session_id = research_data.get('research_session_id', str(uuid4()))
774
 
775
- # Restore or initialize FAISS index
776
  global index
777
  if research_data:
778
  logger.info("Restoring FAISS Index from loaded data.")
779
  else:
780
- index.reset() #Start Fresh
781
  logger.info("Initialized a fresh FAISS Index")
782
 
783
  key_entities_with_descriptions = tool_extract_key_entities(prompt=prompt)
@@ -793,14 +788,13 @@ def deep_research(prompt):
793
  entity_progress[entity]['queries'] = research_data[entity]['queries']
794
  entity_progress[entity]['insights'] = research_data[entity]['insights']
795
 
796
- if i == 0:
797
  initial_focus_areas = tool_identify_focus_areas(prompt=prompt)
798
  research_plan = tool_draft_research_plan(prompt=prompt, entities=key_entities, focus_areas=initial_focus_areas)
799
  context.append(f"Initial Research Plan: {research_plan[:200]}...")
800
  intermediate_output += f"Initial Research Plan:\n{research_plan}\n\n"
801
  focus_areas = initial_focus_areas
802
- elif not focus_areas:
803
- focus_areas = tool_identify_focus_areas(prompt=prompt, insights=all_insights, failed_areas=failed_areas)
804
 
805
  for i in range(MAX_ITERATIONS):
806
  if key_entities and i > 0:
@@ -811,8 +805,7 @@ def deep_research(prompt):
811
 
812
  context.append(f"Current focus: {current_entity}")
813
 
814
- # FAISS similarity search before web/arxiv/pubmed searches
815
- if i > 0: # Don't do it on first iteration
816
  faiss_results_indices = search_faiss_index(prompt if current_entity == 'general' else f"{prompt} {current_entity}")
817
  faiss_context = []
818
  for idx in faiss_results_indices:
@@ -852,7 +845,7 @@ def deep_research(prompt):
852
  entity_progress['general']['insights'].append(reasoning_output)
853
  reasoning_context.append(reasoning_output)
854
  context.append(f"Initial Reasoning: {reasoning_output[:200]}...")
855
- add_to_faiss_index(reasoning_output) # Add reasoning to FAISS
856
  else:
857
  failed_queries.append(initial_query)
858
  context.append(f"Initial query yielded no relevant results: {initial_query}")
@@ -904,7 +897,7 @@ def deep_research(prompt):
904
  entity_specific_insights[current_entity].append(entity_reasoning)
905
 
906
  context.append(f"Reasoning about {current_entity}: {entity_reasoning[:200]}...")
907
- add_to_faiss_index(entity_reasoning) # Add to FAISS
908
  else:
909
  failed_queries.append(entity_query)
910
  context.append(f"Entity query for {current_entity} yielded no relevant results")
@@ -998,7 +991,7 @@ def deep_research(prompt):
998
  entity_specific_insights[current_entity].append(result)
999
  else:
1000
  reasoning_context.append(result)
1001
- add_to_faiss_index(result) # Add reasoning to FAISS
1002
  all_insights.append(result)
1003
 
1004
  elif tool_name == "critique_reasoning":
@@ -1040,7 +1033,7 @@ def deep_research(prompt):
1040
  reasoning_about_article = tool_reason(prompt=prompt, search_results=[{"title": "Extracted Article", "snippet": result, "url": parameters['url']}])
1041
  if reasoning_about_article:
1042
  all_insights.append(reasoning_about_article)
1043
- add_to_faiss_index(reasoning_about_article) # Add to FAISS
1044
 
1045
 
1046
  elif tool_name == "meta_analyze":
@@ -1052,7 +1045,7 @@ def deep_research(prompt):
1052
  if result:
1053
  all_insights.append(result)
1054
  context.append(f"Meta-analysis across entities: {result[:200]}...")
1055
- add_to_faiss_index(result) # Add to FAISS
1056
 
1057
  elif tool_name == "draft_research_plan":
1058
  result = "Research plan already generated."
@@ -1077,7 +1070,6 @@ def deep_research(prompt):
1077
  intermediate_output += f"Iteration {i+1} - Error: {str(e)}\n"
1078
  continue
1079
 
1080
- # Save research data after each iteration
1081
  research_data = {
1082
  'context': context,
1083
  'all_insights': all_insights,
@@ -1088,7 +1080,7 @@ def deep_research(prompt):
1088
  'previous_critiques': previous_critiques,
1089
  'focus_areas': focus_areas,
1090
  'failed_areas': failed_areas,
1091
- 'seen_snippets': list(seen_snippets), # Convert set to list for pickling
1092
  'contradictions': contradictions,
1093
  'research_session_id': research_session_id
1094
  }
@@ -1134,8 +1126,6 @@ def deep_research(prompt):
1134
 
1135
  return full_output
1136
 
1137
- # Gradio Interface
1138
-
1139
  custom_css = """
1140
  .gradio-container {
1141
  background-color: #f7f9fc;
@@ -1143,7 +1133,7 @@ custom_css = """
1143
  .output-box {
1144
  font-family: 'Segoe UI', Tahoma, Geneva, Verdana, sans-serif;
1145
  line-height: 1.5;
1146
- font-size: 14px; /* Increased font size */
1147
  }
1148
  h3 {
1149
  color: #2c3e50;
@@ -1177,7 +1167,7 @@ iface = gr.Interface(
1177
  theme="default",
1178
  cache_examples=False,
1179
  css=custom_css,
1180
- allow_flagging="never", # Disable flagging
1181
  )
1182
 
1183
  if __name__ == "__main__":
 
17
  import scholarly
18
  import pymed
19
  import wikipedia
20
+ #from newspaper3k import Article # Removed newspaper3k
21
+ import trafilatura # Import trafilatura
22
+ from trafilatura import extract, fetch_url
23
  import pickle
24
  import faiss
25
  import threading
 
283
 
284
  def extract_article_content(url: str) -> str:
285
  try:
286
+ downloaded = fetch_url(url)
287
+ if downloaded is None: # Handle potential download failures
288
+ return ""
289
+ return extract(downloaded, favor_precision=True) #Added favor_precision
290
  except Exception as e:
291
  logger.error(f"Failed to extract article content from {url}: {e}")
292
  return ""
 
576
  return "Could not generate a research plan due to an error."
577
 
578
  def tool_extract_article(url: str) -> str:
579
+ # Use trafilatura's extraction function
580
+ extracted_text = extract_article_content(url)
581
+ return extracted_text if extracted_text else f"Could not extract content from {url}"
 
 
 
 
 
582
 
583
  tools = {
584
  "search_web": {
 
675
  "description": "Identifies contradictions across multiple insights.",
676
  "parameters": {
677
  "insights": {"type": "array", "description": "Collection of insights to analyze for contradictions."},
678
+ },
679
  },
680
  "identify_focus_areas": {
681
  "function": tool_identify_focus_areas,
 
757
  context = research_data.get('context', [])
758
  all_insights = research_data.get('all_insights', [])
759
  entity_specific_insights = research_data.get('entity_specific_insights', {})
760
+ intermediate_output = ""
761
  previous_queries = research_data.get('previous_queries', [])
762
  failed_queries = research_data.get('failed_queries', [])
763
  reasoning_context = research_data.get('reasoning_context', [])
 
768
  contradictions = research_data.get('contradictions', [])
769
  research_session_id = research_data.get('research_session_id', str(uuid4()))
770
 
 
771
  global index
772
  if research_data:
773
  logger.info("Restoring FAISS Index from loaded data.")
774
  else:
775
+ index.reset()
776
  logger.info("Initialized a fresh FAISS Index")
777
 
778
  key_entities_with_descriptions = tool_extract_key_entities(prompt=prompt)
 
788
  entity_progress[entity]['queries'] = research_data[entity]['queries']
789
  entity_progress[entity]['insights'] = research_data[entity]['insights']
790
 
791
+ if not focus_areas: # Corrected placement: outside the loop
792
  initial_focus_areas = tool_identify_focus_areas(prompt=prompt)
793
  research_plan = tool_draft_research_plan(prompt=prompt, entities=key_entities, focus_areas=initial_focus_areas)
794
  context.append(f"Initial Research Plan: {research_plan[:200]}...")
795
  intermediate_output += f"Initial Research Plan:\n{research_plan}\n\n"
796
  focus_areas = initial_focus_areas
797
+
 
798
 
799
  for i in range(MAX_ITERATIONS):
800
  if key_entities and i > 0:
 
805
 
806
  context.append(f"Current focus: {current_entity}")
807
 
808
+ if i > 0:
 
809
  faiss_results_indices = search_faiss_index(prompt if current_entity == 'general' else f"{prompt} {current_entity}")
810
  faiss_context = []
811
  for idx in faiss_results_indices:
 
845
  entity_progress['general']['insights'].append(reasoning_output)
846
  reasoning_context.append(reasoning_output)
847
  context.append(f"Initial Reasoning: {reasoning_output[:200]}...")
848
+ add_to_faiss_index(reasoning_output)
849
  else:
850
  failed_queries.append(initial_query)
851
  context.append(f"Initial query yielded no relevant results: {initial_query}")
 
897
  entity_specific_insights[current_entity].append(entity_reasoning)
898
 
899
  context.append(f"Reasoning about {current_entity}: {entity_reasoning[:200]}...")
900
+ add_to_faiss_index(entity_reasoning)
901
  else:
902
  failed_queries.append(entity_query)
903
  context.append(f"Entity query for {current_entity} yielded no relevant results")
 
991
  entity_specific_insights[current_entity].append(result)
992
  else:
993
  reasoning_context.append(result)
994
+ add_to_faiss_index(result)
995
  all_insights.append(result)
996
 
997
  elif tool_name == "critique_reasoning":
 
1033
  reasoning_about_article = tool_reason(prompt=prompt, search_results=[{"title": "Extracted Article", "snippet": result, "url": parameters['url']}])
1034
  if reasoning_about_article:
1035
  all_insights.append(reasoning_about_article)
1036
+ add_to_faiss_index(reasoning_about_article)
1037
 
1038
 
1039
  elif tool_name == "meta_analyze":
 
1045
  if result:
1046
  all_insights.append(result)
1047
  context.append(f"Meta-analysis across entities: {result[:200]}...")
1048
+ add_to_faiss_index(result)
1049
 
1050
  elif tool_name == "draft_research_plan":
1051
  result = "Research plan already generated."
 
1070
  intermediate_output += f"Iteration {i+1} - Error: {str(e)}\n"
1071
  continue
1072
 
 
1073
  research_data = {
1074
  'context': context,
1075
  'all_insights': all_insights,
 
1080
  'previous_critiques': previous_critiques,
1081
  'focus_areas': focus_areas,
1082
  'failed_areas': failed_areas,
1083
+ 'seen_snippets': list(seen_snippets),
1084
  'contradictions': contradictions,
1085
  'research_session_id': research_session_id
1086
  }
 
1126
 
1127
  return full_output
1128
 
 
 
1129
  custom_css = """
1130
  .gradio-container {
1131
  background-color: #f7f9fc;
 
1133
  .output-box {
1134
  font-family: 'Segoe UI', Tahoma, Geneva, Verdana, sans-serif;
1135
  line-height: 1.5;
1136
+ font-size: 14px;
1137
  }
1138
  h3 {
1139
  color: #2c3e50;
 
1167
  theme="default",
1168
  cache_examples=False,
1169
  css=custom_css,
1170
+ allow_flagging="never",
1171
  )
1172
 
1173
  if __name__ == "__main__":