Spaces:
Sleeping
Sleeping
Update app.py
Browse files
app.py
CHANGED
@@ -17,8 +17,9 @@ import arxiv
|
|
17 |
import scholarly
|
18 |
import pymed
|
19 |
import wikipedia
|
20 |
-
#from
|
21 |
-
|
|
|
22 |
import pickle
|
23 |
import faiss
|
24 |
import threading
|
@@ -282,10 +283,10 @@ def tool_search_scholar(query: str, max_results: int = 5) -> list:
|
|
282 |
|
283 |
def extract_article_content(url: str) -> str:
|
284 |
try:
|
285 |
-
|
286 |
-
|
287 |
-
|
288 |
-
return
|
289 |
except Exception as e:
|
290 |
logger.error(f"Failed to extract article content from {url}: {e}")
|
291 |
return ""
|
@@ -575,14 +576,9 @@ def tool_draft_research_plan(prompt: str, entities: list, focus_areas: list = []
|
|
575 |
return "Could not generate a research plan due to an error."
|
576 |
|
577 |
def tool_extract_article(url: str) -> str:
|
578 |
-
|
579 |
-
|
580 |
-
|
581 |
-
|
582 |
-
if len(content) > MAX_FULL_TEXT_LENGTH:
|
583 |
-
content = content[:MAX_FULL_TEXT_LENGTH] + "... [content truncated]"
|
584 |
-
|
585 |
-
return content
|
586 |
|
587 |
tools = {
|
588 |
"search_web": {
|
@@ -679,7 +675,7 @@ tools = {
|
|
679 |
"description": "Identifies contradictions across multiple insights.",
|
680 |
"parameters": {
|
681 |
"insights": {"type": "array", "description": "Collection of insights to analyze for contradictions."},
|
682 |
-
|
683 |
},
|
684 |
"identify_focus_areas": {
|
685 |
"function": tool_identify_focus_areas,
|
@@ -761,7 +757,7 @@ def deep_research(prompt):
|
|
761 |
context = research_data.get('context', [])
|
762 |
all_insights = research_data.get('all_insights', [])
|
763 |
entity_specific_insights = research_data.get('entity_specific_insights', {})
|
764 |
-
intermediate_output = ""
|
765 |
previous_queries = research_data.get('previous_queries', [])
|
766 |
failed_queries = research_data.get('failed_queries', [])
|
767 |
reasoning_context = research_data.get('reasoning_context', [])
|
@@ -772,12 +768,11 @@ def deep_research(prompt):
|
|
772 |
contradictions = research_data.get('contradictions', [])
|
773 |
research_session_id = research_data.get('research_session_id', str(uuid4()))
|
774 |
|
775 |
-
# Restore or initialize FAISS index
|
776 |
global index
|
777 |
if research_data:
|
778 |
logger.info("Restoring FAISS Index from loaded data.")
|
779 |
else:
|
780 |
-
index.reset()
|
781 |
logger.info("Initialized a fresh FAISS Index")
|
782 |
|
783 |
key_entities_with_descriptions = tool_extract_key_entities(prompt=prompt)
|
@@ -793,14 +788,13 @@ def deep_research(prompt):
|
|
793 |
entity_progress[entity]['queries'] = research_data[entity]['queries']
|
794 |
entity_progress[entity]['insights'] = research_data[entity]['insights']
|
795 |
|
796 |
-
if
|
797 |
initial_focus_areas = tool_identify_focus_areas(prompt=prompt)
|
798 |
research_plan = tool_draft_research_plan(prompt=prompt, entities=key_entities, focus_areas=initial_focus_areas)
|
799 |
context.append(f"Initial Research Plan: {research_plan[:200]}...")
|
800 |
intermediate_output += f"Initial Research Plan:\n{research_plan}\n\n"
|
801 |
focus_areas = initial_focus_areas
|
802 |
-
|
803 |
-
focus_areas = tool_identify_focus_areas(prompt=prompt, insights=all_insights, failed_areas=failed_areas)
|
804 |
|
805 |
for i in range(MAX_ITERATIONS):
|
806 |
if key_entities and i > 0:
|
@@ -811,8 +805,7 @@ def deep_research(prompt):
|
|
811 |
|
812 |
context.append(f"Current focus: {current_entity}")
|
813 |
|
814 |
-
|
815 |
-
if i > 0: # Don't do it on first iteration
|
816 |
faiss_results_indices = search_faiss_index(prompt if current_entity == 'general' else f"{prompt} {current_entity}")
|
817 |
faiss_context = []
|
818 |
for idx in faiss_results_indices:
|
@@ -852,7 +845,7 @@ def deep_research(prompt):
|
|
852 |
entity_progress['general']['insights'].append(reasoning_output)
|
853 |
reasoning_context.append(reasoning_output)
|
854 |
context.append(f"Initial Reasoning: {reasoning_output[:200]}...")
|
855 |
-
add_to_faiss_index(reasoning_output)
|
856 |
else:
|
857 |
failed_queries.append(initial_query)
|
858 |
context.append(f"Initial query yielded no relevant results: {initial_query}")
|
@@ -904,7 +897,7 @@ def deep_research(prompt):
|
|
904 |
entity_specific_insights[current_entity].append(entity_reasoning)
|
905 |
|
906 |
context.append(f"Reasoning about {current_entity}: {entity_reasoning[:200]}...")
|
907 |
-
add_to_faiss_index(entity_reasoning)
|
908 |
else:
|
909 |
failed_queries.append(entity_query)
|
910 |
context.append(f"Entity query for {current_entity} yielded no relevant results")
|
@@ -998,7 +991,7 @@ def deep_research(prompt):
|
|
998 |
entity_specific_insights[current_entity].append(result)
|
999 |
else:
|
1000 |
reasoning_context.append(result)
|
1001 |
-
add_to_faiss_index(result)
|
1002 |
all_insights.append(result)
|
1003 |
|
1004 |
elif tool_name == "critique_reasoning":
|
@@ -1040,7 +1033,7 @@ def deep_research(prompt):
|
|
1040 |
reasoning_about_article = tool_reason(prompt=prompt, search_results=[{"title": "Extracted Article", "snippet": result, "url": parameters['url']}])
|
1041 |
if reasoning_about_article:
|
1042 |
all_insights.append(reasoning_about_article)
|
1043 |
-
add_to_faiss_index(reasoning_about_article)
|
1044 |
|
1045 |
|
1046 |
elif tool_name == "meta_analyze":
|
@@ -1052,7 +1045,7 @@ def deep_research(prompt):
|
|
1052 |
if result:
|
1053 |
all_insights.append(result)
|
1054 |
context.append(f"Meta-analysis across entities: {result[:200]}...")
|
1055 |
-
add_to_faiss_index(result)
|
1056 |
|
1057 |
elif tool_name == "draft_research_plan":
|
1058 |
result = "Research plan already generated."
|
@@ -1077,7 +1070,6 @@ def deep_research(prompt):
|
|
1077 |
intermediate_output += f"Iteration {i+1} - Error: {str(e)}\n"
|
1078 |
continue
|
1079 |
|
1080 |
-
# Save research data after each iteration
|
1081 |
research_data = {
|
1082 |
'context': context,
|
1083 |
'all_insights': all_insights,
|
@@ -1088,7 +1080,7 @@ def deep_research(prompt):
|
|
1088 |
'previous_critiques': previous_critiques,
|
1089 |
'focus_areas': focus_areas,
|
1090 |
'failed_areas': failed_areas,
|
1091 |
-
'seen_snippets': list(seen_snippets),
|
1092 |
'contradictions': contradictions,
|
1093 |
'research_session_id': research_session_id
|
1094 |
}
|
@@ -1134,8 +1126,6 @@ def deep_research(prompt):
|
|
1134 |
|
1135 |
return full_output
|
1136 |
|
1137 |
-
# Gradio Interface
|
1138 |
-
|
1139 |
custom_css = """
|
1140 |
.gradio-container {
|
1141 |
background-color: #f7f9fc;
|
@@ -1143,7 +1133,7 @@ custom_css = """
|
|
1143 |
.output-box {
|
1144 |
font-family: 'Segoe UI', Tahoma, Geneva, Verdana, sans-serif;
|
1145 |
line-height: 1.5;
|
1146 |
-
font-size: 14px;
|
1147 |
}
|
1148 |
h3 {
|
1149 |
color: #2c3e50;
|
@@ -1177,7 +1167,7 @@ iface = gr.Interface(
|
|
1177 |
theme="default",
|
1178 |
cache_examples=False,
|
1179 |
css=custom_css,
|
1180 |
-
allow_flagging="never",
|
1181 |
)
|
1182 |
|
1183 |
if __name__ == "__main__":
|
|
|
17 |
import scholarly
|
18 |
import pymed
|
19 |
import wikipedia
|
20 |
+
#from newspaper3k import Article # Removed newspaper3k
|
21 |
+
import trafilatura # Import trafilatura
|
22 |
+
from trafilatura import extract, fetch_url
|
23 |
import pickle
|
24 |
import faiss
|
25 |
import threading
|
|
|
283 |
|
284 |
def extract_article_content(url: str) -> str:
|
285 |
try:
|
286 |
+
downloaded = fetch_url(url)
|
287 |
+
if downloaded is None: # Handle potential download failures
|
288 |
+
return ""
|
289 |
+
return extract(downloaded, favor_precision=True) #Added favor_precision
|
290 |
except Exception as e:
|
291 |
logger.error(f"Failed to extract article content from {url}: {e}")
|
292 |
return ""
|
|
|
576 |
return "Could not generate a research plan due to an error."
|
577 |
|
578 |
def tool_extract_article(url: str) -> str:
|
579 |
+
# Use trafilatura's extraction function
|
580 |
+
extracted_text = extract_article_content(url)
|
581 |
+
return extracted_text if extracted_text else f"Could not extract content from {url}"
|
|
|
|
|
|
|
|
|
|
|
582 |
|
583 |
tools = {
|
584 |
"search_web": {
|
|
|
675 |
"description": "Identifies contradictions across multiple insights.",
|
676 |
"parameters": {
|
677 |
"insights": {"type": "array", "description": "Collection of insights to analyze for contradictions."},
|
678 |
+
},
|
679 |
},
|
680 |
"identify_focus_areas": {
|
681 |
"function": tool_identify_focus_areas,
|
|
|
757 |
context = research_data.get('context', [])
|
758 |
all_insights = research_data.get('all_insights', [])
|
759 |
entity_specific_insights = research_data.get('entity_specific_insights', {})
|
760 |
+
intermediate_output = ""
|
761 |
previous_queries = research_data.get('previous_queries', [])
|
762 |
failed_queries = research_data.get('failed_queries', [])
|
763 |
reasoning_context = research_data.get('reasoning_context', [])
|
|
|
768 |
contradictions = research_data.get('contradictions', [])
|
769 |
research_session_id = research_data.get('research_session_id', str(uuid4()))
|
770 |
|
|
|
771 |
global index
|
772 |
if research_data:
|
773 |
logger.info("Restoring FAISS Index from loaded data.")
|
774 |
else:
|
775 |
+
index.reset()
|
776 |
logger.info("Initialized a fresh FAISS Index")
|
777 |
|
778 |
key_entities_with_descriptions = tool_extract_key_entities(prompt=prompt)
|
|
|
788 |
entity_progress[entity]['queries'] = research_data[entity]['queries']
|
789 |
entity_progress[entity]['insights'] = research_data[entity]['insights']
|
790 |
|
791 |
+
if not focus_areas: # Corrected placement: outside the loop
|
792 |
initial_focus_areas = tool_identify_focus_areas(prompt=prompt)
|
793 |
research_plan = tool_draft_research_plan(prompt=prompt, entities=key_entities, focus_areas=initial_focus_areas)
|
794 |
context.append(f"Initial Research Plan: {research_plan[:200]}...")
|
795 |
intermediate_output += f"Initial Research Plan:\n{research_plan}\n\n"
|
796 |
focus_areas = initial_focus_areas
|
797 |
+
|
|
|
798 |
|
799 |
for i in range(MAX_ITERATIONS):
|
800 |
if key_entities and i > 0:
|
|
|
805 |
|
806 |
context.append(f"Current focus: {current_entity}")
|
807 |
|
808 |
+
if i > 0:
|
|
|
809 |
faiss_results_indices = search_faiss_index(prompt if current_entity == 'general' else f"{prompt} {current_entity}")
|
810 |
faiss_context = []
|
811 |
for idx in faiss_results_indices:
|
|
|
845 |
entity_progress['general']['insights'].append(reasoning_output)
|
846 |
reasoning_context.append(reasoning_output)
|
847 |
context.append(f"Initial Reasoning: {reasoning_output[:200]}...")
|
848 |
+
add_to_faiss_index(reasoning_output)
|
849 |
else:
|
850 |
failed_queries.append(initial_query)
|
851 |
context.append(f"Initial query yielded no relevant results: {initial_query}")
|
|
|
897 |
entity_specific_insights[current_entity].append(entity_reasoning)
|
898 |
|
899 |
context.append(f"Reasoning about {current_entity}: {entity_reasoning[:200]}...")
|
900 |
+
add_to_faiss_index(entity_reasoning)
|
901 |
else:
|
902 |
failed_queries.append(entity_query)
|
903 |
context.append(f"Entity query for {current_entity} yielded no relevant results")
|
|
|
991 |
entity_specific_insights[current_entity].append(result)
|
992 |
else:
|
993 |
reasoning_context.append(result)
|
994 |
+
add_to_faiss_index(result)
|
995 |
all_insights.append(result)
|
996 |
|
997 |
elif tool_name == "critique_reasoning":
|
|
|
1033 |
reasoning_about_article = tool_reason(prompt=prompt, search_results=[{"title": "Extracted Article", "snippet": result, "url": parameters['url']}])
|
1034 |
if reasoning_about_article:
|
1035 |
all_insights.append(reasoning_about_article)
|
1036 |
+
add_to_faiss_index(reasoning_about_article)
|
1037 |
|
1038 |
|
1039 |
elif tool_name == "meta_analyze":
|
|
|
1045 |
if result:
|
1046 |
all_insights.append(result)
|
1047 |
context.append(f"Meta-analysis across entities: {result[:200]}...")
|
1048 |
+
add_to_faiss_index(result)
|
1049 |
|
1050 |
elif tool_name == "draft_research_plan":
|
1051 |
result = "Research plan already generated."
|
|
|
1070 |
intermediate_output += f"Iteration {i+1} - Error: {str(e)}\n"
|
1071 |
continue
|
1072 |
|
|
|
1073 |
research_data = {
|
1074 |
'context': context,
|
1075 |
'all_insights': all_insights,
|
|
|
1080 |
'previous_critiques': previous_critiques,
|
1081 |
'focus_areas': focus_areas,
|
1082 |
'failed_areas': failed_areas,
|
1083 |
+
'seen_snippets': list(seen_snippets),
|
1084 |
'contradictions': contradictions,
|
1085 |
'research_session_id': research_session_id
|
1086 |
}
|
|
|
1126 |
|
1127 |
return full_output
|
1128 |
|
|
|
|
|
1129 |
custom_css = """
|
1130 |
.gradio-container {
|
1131 |
background-color: #f7f9fc;
|
|
|
1133 |
.output-box {
|
1134 |
font-family: 'Segoe UI', Tahoma, Geneva, Verdana, sans-serif;
|
1135 |
line-height: 1.5;
|
1136 |
+
font-size: 14px;
|
1137 |
}
|
1138 |
h3 {
|
1139 |
color: #2c3e50;
|
|
|
1167 |
theme="default",
|
1168 |
cache_examples=False,
|
1169 |
css=custom_css,
|
1170 |
+
allow_flagging="never",
|
1171 |
)
|
1172 |
|
1173 |
if __name__ == "__main__":
|