diff --git "a/app.py" "b/app.py" --- "a/app.py" +++ "b/app.py" @@ -34,10 +34,9 @@ if not HF_API_KEY: client = InferenceClient(provider="hf-inference", api_key=HF_API_KEY) -# Enhanced Model Configuration -MAIN_LLM_MODEL = "mistralai/Mistral-Nemo-Instruct-2407" # Most powerful for main reasoning -REASONING_LLM_MODEL = "deepseek-ai/DeepSeek-R1-Distill-Qwen-32B" # Specialized for analytical tasks -CRITIC_LLM_MODEL = "Qwen/QwQ-32B-Preview" # Diverse perspective for critiques +MAIN_LLM_MODEL = "mistralai/Mistral-Nemo-Instruct-2407" +REASONING_LLM_MODEL = "deepseek-ai/DeepSeek-R1-Distill-Qwen-32B" +CRITIC_LLM_MODEL = "Qwen/QwQ-32B-Preview" SPECIALIST_MODELS = { "medical": "deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B", "scientific": "mistralai/Mistral-7B-Instruct-v0.2", @@ -46,630 +45,579 @@ SPECIALIST_MODELS = { } ENSEMBLE_MODELS = [MAIN_LLM_MODEL, REASONING_LLM_MODEL, CRITIC_LLM_MODEL] + list(SPECIALIST_MODELS.values()) -# Enhanced Parameters -MAX_ITERATIONS = 100 # Increased for deeper research -TIMEOUT = 300 # Longer timeout +MAX_ITERATIONS = 100 +TIMEOUT = 300 RETRY_DELAY = 15 -NUM_RESULTS = 50 # More comprehensive search -SIMILARITY_THRESHOLD = 0.12 # More lenient for broader coverage +NUM_RESULTS = 50 +SIMILARITY_THRESHOLD = 0.12 MAX_CONTEXT_ITEMS = 100 MAX_HISTORY_ITEMS = 25 MAX_FULL_TEXT_LENGTH = 50000 FAISS_INDEX_PATH = "research_index.faiss" RESEARCH_DATA_PATH = "research_data.pkl" -PAPER_SUMMARIES_PATH = "paper_summaries.pkl" #New path for storing paper summary +PAPER_SUMMARIES_PATH = "paper_summaries.pkl" try: -main_similarity_model = SentenceTransformer('sentence-transformers/all-mpnet-base-v2') -concept_similarity_model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2') -document_similarity_model = SentenceTransformer('sentence-transformers/multi-qa-mpnet-base-dot-v1') - -embedding_dim = document_similarity_model.get_sentence_embedding_dimension() -if os.path.exists(FAISS_INDEX_PATH): - index = faiss.read_index(FAISS_INDEX_PATH) - logger.info(f"Loaded FAISS index from {FAISS_INDEX_PATH}") -else: - index = faiss.IndexFlatIP(embedding_dim) # Use IndexFlatIP for inner product (cosine similarity). - logger.info("Created a new FAISS index.") -content_copy -download - Use code with caution. + main_similarity_model = SentenceTransformer('sentence-transformers/all-mpnet-base-v2') + concept_similarity_model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2') + document_similarity_model = SentenceTransformer('sentence-transformers/multi-qa-mpnet-base-dot-v1') + + embedding_dim = document_similarity_model.get_sentence_embedding_dimension() + if os.path.exists(FAISS_INDEX_PATH): + index = faiss.read_index(FAISS_INDEX_PATH) + logger.info(f"Loaded FAISS index from {FAISS_INDEX_PATH}") + else: + index = faiss.IndexFlatIP(embedding_dim) + logger.info("Created a new FAISS index.") except Exception as e: -logger.error(f"Failed to load models or initialize FAISS: {e}") -raise + logger.error(f"Failed to load models or initialize FAISS: {e}") + raise def get_token_count(text): -try: -encoding = tiktoken.get_encoding("cl100k_base") -return len(encoding.encode(text)) -except: -return len(text.split()) * 1.3 + try: + encoding = tiktoken.get_encoding("cl100k_base") + return len(encoding.encode(text)) + except: + return len(text.split()) * 1.3 def save_research_data(data, index): -try: -with open(RESEARCH_DATA_PATH, "wb") as f: -pickle.dump(data, f) -faiss.write_index(index, FAISS_INDEX_PATH) -logger.info(f"Research data and index saved to {RESEARCH_DATA_PATH} and {FAISS_INDEX_PATH}") -except Exception as e: -logger.error(f"Error saving research data: {e}") + try: + with open(RESEARCH_DATA_PATH, "wb") as f: + pickle.dump(data, f) + faiss.write_index(index, FAISS_INDEX_PATH) + logger.info(f"Research data and index saved to {RESEARCH_DATA_PATH} and {FAISS_INDEX_PATH}") + except Exception as e: + logger.error(f"Error saving research data: {e}") def load_research_data(): -if os.path.exists(RESEARCH_DATA_PATH): -try: -with open(RESEARCH_DATA_PATH, "rb") as f: -data = pickle.load(f) -logger.info(f"Loaded research data from {RESEARCH_DATA_PATH}") -return data -except Exception as e: -logger.error(f"Error loading research data: {e}") -return {} -else: -logger.info("No existing research data found.") -return {} + if os.path.exists(RESEARCH_DATA_PATH): + try: + with open(RESEARCH_DATA_PATH, "rb") as f: + data = pickle.load(f) + logger.info(f"Loaded research data from {RESEARCH_DATA_PATH}") + return data + except Exception as e: + logger.error(f"Error loading research data: {e}") + return {} + else: + logger.info("No existing research data found.") + return {} def save_paper_summaries(summaries: Dict[str, str]): -try: -with open(PAPER_SUMMARIES_PATH, "wb") as f: -pickle.dump(summaries, f) -logger.info(f"Paper summaries saved to {PAPER_SUMMARIES_PATH}") -except Exception as e: -logger.error(f"Error saving paper summaries: {e}") + try: + with open(PAPER_SUMMARIES_PATH, "wb") as f: + pickle.dump(summaries, f) + logger.info(f"Paper summaries saved to {PAPER_SUMMARIES_PATH}") + except Exception as e: + logger.error(f"Error saving paper summaries: {e}") def load_paper_summaries() -> Dict[str, str]: -if os.path.exists(PAPER_SUMMARIES_PATH): -try: -with open(PAPER_SUMMARIES_PATH, "rb") as f: -data = pickle.load(f) -logger.info(f"Loaded paper summaries from {PAPER_SUMMARIES_PATH}") -return data -except Exception as e: -logger.error(f"Error loading paper summaries: {e}") -return {} -else: -logger.info("No existing paper summaries found.") -return {} - -def hf_inference(model_name, prompt, max_tokens=2000, retries=5, stream=False): # Added stream parameter -for attempt in range(retries): -try: -messages = [{"role": "user", "content": prompt}] -response_generator = client.chat.completions.create( -model=model_name, -messages=messages, -max_tokens=max_tokens, -stream=stream # Pass the stream parameter -) -if stream: -return response_generator # Return the generator directly -else: -# If not streaming, get the full response -response = next(response_generator) # Consume the first chunk to get complete object -return {"generated_text": response.choices[0].message.content} -except Exception as e: -if attempt == retries - 1: -logger.error(f"Request failed after {retries} retries: {e}") -return {"error": f"Request failed after {retries} retries: {e}"} -time.sleep(RETRY_DELAY * (1 + attempt)) -return {"error": "Request failed after multiple retries."} + if os.path.exists(PAPER_SUMMARIES_PATH): + try: + with open(PAPER_SUMMARIES_PATH, "rb") as f: + data = pickle.load(f) + logger.info(f"Loaded paper summaries from {PAPER_SUMMARIES_PATH}") + return data + except Exception as e: + logger.error(f"Error loading paper summaries: {e}") + return {} + else: + logger.info("No existing paper summaries found.") + return {} + +def hf_inference(model_name, prompt, max_tokens=2000, retries=5, stream=False): + for attempt in range(retries): + try: + messages = [{"role": "user", "content": prompt}] + response_generator = client.chat.completions.create( + model=model_name, + messages=messages, + max_tokens=max_tokens, + stream=stream + ) + if stream: + return response_generator + else: + response = next(response_generator) + return {"generated_text": response.choices[0].message.content} + except Exception as e: + if attempt == retries - 1: + logger.error(f"Request failed after {retries} retries: {e}") + return {"error": f"Request failed after {retries} retries: {e}"} + time.sleep(RETRY_DELAY * (1 + attempt)) + return {"error": "Request failed after multiple retries."} + +def ensemble_inference(prompt, models=ENSEMBLE_MODELS, max_tokens=1500, stream=False): + results = [] + + if stream: + def generate_responses(): + with ThreadPoolExecutor(max_workers=len(models)) as executor: + futures = {executor.submit(hf_inference, model, prompt, max_tokens, stream=True): model for model in models} -def ensemble_inference(prompt, models=ENSEMBLE_MODELS, max_tokens=1500, stream=False): #Added stream -results = [] + for future in as_completed(futures): + model = future_to_model[future] + try: + for chunk in future.result(): + yield {"model": model, "text": chunk.choices[0].delta.content} + except Exception as e: + logger.error(f"Error with model {model}: {e}") + yield {"model": model, "text": f"Error: {e}"} + return generate_responses() -if stream: # If streaming, return a generator that yields from each model - def generate_responses(): + else: with ThreadPoolExecutor(max_workers=len(models)) as executor: - futures = {executor.submit(hf_inference, model, prompt, max_tokens, stream=True): model for model in models} - - for future in as_completed(futures): + future_to_model = {executor.submit(hf_inference, model, prompt, max_tokens, stream=False): model for model in models} + for future in as_completed(future_to_model): model = future_to_model[future] try: - for chunk in future.result(): # Iterate through chunks - yield {"model": model, "text": chunk.choices[0].delta.content} #yield the content of the chunk + result = future.result() + if "generated_text" in result: + results.append({"model": model, "text": result["generated_text"]}) except Exception as e: logger.error(f"Error with model {model}: {e}") - yield {"model": model, "text": f"Error: {e}"} - return generate_responses() # return the generator - -else: #Non-streaming behavior - with ThreadPoolExecutor(max_workers=len(models)) as executor: - future_to_model = {executor.submit(hf_inference, model, prompt, max_tokens, stream=False): model for model in models} - for future in as_completed(future_to_model): - model = future_to_model[future] - try: - result = future.result() - if "generated_text" in result: - results.append({"model": model, "text": result["generated_text"]}) - except Exception as e: - logger.error(f"Error with model {model}: {e}") - if not results: - return {"error": "All models failed to generate responses"} + if not results: + return {"error": "All models failed to generate responses"} - if len(results) == 1: - return {"generated_text": results[0]["text"]} + if len(results) == 1: + return {"generated_text": results[0]["text"]} - synthesis_prompt = "Synthesize these expert responses into a single coherent answer:\n\n" - for result in results: - synthesis_prompt += f"Expert {results.index(result) + 1} ({result['model'].split('/')[-1]}):\n{result['text']}\n\n" + synthesis_prompt = "Synthesize these expert responses into a single coherent answer:\n\n" + for result in results: + synthesis_prompt += f"Expert {results.index(result) + 1} ({result['model'].split('/')[-1]}):\n{result['text']}\n\n" - synthesis = hf_inference(MAIN_LLM_MODEL, synthesis_prompt) # Use a consistent model for final synthesis - if "generated_text" in synthesis: - return synthesis - else: - return {"generated_text": max(results, key=lambda x: len(x["text"]))["text"]} # Fallback -content_copy -download - Use code with caution. + synthesis = hf_inference(MAIN_LLM_MODEL, synthesis_prompt) + if "generated_text" in synthesis: + return synthesis + else: + return {"generated_text": max(results, key=lambda x: len(x["text"]))["text"]} def tool_search_web(query: str, num_results: int = NUM_RESULTS, safesearch: str = "moderate", -time_filter: Optional[str] = None, region: str = "wt-wt", language: str = "en-us") -> list: -try: -with DDGS() as ddgs: -kwargs = { -"keywords": query, -"max_results": num_results, -"safesearch": safesearch, -"region": region, -"hreflang": language, -} -if time_filter: -if time_filter in ['d', 'w', 'm', 'y']: -kwargs["time"] = time_filter - -results = [r for r in ddgs.text(**kwargs)] - if results: - return [{"title": r["title"], "snippet": r["body"], "url": r["href"]} for r in results] - else: - if time_filter and "time" in kwargs: - del kwargs["time"] - results = [r for r in ddgs.text(**kwargs)] - if results: - return [{"title": r["title"], "snippet": r["body"], "url": r["href"]} for r in results] - return [] -except Exception as e: - logger.error(f"DuckDuckGo search error: {e}") - return [] -content_copy -download - Use code with caution. + time_filter: Optional[str] = None, region: str = "wt-wt", language: str = "en-us") -> list: + try: + with DDGS() as ddgs: + kwargs = { + "keywords": query, + "max_results": num_results, + "safesearch": safesearch, + "region": region, + "hreflang": language, + } + if time_filter: + if time_filter in ['d', 'w', 'm', 'y']: + kwargs["time"] = time_filter + + results = [r for r in ddgs.text(**kwargs)] + if results: + return [{"title": r["title"], "snippet": r["body"], "url": r["href"]} for r in results] + else: + if time_filter and "time" in kwargs: + del kwargs["time"] + results = [r for r in ddgs.text(**kwargs)] + if results: + return [{"title": r["title"], "snippet": r["body"], "url": r["href"]} for r in results] + return [] + except Exception as e: + logger.error(f"DuckDuckGo search error: {e}") + return [] def tool_search_arxiv(query: str, max_results: int = 5) -> list: -try: -client = arxiv.Client() -search = arxiv.Search( -query=query, -max_results=max_results, -sort_by=arxiv.SortCriterion.Relevance -) -results = [] -for paper in client.results(search): -results.append({ -"title": paper.title, -"snippet": paper.summary[:500] + "..." if len(paper.summary) > 500 else paper.summary, -"url": paper.pdf_url, -"authors": ", ".join(author.name for author in paper.authors), -"published": paper.published.strftime("%Y-%m-%d") if paper.published else "Unknown", -"source": "arXiv" -}) -return results -except Exception as e: -logger.error(f"arXiv search error: {e}") -return [] + try: + client = arxiv.Client() + search = arxiv.Search( + query=query, + max_results=max_results, + sort_by=arxiv.SortCriterion.Relevance + ) + results = [] + for paper in client.results(search): + results.append({ + "title": paper.title, + "snippet": paper.summary[:500] + "..." if len(paper.summary) > 500 else paper.summary, + "url": paper.pdf_url, + "authors": ", ".join(author.name for author in paper.authors), + "published": paper.published.strftime("%Y-%m-%d") if paper.published else "Unknown", + "source": "arXiv" + }) + return results + except Exception as e: + logger.error(f"arXiv search error: {e}") + return [] def tool_search_pubmed(query: str, max_results: int = 5) -> list: -try: -pubmed = pymed.PubMed(tool="ResearchAssistant", email="researcher@example.com") -results = list(pubmed.query(query, max_results=max_results)) + try: + pubmed = pymed.PubMed(tool="ResearchAssistant", email="researcher@example.com") + results = list(pubmed.query(query, max_results=max_results)) -output = [] - for article in results: - try: - data = article.toDict() - output.append({ - "title": data.get("title", "No title"), - "snippet": data.get("abstract", "No abstract")[:500] + "..." if data.get("abstract", "") and len(data.get("abstract", "")) > 500 else data.get("abstract", "No abstract"), - "url": f"https://pubmed.ncbi.nlm.nih.gov/{data.get('pubmed_id')}/", - "authors": ", ".join(author.get("name", "") for author in data.get("authors", [])), - "published": data.get("publication_date", "Unknown"), - "source": "PubMed" - }) - except: - continue - return output -except Exception as e: - logger.error(f"PubMed search error: {e}") - return [] -content_copy -download - Use code with caution. + output = [] + for article in results: + try: + data = article.toDict() + output.append({ + "title": data.get("title", "No title"), + "snippet": data.get("abstract", "No abstract")[:500] + "..." if data.get("abstract", "") and len(data.get("abstract", "")) > 500 else data.get("abstract", "No abstract"), + "url": f"https://pubmed.ncbi.nlm.nih.gov/{data.get('pubmed_id')}/", + "authors": ", ".join(author.get("name", "") for author in data.get("authors", [])), + "published": data.get("publication_date", "Unknown"), + "source": "PubMed" + }) + except: + continue + return output + except Exception as e: + logger.error(f"PubMed search error: {e}") + return [] def tool_search_wikipedia(query: str, max_results: int = 3) -> list: -try: -search_results = wikipedia.search(query, results=max_results) -results = [] - -for title in search_results: - try: - page = wikipedia.page(title) - summary = page.summary - snippet = summary[:500] + "..." if len(summary) > 500 else summary - results.append({ - "title": page.title, - "snippet": snippet, - "url": page.url, - "source": "Wikipedia" - }) - except (wikipedia.exceptions.DisambiguationError, wikipedia.exceptions.PageError): - continue + try: + search_results = wikipedia.search(query, results=max_results) + results = [] - return results -except Exception as e: - logger.error(f"Wikipedia search error: {e}") - return [] -content_copy -download - Use code with caution. + for title in search_results: + try: + page = wikipedia.page(title) + summary = page.summary + snippet = summary[:500] + "..." if len(summary) > 500 else summary + results.append({ + "title": page.title, + "snippet": snippet, + "url": page.url, + "source": "Wikipedia" + }) + except (wikipedia.exceptions.DisambiguationError, wikipedia.exceptions.PageError): + continue + + return results + except Exception as e: + logger.error(f"Wikipedia search error: {e}") + return [] def tool_search_scholar(query: str, max_results: int = 5) -> list: -try: -search_query = scholarly.search_pubs(query) -results = [] -for _ in range(max_results): -try: -result = next(search_query) -results.append({ -"title": result.get("bib", {}).get("title", "No title"), -"snippet": result.get("bib", {}).get("abstract", "No abstract")[:500] + "..." if result.get("bib", {}).get("abstract") else result.get("bib", {}).get("abstract", "No abstract"), -"url": result.get("pub_url", "#"), -"authors": ", ".join(result.get("bib", {}).get("author", [])), -"published": result.get("bib", {}).get("pub_year", "Unknown"), -"source": "Google Scholar" -}) -except StopIteration: -break -except Exception as e: -logger.warning(f"Error processing Scholar result: {e}") -continue -return results -except Exception as e: -logger.error(f"Google Scholar search error: {e}") -return [] + try: + search_query = scholarly.search_pubs(query) + results = [] + for _ in range(max_results): + try: + result = next(search_query) + results.append({ + "title": result.get("bib", {}).get("title", "No title"), + "snippet": result.get("bib", {}).get("abstract", "No abstract")[:500] + "..." if result.get("bib", {}).get("abstract") else result.get("bib", {}).get("abstract", "No abstract"), + "url": result.get("pub_url", "#"), + "authors": ", ".join(result.get("bib", {}).get("author", [])), + "published": result.get("bib", {}).get("pub_year", "Unknown"), + "source": "Google Scholar" + }) + except StopIteration: + break + except Exception as e: + logger.warning(f"Error processing Scholar result: {e}") + continue + return results + except Exception as e: + logger.error(f"Google Scholar search error: {e}") + return [] def extract_article_content(url: str) -> str: -try: -downloaded = fetch_url(url) -if downloaded is None: -return "" -return extract(downloaded, favor_precision=True) -except Exception as e: -logger.error(f"Failed to extract article content from {url}: {e}") -return "" + try: + downloaded = fetch_url(url) + if downloaded is None: + return "" + return extract(downloaded, favor_precision=True) + except Exception as e: + logger.error(f"Failed to extract article content from {url}: {e}") + return "" def tool_reason(prompt: str, search_results: list, reasoning_context: list = [], -critique: str = "", focus_areas: list = []) -> str: -if not search_results: -return "No search results to reason about." - -reasoning_input = "Reason about the following search results in relation to the prompt:\n\n" -reasoning_input += f"Prompt: {prompt}\n\n" - -if focus_areas: - reasoning_input += f"Focus particularly on these aspects: {', '.join(focus_areas)}\n\n" - -results_by_source = {} -for i, result in enumerate(search_results): - source = result.get('source', 'Web Search') # Default to 'Web Search' - if source not in results_by_source: - results_by_source[source] = [] - results_by_source[source].append((i, result)) - -for source, results in results_by_source.items(): - reasoning_input += f"\n--- {source} Results ---\n" - for i, result in results: - reasoning_input += f"- Result {i + 1}: Title: {result['title']}\n Snippet: {result['snippet']}\n" - if 'authors' in result: - reasoning_input += f" Authors: {result['authors']}\n" - if 'published' in result: - reasoning_input += f" Published: {result['published']}\n" - reasoning_input += "\n" - -if reasoning_context: - recent_context = reasoning_context[-MAX_HISTORY_ITEMS:] # Limit history - reasoning_input += "\nPrevious Reasoning Context:\n" + "\n".join(recent_context) - -if critique: - reasoning_input += f"\n\nRecent critique to address: {critique}\n" - -reasoning_input += "\nProvide a thorough, nuanced analysis that builds upon previous reasoning if applicable. Consider multiple perspectives, potential contradictions in the search results, and the reliability of different sources. Address any specific critiques." - -reasoning_output = ensemble_inference(reasoning_input) # Use ensemble for high-quality reasoning. - -if isinstance(reasoning_output, dict) and "generated_text" in reasoning_output: - return reasoning_output["generated_text"].strip() -else: - logger.error(f"Failed to generate reasoning: {reasoning_output}") - return "Could not generate reasoning due to an error." -content_copy -download - Use code with caution. + critique: str = "", focus_areas: list = []) -> str: + if not search_results: + return "No search results to reason about." + + reasoning_input = "Reason about the following search results in relation to the prompt:\n\n" + reasoning_input += f"Prompt: {prompt}\n\n" + + if focus_areas: + reasoning_input += f"Focus particularly on these aspects: {', '.join(focus_areas)}\n\n" + + results_by_source = {} + for i, result in enumerate(search_results): + source = result.get('source', 'Web Search') + if source not in results_by_source: + results_by_source[source] = [] + results_by_source[source].append((i, result)) + + for source, results in results_by_source.items(): + reasoning_input += f"\n--- {source} Results ---\n" + for i, result in results: + reasoning_input += f"- Result {i + 1}: Title: {result['title']}\n Snippet: {result['snippet']}\n" + if 'authors' in result: + reasoning_input += f" Authors: {result['authors']}\n" + if 'published' in result: + reasoning_input += f" Published: {result['published']}\n" + reasoning_input += "\n" + + if reasoning_context: + recent_context = reasoning_context[-MAX_HISTORY_ITEMS:] + reasoning_input += "\nPrevious Reasoning Context:\n" + "\n".join(recent_context) + + if critique: + reasoning_input += f"\n\nRecent critique to address: {critique}\n" + + reasoning_input += "\nProvide a thorough, nuanced analysis that builds upon previous reasoning if applicable. Consider multiple perspectives, potential contradictions in the search results, and the reliability of different sources. Address any specific critiques." + + reasoning_output = ensemble_inference(reasoning_input) + + if isinstance(reasoning_output, dict) and "generated_text" in reasoning_output: + return reasoning_output["generated_text"].strip() + else: + logger.error(f"Failed to generate reasoning: {reasoning_output}") + return "Could not generate reasoning due to an error." def tool_summarize(insights: list, prompt: str, contradictions: list = []) -> str: -if not insights: -return "No insights to summarize." + if not insights: + return "No insights to summarize." -summarization_input = f"Synthesize the following insights into a cohesive and comprehensive summary regarding: '{prompt}'\n\n" + summarization_input = f"Synthesize the following insights into a cohesive and comprehensive summary regarding: '{prompt}'\n\n" -max_tokens = 12000 # Increased token limit -selected_insights = [] -token_count = get_token_count(summarization_input) + get_token_count("\n\n".join(contradictions)) + max_tokens = 12000 + selected_insights = [] + token_count = get_token_count(summarization_input) + get_token_count("\n\n".join(contradictions)) -for insight in reversed(insights): - insight_tokens = get_token_count(insight) - if token_count + insight_tokens < max_tokens: - selected_insights.insert(0, insight) - token_count += insight_tokens - else: - break + for insight in reversed(insights): + insight_tokens = get_token_count(insight) + if token_count + insight_tokens < max_tokens: + selected_insights.insert(0, insight) + token_count += insight_tokens + else: + break -summarization_input += "\n\n".join(selected_insights) + summarization_input += "\n\n".join(selected_insights) -if contradictions: - summarization_input += "\n\nAddress these specific contradictions:\n" + "\n".join(contradictions) + if contradictions: + summarization_input += "\n\nAddress these specific contradictions:\n" + "\n".join(contradictions) -summarization_input += "\n\nProvide a well-structured summary that:\n1. Presents the main findings\n2. Acknowledges limitations and uncertainties\n3. Highlights areas of consensus and disagreement\n4. Suggests potential directions for further inquiry\n5. Evaluates the strength of evidence for key claims" + summarization_input += "\n\nProvide a well-structured summary that:\n1. Presents the main findings\n2. Acknowledges limitations and uncertainties\n3. Highlights areas of consensus and disagreement\n4. Suggests potential directions for further inquiry\n5. Evaluates the strength of evidence for key claims" -summarization_output = ensemble_inference(summarization_input) + summarization_output = ensemble_inference(summarization_input) -if isinstance(summarization_output, dict) and "generated_text" in summarization_output: - return summarization_output["generated_text"].strip() -else: - logger.error(f"Failed to generate summary: {summarization_output}") - return "Could not generate a summary due to an error." -content_copy -download - Use code with caution. + if isinstance(summarization_output, dict) and "generated_text" in summarization_output: + return summarization_output["generated_text"].strip() + else: + logger.error(f"Failed to generate summary: {summarization_output}") + return "Could not generate a summary due to an error." def tool_generate_search_query(prompt: str, previous_queries: list = [], -failed_queries: list = [], focus_areas: list = []) -> str: -query_gen_input = f"Generate an effective search query for the following prompt: {prompt}\n" + failed_queries: list = [], focus_areas: list = []) -> str: + query_gen_input = f"Generate an effective search query for the following prompt: {prompt}\n" -if previous_queries: - recent_queries = previous_queries[-MAX_HISTORY_ITEMS:] - query_gen_input += "Previous search queries:\n" + "\n".join(recent_queries) + "\n" + if previous_queries: + recent_queries = previous_queries[-MAX_HISTORY_ITEMS:] + query_gen_input += "Previous search queries:\n" + "\n".join(recent_queries) + "\n" -if failed_queries: - query_gen_input += "These queries didn't yield useful results:\n" + "\n".join(failed_queries) + "\n" + if failed_queries: + query_gen_input += "These queries didn't yield useful results:\n" + "\n".join(failed_queries) + "\n" -if focus_areas: - query_gen_input += f"Focus particularly on these aspects: {', '.join(focus_areas)}\n" + if focus_areas: + query_gen_input += f"Focus particularly on these aspects: {', '.join(focus_areas)}\n" -query_gen_input += "Refine the search query based on previous queries, aiming for more precise results. Consider using advanced search operators like site:, filetype:, intitle:, etc. when appropriate. Make sure the query is well-formed for academic and scientific search engines.\n" -query_gen_input += "Search Query:" + query_gen_input += "Refine the search query based on previous queries, aiming for more precise results. Consider using advanced search operators like site:, filetype:, intitle:, etc. when appropriate. Make sure the query is well-formed for academic and scientific search engines.\n" + query_gen_input += "Search Query:" -query_gen_output = hf_inference(MAIN_LLM_MODEL, query_gen_input) + query_gen_output = hf_inference(MAIN_LLM_MODEL, query_gen_input) -if isinstance(query_gen_output, dict) and 'generated_text' in query_gen_output: - return query_gen_output['generated_text'].strip() + if isinstance(query_gen_output, dict) and 'generated_text' in query_gen_output: + return query_gen_output['generated_text'].strip() -logger.error(f"Failed to generate search query: {query_gen_output}") -return "" -content_copy -download - Use code with caution. + logger.error(f"Failed to generate search query: {query_gen_output}") + return "" def tool_critique_reasoning(reasoning_output: str, prompt: str, -previous_critiques: list = []) -> str: -critique_input = f"Critically evaluate the following reasoning output in relation to the prompt:\n\nPrompt: {prompt}\n\nReasoning: {reasoning_output}\n\n" + previous_critiques: list = []) -> str: + critique_input = f"Critically evaluate the following reasoning output in relation to the prompt:\n\nPrompt: {prompt}\n\nReasoning: {reasoning_output}\n\n" -if previous_critiques: - critique_input += "Previous critiques that should be addressed:\n" + "\n".join(previous_critiques[-MAX_HISTORY_ITEMS:]) + "\n\n" + if previous_critiques: + critique_input += "Previous critiques that should be addressed:\n" + "\n".join(previous_critiques[-MAX_HISTORY_ITEMS:]) + "\n\n" -critique_input += "Identify any flaws, biases, logical fallacies, unsupported claims, or areas for improvement. Be specific and constructive. Suggest concrete ways to enhance the reasoning. Also evaluate the strength of evidence and whether conclusions are proportionate to the available information." + critique_input += "Identify any flaws, biases, logical fallacies, unsupported claims, or areas for improvement. Be specific and constructive. Suggest concrete ways to enhance the reasoning. Also evaluate the strength of evidence and whether conclusions are proportionate to the available information." -critique_output = hf_inference(CRITIC_LLM_MODEL, critique_input) # Use specialized critique model. + critique_output = hf_inference(CRITIC_LLM_MODEL, critique_input) -if isinstance(critique_output, dict) and "generated_text" in critique_output: - return critique_output["generated_text"].strip() + if isinstance(critique_output, dict) and "generated_text" in critique_output: + return critique_output["generated_text"].strip() -logger.error(f"Failed to generate critique: {critique_output}") -return "Could not generate a critique due to an error." -content_copy -download - Use code with caution. + logger.error(f"Failed to generate critique: {critique_output}") + return "Could not generate a critique due to an error." def tool_identify_contradictions(insights: list) -> list: -if len(insights) < 2: -return [] - -max_tokens = 12000 # Increased token limit for potentially more contradictions -selected_insights = [] -token_count = 0 - -for insight in reversed(insights): - insight_tokens = get_token_count(insight) - if token_count + insight_tokens < max_tokens: - selected_insights.insert(0, insight) - token_count += insight_tokens - else: - break + if len(insights) < 2: + return [] -contradiction_input = "Identify specific contradictions in these insights:\n\n" + "\n\n".join(selected_insights) -contradiction_input += "\n\nList each contradiction as a separate numbered point. For each contradiction, cite the specific claims that are in tension and evaluate which claim is better supported. If no contradictions exist, respond with 'No contradictions found.'" + max_tokens = 12000 + selected_insights = [] + token_count = 0 -contradiction_output = hf_inference(CRITIC_LLM_MODEL, contradiction_input) # Use critique model + for insight in reversed(insights): + insight_tokens = get_token_count(insight) + if token_count + insight_tokens < max_tokens: + selected_insights.insert(0, insight) + token_count += insight_tokens + else: + break -if isinstance(contradiction_output, dict) and "generated_text" in contradiction_output: - result = contradiction_output["generated_text"].strip() - if result == "No contradictions found.": - return [] - # More robust contradiction extraction, handles multi-sentence contradictions - contradictions = re.findall(r'\d+\.\s+(.*?)(?=\d+\.|$)', result, re.DOTALL) - return [c.strip() for c in contradictions if c.strip()] + contradiction_input = "Identify specific contradictions in these insights:\n\n" + "\n\n".join(selected_insights) + contradiction_input += "\n\nList each contradiction as a separate numbered point. For each contradiction, cite the specific claims that are in tension and evaluate which claim is better supported. If no contradictions exist, respond with 'No contradictions found.'" -logger.error(f"Failed to identify contradictions: {contradiction_output}") -return [] -content_copy -download - Use code with caution. + contradiction_output = hf_inference(CRITIC_LLM_MODEL, contradiction_input) + + if isinstance(contradiction_output, dict) and "generated_text" in contradiction_output: + result = contradiction_output["generated_text"].strip() + if result == "No contradictions found.": + return [] + contradictions = re.findall(r'\d+\.\s+(.*?)(?=\d+\.|$)', result, re.DOTALL) + return [c.strip() for c in contradictions if c.strip()] + + logger.error(f"Failed to identify contradictions: {contradiction_output}") + return [] def tool_identify_focus_areas(prompt: str, insights: list = [], -failed_areas: list = []) -> list: -focus_input = f"Based on this research prompt: '{prompt}'\n\n" + failed_areas: list = []) -> list: + focus_input = f"Based on this research prompt: '{prompt}'\n\n" -if insights: - recent_insights = insights[-5:] if len(insights) > 5 else insights - focus_input += "And these existing insights:\n" + "\n".join(recent_insights) + "\n\n" + if insights: + recent_insights = insights[-5:] if len(insights) > 5 else insights + focus_input += "And these existing insights:\n" + "\n".join(recent_insights) + "\n\n" -if failed_areas: - focus_input += f"These focus areas didn't yield useful results: {', '.join(failed_areas)}\n\n" + if failed_areas: + focus_input += f"These focus areas didn't yield useful results: {', '.join(failed_areas)}\n\n" -focus_input += "Identify 3-5 specific aspects that should be investigated further to get a complete understanding. Be precise and prioritize underexplored areas. For each suggested area, briefly explain why it's important to investigate." + focus_input += "Identify 3-5 specific aspects that should be investigated further to get a complete understanding. Be precise and prioritize underexplored areas. For each suggested area, briefly explain why it's important to investigate." -focus_output = hf_inference(MAIN_LLM_MODEL, focus_input) # Consistent model + focus_output = hf_inference(MAIN_LLM_MODEL, focus_input) -if isinstance(focus_output, dict) and "generated_text" in focus_output: - result = focus_output["generated_text"].strip() - # More robust extraction, handles different list formats - areas = re.findall(r'(?:^|\n)(?:\d+\.|\*|\-)\s*(.*?)(?=(?:\n(?:\d+\.|\*|\-|$))|$)', result) - return [area.strip() for area in areas if area.strip()][:5] + if isinstance(focus_output, dict) and "generated_text" in focus_output: + result = focus_output["generated_text"].strip() + areas = re.findall(r'(?:^|\n)(?:\d+\.|\*|\-)\s*(.*?)(?=(?:\n(?:\d+\.|\*|\-|$))|$)', result) + return [area.strip() for area in areas if area.strip()][:5] -logger.error(f"Failed to identify focus areas: {focus_output}") -return [] -content_copy -download - Use code with caution. + logger.error(f"Failed to identify focus areas: {focus_output}") + return [] def add_to_faiss_index(text: str): -embedding = document_similarity_model.encode(text, convert_to_tensor=True) -embedding_np = embedding.cpu().numpy().reshape(1, -1) -if embedding_np.shape[1] != embedding_dim: -logger.error(f"Embedding dimension mismatch: expected {embedding_dim}, got {embedding_np.shape[1]}") -return -faiss.normalize_L2(embedding_np) # Normalize for cosine similarity. -index.add(embedding_np) + embedding = document_similarity_model.encode(text, convert_to_tensor=True) + embedding_np = embedding.cpu().numpy().reshape(1, -1) + if embedding_np.shape[1] != embedding_dim: + logger.error(f"Embedding dimension mismatch: expected {embedding_dim}, got {embedding_np.shape[1]}") + return + faiss.normalize_L2(embedding_np) + index.add(embedding_np) def search_faiss_index(query: str, top_k: int = 5) -> List[str]: -query_embedding = document_similarity_model.encode(query, convert_to_tensor=True) -query_embedding_np = query_embedding.cpu().numpy().reshape(1, -1) -faiss.normalize_L2(query_embedding_np) -distances, indices = index.search(query_embedding_np, top_k) -return indices[0].tolist() + query_embedding = document_similarity_model.encode(query, convert_to_tensor=True) + query_embedding_np = query_embedding.cpu().numpy().reshape(1, -1) + faiss.normalize_L2(query_embedding_np) + distances, indices = index.search(query_embedding_np, top_k) + return indices[0].tolist() def filter_results(search_results, prompt, previous_snippets=None): -if not main_similarity_model or not search_results: -return search_results + if not main_similarity_model or not search_results: + return search_results -try: - prompt_embedding = main_similarity_model.encode(prompt, convert_to_tensor=True) - filtered_results = [] + try: + prompt_embedding = main_similarity_model.encode(prompt, convert_to_tensor=True) + filtered_results = [] - seen_snippets = set() - if previous_snippets: - seen_snippets.update(previous_snippets) + seen_snippets = set() + if previous_snippets: + seen_snippets.update(previous_snippets) - for result in search_results: - combined_text = result['title'] + " " + result['snippet'] + for result in search_results: + combined_text = result['title'] + " " + result['snippet'] - if result['snippet'] in seen_snippets: # Prevent exact duplicates - continue + if result['snippet'] in seen_snippets: + continue - result_embedding = main_similarity_model.encode(combined_text, convert_to_tensor=True) - cosine_score = util.pytorch_cos_sim(prompt_embedding, result_embedding)[0][0].item() - - if cosine_score >= SIMILARITY_THRESHOLD: - result['relevance_score'] = cosine_score - filtered_results.append(result) - seen_snippets.add(result['snippet']) # Add snippets after filtering - add_to_faiss_index(result['snippet']) + result_embedding = main_similarity_model.encode(combined_text, convert_to_tensor=True) + cosine_score = util.pytorch_cos_sim(prompt_embedding, result_embedding)[0][0].item() + + if cosine_score >= SIMILARITY_THRESHOLD: + result['relevance_score'] = cosine_score + filtered_results.append(result) + seen_snippets.add(result['snippet']) + add_to_faiss_index(result['snippet']) - filtered_results.sort(key=lambda x: x.get('relevance_score', 0), reverse=True) # Sort by relevance. - return filtered_results + filtered_results.sort(key=lambda x: x.get('relevance_score', 0), reverse=True) + return filtered_results -except Exception as e: - logger.error(f"Error during filtering: {e}") - return search_results # Return original results on error. -content_copy -download - Use code with caution. + except Exception as e: + logger.error(f"Error during filtering: {e}") + return search_results def tool_extract_key_entities(prompt: str) -> list: -entity_input = f"Extract the key entities (people, organizations, concepts, technologies, events, time periods, locations, etc.) from this research prompt that should be investigated individually:\n\n{prompt}\n\nList the 5-7 most important entities, one per line, with a brief explanation (2-3 sentences) of why each is central to the research question." + entity_input = f"Extract the key entities (people, organizations, concepts, technologies, events, time periods, locations, etc.) from this research prompt that should be investigated individually:\n\n{prompt}\n\nList the 5-7 most important entities, one per line, with a brief explanation (2-3 sentences) of why each is central to the research question." -entity_output = hf_inference(MAIN_LLM_MODEL, entity_input) + entity_output = hf_inference(MAIN_LLM_MODEL, entity_input) -if isinstance(entity_output, dict) and "generated_text" in entity_output: - result = entity_output["generated_text"].strip() - entities = [e.strip() for e in result.split('\n') if e.strip()] - return entities[:7] # Limit to top 7 entities + if isinstance(entity_output, dict) and "generated_text" in entity_output: + result = entity_output["generated_text"].strip() + entities = [e.strip() for e in result.split('\n') if e.strip()] + return entities[:7] -logger.error(f"Failed to extract key entities: {entity_output}") -return [] -content_copy -download - Use code with caution. + logger.error(f"Failed to extract key entities: {entity_output}") + return [] def tool_meta_analyze(entity_insights: Dict[str, list], prompt: str) -> str: -if not entity_insights: -return "No entity insights to analyze." + if not entity_insights: + return "No entity insights to analyze." -meta_input = f"Perform a meta-analysis across these different entities related to the prompt: '{prompt}'\n\n" + meta_input = f"Perform a meta-analysis across these different entities related to the prompt: '{prompt}'\n\n" -for entity, insights in entity_insights.items(): - if insights: - meta_input += f"\n--- {entity} ---\n" + insights[-1] + "\n" # Most recent insight for each entity + for entity, insights in entity_insights.items(): + if insights: + meta_input += f"\n--- {entity} ---\n" + insights[-1] + "\n" -meta_input += "\nProvide a high-level synthesis that identifies:\n1. Common themes across entities\n2. Important differences and contradictions\n3. How these entities interact or influence each other\n4. The broader implications for the original research question\n5. A systems-level understanding of how these elements fit together" + meta_input += "\nProvide a high-level synthesis that identifies:\n1. Common themes across entities\n2. Important differences and contradictions\n3. How these entities interact or influence each other\n4. The broader implications for the original research question\n5. A systems-level understanding of how these elements fit together" -meta_output = ensemble_inference(meta_input) # Ensemble for meta-analysis + meta_output = ensemble_inference(meta_input) -if isinstance(meta_output, dict) and "generated_text" in meta_output: - return meta_output["generated_text"].strip() + if isinstance(meta_output, dict) and "generated_text" in meta_output: + return meta_output["generated_text"].strip() -logger.error(f"Failed to perform meta-analysis: {meta_output}") -return "Could not generate a meta-analysis due to an error." -content_copy -download - Use code with caution. + logger.error(f"Failed to perform meta-analysis: {meta_output}") + return "Could not generate a meta-analysis due to an error." def tool_draft_research_plan(prompt: str, entities: list, focus_areas: list = []) -> str: -plan_input = f"Create a detailed research plan for investigating this question: '{prompt}'\n\n" + plan_input = f"Create a detailed research plan for investigating this question: '{prompt}'\n\n" -if entities: - plan_input += "Key entities to investigate:\n" + "\n".join(entities) + "\n\n" + if entities: + plan_input += "Key entities to investigate:\n" + "\n".join(entities) + "\n\n" -if focus_areas: - plan_input += "Additional focus areas:\n" + "\n".join(focus_areas) + "\n\n" + if focus_areas: + plan_input += "Additional focus areas:\n" + "\n".join(focus_areas) + "\n\n" -plan_input += "The research plan should include:\n" -plan_input += "1. Main research questions and sub-questions\n" -plan_input += "2. Methodology for investigating each aspect\n" -plan_input += "3. Potential sources and databases to consult\n" -plan_input += "4. Suggested sequence of investigation\n" -plan_input += "5. Potential challenges and how to address them\n" -plan_input += "6. Criteria for evaluating the quality of findings" + plan_input += "The research plan should include:\n" + plan_input += "1. Main research questions and sub-questions\n" + plan_input += "2. Methodology for investigating each aspect\n" + plan_input += "3. Potential sources and databases to consult\n" + plan_input += "4. Suggested sequence of investigation\n" + plan_input += "5. Potential challenges and how to address them\n" + plan_input += "6. Criteria for evaluating the quality of findings" -plan_output = hf_inference(REASONING_LLM_MODEL, plan_input) # Use reasoning model + plan_output = hf_inference(REASONING_LLM_MODEL, plan_input) -if isinstance(plan_output, dict) and "generated_text" in plan_output: - return plan_output["generated_text"].strip() + if isinstance(plan_output, dict) and "generated_text" in plan_output: + return plan_output["generated_text"].strip() -logger.error(f"Failed to generate research plan: {plan_output}") -return "Could not generate a research plan due to an error." -content_copy -download - Use code with caution. + logger.error(f"Failed to generate research plan: {plan_output}") + return "Could not generate a research plan due to an error." def tool_extract_article(url: str) -> str: -extracted_text = extract_article_content(url) -return extracted_text if extracted_text else f"Could not extract content from {url}" - -New tool for summarizing a single paper + extracted_text = extract_article_content(url) + return extracted_text if extracted_text else f"Could not extract content from {url}" def tool_summarize_paper(paper_text: str) -> str: -summarization_prompt = f"""Summarize this academic paper, focusing on the following: + summarization_prompt = f"""Summarize this academic paper, focusing on the following: Main Research Question(s): What questions does the paper address? @@ -682,226 +630,217 @@ Limitations: What are the acknowledged limitations of the study? Implications: What are the broader implications of the findings, according to the authors? Paper Text: {paper_text[:MAX_FULL_TEXT_LENGTH]} -""" # Truncate if necessary -summary = hf_inference(REASONING_LLM_MODEL, summarization_prompt, max_tokens=500) +""" + summary = hf_inference(REASONING_LLM_MODEL, summarization_prompt, max_tokens=500) -if isinstance(summary, dict) and "generated_text" in summary: -return summary["generated_text"].strip() -else: -logger.error(f"Failed to generate summary: {summary}") -return "Could not generate a summary due to an error." + if isinstance(summary, dict) and "generated_text" in summary: + return summary["generated_text"].strip() + else: + logger.error(f"Failed to generate summary: {summary}") + return "Could not generate a summary due to an error." def tool_search_patents(query: str, max_results: int = 10) -> list: - """Search patent databases including USPTO and EPO""" - # Implementation details... + pass def tool_search_clinical_trials(query: str, max_results: int = 10) -> list: - """Search ClinicalTrials.gov and WHO ICTRP""" - # Implementation details... + pass def tool_search_datasets(query: str, max_results: int = 10) -> list: - """Search academic datasets from repositories like Kaggle, UCI, etc.""" - # Implementation details... + pass def tool_search_conferences(query: str, max_results: int = 10) -> list: - """Search major conference proceedings""" - # Implementation details... + pass tools = { -"search_web": { -"function": tool_search_web, -"description": "Searches the web for information.", -"parameters": { -"query": {"type": "string", "description": "The search query."}, -"num_results": {"type": "integer", "description": "Number of results to return."}, -"time_filter": {"type": "string", "description": "Optional time filter (d, w, m, y)."}, -"region": {"type": "string", "description": "Optional region code."}, -"language": {"type": "string", "description": "Optional language code."} -}, -}, -"search_arxiv": { -"function": tool_search_arxiv, -"description": "Searches arXiv for scientific papers.", -"parameters": { -"query": {"type": "string", "description": "The search query for scientific papers."}, -"max_results": {"type": "integer", "description": "Maximum number of papers to return."} -}, -}, -"search_pubmed": { -"function": tool_search_pubmed, -"description": "Searches PubMed for medical and scientific literature.", -"parameters": { -"query": {"type": "string", "description": "The search query for medical literature."}, -"max_results": {"type": "integer", "description": "Maximum number of articles to return."} -}, -}, -"search_wikipedia": { -"function": tool_search_wikipedia, -"description": "Searches Wikipedia for information.", -"parameters": { -"query": {"type": "string", "description": "The search query for Wikipedia."}, -"max_results": {"type": "integer", "description": "Maximum number of articles to return."} -}, -}, -"search_scholar": { -"function": tool_search_scholar, -"description": "Searches Google Scholar for academic publications.", -"parameters": { -"query": {"type": "string", "description": "The search query for Google Scholar."}, -"max_results": {"type": "integer", "description": "Maximum number of articles to return."} -} -}, -"extract_article": { -"function": tool_extract_article, -"description": "Extracts the main content from a web article URL", -"parameters": { -"url": {"type": "string", "description": "The URL of the article to extract"} -}, -}, -"summarize_paper": { -"function": tool_summarize_paper, -"description": "Summarizes the content of an academic paper.", -"parameters": { -"paper_text": {"type": "string", "description": "The full text of the paper to be summarized."}, -}, -}, -"reason": { -"function": tool_reason, -"description": "Analyzes and reasons about information.", -"parameters": { -"prompt": {"type": "string", "description": "The original prompt."}, -"search_results": {"type": "array", "description": "Search results to analyze."}, -"reasoning_context": {"type": "array", "description": "Previous reasoning outputs."}, -"critique": {"type": "string", "description": "Recent critique to address."}, -"focus_areas": {"type": "array", "description": "Specific aspects to focus on."} -}, -}, -"summarize": { -"function": tool_summarize, -"description": "Synthesizes insights into a cohesive summary.", -"parameters": { -"insights": {"type": "array", "description": "Insights to summarize."}, -"prompt": {"type": "string", "description": "The original research prompt."}, -"contradictions": {"type": "array", "description": "Specific contradictions to address."} -}, -}, -"generate_search_query": { -"function": tool_generate_search_query, -"description": "Generates an optimized search query", -"parameters":{ -"prompt": {"type": "string", "description": "The original user prompt."}, -"previous_queries": {"type": "array", "description": "Previously used search queries."}, -"failed_queries": {"type": "array", "description": "Queries that didn't yield good results."}, -"focus_areas": {"type": "array", "description": "Specific aspects to focus on."} -} -}, -"critique_reasoning": { -"function": tool_critique_reasoning, -"description": "Critically evaluates reasoning output.", -"parameters": { -"reasoning_output": {"type": "string", "description": "The reasoning output to critique."}, -"prompt": {"type": "string", "description": "The original prompt."}, -"previous_critiques": {"type": "array", "description": "Previous critique outputs."} -}, -}, -"identify_contradictions": { -"function": tool_identify_contradictions, -"description": "Identifies contradictions across multiple insights.", -"parameters": { -"insights": {"type": "array", "description": "Collection of insights to analyze for contradictions."}, -}, -}, -"identify_focus_areas": { -"function": tool_identify_focus_areas, -"description": "Identifies specific aspects that need further investigation.", -"parameters": { -"prompt": {"type": "string", "description": "The original research prompt."}, -"insights": {"type": "array", "description": "Existing insights to build upon."}, -"failed_areas": {"type": "array", "description": "Previously tried areas that yielded poor results."} -}, -}, -"extract_key_entities": { -"function": tool_extract_key_entities, -"description": "Extracts key entities from the prompt for focused research.", -"parameters": { -"prompt": {"type": "string", "description": "The original research prompt."} -}, -}, -"meta_analyze": { -"function": tool_meta_analyze, -"description": "Performs meta-analysis across entity-specific insights.", -"parameters": { -"entity_insights": {"type": "object", "description": "Dictionary mapping entities to their insights."}, -"prompt": {"type": "string", "description": "The original research prompt."} -}, -}, -"draft_research_plan": { -"function": tool_draft_research_plan, -"description": "Creates a detailed research plan.", -"parameters": { -"prompt": {"type": "string", "description": "The research question/prompt."}, -"entities": {"type": "array", "description": "Key entities to investigate."}, -"focus_areas": {"type": "array", "description": "Additional areas to focus on."} -} -}, -"search_patents": { -"function": tool_search_patents, -"description": "Searches patent databases globally", -"parameters": { -"query": {"type": "string", "description": "Patent search query"}, -"max_results": {"type": "integer", "description": "Maximum number of patents to return"} -} -}, -"search_clinical_trials": { -"function": tool_search_clinical_trials, -"description": "Search ClinicalTrials.gov and WHO ICTRP", -"parameters": { -"query": {"type": "string", "description": "Search query for ClinicalTrials.gov and WHO ICTRP"}, -"max_results": {"type": "integer", "description": "Maximum number of results to return"} -} -}, -"search_datasets": { -"function": tool_search_datasets, -"description": "Search academic datasets from repositories like Kaggle, UCI, etc.", -"parameters": { -"query": {"type": "string", "description": "Search query for academic datasets"}, -"max_results": {"type": "integer", "description": "Maximum number of results to return"} -} -}, -"search_conferences": { -"function": tool_search_conferences, -"description": "Search major conference proceedings", -"parameters": { -"query": {"type": "string", "description": "Search query for conference proceedings"}, -"max_results": {"type": "integer", "description": "Maximum number of results to return"} -} -} -} + "search_web": { + "function": tool_search_web, + "description": "Searches the web for information.", + "parameters": { + "query": {"type": "string", "description": "The search query."}, + "num_results": {"type": "integer", "description": "Number of results to return."}, + "time_filter": {"type": "string", "description": "Optional time filter (d, w, m, y)."}, + "region": {"type": "string", "description": "Optional region code."}, + "language": {"type": "string", "description": "Optional language code."} + }, + }, + "search_arxiv": { + "function": tool_search_arxiv, + "description": "Searches arXiv for scientific papers.", + "parameters": { + "query": {"type": "string", "description": "The search query for scientific papers."}, + "max_results": {"type": "integer", "description": "Maximum number of papers to return."} + }, + }, + "search_pubmed": { + "function": tool_search_pubmed, + "description": "Searches PubMed for medical and scientific literature.", + "parameters": { + "query": {"type": "string", "description": "The search query for medical literature."}, + "max_results": {"type": "integer", "description": "Maximum number of articles to return."} + }, + }, + "search_wikipedia": { + "function": tool_search_wikipedia, + "description": "Searches Wikipedia for information.", + "parameters": { + "query": {"type": "string", "description": "The search query for Wikipedia."}, + "max_results": {"type": "integer", "description": "Maximum number of articles to return."} + }, + }, + "search_scholar": { + "function": tool_search_scholar, + "description": "Searches Google Scholar for academic publications.", + "parameters": { + "query": {"type": "string", "description": "The search query for Google Scholar."}, + "max_results": {"type": "integer", "description": "Maximum number of articles to return."} + } + }, + "extract_article": { + "function": tool_extract_article, + "description": "Extracts the main content from a web article URL", + "parameters": { + "url": {"type": "string", "description": "The URL of the article to extract"} + }, + }, + "summarize_paper": { + "function": tool_summarize_paper, + "description": "Summarizes the content of an academic paper.", + "parameters": { + "paper_text": {"type": "string", "description": "The full text of the paper to be summarized."}, + }, + }, + "reason": { + "function": tool_reason, + "description": "Analyzes and reasons about information.", + "parameters": { + "prompt": {"type": "string", "description": "The original prompt."}, + "search_results": {"type": "array", "description": "Search results to analyze."}, + "reasoning_context": {"type": "array", "description": "Previous reasoning outputs."}, + "critique": {"type": "string", "description": "Recent critique to address."}, + "focus_areas": {"type": "array", "description": "Specific aspects to focus on."} + }, + }, + "summarize": { + "function": tool_summarize, + "description": "Synthesizes insights into a cohesive summary.", + "parameters": { + "insights": {"type": "array", "description": "Insights to summarize."}, + "prompt": {"type": "string", "description": "The original research prompt."}, + "contradictions": {"type": "array", "description": "Specific contradictions to address."} + }, + }, + "generate_search_query": { + "function": tool_generate_search_query, + "description": "Generates an optimized search query", + "parameters":{ + "prompt": {"type": "string", "description": "The original user prompt."}, + "previous_queries": {"type": "array", "description": "Previously used search queries."}, + "failed_queries": {"type": "array", "description": "Queries that didn't yield good results."}, + "focus_areas": {"type": "array", "description": "Specific aspects to focus on."} + } + }, + "critique_reasoning": { + "function": tool_critique_reasoning, + "description": "Critically evaluates reasoning output.", + "parameters": { + "reasoning_output": {"type": "string", "description": "The reasoning output to critique."}, + "prompt": {"type": "string", "description": "The original prompt."}, + "previous_critiques": {"type": "array", "description": "Previous critique outputs."} + }, + }, + "identify_contradictions": { + "function": tool_identify_contradictions, + "description": "Identifies contradictions across multiple insights.", + "parameters": { + "insights": {"type": "array", "description": "Collection of insights to analyze for contradictions."}, + }, + }, + "identify_focus_areas": { + "function": tool_identify_focus_areas, + "description": "Identifies specific aspects that need further investigation.", + "parameters": { + "prompt": {"type": "string", "description": "The original research prompt."}, + "insights": {"type": "array", "description": "Existing insights to build upon."}, + "failed_areas": {"type": "array", "description": "Previously tried areas that yielded poor results."} + }, + }, + "extract_key_entities": { + "function": tool_extract_key_entities, + "description": "Extracts key entities from the prompt for focused research.", + "parameters": { + "prompt": {"type": "string", "description": "The original research prompt."} + }, + }, + "meta_analyze": { + "function": tool_meta_analyze, + "description": "Performs meta-analysis across entity-specific insights.", + "parameters": { + "entity_insights": {"type": "object", "description": "Dictionary mapping entities to their insights."}, + "prompt": {"type": "string", "description": "The original research prompt."} + }, + }, + "draft_research_plan": { + "function": tool_draft_research_plan, + "description": "Creates a detailed research plan.", + "parameters": { + "prompt": {"type": "string", "description": "The research question/prompt."}, + "entities": {"type": "array", "description": "Key entities to investigate."}, + "focus_areas": {"type": "array", "description": "Additional areas to focus on."} + } + }, + "search_patents": { + "function": tool_search_patents, + "description": "Searches patent databases globally", + "parameters": { + "query": {"type": "string", "description": "Patent search query"}, + "max_results": {"type": "integer", "description": "Maximum number of patents to return"} + } + }, + "search_clinical_trials": { + "function": tool_search_clinical_trials, + "description": "Search ClinicalTrials.gov and WHO ICTRP", + "parameters": { + "query": {"type": "string", "description": "Search query for ClinicalTrials.gov and WHO ICTRP"}, + "max_results": {"type": "integer", "description": "Maximum number of results to return"} + } + }, + "search_datasets": { + "function": tool_search_datasets, + "description": "Search academic datasets from repositories like Kaggle, UCI, etc.", + "parameters": { + "query": {"type": "string", "description": "Search query for academic datasets"}, + "max_results": {"type": "integer", "description": "Maximum number of results to return"} + } + }, + "search_conferences": { + "function": tool_search_conferences, + "description": "Search major conference proceedings", + "parameters": { + "query": {"type": "string", "description": "Search query for conference proceedings"}, + "max_results": {"type": "integer", "description": "Maximum number of results to return"} + } + } } def create_prompt(task_description, user_input, available_tools, context): -prompt = f"""{task_description} + prompt = f"""{task_description} User Input: {user_input} Available Tools: """ -for tool_name, tool_data in available_tools.items(): -prompt += f"- {tool_name}: {tool_data['description']}\n" -prompt += " Parameters:\n" -for param_name, param_data in tool_data["parameters"].items(): -prompt += f" - {param_name} ({param_data['type']}): {param_data['description']}\n" - -recent_context = context[-MAX_CONTEXT_ITEMS:] if len(context) > MAX_CONTEXT_ITEMS else context + for tool_name, tool_data in available_tools.items(): + prompt += f"- {tool_name}: {tool_data['description']}\n" + prompt += " Parameters:\n" + for param_name, param_data in tool_data["parameters"].items(): + prompt += f" - {param_name} ({param_data['type']}): {param_data['description']}\n" -prompt += "\nContext (most recent items):\n" -for item in recent_context: - prompt += f"- {item}\n" + recent_context = context[-MAX_CONTEXT_ITEMS:] if len(context) > MAX_CONTEXT_ITEMS else context -prompt += """ -content_copy -download - Use code with caution. + prompt += "\nContext (most recent items):\n" + for item in recent_context: + prompt += f"- {item}\n" + prompt += """ Instructions: Select the BEST tool and parameters for the current research stage. Output valid JSON. If no tool is appropriate, respond with {}. Only use provided tools. Be strategic about which tool to use next based on the research progress so far. @@ -922,148 +861,145 @@ Example: {"tool": "search_web", "parameters": {"query": "Eiffel Tower location"}} Output: """ -return prompt + return prompt def deep_research(prompt): -task_description = "You are an advanced research assistant, designed to be as comprehensive as possible. Use available tools iteratively, focus on different aspects, explore promising leads thoroughly, critically evaluate your findings, and build up a comprehensive understanding of the research topic. Utilize the FAISS index to avoid redundant searches and to build a persistent knowledge base." -research_data = load_research_data() -paper_summaries = load_paper_summaries() # Load paper summaries - -context = research_data.get('context', []) -all_insights = research_data.get('all_insights', []) -entity_specific_insights = research_data.get('entity_specific_insights', {}) -intermediate_output = "" -previous_queries = research_data.get('previous_queries', []) -failed_queries = research_data.get('failed_queries', []) -reasoning_context = research_data.get('reasoning_context', []) -previous_critiques = research_data.get('previous_critiques', []) -focus_areas = research_data.get('focus_areas', []) -failed_areas = research_data.get('failed_areas', []) -seen_snippets = set(research_data.get('seen_snippets', [])) -contradictions = research_data.get('contradictions', []) -research_session_id = research_data.get('research_session_id', str(uuid4())) - -global index -if research_data: - logger.info("Restoring FAISS Index from loaded data.") -else: - index.reset() - logger.info("Initialized a fresh FAISS Index") - -key_entities_with_descriptions = tool_extract_key_entities(prompt=prompt) -key_entities = [e.split(":")[0].strip() for e in key_entities_with_descriptions] # Extract just entity names -if key_entities: - context.append(f"Identified key entities: {key_entities}") - intermediate_output += f"Identified key entities for focused research: {key_entities_with_descriptions}\n" - yield "Identifying key entities... (Completed)" - -# Initialize progress tracking for each entity. -entity_progress = {entity: {'queries': [], 'insights': []} for entity in key_entities} -entity_progress['general'] = {'queries': [], 'insights': []} # For general, non-entity-specific searches -for entity in key_entities + ['general']: - if entity in research_data: # Load existing progress - entity_progress[entity]['queries'] = research_data[entity]['queries'] - entity_progress[entity]['insights'] = research_data[entity]['insights'] - -if not focus_areas: # Corrected placement: outside the loop - initial_focus_areas = tool_identify_focus_areas(prompt=prompt) - yield "Identifying initial focus areas...(Completed)" - research_plan = tool_draft_research_plan(prompt=prompt, entities=key_entities, focus_areas=initial_focus_areas) - yield "Drafting initial research plan...(Completed)" - context.append(f"Initial Research Plan: {research_plan[:200]}...") # Add plan to context - intermediate_output += f"Initial Research Plan:\n{research_plan}\n\n" - focus_areas = initial_focus_areas - - -for i in range(MAX_ITERATIONS): - # Entity-focused iteration strategy - if key_entities and i > 0: # Cycle through entities *after* initial setup - entities_to_process = key_entities + ['general'] # Include 'general' for broad searches - current_entity = entities_to_process[i % len(entities_to_process)] + task_description = "You are an advanced research assistant, designed to be as comprehensive as possible. Use available tools iteratively, focus on different aspects, explore promising leads thoroughly, critically evaluate your findings, and build up a comprehensive understanding of the research topic. Utilize the FAISS index to avoid redundant searches and to build a persistent knowledge base." + research_data = load_research_data() + paper_summaries = load_paper_summaries() + + context = research_data.get('context', []) + all_insights = research_data.get('all_insights', []) + entity_specific_insights = research_data.get('entity_specific_insights', {}) + intermediate_output = "" + previous_queries = research_data.get('previous_queries', []) + failed_queries = research_data.get('failed_queries', []) + reasoning_context = research_data.get('reasoning_context', []) + previous_critiques = research_data.get('previous_critiques', []) + focus_areas = research_data.get('focus_areas', []) + failed_areas = research_data.get('failed_areas', []) + seen_snippets = set(research_data.get('seen_snippets', [])) + contradictions = research_data.get('contradictions', []) + research_session_id = research_data.get('research_session_id', str(uuid4())) + + global index + if research_data: + logger.info("Restoring FAISS Index from loaded data.") else: - current_entity = 'general' # Start with general research. - - context.append(f"Current focus: {current_entity}") - - # FAISS Retrieval - if i > 0: # Use FAISS *after* the first iteration (once we have data) - faiss_results_indices = search_faiss_index(prompt if current_entity == 'general' else f"{prompt} {current_entity}") - faiss_context = [] - for idx in faiss_results_indices: - if idx < len(all_insights): # Check index bounds - faiss_context.append(f"Previously found insight: {all_insights[idx]}") - if faiss_context: - context.extend(faiss_context) # Add FAISS context - intermediate_output += f"Iteration {i+1} - Retrieved {len(faiss_context)} relevant items from FAISS index.\n" - - - if i == 0: #Initial broad search - initial_query = tool_generate_search_query(prompt=prompt) - yield f"Generating initial search query... (Iteration {i+1})" - if initial_query: - previous_queries.append(initial_query) - entity_progress['general']['queries'].append(initial_query) - - with ThreadPoolExecutor(max_workers=5) as executor: - futures = [ - executor.submit(tool_search_web, query=initial_query, num_results=NUM_RESULTS), - executor.submit(tool_search_arxiv, query=initial_query, max_results=5), - executor.submit(tool_search_pubmed, query=initial_query, max_results=5), - executor.submit(tool_search_wikipedia, query=initial_query, max_results=3), - executor.submit(tool_search_scholar, query=initial_query, max_results=5) - ] - - search_results = [] - for future in as_completed(futures): - search_results.extend(future.result()) - yield f"Performing initial searches... (Iteration {i+1})" - - filtered_search_results = filter_results(search_results, prompt) - - if filtered_search_results: - context.append(f"Initial Search Results: {len(filtered_search_results)} items found") - reasoning_output = tool_reason(prompt, filtered_search_results) - yield f"Reasoning about initial search results... (Iteration {i+1})" - if reasoning_output: - all_insights.append(reasoning_output) - entity_progress['general']['insights'].append(reasoning_output) - reasoning_context.append(reasoning_output) - context.append(f"Initial Reasoning: {reasoning_output[:200]}...") - add_to_faiss_index(reasoning_output) - else: - failed_queries.append(initial_query) - context.append(f"Initial query yielded no relevant results: {initial_query}") - - elif current_entity != 'general': - entity_query = tool_generate_search_query( - prompt=f"{prompt} focusing specifically on {current_entity}", - previous_queries=entity_progress[current_entity]['queries'], - focus_areas=focus_areas - ) - yield f"Generating search query for entity: {current_entity}... (Iteration {i+1})" - - if entity_query: - previous_queries.append(entity_query) - entity_progress[current_entity]['queries'].append(entity_query) - + index.reset() + logger.info("Initialized a fresh FAISS Index") + + key_entities_with_descriptions = tool_extract_key_entities(prompt=prompt) + key_entities = [e.split(":")[0].strip() for e in key_entities_with_descriptions] + if key_entities: + context.append(f"Identified key entities: {key_entities}") + intermediate_output += f"Identified key entities for focused research: {key_entities_with_descriptions}\n" + yield "Identifying key entities... (Completed)" + + entity_progress = {entity: {'queries': [], 'insights': []} for entity in key_entities} + entity_progress['general'] = {'queries': [], 'insights': []} + for entity in key_entities + ['general']: + if entity in research_data: + entity_progress[entity]['queries'] = research_data[entity]['queries'] + entity_progress[entity]['insights'] = research_data[entity]['insights'] + + if not focus_areas: + initial_focus_areas = tool_identify_focus_areas(prompt=prompt) + yield "Identifying initial focus areas...(Completed)" + research_plan = tool_draft_research_plan(prompt=prompt, entities=key_entities, focus_areas=initial_focus_areas) + yield "Drafting initial research plan...(Completed)" + context.append(f"Initial Research Plan: {research_plan[:200]}...") + intermediate_output += f"Initial Research Plan:\n{research_plan}\n\n" + focus_areas = initial_focus_areas + + + for i in range(MAX_ITERATIONS): + if key_entities and i > 0: + entities_to_process = key_entities + ['general'] + current_entity = entities_to_process[i % len(entities_to_process)] + else: + current_entity = 'general' - with ThreadPoolExecutor(max_workers=5) as executor: - futures = [ - executor.submit(tool_search_web, query=entity_query, num_results=NUM_RESULTS//2), - executor.submit(tool_search_arxiv, query=entity_query, max_results=3), - executor.submit(tool_search_pubmed, query=entity_query, max_results=3), - executor.submit(tool_search_wikipedia, query=entity_query, max_results=2), - executor.submit(tool_search_scholar, query=entity_query, max_results=3) - ] + context.append(f"Current focus: {current_entity}") + + if i > 0: + faiss_results_indices = search_faiss_index(prompt if current_entity == 'general' else f"{prompt} {current_entity}") + faiss_context = [] + for idx in faiss_results_indices: + if idx < len(all_insights): + faiss_context.append(f"Previously found insight: {all_insights[idx]}") + if faiss_context: + context.extend(faiss_context) + intermediate_output += f"Iteration {i+1} - Retrieved {len(faiss_context)} relevant items from FAISS index.\n" + - search_results = [] - for future in as_completed(futures): - search_results.extend(future.result()) + if i == 0: + initial_query = tool_generate_search_query(prompt=prompt) + yield f"Generating initial search query... (Iteration {i+1})" + if initial_query: + previous_queries.append(initial_query) + entity_progress['general']['queries'].append(initial_query) + + with ThreadPoolExecutor(max_workers=5) as executor: + futures = [ + executor.submit(tool_search_web, query=initial_query, num_results=NUM_RESULTS), + executor.submit(tool_search_arxiv, query=initial_query, max_results=5), + executor.submit(tool_search_pubmed, query=initial_query, max_results=5), + executor.submit(tool_search_wikipedia, query=initial_query, max_results=3), + executor.submit(tool_search_scholar, query=initial_query, max_results=5) + ] + + search_results = [] + for future in as_completed(futures): + search_results.extend(future.result()) + yield f"Performing initial searches... (Iteration {i+1})" + + filtered_search_results = filter_results(search_results, prompt) + + if filtered_search_results: + context.append(f"Initial Search Results: {len(filtered_search_results)} items found") + reasoning_output = tool_reason(prompt, filtered_search_results) + yield f"Reasoning about initial search results... (Iteration {i+1})" + if reasoning_output: + all_insights.append(reasoning_output) + entity_progress['general']['insights'].append(reasoning_output) + reasoning_context.append(reasoning_output) + context.append(f"Initial Reasoning: {reasoning_output[:200]}...") + add_to_faiss_index(reasoning_output) + else: + failed_queries.append(initial_query) + context.append(f"Initial query yielded no relevant results: {initial_query}") + + elif current_entity != 'general': + entity_query = tool_generate_search_query( + prompt=f"{prompt} focusing specifically on {current_entity}", + previous_queries=entity_progress[current_entity]['queries'], + focus_areas=focus_areas + ) + yield f"Generating search query for entity: {current_entity}... (Iteration {i+1})" + + if entity_query: + previous_queries.append(entity_query) + entity_progress[current_entity]['queries'].append(entity_query) + + + with ThreadPoolExecutor(max_workers=5) as executor: + futures = [ + executor.submit(tool_search_web, query=entity_query, num_results=NUM_RESULTS//2), + executor.submit(tool_search_arxiv, query=entity_query, max_results=3), + executor.submit(tool_search_pubmed, query=entity_query, max_results=3), + executor.submit(tool_search_wikipedia, query=entity_query, max_results=2), + executor.submit(tool_search_scholar, query=entity_query, max_results=3) + ] + + search_results = [] + for future in as_completed(futures): + search_results.extend(future.result()) yield f"Searching for information on entity: {current_entity}... (Iteration {i+1})" filtered_search_results = filter_results(search_results, - f"{prompt} {current_entity}", - previous_snippets=seen_snippets) # Pass existing snippets + f"{prompt} {current_entity}", + previous_snippets=seen_snippets) if filtered_search_results: context.append(f"Entity Search for {current_entity}: {len(filtered_search_results)} results") @@ -1071,7 +1007,7 @@ for i in range(MAX_ITERATIONS): entity_reasoning = tool_reason( prompt=f"{prompt} focusing on {current_entity}", search_results=filtered_search_results, - reasoning_context=entity_progress[current_entity]['insights'], # Use entity-specific context + reasoning_context=entity_progress[current_entity]['insights'], focus_areas=focus_areas ) yield f"Reasoning about entity: {current_entity}... (Iteration {i+1})" @@ -1090,264 +1026,253 @@ for i in range(MAX_ITERATIONS): failed_queries.append(entity_query) context.append(f"Entity query for {current_entity} yielded no relevant results") - llm_prompt = create_prompt(task_description, prompt, tools, context) - llm_response = hf_inference(MAIN_LLM_MODEL, llm_prompt, stream=True) # Use streaming + llm_prompt = create_prompt(task_description, prompt, tools, context) + llm_response = hf_inference(MAIN_LLM_MODEL, llm_prompt, stream=True) - if isinstance(llm_response, dict) and "error" in llm_response: - intermediate_output += f"LLM Error: {llm_response['error']}\n" - yield f"LLM Error (Iteration {i+1}): {llm_response['error']}" # Display error in output - continue + if isinstance(llm_response, dict) and "error" in llm_response: + intermediate_output += f"LLM Error: {llm_response['error']}\n" + yield f"LLM Error (Iteration {i+1}): {llm_response['error']}" + continue - # Process streaming response - response_text = "" - try: - for chunk in llm_response: - if chunk.choices and chunk.choices[0].delta and chunk.choices[0].delta.content: - response_text += chunk.choices[0].delta.content - yield f"Iteration {i+1} - Thinking... {response_text}" # Real time output + response_text = "" + try: + for chunk in llm_response: + if chunk.choices and chunk.choices[0].delta and chunk.choices[0].delta.content: + response_text += chunk.choices[0].delta.content + yield f"Iteration {i+1} - Thinking... {response_text}" + + except Exception as e: + intermediate_output += f"Streaming Error: {str(e)}\n" + yield f"Streaming Error (Iteration {i+1}): {str(e)}" + continue - except Exception as e: - intermediate_output += f"Streaming Error: {str(e)}\n" - yield f"Streaming Error (Iteration {i+1}): {str(e)}" #Error - continue + try: + response_json = json.loads(response_text) + intermediate_output += f"Iteration {i+1} - Focus: {current_entity} - Action: {response_text}\n" + except json.JSONDecodeError: + intermediate_output += f"Iteration {i+1} - LLM Response (Invalid JSON): {response_text[:100]}...\n" + context.append(f"Invalid JSON: {response_text[:100]}...") + continue - try: - response_json = json.loads(response_text) # Parse the JSON response. - intermediate_output += f"Iteration {i+1} - Focus: {current_entity} - Action: {response_text}\n" - except json.JSONDecodeError: - intermediate_output += f"Iteration {i+1} - LLM Response (Invalid JSON): {response_text[:100]}...\n" - context.append(f"Invalid JSON: {response_text[:100]}...") # Add invalid JSON to context - continue - - tool_name = response_json.get("tool") - parameters = response_json.get("parameters", {}) - - if not tool_name: #LLM didn't return a tool. End the process if we are past halfway. - if all_insights: - if i > MAX_ITERATIONS // 2: - break - continue + tool_name = response_json.get("tool") + parameters = response_json.get("parameters", {}) - if tool_name not in tools: - context.append(f"Invalid tool: {tool_name}") - intermediate_output += f"Iteration {i + 1} - Invalid tool chosen: {tool_name}\n" - continue + if not tool_name: + if all_insights: + if i > MAX_ITERATIONS // 2: + break + continue - tool = tools[tool_name] - try: - intermediate_output += f"Iteration {i+1} - Executing: {tool_name}, Key params: {str(parameters)[:100]}...\n" + if tool_name not in tools: + context.append(f"Invalid tool: {tool_name}") + intermediate_output += f"Iteration {i + 1} - Invalid tool chosen: {tool_name}\n" + continue - if tool_name == "generate_search_query": - parameters['previous_queries'] = previous_queries - parameters['failed_queries'] = failed_queries - parameters['focus_areas'] = focus_areas - result = tool["function"](**parameters) - yield f"Iteration {i+1} - Generated search query: {result}" + tool = tools[tool_name] + try: + intermediate_output += f"Iteration {i+1} - Executing: {tool_name}, Key params: {str(parameters)[:100]}...\n" - if current_entity != 'general': - entity_progress[current_entity]['queries'].append(result) # Add entity-specific + if tool_name == "generate_search_query": + parameters['previous_queries'] = previous_queries + parameters['failed_queries'] = failed_queries + parameters['focus_areas'] = focus_areas + result = tool["function"](**parameters) + yield f"Iteration {i+1} - Generated search query: {result}" - previous_queries.append(result) + if current_entity != 'general': + entity_progress[current_entity]['queries'].append(result) - elif tool_name in ["search_web", "search_arxiv", "search_pubmed", "search_wikipedia", "search_scholar"]: - result = tool["function"](**parameters) - search_prompt = prompt - if current_entity != 'general': - search_prompt = f"{prompt} focusing on {current_entity}" + previous_queries.append(result) - filtered_result = filter_results(result, search_prompt, previous_snippets=seen_snippets) + elif tool_name in ["search_web", "search_arxiv", "search_pubmed", "search_wikipedia", "search_scholar"]: + result = tool["function"](**parameters) + search_prompt = prompt + if current_entity != 'general': + search_prompt = f"{prompt} focusing on {current_entity}" - result = filtered_result # Work with filtered results + filtered_result = filter_results(result, search_prompt, previous_snippets=seen_snippets) - if not result and 'query' in parameters: # Add query to failures if nothing returned. - failed_queries.append(parameters['query']) + result = filtered_result - elif tool_name == "reason": - # Ensure correct reasoning context is passed. - if current_entity != 'general' and 'reasoning_context' not in parameters: - parameters['reasoning_context'] = entity_progress[current_entity]['insights'] - elif 'reasoning_context' not in parameters: - parameters['reasoning_context'] = reasoning_context[:] + if not result and 'query' in parameters: + failed_queries.append(parameters['query']) - if 'prompt' not in parameters: - if current_entity != 'general': - parameters['prompt'] = f"{prompt} focusing on {current_entity}" - else: - parameters['prompt'] = prompt + elif tool_name == "reason": + if current_entity != 'general' and 'reasoning_context' not in parameters: + parameters['reasoning_context'] = entity_progress[current_entity]['insights'] + elif 'reasoning_context' not in parameters: + parameters['reasoning_context'] = reasoning_context[:] - if 'search_results' not in parameters: - parameters['search_results'] = [] #Avoid errors if no search results. + if 'prompt' not in parameters: + if current_entity != 'general': + parameters['prompt'] = f"{prompt} focusing on {current_entity}" + else: + parameters['prompt'] = prompt - if 'focus_areas' not in parameters and focus_areas: # Avoid overwriting focus_areas if already set - parameters['focus_areas'] = focus_areas + if 'search_results' not in parameters: + parameters['search_results'] = [] - result = tool["function"](**parameters) - yield f"Iteration {i+1} - Reasoning about information..." + if 'focus_areas' not in parameters and focus_areas: + parameters['focus_areas'] = focus_areas - if current_entity != 'general': - entity_progress[current_entity]['insights'].append(result) - if current_entity not in entity_specific_insights: - entity_specific_insights[current_entity] = [] - entity_specific_insights[current_entity].append(result) - else: - reasoning_context.append(result) #Add to general context. - add_to_faiss_index(result) - all_insights.append(result) + result = tool["function"](**parameters) + yield f"Iteration {i+1} - Reasoning about information..." - elif tool_name == "critique_reasoning": - if 'previous_critiques' not in parameters: #Pass in the previous critiques. - parameters['previous_critiques'] = previous_critiques + if current_entity != 'general': + entity_progress[current_entity]['insights'].append(result) + if current_entity not in entity_specific_insights: + entity_specific_insights[current_entity] = [] + entity_specific_insights[current_entity].append(result) + else: + reasoning_context.append(result) + add_to_faiss_index(result) + all_insights.append(result) + + elif tool_name == "critique_reasoning": + if 'previous_critiques' not in parameters: + parameters['previous_critiques'] = previous_critiques + + if all_insights: + if 'reasoning_output' not in parameters: + parameters['reasoning_output'] = all_insights[-1] + if 'prompt' not in parameters: + parameters['prompt'] = prompt + + result = tool["function"](**parameters) + yield f"Iteration {i+1} - Critiquing reasoning..." + previous_critiques.append(result) + context.append(f"Critique: {result[:200]}...") + else: + result = "No reasoning to critique yet." - if all_insights: - if 'reasoning_output' not in parameters: - parameters['reasoning_output'] = all_insights[-1] #Critique the most recent insight. + elif tool_name == "identify_contradictions": + result = tool["function"](**parameters) + yield f"Iteration {i+1} - Identifying contradictions..." + if result: + contradictions = result + context.append(f"Identified contradictions: {result}") + + elif tool_name == "identify_focus_areas": + if 'failed_areas' not in parameters: + parameters['failed_areas'] = failed_areas + result = tool["function"](**parameters) + yield f"Iteration {i+1} - Identifying focus areas..." + if result: + old_focus = set(focus_areas) + focus_areas = result + failed_areas.extend([area for area in old_focus if area not in result]) + context.append(f"New focus areas: {result}") + + elif tool_name == "extract_article": + result = tool["function"](**parameters) + yield f"Iteration {i+1} - Extracting article content..." + if result: + context.append(f"Extracted article content from {parameters['url']}: {result[:200]}...") + reasoning_about_article = tool_reason(prompt=prompt, search_results=[{"title": "Extracted Article", "snippet": result, "url": parameters['url']}]) + if reasoning_about_article: + all_insights.append(reasoning_about_article) + add_to_faiss_index(reasoning_about_article) + + elif tool_name == "summarize_paper": + result = tool["function"](**parameters) + yield f"Iteration {i+1} - Summarizing paper..." + if result: + paper_summaries[parameters['paper_text'][:100]] = result + save_paper_summaries(paper_summaries) + context.append(f"Summarized paper: {result[:200]}...") + add_to_faiss_index(result) + all_insights.append(result) + + elif tool_name == "meta_analyze": + if 'entity_insights' not in parameters: + parameters['entity_insights'] = entity_specific_insights if 'prompt' not in parameters: parameters['prompt'] = prompt - result = tool["function"](**parameters) - yield f"Iteration {i+1} - Critiquing reasoning..." - previous_critiques.append(result) - context.append(f"Critique: {result[:200]}...") - else: - result = "No reasoning to critique yet." - - elif tool_name == "identify_contradictions": - result = tool["function"](**parameters) - yield f"Iteration {i+1} - Identifying contradictions..." - if result: - contradictions = result # Keep track of contradictions. - context.append(f"Identified contradictions: {result}") - - elif tool_name == "identify_focus_areas": - if 'failed_areas' not in parameters: - parameters['failed_areas'] = failed_areas - result = tool["function"](**parameters) - yield f"Iteration {i+1} - Identifying focus areas..." - if result: - old_focus = set(focus_areas) - focus_areas = result # Update focus areas - failed_areas.extend([area for area in old_focus if area not in result]) #Track failed areas - context.append(f"New focus areas: {result}") - - elif tool_name == "extract_article": - result = tool["function"](**parameters) - yield f"Iteration {i+1} - Extracting article content..." - if result: - context.append(f"Extracted article content from {parameters['url']}: {result[:200]}...") - # Reason specifically about the extracted article. - reasoning_about_article = tool_reason(prompt=prompt, search_results=[{"title": "Extracted Article", "snippet": result, "url": parameters['url']}]) - if reasoning_about_article: - all_insights.append(reasoning_about_article) - add_to_faiss_index(reasoning_about_article) - - elif tool_name == "summarize_paper": - result = tool["function"](**parameters) - yield f"Iteration {i+1} - Summarizing paper..." - if result: - paper_summaries[parameters['paper_text'][:100]] = result # Store by a snippet of the text - save_paper_summaries(paper_summaries) - context.append(f"Summarized paper: {result[:200]}...") - add_to_faiss_index(result) # Add the summary itself to FAISS. - all_insights.append(result) #Add summary to insights for later summarization. - - elif tool_name == "meta_analyze": - if 'entity_insights' not in parameters: - parameters['entity_insights'] = entity_specific_insights - if 'prompt' not in parameters: - parameters['prompt'] = prompt - result = tool["function"](**parameters) - yield f"Iteration {i+1} - Performing meta-analysis..." - if result: - all_insights.append(result) # Add meta-analysis to overall insights. - context.append(f"Meta-analysis across entities: {result[:200]}...") - add_to_faiss_index(result) - - - elif tool_name == "draft_research_plan": - result = "Research plan already generated." # Avoid re-generating. - - else: - result = tool["function"](**parameters) + yield f"Iteration {i+1} - Performing meta-analysis..." + if result: + all_insights.append(result) + context.append(f"Meta-analysis across entities: {result[:200]}...") + add_to_faiss_index(result) - result_str = str(result) - if len(result_str) > 500: - result_str = result_str[:500] + "..." - intermediate_output += f"Iteration {i+1} - Result: {result_str}\n" + elif tool_name == "draft_research_plan": + result = "Research plan already generated." - # Add tool use to context, limit context length - result_context = result_str - if len(result_str) > 300: - result_context = result_str[:300] + "..." - context.append(f"Used: {tool_name}, Result: {result_context}") + else: + result = tool["function"](**parameters) - except Exception as e: - logger.error(f"Error with {tool_name}: {str(e)}") - context.append(f"Error with {tool_name}: {str(e)}") - intermediate_output += f"Iteration {i+1} - Error: {str(e)}\n" - continue - - #Save data - research_data = { - 'context': context, - 'all_insights': all_insights, - 'entity_specific_insights': entity_specific_insights, - 'previous_queries': previous_queries, - 'failed_queries': failed_queries, - 'reasoning_context': reasoning_context, - 'previous_critiques': previous_critiques, - 'focus_areas': focus_areas, - 'failed_areas': failed_areas, - 'seen_snippets': list(seen_snippets), - 'contradictions': contradictions, - 'research_session_id': research_session_id - } - for entity in entity_progress: - research_data[entity] = entity_progress[entity] #save the individual entity - save_research_data(research_data, index) + result_str = str(result) + if len(result_str) > 500: + result_str = result_str[:500] + "..." + intermediate_output += f"Iteration {i+1} - Result: {result_str}\n" -# Perform meta-analysis *before* final summarization, if we have enough entity-specific insights. -if len(entity_specific_insights) > 1 and len(all_insights) > 2: - meta_analysis = tool_meta_analyze(entity_insights=entity_specific_insights, prompt=prompt) - if meta_analysis: - all_insights.append(meta_analysis) - intermediate_output += f"Final Meta-Analysis: {meta_analysis[:500]}...\n" - add_to_faiss_index(meta_analysis) # Add to FAISS + result_context = result_str + if len(result_str) > 300: + result_context = result_str[:300] + "..." + context.append(f"Used: {tool_name}, Result: {result_context}") -if all_insights: - final_result = tool_summarize(all_insights, prompt, contradictions) # Summarize all insights. -else: - final_result = "Could not find meaningful information despite multiple attempts." + except Exception as e: + logger.error(f"Error with {tool_name}: {str(e)}") + context.append(f"Error with {tool_name}: {str(e)}") + intermediate_output += f"Iteration {i+1} - Error: {str(e)}\n" + continue + research_data = { + 'context': context, + 'all_insights': all_insights, + 'entity_specific_insights': entity_specific_insights, + 'previous_queries': previous_queries, + 'failed_queries': failed_queries, + 'reasoning_context': reasoning_context, + 'previous_critiques': previous_critiques, + 'focus_areas': focus_areas, + 'failed_areas': failed_areas, + 'seen_snippets': list(seen_snippets), + 'contradictions': contradictions, + 'research_session_id': research_session_id + } + for entity in entity_progress: + research_data[entity] = entity_progress[entity] + save_research_data(research_data, index) + + if len(entity_specific_insights) > 1 and len(all_insights) > 2: + meta_analysis = tool_meta_analyze(entity_insights=entity_specific_insights, prompt=prompt) + if meta_analysis: + all_insights.append(meta_analysis) + intermediate_output += f"Final Meta-Analysis: {meta_analysis[:500]}...\n" + add_to_faiss_index(meta_analysis) + + if all_insights: + final_result = tool_summarize(all_insights, prompt, contradictions) + else: + final_result = "Could not find meaningful information despite multiple attempts." -full_output = f"**Research Prompt:** {prompt}\n\n" + full_output = f"**Research Prompt:** {prompt}\n\n" -if key_entities_with_descriptions: - full_output += f"**Key Entities Identified:**\n" - for entity in key_entities_with_descriptions: - full_output += f"- {entity}\n" - full_output += "\n" + if key_entities_with_descriptions: + full_output += f"**Key Entities Identified:**\n" + for entity in key_entities_with_descriptions: + full_output += f"- {entity}\n" + full_output += "\n" -full_output += "**Research Process:**\n" + intermediate_output + "\n" + full_output += "**Research Process:**\n" + intermediate_output + "\n" -if contradictions: - full_output += "**Contradictions Identified:**\n" - for i, contradiction in enumerate(contradictions, 1): - full_output += f"{i}. {contradiction}\n" - full_output += "\n" + if contradictions: + full_output += "**Contradictions Identified:**\n" + for i, contradiction in enumerate(contradictions, 1): + full_output += f"{i}. {contradiction}\n" + full_output += "\n" -full_output += f"**Final Analysis:**\n{final_result}\n\n" + full_output += f"**Final Analysis:**\n{final_result}\n\n" -full_output += f"Research Session ID: {research_session_id}\n" -full_output += f"Completed at: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}\n" -full_output += f"Total iterations: {i+1}\n" -full_output += f"Total insights generated: {len(all_insights)}\n" + full_output += f"Research Session ID: {research_session_id}\n" + full_output += f"Completed at: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}\n" + full_output += f"Total iterations: {i+1}\n" + full_output += f"Total insights generated: {len(all_insights)}\n" -yield full_output # Final output -content_copy -download - Use code with caution. + yield full_output custom_css = """ /* Modern Research Interface */ @@ -1466,5 +1391,5 @@ iface = gr.Interface( css=custom_css ) -if name == "main": -iface.launch(share=False) +if __name__ == "__main__": + iface.launch(share=False)