ScholarAgent

Running

App Files Files Community

ScholarAgent / app.py

pdx97

Update app.py

79b8e3b verified 4 months ago

raw

history blame

16.2 kB

	# # import feedparser
	# # import urllib.parse
	# # import yaml
	# # import gradio as gr
	# # from smolagents import CodeAgent, HfApiModel, tool
	# # from tools.final_answer import FinalAnswerTool

	# # @tool
	# # def fetch_latest_arxiv_papers(keywords: list, num_results: int = 3) -> list:
	# # """Fetches the latest research papers from arXiv based on provided keywords.

	# # Args:
	# # keywords: A list of keywords to search for relevant papers.
	# # num_results: The number of papers to fetch (default is 3).

	# # Returns:
	# # A list of dictionaries containing:
	# # - "title": The title of the research paper.
	# # - "authors": The authors of the paper.
	# # - "year": The publication year.
	# # - "abstract": A summary of the research paper.
	# # - "link": A direct link to the paper on arXiv.
	# # """
	# # try:
	# # print(f"DEBUG: Searching arXiv papers with keywords: {keywords}") # Debug input

	# # #Properly format query with +AND+ for multiple keywords
	# # query = "+AND+".join([f"all:{kw}" for kw in keywords])
	# # query_encoded = urllib.parse.quote(query) # Encode spaces and special characters

	# # url = f"http://export.arxiv.org/api/query?search_query={query_encoded}&start=0&max_results={num_results}&sortBy=submittedDate&sortOrder=descending"

	# # print(f"DEBUG: Query URL - {url}") # Debug URL

	# # feed = feedparser.parse(url)

	# # papers = []
	# # for entry in feed.entries:
	# # papers.append({
	# # "title": entry.title,
	# # "authors": ", ".join(author.name for author in entry.authors),
	# # "year": entry.published[:4], # Extract year
	# # "abstract": entry.summary,
	# # "link": entry.link
	# # })

	# # return papers

	# # except Exception as e:
	# # print(f"ERROR: {str(e)}") # Debug errors
	# # return [f"Error fetching research papers: {str(e)}"]


	# #"""------Applied BM25 search for paper retrival------"""
	# # from rank_bm25 import BM25Okapi
	# # import nltk

	# # import os
	# # import shutil


	# # nltk_data_path = os.path.join(nltk.data.path[0], "tokenizers", "punkt")
	# # if os.path.exists(nltk_data_path):
	# # shutil.rmtree(nltk_data_path) # Remove corrupted version

	# # print("Removed old NLTK 'punkt' data. Reinstalling...")

	# # # Step 2: Download the correct 'punkt' tokenizer
	# # nltk.download("punkt_tab")

	# # print("Successfully installed 'punkt'!")


	# # @tool # Register the function properly as a SmolAgents tool
	# # def fetch_latest_arxiv_papers(keywords: list, num_results: int = 5) -> list:
	# # """Fetches and ranks arXiv papers using BM25 keyword relevance.

	# # Args:
	# # keywords: List of keywords for search.
	# # num_results: Number of results to return.

	# # Returns:
	# # List of the most relevant papers based on BM25 ranking.
	# # """
	# # try:
	# # print(f"DEBUG: Searching arXiv papers with keywords: {keywords}")

	# # # Use a general keyword search (without `ti:` and `abs:`)
	# # query = "+AND+".join([f"all:{kw}" for kw in keywords])
	# # query_encoded = urllib.parse.quote(query)
	# # url = f"http://export.arxiv.org/api/query?search_query={query_encoded}&start=0&max_results=50&sortBy=submittedDate&sortOrder=descending"

	# # print(f"DEBUG: Query URL - {url}")

	# # feed = feedparser.parse(url)
	# # papers = []

	# # # Extract papers from arXiv
	# # for entry in feed.entries:
	# # papers.append({
	# # "title": entry.title,
	# # "authors": ", ".join(author.name for author in entry.authors),
	# # "year": entry.published[:4],
	# # "abstract": entry.summary,
	# # "link": entry.link
	# # })

	# # if not papers:
	# # return [{"error": "No results found. Try different keywords."}]

	# # # Apply BM25 ranking
	# # tokenized_corpus = [nltk.word_tokenize(paper["title"].lower() + " " + paper["abstract"].lower()) for paper in papers]
	# # bm25 = BM25Okapi(tokenized_corpus)

	# # tokenized_query = nltk.word_tokenize(" ".join(keywords).lower())
	# # scores = bm25.get_scores(tokenized_query)

	# # # Sort papers based on BM25 score
	# # ranked_papers = sorted(zip(papers, scores), key=lambda x: x[1], reverse=True)

	# # # Return the most relevant ones
	# # return [paper[0] for paper in ranked_papers[:num_results]]

	# # except Exception as e:
	# # print(f"ERROR: {str(e)}")
	# # return [{"error": f"Error fetching research papers: {str(e)}"}]


	# """------Applied TF-IDF for better semantic search------"""
	# import feedparser
	# import urllib.parse
	# import yaml
	# from tools.final_answer import FinalAnswerTool
	# import numpy as np
	# from sklearn.feature_extraction.text import TfidfVectorizer
	# from sklearn.metrics.pairwise import cosine_similarity
	# import gradio as gr
	# from smolagents import CodeAgent,DuckDuckGoSearchTool, HfApiModel,load_tool,tool
	# import nltk

	# import datetime
	# import requests
	# import pytz
	# from tools.final_answer import FinalAnswerTool

	# from Gradio_UI import GradioUI

	# nltk.download("stopwords")
	# from nltk.corpus import stopwords

	# @tool # ✅ Register the function properly as a SmolAgents tool
	# def fetch_latest_arxiv_papers(keywords: list, num_results: int = 5) -> list:
	# """Fetches and ranks arXiv papers using TF-IDF and Cosine Similarity.

	# Args:
	# keywords: List of keywords for search.
	# num_results: Number of results to return.

	# Returns:
	# List of the most relevant papers based on TF-IDF ranking.
	# """
	# try:
	# print(f"DEBUG: Searching arXiv papers with keywords: {keywords}")

	# # Use a general keyword search
	# query = "+AND+".join([f"all:{kw}" for kw in keywords])
	# query_encoded = urllib.parse.quote(query)
	# url = f"http://export.arxiv.org/api/query?search_query={query_encoded}&start=0&max_results=50&sortBy=submittedDate&sortOrder=descending"

	# print(f"DEBUG: Query URL - {url}")

	# feed = feedparser.parse(url)
	# papers = []

	# # Extract papers from arXiv
	# for entry in feed.entries:
	# papers.append({
	# "title": entry.title,
	# "authors": ", ".join(author.name for author in entry.authors),
	# "year": entry.published[:4],
	# "abstract": entry.summary,
	# "link": entry.link
	# })

	# if not papers:
	# return [{"error": "No results found. Try different keywords."}]

	# # Prepare TF-IDF Vectorization
	# corpus = [paper["title"] + " " + paper["abstract"] for paper in papers]
	# vectorizer = TfidfVectorizer(stop_words=stopwords.words('english')) # Remove stopwords
	# tfidf_matrix = vectorizer.fit_transform(corpus)

	# # Transform Query into TF-IDF Vector
	# query_str = " ".join(keywords)
	# query_vec = vectorizer.transform([query_str])

	# #Compute Cosine Similarity
	# similarity_scores = cosine_similarity(query_vec, tfidf_matrix).flatten()

	# #Sort papers based on similarity score
	# ranked_papers = sorted(zip(papers, similarity_scores), key=lambda x: x[1], reverse=True)

	# # Return the most relevant papers
	# return [paper[0] for paper in ranked_papers[:num_results]]

	# except Exception as e:
	# print(f"ERROR: {str(e)}")
	# return [{"error": f"Error fetching research papers: {str(e)}"}]
	# @tool
	# def get_current_time_in_timezone(timezone: str) -> str:
	# """A tool that fetches the current local time in a specified timezone.
	# Args:
	# timezone: A string representing a valid timezone (e.g., 'America/New_York').
	# """
	# try:
	# # Create timezone object
	# tz = pytz.timezone(timezone)
	# # Get current time in that timezone
	# local_time = datetime.datetime.now(tz).strftime("%Y-%m-%d %H:%M:%S")
	# return f"The current local time in {timezone} is: {local_time}"
	# except Exception as e:
	# return f"Error fetching time for timezone '{timezone}': {str(e)}"


	# final_answer = FinalAnswerTool()


	# # AI Model
	# model = HfApiModel(
	# max_tokens=2096,
	# temperature=0.5,
	# model_id='Qwen/Qwen2.5-Coder-32B-Instruct',
	# custom_role_conversions=None,
	# )

	# # Import tool from Hub
	# image_generation_tool = load_tool("agents-course/text-to-image", trust_remote_code=True)


	# # Load prompt templates
	# with open("prompts.yaml", 'r') as stream:
	# prompt_templates = yaml.safe_load(stream)

	# # Create the AI Agent
	# agent = CodeAgent(
	# model=model,
	# tools=[final_answer,fetch_latest_arxiv_papers], # Add your tools here
	# max_steps=6,
	# verbosity_level=1,
	# grammar=None,
	# planning_interval=None,
	# name="ScholarAgent",
	# description="An AI agent that fetches the latest research papers from arXiv based on user-defined keywords and filters.",
	# prompt_templates=prompt_templates
	# )

	# # # Define Gradio Search Function
	# # def search_papers(user_input):
	# # keywords = [kw.strip() for kw in user_input.split(",") if kw.strip()] # Ensure valid keywords
	# # print(f"DEBUG: Received input keywords - {keywords}") # Debug user input

	# # if not keywords:
	# # print("DEBUG: No valid keywords provided.")
	# # return "Error: Please enter at least one valid keyword."

	# # results = fetch_latest_arxiv_papers(keywords, num_results=3) # Fetch 3 results
	# # print(f"DEBUG: Results received - {results}") # Debug function output

	# # if isinstance(results, list) and results and isinstance(results[0], dict):
	# # #Format output with better readability and clarity
	# # formatted_results = "\n\n".join([
	# # f"---\n\n"
	# # f"📌 Title:\n{paper['title']}\n\n"
	# # f"👨‍🔬 Authors:\n{paper['authors']}\n\n"
	# # f"📅 Year: {paper['year']}\n\n"
	# # f"📖 Abstract:\n{paper['abstract'][:500]}... (truncated for readability)\n\n"
	# # f"[🔗 Read Full Paper]({paper['link']})\n\n"
	# # for paper in results
	# # ])
	# # return formatted_results

	# # print("DEBUG: No results found.")
	# # return "No results found. Try different keywords."

	# #Search Papers
	# def search_papers(user_input):
	# keywords = [kw.strip() for kw in user_input.split(",") if kw.strip()] # Ensure valid keywords
	# print(f"DEBUG: Received input keywords - {keywords}") # Debug user input

	# if not keywords:
	# print("DEBUG: No valid keywords provided.")
	# return "Error: Please enter at least one valid keyword."

	# results = fetch_latest_arxiv_papers(keywords, num_results=3) # Fetch 3 results
	# print(f"DEBUG: Results received - {results}") # Debug function output

	# # Check if the API returned an error
	# if isinstance(results, list) and len(results) > 0 and "error" in results[0]:
	# return results[0]["error"] # Return the error message directly

	# # Format results only if valid papers exist
	# if isinstance(results, list) and results and isinstance(results[0], dict):
	# formatted_results = "\n\n".join([
	# f"---\n\n"
	# f"📌 Title: {paper['title']}\n\n"
	# f"👨‍🔬 Authors: {paper['authors']}\n\n"
	# f"📅 Year: {paper['year']}\n\n"
	# f"📖 Abstract: {paper['abstract'][:500]}... (truncated for readability)\n\n"
	# f"[🔗 Read Full Paper]({paper['link']})\n\n"
	# for paper in results
	# ])
	# return formatted_results

	# print("DEBUG: No results found.")
	# return "No results found. Try different keywords."

	# # Launch Gradio UI with CodeAgent
	# GradioUI(agent).launch()


	# # # Create Gradio UI
	# # with gr.Blocks() as demo:
	# # gr.Markdown("# ScholarAgent")
	# # keyword_input = gr.Textbox(label="Enter keywords (comma-separated)", placeholder="e.g., deep learning, reinforcement learning")
	# # output_display = gr.Markdown()
	# # search_button = gr.Button("Search")

	# # search_button.click(search_papers, inputs=[keyword_input], outputs=[output_display])

	# # print("DEBUG: Gradio UI is running. Waiting for user input...")

	# # # Launch Gradio App
	# # demo.launch()

	import os
	import datetime
	import requests
	import pytz
	import yaml
	from smolagents import CodeAgent, HfApiModel, load_tool, tool
	from tools.final_answer import FinalAnswerTool
	from Gradio_UI import GradioUI

	# Step 1: Set Hugging Face API Token
	os.environ["HUGGINGFACEHUB_API_TOKEN"] = "your_huggingface_api_token"

	# Step 2: Define ScholarAgent's Paper Search Functionality
	@tool
	def fetch_arxiv_papers(query: str) -> str:
	"""Fetches the top 3 most recent research papers from ArXiv based on a keyword search.

	Args:
	query: A string containing keywords or a full sentence describing the research topic.

	Returns:
	A formatted string with the top 3 recent papers, including title, authors, and ArXiv links.
	"""
	base_url = "http://export.arxiv.org/api/query"
	params = {
	"search_query": query,
	"start": 0,
	"max_results": 3,
	"sortBy": "submittedDate",
	"sortOrder": "descending",
	}

	try:
	response = requests.get(base_url, params=params)
	if response.status_code == 200:
	papers = response.text.split("<entry>")
	results = []
	for paper in papers[1:4]: # Extract top 3 papers
	title = paper.split("<title>")[1].split("</title>")[0].strip()
	authors = paper.split("<author><name>")[1].split("</name>")[0].strip()
	link = paper.split("<id>")[1].split("</id>")[0].strip()
	results.append(f"- {title}\n - 📖 Authors: {authors}\n - 🔗 [Read here]({link})\n")
	return "\n".join(results) if results else "No relevant papers found."
	else:
	return "Error: Unable to retrieve papers from ArXiv."
	except Exception as e:
	return f"API Error: {str(e)}"

	# Step 3: Add a Timezone Utility Tool
	@tool
	def get_current_time_in_timezone(timezone: str) -> str:
	"""Fetches the current local time in a specified timezone.

	Args:
	timezone: A string representing a valid timezone (e.g., 'America/New_York').

	Returns:
	A formatted string with the current time.
	"""
	try:
	tz = pytz.timezone(timezone)
	local_time = datetime.datetime.now(tz).strftime("%Y-%m-%d %H:%M:%S")
	return f"The current local time in {timezone} is: {local_time}"
	except Exception as e:
	return f"Error fetching time for timezone '{timezone}': {str(e)}"

	# Step 4: Define Final Answer Tool (Required)
	final_answer = FinalAnswerTool()

	# Step 5: Configure Hugging Face Model with API Token
	model = HfApiModel(
	max_tokens=2096,
	temperature=0.5,
	model_id='Qwen/Qwen2.5-Coder-32B-Instruct', # Default model
	custom_role_conversions=None,

	)

	# Step 6: Load Additional Tools
	image_generation_tool = load_tool("agents-course/text-to-image", trust_remote_code=True)

	# Step 7: Load Prompt Templates
	with open("prompts.yaml", 'r') as stream:
	prompt_templates = yaml.safe_load(stream)

	# Step 8: Define ScholarAgent (AI Agent)
	agent = CodeAgent(
	model=model,
	tools=[final_answer, fetch_arxiv_papers, get_current_time_in_timezone], # ScholarAgent tools
	max_steps=6,
	verbosity_level=1,
	grammar=None,
	planning_interval=None,
	name="ScholarAgent",
	description="An AI-powered research assistant that fetches top research papers from ArXiv.",
	prompt_templates=prompt_templates
	)

	# Step 9: Launch Gradio UI with CodeAgent
	GradioUI(agent).launch()