ScholarAgent

Running

App Files Files Community

ScholarAgent / app.py

pdx97

Update app.py

6dda0fd verified 4 months ago

raw

history blame

17.6 kB

	# # import feedparser
	# # import urllib.parse
	# # import yaml
	# # import gradio as gr
	# # from smolagents import CodeAgent, HfApiModel, tool
	# # from tools.final_answer import FinalAnswerTool

	# # @tool
	# # def fetch_latest_arxiv_papers(keywords: list, num_results: int = 3) -> list:
	# # """Fetches the latest research papers from arXiv based on provided keywords.

	# # Args:
	# # keywords: A list of keywords to search for relevant papers.
	# # num_results: The number of papers to fetch (default is 3).

	# # Returns:
	# # A list of dictionaries containing:
	# # - "title": The title of the research paper.
	# # - "authors": The authors of the paper.
	# # - "year": The publication year.
	# # - "abstract": A summary of the research paper.
	# # - "link": A direct link to the paper on arXiv.
	# # """
	# # try:
	# # print(f"DEBUG: Searching arXiv papers with keywords: {keywords}") # Debug input

	# # #Properly format query with +AND+ for multiple keywords
	# # query = "+AND+".join([f"all:{kw}" for kw in keywords])
	# # query_encoded = urllib.parse.quote(query) # Encode spaces and special characters

	# # url = f"http://export.arxiv.org/api/query?search_query={query_encoded}&start=0&max_results={num_results}&sortBy=submittedDate&sortOrder=descending"

	# # print(f"DEBUG: Query URL - {url}") # Debug URL

	# # feed = feedparser.parse(url)

	# # papers = []
	# # for entry in feed.entries:
	# # papers.append({
	# # "title": entry.title,
	# # "authors": ", ".join(author.name for author in entry.authors),
	# # "year": entry.published[:4], # Extract year
	# # "abstract": entry.summary,
	# # "link": entry.link
	# # })

	# # return papers

	# # except Exception as e:
	# # print(f"ERROR: {str(e)}") # Debug errors
	# # return [f"Error fetching research papers: {str(e)}"]


	# #"""------Applied BM25 search for paper retrival------"""
	# # from rank_bm25 import BM25Okapi
	# # import nltk

	# # import os
	# # import shutil


	# # nltk_data_path = os.path.join(nltk.data.path[0], "tokenizers", "punkt")
	# # if os.path.exists(nltk_data_path):
	# # shutil.rmtree(nltk_data_path) # Remove corrupted version

	# # print("Removed old NLTK 'punkt' data. Reinstalling...")

	# # # Step 2: Download the correct 'punkt' tokenizer
	# # nltk.download("punkt_tab")

	# # print("Successfully installed 'punkt'!")


	# # @tool # Register the function properly as a SmolAgents tool
	# # def fetch_latest_arxiv_papers(keywords: list, num_results: int = 5) -> list:
	# # """Fetches and ranks arXiv papers using BM25 keyword relevance.

	# # Args:
	# # keywords: List of keywords for search.
	# # num_results: Number of results to return.

	# # Returns:
	# # List of the most relevant papers based on BM25 ranking.
	# # """
	# # try:
	# # print(f"DEBUG: Searching arXiv papers with keywords: {keywords}")

	# # # Use a general keyword search (without `ti:` and `abs:`)
	# # query = "+AND+".join([f"all:{kw}" for kw in keywords])
	# # query_encoded = urllib.parse.quote(query)
	# # url = f"http://export.arxiv.org/api/query?search_query={query_encoded}&start=0&max_results=50&sortBy=submittedDate&sortOrder=descending"

	# # print(f"DEBUG: Query URL - {url}")

	# # feed = feedparser.parse(url)
	# # papers = []

	# # # Extract papers from arXiv
	# # for entry in feed.entries:
	# # papers.append({
	# # "title": entry.title,
	# # "authors": ", ".join(author.name for author in entry.authors),
	# # "year": entry.published[:4],
	# # "abstract": entry.summary,
	# # "link": entry.link
	# # })

	# # if not papers:
	# # return [{"error": "No results found. Try different keywords."}]

	# # # Apply BM25 ranking
	# # tokenized_corpus = [nltk.word_tokenize(paper["title"].lower() + " " + paper["abstract"].lower()) for paper in papers]
	# # bm25 = BM25Okapi(tokenized_corpus)

	# # tokenized_query = nltk.word_tokenize(" ".join(keywords).lower())
	# # scores = bm25.get_scores(tokenized_query)

	# # # Sort papers based on BM25 score
	# # ranked_papers = sorted(zip(papers, scores), key=lambda x: x[1], reverse=True)

	# # # Return the most relevant ones
	# # return [paper[0] for paper in ranked_papers[:num_results]]

	# # except Exception as e:
	# # print(f"ERROR: {str(e)}")
	# # return [{"error": f"Error fetching research papers: {str(e)}"}]


	# """------Applied TF-IDF for better semantic search------"""
	# import feedparser
	# import urllib.parse
	# import yaml
	# from tools.final_answer import FinalAnswerTool
	# import numpy as np
	# from sklearn.feature_extraction.text import TfidfVectorizer
	# from sklearn.metrics.pairwise import cosine_similarity
	# import gradio as gr
	# from smolagents import CodeAgent,DuckDuckGoSearchTool, HfApiModel,load_tool,tool
	# import nltk

	# import datetime
	# import requests
	# import pytz
	# from tools.final_answer import FinalAnswerTool

	# from Gradio_UI import GradioUI

	# nltk.download("stopwords")
	# from nltk.corpus import stopwords

	# @tool # ✅ Register the function properly as a SmolAgents tool
	# def fetch_latest_arxiv_papers(keywords: list, num_results: int = 5) -> list:
	# """Fetches and ranks arXiv papers using TF-IDF and Cosine Similarity.

	# Args:
	# keywords: List of keywords for search.
	# num_results: Number of results to return.

	# Returns:
	# List of the most relevant papers based on TF-IDF ranking.
	# """
	# try:
	# print(f"DEBUG: Searching arXiv papers with keywords: {keywords}")

	# # Use a general keyword search
	# query = "+AND+".join([f"all:{kw}" for kw in keywords])
	# query_encoded = urllib.parse.quote(query)
	# url = f"http://export.arxiv.org/api/query?search_query={query_encoded}&start=0&max_results=50&sortBy=submittedDate&sortOrder=descending"

	# print(f"DEBUG: Query URL - {url}")

	# feed = feedparser.parse(url)
	# papers = []

	# # Extract papers from arXiv
	# for entry in feed.entries:
	# papers.append({
	# "title": entry.title,
	# "authors": ", ".join(author.name for author in entry.authors),
	# "year": entry.published[:4],
	# "abstract": entry.summary,
	# "link": entry.link
	# })

	# if not papers:
	# return [{"error": "No results found. Try different keywords."}]

	# # Prepare TF-IDF Vectorization
	# corpus = [paper["title"] + " " + paper["abstract"] for paper in papers]
	# vectorizer = TfidfVectorizer(stop_words=stopwords.words('english')) # Remove stopwords
	# tfidf_matrix = vectorizer.fit_transform(corpus)

	# # Transform Query into TF-IDF Vector
	# query_str = " ".join(keywords)
	# query_vec = vectorizer.transform([query_str])

	# #Compute Cosine Similarity
	# similarity_scores = cosine_similarity(query_vec, tfidf_matrix).flatten()

	# #Sort papers based on similarity score
	# ranked_papers = sorted(zip(papers, similarity_scores), key=lambda x: x[1], reverse=True)

	# # Return the most relevant papers
	# return [paper[0] for paper in ranked_papers[:num_results]]

	# except Exception as e:
	# print(f"ERROR: {str(e)}")
	# return [{"error": f"Error fetching research papers: {str(e)}"}]
	# @tool
	# def get_current_time_in_timezone(timezone: str) -> str:
	# """A tool that fetches the current local time in a specified timezone.
	# Args:
	# timezone: A string representing a valid timezone (e.g., 'America/New_York').
	# """
	# try:
	# # Create timezone object
	# tz = pytz.timezone(timezone)
	# # Get current time in that timezone
	# local_time = datetime.datetime.now(tz).strftime("%Y-%m-%d %H:%M:%S")
	# return f"The current local time in {timezone} is: {local_time}"
	# except Exception as e:
	# return f"Error fetching time for timezone '{timezone}': {str(e)}"


	# final_answer = FinalAnswerTool()


	# # AI Model
	# model = HfApiModel(
	# max_tokens=2096,
	# temperature=0.5,
	# model_id='Qwen/Qwen2.5-Coder-32B-Instruct',
	# custom_role_conversions=None,
	# )

	# # Import tool from Hub
	# image_generation_tool = load_tool("agents-course/text-to-image", trust_remote_code=True)


	# # Load prompt templates
	# with open("prompts.yaml", 'r') as stream:
	# prompt_templates = yaml.safe_load(stream)

	# # Create the AI Agent
	# agent = CodeAgent(
	# model=model,
	# tools=[final_answer,fetch_latest_arxiv_papers], # Add your tools here
	# max_steps=6,
	# verbosity_level=1,
	# grammar=None,
	# planning_interval=None,
	# name="ScholarAgent",
	# description="An AI agent that fetches the latest research papers from arXiv based on user-defined keywords and filters.",
	# prompt_templates=prompt_templates
	# )



	# #Search Papers
	# def search_papers(user_input):
	# keywords = [kw.strip() for kw in user_input.split(",") if kw.strip()] # Ensure valid keywords
	# print(f"DEBUG: Received input keywords - {keywords}") # Debug user input

	# if not keywords:
	# print("DEBUG: No valid keywords provided.")
	# return "Error: Please enter at least one valid keyword."

	# results = fetch_latest_arxiv_papers(keywords, num_results=3) # Fetch 3 results
	# print(f"DEBUG: Results received - {results}") # Debug function output

	# # Check if the API returned an error
	# if isinstance(results, list) and len(results) > 0 and "error" in results[0]:
	# return results[0]["error"] # Return the error message directly

	# # Format results only if valid papers exist
	# if isinstance(results, list) and results and isinstance(results[0], dict):
	# formatted_results = "\n\n".join([
	# f"---\n\n"
	# f"📌 Title: {paper['title']}\n\n"
	# f"👨‍🔬 Authors: {paper['authors']}\n\n"
	# f"📅 Year: {paper['year']}\n\n"
	# f"📖 Abstract: {paper['abstract'][:500]}... (truncated for readability)\n\n"
	# f"[🔗 Read Full Paper]({paper['link']})\n\n"
	# for paper in results
	# ])
	# return formatted_results

	# print("DEBUG: No results found.")
	# return "No results found. Try different keywords."



	# # Create Gradio UI
	# with gr.Blocks() as demo:
	# gr.Markdown("# ScholarAgent")
	# keyword_input = gr.Textbox(label="Enter keywords(comma-separated) or even full sentences ", placeholder="e.g., deep learning, reinforcement learning or NLP in finance or Deep learning in Medicine")
	# output_display = gr.Markdown()
	# search_button = gr.Button("Search")

	# search_button.click(search_papers, inputs=[keyword_input], outputs=[output_display])

	# print("DEBUG: Gradio UI is running. Waiting for user input...")

	# # Launch Gradio App
	# demo.launch()

	"""------Enhanced ScholarAgent with Fixes and Features-----"""
	import feedparser
	import urllib.parse
	import yaml
	import requests
	import numpy as np
	from sklearn.feature_extraction.text import TfidfVectorizer
	from sklearn.metrics.pairwise import cosine_similarity
	import gradio as gr
	from smolagents import CodeAgent, HfApiModel, tool
	import nltk
	from transformers import pipeline

	# ✅ Ensure necessary NLTK data is downloaded
	nltk.download("stopwords")
	nltk.download("punkt")
	from nltk.corpus import stopwords

	# ✅ GPT Summarization Pipeline
	summarizer = pipeline("summarization", model="facebook/bart-large-cnn")

	@tool
	def fetch_latest_arxiv_papers(keywords: list, num_results: int = 5) -> list:
	"""
	Fetches and ranks arXiv papers using optimized TF-IDF and Cosine Similarity.

	Args:
	keywords: List of keywords for search.
	num_results: Number of results to return.

	Returns:
	List of the most relevant papers based on TF-IDF ranking.
	"""
	try:
	# ✅ Correct URL encoding for spaces and special characters
	query = "+AND+".join([f"all:{kw}" for kw in keywords])
	query_encoded = urllib.parse.quote_plus(query) # ✅ FIXED: Correct encoding

	url = f"http://export.arxiv.org/api/query?search_query={query_encoded}&start=0&max_results=5&sortBy=submittedDate&sortOrder=descending"
	print(f"DEBUG: Query URL - {url}") # ✅ Debugging

	feed = feedparser.parse(url)
	print(f"DEBUG: API Response - {feed.entries}")

	papers = []
	for entry in feed.entries:
	papers.append({
	"title": entry.title,
	"authors": ", ".join(author.name for author in entry.authors),
	"year": entry.published[:4],
	"abstract": entry.summary,
	"link": entry.link
	})

	if not papers:
	print("DEBUG: No results from ArXiv API")
	return [{"error": "No results found. Try different keywords."}]

	# ✅ Debug Corpus before TF-IDF
	corpus = [paper["title"] + " " + paper["abstract"] for paper in papers]
	print(f"DEBUG: Corpus - {corpus}")

	vectorizer = TfidfVectorizer(stop_words=stopwords.words('english'), max_features=2000)
	tfidf_matrix = vectorizer.fit_transform(corpus)
	print(f"DEBUG: TF-IDF Matrix Shape - {tfidf_matrix.shape}")

	query_vec = vectorizer.transform([" ".join(keywords)])
	print(f"DEBUG: Query Vector Shape - {query_vec.shape}")

	similarity_scores = cosine_similarity(query_vec, tfidf_matrix).flatten()
	print(f"DEBUG: Similarity Scores - {similarity_scores}")

	# ✅ Rank papers by similarity score
	ranked_papers = sorted(zip(papers, similarity_scores), key=lambda x: x[1], reverse=True)

	return [paper[0] for paper in ranked_papers[:num_results]]

	except Exception as e:
	print(f"ERROR: {str(e)}")
	return [{"error": f"Error fetching research papers: {str(e)}"}]



	@tool
	def get_citation_count(paper_title: str) -> int:
	"""
	Fetches citation count from Semantic Scholar API.

	Args:
	paper_title (str): Title of the research paper.

	Returns:
	int: Citation count (default 0 if not found).
	"""
	try:
	base_url = "https://api.semanticscholar.org/graph/v1/paper/search"
	params = {"query": paper_title, "fields": "citationCount"}
	response = requests.get(base_url, params=params).json()

	if "data" in response and response["data"]:
	return response["data"][0].get("citationCount", 0)
	return 0 # Default to 0 if no data found

	except Exception as e:
	print(f"ERROR fetching citation count: {e}")
	return 0


	@tool
	def rank_papers_by_citations(papers: list) -> list:
	"""
	Ranks papers based on citation count and TF-IDF similarity.

	Args:
	papers (list): List of research papers.

	Returns:
	list: Papers sorted by citation count and TF-IDF score.
	"""
	for paper in papers:
	paper["citations"] = get_citation_count(paper["title"])
	return sorted(papers, key=lambda x: (x["citations"], x.get("tfidf_score", 0)), reverse=True)


	# ✅ AI Model
	model = HfApiModel(
	max_tokens=2096,
	temperature=0.5,
	model_id='Qwen/Qwen2.5-Coder-32B-Instruct',
	custom_role_conversions=None,
	)

	# ✅ Load prompt templates
	with open("prompts.yaml", 'r') as stream:
	prompt_templates = yaml.safe_load(stream)

	# ✅ Create the AI Agent
	agent = CodeAgent(
	model=model,
	tools=[fetch_latest_arxiv_papers, get_citation_count, rank_papers_by_citations],
	max_steps=6,
	verbosity_level=1,
	grammar=None,
	planning_interval=None,
	name="ScholarAgent",
	description="An AI agent that fetches and ranks the latest research papers based on citations and relevance.",
	prompt_templates=prompt_templates
	)


	# ✅ Gradio UI
	with gr.Blocks() as demo:
	gr.Markdown("# ScholarAgent")
	keyword_input = gr.Textbox(label="Enter keywords or full sentences", placeholder="e.g., deep learning, reinforcement learning")
	output_display = gr.Markdown()
	search_button = gr.Button("Search")

	def search_papers(user_input):
	keywords = [kw.strip() for kw in user_input.split(",") if kw.strip()]
	results = fetch_latest_arxiv_papers(keywords, num_results=3)

	if isinstance(results, list) and results and "error" in results[0]:
	return results[0]["error"]

	return "\n\n".join([
	f"---\n\n"
	f"📌 Title: {paper['title']}\n\n"
	f"👨‍🔬 Authors: {paper['authors']}\n\n"
	f"📅 Year: {paper['year']}\n\n"
	f"📖 Summary: {paper.get('summary', 'No summary available')[:500]}... (truncated)\n\n"
	f"🔢 Citations: {paper.get('citations', 0)}\n\n"
	f"[🔗 Read Full Paper]({paper['link']})\n\n"
	for paper in results
	])

	search_button.click(search_papers, inputs=[keyword_input], outputs=[output_display])
	print("DEBUG: Gradio UI is running. Waiting for user input...")

	# ✅ Launch Gradio App
	demo.launch()