ScholarAgent /
pdx97's picture
b81895b verified
history blame
17.5 kB
# # import feedparser
# # import urllib.parse
# # import yaml
# # import gradio as gr
# # from smolagents import CodeAgent, HfApiModel, tool
# # from tools.final_answer import FinalAnswerTool
# # @tool
# # def fetch_latest_arxiv_papers(keywords: list, num_results: int = 3) -> list:
# # """Fetches the latest research papers from arXiv based on provided keywords.
# # Args:
# # keywords: A list of keywords to search for relevant papers.
# # num_results: The number of papers to fetch (default is 3).
# # Returns:
# # A list of dictionaries containing:
# # - "title": The title of the research paper.
# # - "authors": The authors of the paper.
# # - "year": The publication year.
# # - "abstract": A summary of the research paper.
# # - "link": A direct link to the paper on arXiv.
# # """
# # try:
# # print(f"DEBUG: Searching arXiv papers with keywords: {keywords}") # Debug input
# # #Properly format query with +AND+ for multiple keywords
# # query = "+AND+".join([f"all:{kw}" for kw in keywords])
# # query_encoded = urllib.parse.quote(query) # Encode spaces and special characters
# # url = f"{query_encoded}&start=0&max_results={num_results}&sortBy=submittedDate&sortOrder=descending"
# # print(f"DEBUG: Query URL - {url}") # Debug URL
# # feed = feedparser.parse(url)
# # papers = []
# # for entry in feed.entries:
# # papers.append({
# # "title": entry.title,
# # "authors": ", ".join( for author in entry.authors),
# # "year": entry.published[:4], # Extract year
# # "abstract": entry.summary,
# # "link":
# # })
# # return papers
# # except Exception as e:
# # print(f"ERROR: {str(e)}") # Debug errors
# # return [f"Error fetching research papers: {str(e)}"]
# #"""------Applied BM25 search for paper retrival------"""
# # from rank_bm25 import BM25Okapi
# # import nltk
# # import os
# # import shutil
# # nltk_data_path = os.path.join([0], "tokenizers", "punkt")
# # if os.path.exists(nltk_data_path):
# # shutil.rmtree(nltk_data_path) # Remove corrupted version
# # print("Removed old NLTK 'punkt' data. Reinstalling...")
# # # Step 2: Download the correct 'punkt' tokenizer
# #"punkt_tab")
# # print("Successfully installed 'punkt'!")
# # @tool # Register the function properly as a SmolAgents tool
# # def fetch_latest_arxiv_papers(keywords: list, num_results: int = 5) -> list:
# # """Fetches and ranks arXiv papers using BM25 keyword relevance.
# # Args:
# # keywords: List of keywords for search.
# # num_results: Number of results to return.
# # Returns:
# # List of the most relevant papers based on BM25 ranking.
# # """
# # try:
# # print(f"DEBUG: Searching arXiv papers with keywords: {keywords}")
# # # Use a general keyword search (without `ti:` and `abs:`)
# # query = "+AND+".join([f"all:{kw}" for kw in keywords])
# # query_encoded = urllib.parse.quote(query)
# # url = f"{query_encoded}&start=0&max_results=50&sortBy=submittedDate&sortOrder=descending"
# # print(f"DEBUG: Query URL - {url}")
# # feed = feedparser.parse(url)
# # papers = []
# # # Extract papers from arXiv
# # for entry in feed.entries:
# # papers.append({
# # "title": entry.title,
# # "authors": ", ".join( for author in entry.authors),
# # "year": entry.published[:4],
# # "abstract": entry.summary,
# # "link":
# # })
# # if not papers:
# # return [{"error": "No results found. Try different keywords."}]
# # # Apply BM25 ranking
# # tokenized_corpus = [nltk.word_tokenize(paper["title"].lower() + " " + paper["abstract"].lower()) for paper in papers]
# # bm25 = BM25Okapi(tokenized_corpus)
# # tokenized_query = nltk.word_tokenize(" ".join(keywords).lower())
# # scores = bm25.get_scores(tokenized_query)
# # # Sort papers based on BM25 score
# # ranked_papers = sorted(zip(papers, scores), key=lambda x: x[1], reverse=True)
# # # Return the most relevant ones
# # return [paper[0] for paper in ranked_papers[:num_results]]
# # except Exception as e:
# # print(f"ERROR: {str(e)}")
# # return [{"error": f"Error fetching research papers: {str(e)}"}]
# """------Applied TF-IDF for better semantic search------"""
# import feedparser
# import urllib.parse
# import yaml
# from tools.final_answer import FinalAnswerTool
# import numpy as np
# from sklearn.feature_extraction.text import TfidfVectorizer
# from sklearn.metrics.pairwise import cosine_similarity
# import gradio as gr
# from smolagents import CodeAgent,DuckDuckGoSearchTool, HfApiModel,load_tool,tool
# import nltk
# import datetime
# import requests
# import pytz
# from tools.final_answer import FinalAnswerTool
# from Gradio_UI import GradioUI
# from nltk.corpus import stopwords
# @tool # ✅ Register the function properly as a SmolAgents tool
# def fetch_latest_arxiv_papers(keywords: list, num_results: int = 5) -> list:
# """Fetches and ranks arXiv papers using TF-IDF and Cosine Similarity.
# Args:
# keywords: List of keywords for search.
# num_results: Number of results to return.
# Returns:
# List of the most relevant papers based on TF-IDF ranking.
# """
# try:
# print(f"DEBUG: Searching arXiv papers with keywords: {keywords}")
# # Use a general keyword search
# query = "+AND+".join([f"all:{kw}" for kw in keywords])
# query_encoded = urllib.parse.quote(query)
# url = f"{query_encoded}&start=0&max_results=50&sortBy=submittedDate&sortOrder=descending"
# print(f"DEBUG: Query URL - {url}")
# feed = feedparser.parse(url)
# papers = []
# # Extract papers from arXiv
# for entry in feed.entries:
# papers.append({
# "title": entry.title,
# "authors": ", ".join( for author in entry.authors),
# "year": entry.published[:4],
# "abstract": entry.summary,
# "link":
# })
# if not papers:
# return [{"error": "No results found. Try different keywords."}]
# # Prepare TF-IDF Vectorization
# corpus = [paper["title"] + " " + paper["abstract"] for paper in papers]
# vectorizer = TfidfVectorizer(stop_words=stopwords.words('english')) # Remove stopwords
# tfidf_matrix = vectorizer.fit_transform(corpus)
# # Transform Query into TF-IDF Vector
# query_str = " ".join(keywords)
# query_vec = vectorizer.transform([query_str])
# #Compute Cosine Similarity
# similarity_scores = cosine_similarity(query_vec, tfidf_matrix).flatten()
# #Sort papers based on similarity score
# ranked_papers = sorted(zip(papers, similarity_scores), key=lambda x: x[1], reverse=True)
# # Return the most relevant papers
# return [paper[0] for paper in ranked_papers[:num_results]]
# except Exception as e:
# print(f"ERROR: {str(e)}")
# return [{"error": f"Error fetching research papers: {str(e)}"}]
# @tool
# def get_current_time_in_timezone(timezone: str) -> str:
# """A tool that fetches the current local time in a specified timezone.
# Args:
# timezone: A string representing a valid timezone (e.g., 'America/New_York').
# """
# try:
# # Create timezone object
# tz = pytz.timezone(timezone)
# # Get current time in that timezone
# local_time ="%Y-%m-%d %H:%M:%S")
# return f"The current local time in {timezone} is: {local_time}"
# except Exception as e:
# return f"Error fetching time for timezone '{timezone}': {str(e)}"
# final_answer = FinalAnswerTool()
# # AI Model
# model = HfApiModel(
# max_tokens=2096,
# temperature=0.5,
# model_id='Qwen/Qwen2.5-Coder-32B-Instruct',
# custom_role_conversions=None,
# )
# # Import tool from Hub
# image_generation_tool = load_tool("agents-course/text-to-image", trust_remote_code=True)
# # Load prompt templates
# with open("prompts.yaml", 'r') as stream:
# prompt_templates = yaml.safe_load(stream)
# # Create the AI Agent
# agent = CodeAgent(
# model=model,
# tools=[final_answer,fetch_latest_arxiv_papers], # Add your tools here
# max_steps=6,
# verbosity_level=1,
# grammar=None,
# planning_interval=None,
# name="ScholarAgent",
# description="An AI agent that fetches the latest research papers from arXiv based on user-defined keywords and filters.",
# prompt_templates=prompt_templates
# )
# #Search Papers
# def search_papers(user_input):
# keywords = [kw.strip() for kw in user_input.split(",") if kw.strip()] # Ensure valid keywords
# print(f"DEBUG: Received input keywords - {keywords}") # Debug user input
# if not keywords:
# print("DEBUG: No valid keywords provided.")
# return "Error: Please enter at least one valid keyword."
# results = fetch_latest_arxiv_papers(keywords, num_results=3) # Fetch 3 results
# print(f"DEBUG: Results received - {results}") # Debug function output
# # Check if the API returned an error
# if isinstance(results, list) and len(results) > 0 and "error" in results[0]:
# return results[0]["error"] # Return the error message directly
# # Format results only if valid papers exist
# if isinstance(results, list) and results and isinstance(results[0], dict):
# formatted_results = "\n\n".join([
# f"---\n\n"
# f"📌 **Title:** {paper['title']}\n\n"
# f"👨‍🔬 **Authors:** {paper['authors']}\n\n"
# f"📅 **Year:** {paper['year']}\n\n"
# f"📖 **Abstract:** {paper['abstract'][:500]}... *(truncated for readability)*\n\n"
# f"[🔗 Read Full Paper]({paper['link']})\n\n"
# for paper in results
# ])
# return formatted_results
# print("DEBUG: No results found.")
# return "No results found. Try different keywords."
# # Create Gradio UI
# with gr.Blocks() as demo:
# gr.Markdown("# ScholarAgent")
# keyword_input = gr.Textbox(label="Enter keywords(comma-separated) or even full sentences ", placeholder="e.g., deep learning, reinforcement learning or NLP in finance or Deep learning in Medicine")
# output_display = gr.Markdown()
# search_button = gr.Button("Search")
#, inputs=[keyword_input], outputs=[output_display])
# print("DEBUG: Gradio UI is running. Waiting for user input...")
# # Launch Gradio App
# demo.launch()
"""------Enhanced ScholarAgent with Fixes and Features-----"""
import feedparser
import urllib.parse
import yaml
import requests
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import gradio as gr
from smolagents import CodeAgent, HfApiModel, tool
import nltk
from transformers import pipeline
# ✅ Ensure necessary NLTK data is downloaded"stopwords")"punkt")
from nltk.corpus import stopwords
# ✅ GPT Summarization Pipeline
summarizer = pipeline("summarization", model="facebook/bart-large-cnn")
def fetch_latest_arxiv_papers(keywords: list, num_results: int = 5) -> list:
Fetches and ranks arXiv papers using optimized TF-IDF and Cosine Similarity.
keywords: List of keywords for search.
num_results: Number of results to return.
List of the most relevant papers based on TF-IDF ranking.
# ✅ Encode query properly
query = "+AND+".join([f"all:{kw}" for kw in keywords])
query_encoded = urllib.parse.quote_plus(query)
url = f"{query_encoded}&start=0&max_results=5&sortBy=submittedDate&sortOrder=descending"
print(f"DEBUG: Query URL - {url}")
feed = feedparser.parse(url)
papers = []
for entry in feed.entries:
"title": entry.title,
"authors": ", ".join( for author in entry.authors),
"year": entry.published[:4],
"abstract": entry.summary,
if not papers:
print("DEBUG: No results from ArXiv API")
return [{"error": "No results found. Try different keywords."}]
# ✅ TF-IDF Vectorization
corpus = [paper["title"] + " " + paper["abstract"] for paper in papers]
vectorizer = TfidfVectorizer(stop_words=stopwords.words('english'), max_features=2000)
tfidf_matrix = vectorizer.fit_transform(corpus)
query_vec = vectorizer.transform([" ".join(keywords)])
similarity_scores = cosine_similarity(query_vec, tfidf_matrix).flatten()
ranked_papers = sorted(zip(papers, similarity_scores), key=lambda x: x[1], reverse=True)
# ✅ Apply GPT Summarization with Fallback
for paper, _ in ranked_papers:
paper["summary"] = summarizer(paper["abstract"], max_length=100, min_length=30, do_sample=False)[0]["summary_text"]
paper["summary"] = paper["abstract"][:300] + "..." # ✅ Fallback: First 300 characters of abstract
return [paper[0] for paper in ranked_papers[:num_results]]
except Exception as e:
print(f"ERROR: {str(e)}")
return [{"error": f"Error fetching research papers: {str(e)}"}]
def get_citation_count(paper_title: str) -> int:
Fetches citation count from Semantic Scholar API.
paper_title (str): Title of the research paper.
int: Citation count (default 0 if not found).
base_url = ""
params = {"query": paper_title, "fields": "citationCount"}
response = requests.get(base_url, params=params).json()
if "data" in response and response["data"]:
return response["data"][0].get("citationCount", 0)
return 0 # Default to 0 if no data found
except Exception as e:
print(f"ERROR fetching citation count: {e}")
return 0
def rank_papers_by_citations(papers: list) -> list:
Ranks papers based on citation count and TF-IDF similarity.
papers (list): List of research papers.
list: Papers sorted by citation count and TF-IDF score.
for paper in papers:
paper["citations"] = get_citation_count(paper["title"])
return sorted(papers, key=lambda x: (x["citations"], x.get("tfidf_score", 0)), reverse=True)
# ✅ AI Model
model = HfApiModel(
# ✅ Load prompt templates
with open("prompts.yaml", 'r') as stream:
prompt_templates = yaml.safe_load(stream)
# ✅ Create the AI Agent
agent = CodeAgent(
tools=[fetch_latest_arxiv_papers, get_citation_count, rank_papers_by_citations],
description="An AI agent that fetches and ranks the latest research papers based on citations and relevance.",
# ✅ Gradio UI
with gr.Blocks() as demo:
gr.Markdown("# ScholarAgent")
keyword_input = gr.Textbox(label="Enter keywords or full sentences", placeholder="e.g., deep learning, reinforcement learning")
output_display = gr.Markdown()
search_button = gr.Button("Search")
def search_papers(user_input):
keywords = [kw.strip() for kw in user_input.split(",") if kw.strip()]
results = fetch_latest_arxiv_papers(keywords, num_results=3)
if isinstance(results, list) and results and "error" in results[0]:
return results[0]["error"]
return "\n\n".join([
f"📌 **Title:** {paper['title']}\n\n"
f"👨‍🔬 **Authors:** {paper['authors']}\n\n"
f"📅 **Year:** {paper['year']}\n\n"
f"📖 **Summary:** {paper.get('summary', 'No summary available')[:500]}... *(truncated)*\n\n"
f"🔢 **Citations:** {paper.get('citations', 0)}\n\n"
f"[🔗 Read Full Paper]({paper['link']})\n\n"
for paper in results
]), inputs=[keyword_input], outputs=[output_display])
print("DEBUG: Gradio UI is running. Waiting for user input...")
# ✅ Launch Gradio App