import feedparser import urllib.parse import yaml import gradio as gr from smolagents import CodeAgent, HfApiModel, tool # @tool # def fetch_latest_arxiv_papers(keywords: list, num_results: int = 3) -> list: # """Fetches the latest research papers from arXiv based on provided keywords. # Args: # keywords: A list of keywords to search for relevant papers. # num_results: The number of papers to fetch (default is 3). # Returns: # A list of dictionaries containing: # - "title": The title of the research paper. # - "authors": The authors of the paper. # - "year": The publication year. # - "abstract": A summary of the research paper. # - "link": A direct link to the paper on arXiv. # """ # try: # print(f"DEBUG: Searching arXiv papers with keywords: {keywords}") # Debug input # #Properly format query with +AND+ for multiple keywords # query = "+AND+".join([f"all:{kw}" for kw in keywords]) # query_encoded = urllib.parse.quote(query) # Encode spaces and special characters # url = f"http://export.arxiv.org/api/query?search_query={query_encoded}&start=0&max_results={num_results}&sortBy=submittedDate&sortOrder=descending" # print(f"DEBUG: Query URL - {url}") # Debug URL # feed = feedparser.parse(url) # papers = [] # for entry in feed.entries: # papers.append({ # "title": entry.title, # "authors": ", ".join(author.name for author in entry.authors), # "year": entry.published[:4], # Extract year # "abstract": entry.summary, # "link": entry.link # }) # return papers # except Exception as e: # print(f"ERROR: {str(e)}") # Debug errors # return [f"Error fetching research papers: {str(e)}"] from rank_bm25 import BM25Okapi import nltk import os import shutil nltk_data_path = os.path.join(nltk.data.path[0], "tokenizers", "punkt") if os.path.exists(nltk_data_path): shutil.rmtree(nltk_data_path) # Remove corrupted version print("✅ Removed old NLTK 'punkt' data. Reinstalling...") # ✅ Step 2: Download the correct 'punkt' tokenizer nltk.download("punkt") print("✅ Successfully installed 'punkt'!") @tool # Register the function properly as a SmolAgents tool def fetch_latest_arxiv_papers(keywords: list, num_results: int = 5) -> list: """Fetches and ranks arXiv papers using BM25 keyword relevance. Args: keywords: List of keywords for search. num_results: Number of results to return. Returns: List of the most relevant papers based on BM25 ranking. """ try: print(f"DEBUG: Searching arXiv papers with keywords: {keywords}") # Use a general keyword search (without `ti:` and `abs:`) query = "+AND+".join([f"all:{kw}" for kw in keywords]) query_encoded = urllib.parse.quote(query) url = f"http://export.arxiv.org/api/query?search_query={query_encoded}&start=0&max_results=50&sortBy=submittedDate&sortOrder=descending" print(f"DEBUG: Query URL - {url}") feed = feedparser.parse(url) papers = [] # Extract papers from arXiv for entry in feed.entries: papers.append({ "title": entry.title, "authors": ", ".join(author.name for author in entry.authors), "year": entry.published[:4], "abstract": entry.summary, "link": entry.link }) if not papers: return [{"error": "No results found. Try different keywords."}] # Apply BM25 ranking tokenized_corpus = [nltk.word_tokenize(paper["title"].lower() + " " + paper["abstract"].lower()) for paper in papers] bm25 = BM25Okapi(tokenized_corpus) tokenized_query = nltk.word_tokenize(" ".join(keywords).lower()) scores = bm25.get_scores(tokenized_query) # Sort papers based on BM25 score ranked_papers = sorted(zip(papers, scores), key=lambda x: x[1], reverse=True) # Return the most relevant ones return [paper[0] for paper in ranked_papers[:num_results]] except Exception as e: print(f"ERROR: {str(e)}") return [{"error": f"Error fetching research papers: {str(e)}"}] # AI Model model = HfApiModel( max_tokens=2096, temperature=0.5, model_id='Qwen/Qwen2.5-Coder-32B-Instruct', custom_role_conversions=None, ) # Load prompt templates with open("prompts.yaml", 'r') as stream: prompt_templates = yaml.safe_load(stream) # Create the AI Agent agent = CodeAgent( model=model, tools=[fetch_latest_arxiv_papers], # Properly registered tool max_steps=6, verbosity_level=1, grammar=None, planning_interval=None, name="ScholarAgent", description="An AI agent that fetches the latest research papers from arXiv based on user-defined keywords and filters.", prompt_templates=prompt_templates ) # # Define Gradio Search Function # def search_papers(user_input): # keywords = [kw.strip() for kw in user_input.split(",") if kw.strip()] # Ensure valid keywords # print(f"DEBUG: Received input keywords - {keywords}") # Debug user input # if not keywords: # print("DEBUG: No valid keywords provided.") # return "Error: Please enter at least one valid keyword." # results = fetch_latest_arxiv_papers(keywords, num_results=3) # Fetch 3 results # print(f"DEBUG: Results received - {results}") # Debug function output # if isinstance(results, list) and results and isinstance(results[0], dict): # #Format output with better readability and clarity # formatted_results = "\n\n".join([ # f"---\n\n" # f"📌 **Title:**\n{paper['title']}\n\n" # f"👨‍🔬 **Authors:**\n{paper['authors']}\n\n" # f"📅 **Year:** {paper['year']}\n\n" # f"📖 **Abstract:**\n{paper['abstract'][:500]}... *(truncated for readability)*\n\n" # f"[🔗 Read Full Paper]({paper['link']})\n\n" # for paper in results # ]) # return formatted_results # print("DEBUG: No results found.") # return "No results found. Try different keywords." #Search Papers def search_papers(user_input): keywords = [kw.strip() for kw in user_input.split(",") if kw.strip()] # Ensure valid keywords print(f"DEBUG: Received input keywords - {keywords}") # Debug user input if not keywords: print("DEBUG: No valid keywords provided.") return "Error: Please enter at least one valid keyword." results = fetch_latest_arxiv_papers(keywords, num_results=3) # Fetch 3 results print(f"DEBUG: Results received - {results}") # Debug function output # ✅ Check if the API returned an error if isinstance(results, list) and len(results) > 0 and "error" in results[0]: return results[0]["error"] # Return the error message directly # ✅ Format results only if valid papers exist if isinstance(results, list) and results and isinstance(results[0], dict): formatted_results = "\n\n".join([ f"---\n\n" f"📌 **Title:** {paper['title']}\n\n" f"👨‍🔬 **Authors:** {paper['authors']}\n\n" f"📅 **Year:** {paper['year']}\n\n" f"📖 **Abstract:** {paper['abstract'][:500]}... *(truncated for readability)*\n\n" f"[🔗 Read Full Paper]({paper['link']})\n\n" for paper in results ]) return formatted_results print("DEBUG: No results found.") return "No results found. Try different keywords." # Create Gradio UI with gr.Blocks() as demo: gr.Markdown("# ScholarAgent") keyword_input = gr.Textbox(label="Enter keywords (comma-separated)", placeholder="e.g., deep learning, reinforcement learning") output_display = gr.Markdown() search_button = gr.Button("Search") search_button.click(search_papers, inputs=[keyword_input], outputs=[output_display]) print("DEBUG: Gradio UI is running. Waiting for user input...") # Launch Gradio App demo.launch()