# # import feedparser # # import urllib.parse # # import yaml # # import gradio as gr # # from smolagents import CodeAgent, HfApiModel, tool # # from tools.final_answer import FinalAnswerTool # # @tool # # def fetch_latest_arxiv_papers(keywords: list, num_results: int = 3) -> list: # # """Fetches the latest research papers from arXiv based on provided keywords. # # Args: # # keywords: A list of keywords to search for relevant papers. # # num_results: The number of papers to fetch (default is 3). # # Returns: # # A list of dictionaries containing: # # - "title": The title of the research paper. # # - "authors": The authors of the paper. # # - "year": The publication year. # # - "abstract": A summary of the research paper. # # - "link": A direct link to the paper on arXiv. # # """ # # try: # # print(f"DEBUG: Searching arXiv papers with keywords: {keywords}") # Debug input # # #Properly format query with +AND+ for multiple keywords # # query = "+AND+".join([f"all:{kw}" for kw in keywords]) # # query_encoded = urllib.parse.quote(query) # Encode spaces and special characters # # url = f"http://export.arxiv.org/api/query?search_query={query_encoded}&start=0&max_results={num_results}&sortBy=submittedDate&sortOrder=descending" # # print(f"DEBUG: Query URL - {url}") # Debug URL # # feed = feedparser.parse(url) # # papers = [] # # for entry in feed.entries: # # papers.append({ # # "title": entry.title, # # "authors": ", ".join(author.name for author in entry.authors), # # "year": entry.published[:4], # Extract year # # "abstract": entry.summary, # # "link": entry.link # # }) # # return papers # # except Exception as e: # # print(f"ERROR: {str(e)}") # Debug errors # # return [f"Error fetching research papers: {str(e)}"] # #"""------Applied BM25 search for paper retrival------""" # # from rank_bm25 import BM25Okapi # # import nltk # # import os # # import shutil # # nltk_data_path = os.path.join(nltk.data.path[0], "tokenizers", "punkt") # # if os.path.exists(nltk_data_path): # # shutil.rmtree(nltk_data_path) # Remove corrupted version # # print("Removed old NLTK 'punkt' data. Reinstalling...") # # # Step 2: Download the correct 'punkt' tokenizer # # nltk.download("punkt_tab") # # print("Successfully installed 'punkt'!") # # @tool # Register the function properly as a SmolAgents tool # # def fetch_latest_arxiv_papers(keywords: list, num_results: int = 5) -> list: # # """Fetches and ranks arXiv papers using BM25 keyword relevance. # # Args: # # keywords: List of keywords for search. # # num_results: Number of results to return. # # Returns: # # List of the most relevant papers based on BM25 ranking. # # """ # # try: # # print(f"DEBUG: Searching arXiv papers with keywords: {keywords}") # # # Use a general keyword search (without `ti:` and `abs:`) # # query = "+AND+".join([f"all:{kw}" for kw in keywords]) # # query_encoded = urllib.parse.quote(query) # # url = f"http://export.arxiv.org/api/query?search_query={query_encoded}&start=0&max_results=50&sortBy=submittedDate&sortOrder=descending" # # print(f"DEBUG: Query URL - {url}") # # feed = feedparser.parse(url) # # papers = [] # # # Extract papers from arXiv # # for entry in feed.entries: # # papers.append({ # # "title": entry.title, # # "authors": ", ".join(author.name for author in entry.authors), # # "year": entry.published[:4], # # "abstract": entry.summary, # # "link": entry.link # # }) # # if not papers: # # return [{"error": "No results found. Try different keywords."}] # # # Apply BM25 ranking # # tokenized_corpus = [nltk.word_tokenize(paper["title"].lower() + " " + paper["abstract"].lower()) for paper in papers] # # bm25 = BM25Okapi(tokenized_corpus) # # tokenized_query = nltk.word_tokenize(" ".join(keywords).lower()) # # scores = bm25.get_scores(tokenized_query) # # # Sort papers based on BM25 score # # ranked_papers = sorted(zip(papers, scores), key=lambda x: x[1], reverse=True) # # # Return the most relevant ones # # return [paper[0] for paper in ranked_papers[:num_results]] # # except Exception as e: # # print(f"ERROR: {str(e)}") # # return [{"error": f"Error fetching research papers: {str(e)}"}] # """------Applied TF-IDF for better semantic search------""" # import feedparser # import urllib.parse # import yaml # from tools.final_answer import FinalAnswerTool # import numpy as np # from sklearn.feature_extraction.text import TfidfVectorizer # from sklearn.metrics.pairwise import cosine_similarity # import gradio as gr # from smolagents import CodeAgent,DuckDuckGoSearchTool, HfApiModel,load_tool,tool # import nltk # import datetime # import requests # import pytz # from tools.final_answer import FinalAnswerTool # from Gradio_UI import GradioUI # nltk.download("stopwords") # from nltk.corpus import stopwords # @tool # ✅ Register the function properly as a SmolAgents tool # def fetch_latest_arxiv_papers(keywords: list, num_results: int = 5) -> list: # """Fetches and ranks arXiv papers using TF-IDF and Cosine Similarity. # Args: # keywords: List of keywords for search. # num_results: Number of results to return. # Returns: # List of the most relevant papers based on TF-IDF ranking. # """ # try: # print(f"DEBUG: Searching arXiv papers with keywords: {keywords}") # # Use a general keyword search # query = "+AND+".join([f"all:{kw}" for kw in keywords]) # query_encoded = urllib.parse.quote(query) # url = f"http://export.arxiv.org/api/query?search_query={query_encoded}&start=0&max_results=50&sortBy=submittedDate&sortOrder=descending" # print(f"DEBUG: Query URL - {url}") # feed = feedparser.parse(url) # papers = [] # # Extract papers from arXiv # for entry in feed.entries: # papers.append({ # "title": entry.title, # "authors": ", ".join(author.name for author in entry.authors), # "year": entry.published[:4], # "abstract": entry.summary, # "link": entry.link # }) # if not papers: # return [{"error": "No results found. Try different keywords."}] # # Prepare TF-IDF Vectorization # corpus = [paper["title"] + " " + paper["abstract"] for paper in papers] # vectorizer = TfidfVectorizer(stop_words=stopwords.words('english')) # Remove stopwords # tfidf_matrix = vectorizer.fit_transform(corpus) # # Transform Query into TF-IDF Vector # query_str = " ".join(keywords) # query_vec = vectorizer.transform([query_str]) # #Compute Cosine Similarity # similarity_scores = cosine_similarity(query_vec, tfidf_matrix).flatten() # #Sort papers based on similarity score # ranked_papers = sorted(zip(papers, similarity_scores), key=lambda x: x[1], reverse=True) # # Return the most relevant papers # return [paper[0] for paper in ranked_papers[:num_results]] # except Exception as e: # print(f"ERROR: {str(e)}") # return [{"error": f"Error fetching research papers: {str(e)}"}] # @tool # def get_current_time_in_timezone(timezone: str) -> str: # """A tool that fetches the current local time in a specified timezone. # Args: # timezone: A string representing a valid timezone (e.g., 'America/New_York'). # """ # try: # # Create timezone object # tz = pytz.timezone(timezone) # # Get current time in that timezone # local_time = datetime.datetime.now(tz).strftime("%Y-%m-%d %H:%M:%S") # return f"The current local time in {timezone} is: {local_time}" # except Exception as e: # return f"Error fetching time for timezone '{timezone}': {str(e)}" # final_answer = FinalAnswerTool() # # AI Model # model = HfApiModel( # max_tokens=2096, # temperature=0.5, # model_id='Qwen/Qwen2.5-Coder-32B-Instruct', # custom_role_conversions=None, # ) # # Import tool from Hub # image_generation_tool = load_tool("agents-course/text-to-image", trust_remote_code=True) # # Load prompt templates # with open("prompts.yaml", 'r') as stream: # prompt_templates = yaml.safe_load(stream) # # Create the AI Agent # agent = CodeAgent( # model=model, # tools=[final_answer,fetch_latest_arxiv_papers], # Add your tools here # max_steps=6, # verbosity_level=1, # grammar=None, # planning_interval=None, # name="ScholarAgent", # description="An AI agent that fetches the latest research papers from arXiv based on user-defined keywords and filters.", # prompt_templates=prompt_templates # ) # #Search Papers # def search_papers(user_input): # keywords = [kw.strip() for kw in user_input.split(",") if kw.strip()] # Ensure valid keywords # print(f"DEBUG: Received input keywords - {keywords}") # Debug user input # if not keywords: # print("DEBUG: No valid keywords provided.") # return "Error: Please enter at least one valid keyword." # results = fetch_latest_arxiv_papers(keywords, num_results=3) # Fetch 3 results # print(f"DEBUG: Results received - {results}") # Debug function output # # Check if the API returned an error # if isinstance(results, list) and len(results) > 0 and "error" in results[0]: # return results[0]["error"] # Return the error message directly # # Format results only if valid papers exist # if isinstance(results, list) and results and isinstance(results[0], dict): # formatted_results = "\n\n".join([ # f"---\n\n" # f"📌 **Title:** {paper['title']}\n\n" # f"👨‍🔬 **Authors:** {paper['authors']}\n\n" # f"📅 **Year:** {paper['year']}\n\n" # f"📖 **Abstract:** {paper['abstract'][:500]}... *(truncated for readability)*\n\n" # f"[🔗 Read Full Paper]({paper['link']})\n\n" # for paper in results # ]) # return formatted_results # print("DEBUG: No results found.") # return "No results found. Try different keywords." # # Create Gradio UI # with gr.Blocks() as demo: # gr.Markdown("# ScholarAgent") # keyword_input = gr.Textbox(label="Enter keywords(comma-separated) or even full sentences ", placeholder="e.g., deep learning, reinforcement learning or NLP in finance or Deep learning in Medicine") # output_display = gr.Markdown() # search_button = gr.Button("Search") # search_button.click(search_papers, inputs=[keyword_input], outputs=[output_display]) # print("DEBUG: Gradio UI is running. Waiting for user input...") # # Launch Gradio App # demo.launch() """------Enhanced ScholarAgent with Fixes and Features-----""" import feedparser import urllib.parse import yaml import requests import numpy as np from sklearn.feature_extraction.text import TfidfVectorizer from sklearn.metrics.pairwise import cosine_similarity import gradio as gr from smolagents import CodeAgent, HfApiModel, tool import nltk from transformers import pipeline # ✅ Ensure necessary NLTK data is downloaded nltk.download("stopwords") nltk.download("punkt") from nltk.corpus import stopwords # ✅ GPT Summarization Pipeline summarizer = pipeline("summarization", model="facebook/bart-large-cnn") @tool def fetch_latest_arxiv_papers(keywords: list, num_results: int = 5) -> list: """ Fetches and ranks arXiv papers using optimized TF-IDF and Cosine Similarity. Args: keywords: List of keywords for search. num_results: Number of results to return. Returns: List of the most relevant papers based on TF-IDF ranking. """ try: # ✅ Correct URL encoding for spaces and special characters query = "+AND+".join([f"all:{kw}" for kw in keywords]) query_encoded = urllib.parse.quote_plus(query) # ✅ FIXED: Correct encoding url = f"http://export.arxiv.org/api/query?search_query={query_encoded}&start=0&max_results=5&sortBy=submittedDate&sortOrder=descending" print(f"DEBUG: Query URL - {url}") # ✅ Debugging feed = feedparser.parse(url) print(f"DEBUG: API Response - {feed.entries}") papers = [] for entry in feed.entries: papers.append({ "title": entry.title, "authors": ", ".join(author.name for author in entry.authors), "year": entry.published[:4], "abstract": entry.summary, "link": entry.link }) if not papers: print("DEBUG: No results from ArXiv API") return [{"error": "No results found. Try different keywords."}] # ✅ Debug Corpus before TF-IDF corpus = [paper["title"] + " " + paper["abstract"] for paper in papers] print(f"DEBUG: Corpus - {corpus}") vectorizer = TfidfVectorizer(stop_words=stopwords.words('english'), max_features=2000) tfidf_matrix = vectorizer.fit_transform(corpus) print(f"DEBUG: TF-IDF Matrix Shape - {tfidf_matrix.shape}") query_vec = vectorizer.transform([" ".join(keywords)]) print(f"DEBUG: Query Vector Shape - {query_vec.shape}") similarity_scores = cosine_similarity(query_vec, tfidf_matrix).flatten() print(f"DEBUG: Similarity Scores - {similarity_scores}") # ✅ Rank papers by similarity score ranked_papers = sorted(zip(papers, similarity_scores), key=lambda x: x[1], reverse=True) return [paper[0] for paper in ranked_papers[:num_results]] except Exception as e: print(f"ERROR: {str(e)}") return [{"error": f"Error fetching research papers: {str(e)}"}] @tool def get_citation_count(paper_title: str) -> int: """ Fetches citation count from Semantic Scholar API. Args: paper_title (str): Title of the research paper. Returns: int: Citation count (default 0 if not found). """ try: base_url = "https://api.semanticscholar.org/graph/v1/paper/search" params = {"query": paper_title, "fields": "citationCount"} response = requests.get(base_url, params=params).json() if "data" in response and response["data"]: return response["data"][0].get("citationCount", 0) return 0 # Default to 0 if no data found except Exception as e: print(f"ERROR fetching citation count: {e}") return 0 @tool def rank_papers_by_citations(papers: list) -> list: """ Ranks papers based on citation count and TF-IDF similarity. Args: papers (list): List of research papers. Returns: list: Papers sorted by citation count and TF-IDF score. """ for paper in papers: paper["citations"] = get_citation_count(paper["title"]) return sorted(papers, key=lambda x: (x["citations"], x.get("tfidf_score", 0)), reverse=True) # ✅ AI Model model = HfApiModel( max_tokens=2096, temperature=0.5, model_id='Qwen/Qwen2.5-Coder-32B-Instruct', custom_role_conversions=None, ) # ✅ Load prompt templates with open("prompts.yaml", 'r') as stream: prompt_templates = yaml.safe_load(stream) # ✅ Create the AI Agent agent = CodeAgent( model=model, tools=[fetch_latest_arxiv_papers, get_citation_count, rank_papers_by_citations], max_steps=6, verbosity_level=1, grammar=None, planning_interval=None, name="ScholarAgent", description="An AI agent that fetches and ranks the latest research papers based on citations and relevance.", prompt_templates=prompt_templates ) # ✅ Gradio UI with gr.Blocks() as demo: gr.Markdown("# ScholarAgent") keyword_input = gr.Textbox(label="Enter keywords or full sentences", placeholder="e.g., deep learning, reinforcement learning") output_display = gr.Markdown() search_button = gr.Button("Search") def search_papers(user_input): keywords = [kw.strip() for kw in user_input.split(",") if kw.strip()] results = fetch_latest_arxiv_papers(keywords, num_results=3) if isinstance(results, list) and results and "error" in results[0]: return results[0]["error"] return "\n\n".join([ f"---\n\n" f"📌 **Title:** {paper['title']}\n\n" f"👨‍🔬 **Authors:** {paper['authors']}\n\n" f"📅 **Year:** {paper['year']}\n\n" f"📖 **Summary:** {paper.get('summary', 'No summary available')[:500]}... *(truncated)*\n\n" f"🔢 **Citations:** {paper.get('citations', 0)}\n\n" f"[🔗 Read Full Paper]({paper['link']})\n\n" for paper in results ]) search_button.click(search_papers, inputs=[keyword_input], outputs=[output_display]) print("DEBUG: Gradio UI is running. Waiting for user input...") # ✅ Launch Gradio App demo.launch()