ScholarAgent

Running

App Files Files Community

pdx97 commited on Mar 13

Commit

bde3c06

verified ·

1 Parent(s): acce4ba

Update app.py

Browse files

Files changed (1) hide show

app.py +81 -273

app.py CHANGED Viewed

@@ -126,209 +126,31 @@
 # #         return [{"error": f"Error fetching research papers: {str(e)}"}]
-# """------Applied TF-IDF for better semantic search------"""
-# import feedparser
-# import urllib.parse
-# import yaml
-# from tools.final_answer import FinalAnswerTool
-# import numpy as np
-# from sklearn.feature_extraction.text import TfidfVectorizer
-# from sklearn.metrics.pairwise import cosine_similarity
-# import gradio as gr
-# from smolagents import CodeAgent,DuckDuckGoSearchTool, HfApiModel,load_tool,tool
-# import nltk
-# import datetime
-# import requests
-# import pytz
-# from tools.final_answer import FinalAnswerTool
-# from Gradio_UI import GradioUI
-# nltk.download("stopwords")
-# from nltk.corpus import stopwords
-# @tool  # ✅ Register the function properly as a SmolAgents tool
-# def fetch_latest_arxiv_papers(keywords: list, num_results: int = 5) -> list:
-#     """Fetches and ranks arXiv papers using TF-IDF and Cosine Similarity.
-#     Args:
-#         keywords: List of keywords for search.
-#         num_results: Number of results to return.
-#     Returns:
-#         List of the most relevant papers based on TF-IDF ranking.
-#     """
-#     try:
-#         print(f"DEBUG: Searching arXiv papers with keywords: {keywords}")
-#         # Use a general keyword search
-#         query = "+AND+".join([f"all:{kw}" for kw in keywords])
-#         query_encoded = urllib.parse.quote(query)
-#         url = f"http://export.arxiv.org/api/query?search_query={query_encoded}&start=0&max_results=50&sortBy=submittedDate&sortOrder=descending"
-#         print(f"DEBUG: Query URL - {url}")
-#         feed = feedparser.parse(url)
-#         papers = []
-#         # Extract papers from arXiv
-#         for entry in feed.entries:
-#             papers.append({
-#                 "title": entry.title,
-#                 "authors": ", ".join(author.name for author in entry.authors),
-#                 "year": entry.published[:4],
-#                 "abstract": entry.summary,
-#                 "link": entry.link
-#             })
-#         if not papers:
-#             return [{"error": "No results found. Try different keywords."}]
-#         # Prepare TF-IDF Vectorization
-#         corpus = [paper["title"] + " " + paper["abstract"] for paper in papers]
-#         vectorizer = TfidfVectorizer(stop_words=stopwords.words('english'))  # Remove stopwords
-#         tfidf_matrix = vectorizer.fit_transform(corpus)
-#         # Transform Query into TF-IDF Vector
-#         query_str = " ".join(keywords)
-#         query_vec = vectorizer.transform([query_str])
-#         #Compute Cosine Similarity
-#         similarity_scores = cosine_similarity(query_vec, tfidf_matrix).flatten()
-#         #Sort papers based on similarity score
-#         ranked_papers = sorted(zip(papers, similarity_scores), key=lambda x: x[1], reverse=True)
-#         # Return the most relevant papers
-#         return [paper[0] for paper in ranked_papers[:num_results]]
-#     except Exception as e:
-#         print(f"ERROR: {str(e)}")
-#         return [{"error": f"Error fetching research papers: {str(e)}"}]
-# @tool
-# def get_current_time_in_timezone(timezone: str) -> str:
-#     """A tool that fetches the current local time in a specified timezone.
-#     Args:
-#         timezone: A string representing a valid timezone (e.g., 'America/New_York').
-#     """
-#     try:
-#         # Create timezone object
-#         tz = pytz.timezone(timezone)
-#         # Get current time in that timezone
-#         local_time = datetime.datetime.now(tz).strftime("%Y-%m-%d %H:%M:%S")
-#         return f"The current local time in {timezone} is: {local_time}"
-#     except Exception as e:
-#         return f"Error fetching time for timezone '{timezone}': {str(e)}"
-# final_answer = FinalAnswerTool()
-# # AI Model
-# model = HfApiModel(
-#     max_tokens=2096,
-#     temperature=0.5,
-#     model_id='Qwen/Qwen2.5-Coder-32B-Instruct',
-#     custom_role_conversions=None,
-# )
-# # Import tool from Hub
-# image_generation_tool = load_tool("agents-course/text-to-image", trust_remote_code=True)
-# # Load prompt templates
-# with open("prompts.yaml", 'r') as stream:
-#     prompt_templates = yaml.safe_load(stream)
-# # Create the AI Agent
-# agent = CodeAgent(
-#     model=model,
-#     tools=[final_answer,fetch_latest_arxiv_papers],  # Add your tools here
-#     max_steps=6,
-#     verbosity_level=1,
-#     grammar=None,
-#     planning_interval=None,
-#     name="ScholarAgent",
-#     description="An AI agent that fetches the latest research papers from arXiv based on user-defined keywords and filters.",
-#     prompt_templates=prompt_templates
-# )
-# #Search Papers
-# def search_papers(user_input):
-#     keywords = [kw.strip() for kw in user_input.split(",") if kw.strip()]  # Ensure valid keywords
-#     print(f"DEBUG: Received input keywords - {keywords}")  # Debug user input
-#     if not keywords:
-#         print("DEBUG: No valid keywords provided.")
-#         return "Error: Please enter at least one valid keyword."
-#     results = fetch_latest_arxiv_papers(keywords, num_results=3)  # Fetch 3 results
-#     print(f"DEBUG: Results received - {results}")  # Debug function output
-#     # Check if the API returned an error
-#     if isinstance(results, list) and len(results) > 0 and "error" in results[0]:
-#         return results[0]["error"]  # Return the error message directly
-#     # Format results only if valid papers exist
-#     if isinstance(results, list) and results and isinstance(results[0], dict):
-#         formatted_results = "\n\n".join([
-#             f"---\n\n"
-#             f"📌 **Title:** {paper['title']}\n\n"
-#             f"👨‍🔬 **Authors:** {paper['authors']}\n\n"
-#             f"📅 **Year:** {paper['year']}\n\n"
-#             f"📖 **Abstract:** {paper['abstract'][:500]}... *(truncated for readability)*\n\n"
-#             f"[🔗 Read Full Paper]({paper['link']})\n\n"
-#             for paper in results
-#         ])
-#         return formatted_results
-#     print("DEBUG: No results found.")
-#     return "No results found. Try different keywords."
-# # Create Gradio UI
-# with gr.Blocks() as demo:
-#     gr.Markdown("# ScholarAgent")
-#     keyword_input = gr.Textbox(label="Enter keywords(comma-separated) or even full sentences ", placeholder="e.g., deep learning, reinforcement learning or NLP in finance or Deep learning in Medicine")
-#     output_display = gr.Markdown()
-#     search_button = gr.Button("Search")
-#     search_button.click(search_papers, inputs=[keyword_input], outputs=[output_display])
-#     print("DEBUG: Gradio UI is running. Waiting for user input...")
-# # Launch Gradio App
-# demo.launch()
-"""------Enhanced ScholarAgent with Fixes and Features-----"""
 import feedparser
 import urllib.parse
 import yaml
-import requests
 import numpy as np
 from sklearn.feature_extraction.text import TfidfVectorizer
 from sklearn.metrics.pairwise import cosine_similarity
 import gradio as gr
-from smolagents import CodeAgent, HfApiModel, tool
 import nltk
-from transformers import pipeline
-# ✅ Ensure necessary NLTK data is downloaded
 nltk.download("stopwords")
-nltk.download("punkt")
 from nltk.corpus import stopwords
-# ✅ GPT Summarization Pipeline
-summarizer = pipeline("summarization", model="facebook/bart-large-cnn")
-@tool
 def fetch_latest_arxiv_papers(keywords: list, num_results: int = 5) -> list:
-    """
-    Fetches and ranks arXiv papers using optimized TF-IDF and Cosine Similarity.
     Args:
         keywords: List of keywords for search.
@@ -338,16 +160,19 @@ def fetch_latest_arxiv_papers(keywords: list, num_results: int = 5) -> list:
         List of the most relevant papers based on TF-IDF ranking.
     """
     try:
-        # ✅ Encode query properly
-        query = "+AND+".join([f"all:{kw}" for kw in keywords])
-        query_encoded = urllib.parse.quote_plus(query)
-        url = f"http://export.arxiv.org/api/query?search_query={query_encoded}&start=0&max_results=5&sortBy=submittedDate&sortOrder=descending"
-        print(f"DEBUG: Query URL - {url}")
         feed = feedparser.parse(url)
         papers = []
         for entry in feed.entries:
             papers.append({
                 "title": entry.title,
@@ -358,77 +183,49 @@ def fetch_latest_arxiv_papers(keywords: list, num_results: int = 5) -> list:
             })
         if not papers:
-            print("DEBUG: No results from ArXiv API")
             return [{"error": "No results found. Try different keywords."}]
-        # ✅ TF-IDF Vectorization
         corpus = [paper["title"] + " " + paper["abstract"] for paper in papers]
-        vectorizer = TfidfVectorizer(stop_words=stopwords.words('english'), max_features=2000)
         tfidf_matrix = vectorizer.fit_transform(corpus)
-        query_vec = vectorizer.transform([" ".join(keywords)])
         similarity_scores = cosine_similarity(query_vec, tfidf_matrix).flatten()
         ranked_papers = sorted(zip(papers, similarity_scores), key=lambda x: x[1], reverse=True)
-        # ✅ Apply GPT Summarization with Fallback
-        for paper, _ in ranked_papers:
-            try:
-                paper["summary"] = summarizer(paper["abstract"], max_length=100, min_length=30, do_sample=False)[0]["summary_text"]
-            except:
-                paper["summary"] = paper["abstract"][:300] + "..."  # ✅ Fallback: First 300 characters of abstract
         return [paper[0] for paper in ranked_papers[:num_results]]
     except Exception as e:
         print(f"ERROR: {str(e)}")
         return [{"error": f"Error fetching research papers: {str(e)}"}]
 @tool
-def get_citation_count(paper_title: str) -> int:
-    """
-    Fetches citation count from Semantic Scholar API.
     Args:
-        paper_title (str): Title of the research paper.
-    Returns:
-        int: Citation count (default 0 if not found).
     """
     try:
-        base_url = "https://api.semanticscholar.org/graph/v1/paper/search"
-        params = {"query": paper_title, "fields": "citationCount"}
-        response = requests.get(base_url, params=params).json()
-        if "data" in response and response["data"]:
-            return response["data"][0].get("citationCount", 0)
-        return 0  # Default to 0 if no data found
     except Exception as e:
-        print(f"ERROR fetching citation count: {e}")
-        return 0
-@tool
-def rank_papers_by_citations(papers: list) -> list:
-    """
-    Ranks papers based on citation count and TF-IDF similarity.
-    Args:
-        papers (list): List of research papers.
-    Returns:
-        list: Papers sorted by citation count and TF-IDF score.
-    """
-    for paper in papers:
-        paper["citations"] = get_citation_count(paper["title"])
-    return sorted(papers, key=lambda x: (x["citations"], x.get("tfidf_score", 0)), reverse=True)
-# ✅ AI Model
 model = HfApiModel(
     max_tokens=2096,
     temperature=0.5,
@@ -436,65 +233,76 @@ model = HfApiModel(
     custom_role_conversions=None,
 )
-# ✅ Load prompt templates
 with open("prompts.yaml", 'r') as stream:
     prompt_templates = yaml.safe_load(stream)
-# ✅ Create the AI Agent
 agent = CodeAgent(
     model=model,
-    tools=[fetch_latest_arxiv_papers, get_citation_count, rank_papers_by_citations],
     max_steps=6,
     verbosity_level=1,
     grammar=None,
     planning_interval=None,
     name="ScholarAgent",
-    description="An AI agent that fetches and ranks the latest research papers based on citations and relevance.",
     prompt_templates=prompt_templates
 )
-# ✅ Gradio UI
-with gr.Blocks() as demo:
-    gr.Markdown("# ScholarAgent")
-    keyword_input = gr.Textbox(label="Enter keywords or full sentences", placeholder="e.g., deep learning, reinforcement learning")
-    output_display = gr.Markdown()
-    search_button = gr.Button("Search")
-    def search_papers(user_input, year_range, min_citations):
-        keywords = [kw.strip() for kw in user_input.split(",") if kw.strip()]
-        print(f"DEBUG: Received input keywords - {keywords}")
-        if not keywords:
-            print("DEBUG: No valid keywords provided.")
-            return "Error: Please enter at least one valid keyword."
-        results = fetch_latest_arxiv_papers(keywords, num_results=5, year_range=year_range, min_citations=int(min_citations))
-        print(f"DEBUG: Results received - {results}")
-        # If results are empty or an error occurred, display an error message
-        if not results or isinstance(results, list) and "error" in results[0]:
-            print(f"DEBUG: Error in fetching results - {results[0]['error']}")
-            return results[0]["error"] if results else "No results found. Try different keywords."
-        # Format output
         formatted_results = "\n\n".join([
             f"📌 **Title:** {paper['title']}\n\n"
             f"👨‍🔬 **Authors:** {paper['authors']}\n\n"
             f"📅 **Year:** {paper['year']}\n\n"
-            f"📖 **Summary:** {paper['summary'] if 'summary' in paper else 'No summary available'}\n\n"
-            f"🔢 **Citations:** {paper['citations']}\n\n"
             f"[🔗 Read Full Paper]({paper['link']})\n\n"
             for paper in results
         ])
-        print(f"DEBUG: Formatted Results - {formatted_results}")
         return formatted_results
     search_button.click(search_papers, inputs=[keyword_input], outputs=[output_display])
     print("DEBUG: Gradio UI is running. Waiting for user input...")
-# ✅ Launch Gradio App
 demo.launch()

 # #         return [{"error": f"Error fetching research papers: {str(e)}"}]
+"""------Applied TF-IDF for better semantic search------"""
 import feedparser
 import urllib.parse
 import yaml
+from tools.final_answer import FinalAnswerTool
 import numpy as np
 from sklearn.feature_extraction.text import TfidfVectorizer
 from sklearn.metrics.pairwise import cosine_similarity
 import gradio as gr
+from smolagents import CodeAgent,DuckDuckGoSearchTool, HfApiModel,load_tool,tool
 import nltk
+import datetime
+import requests
+import pytz
+from tools.final_answer import FinalAnswerTool
+from Gradio_UI import GradioUI
 nltk.download("stopwords")
 from nltk.corpus import stopwords
+@tool  # ✅ Register the function properly as a SmolAgents tool
 def fetch_latest_arxiv_papers(keywords: list, num_results: int = 5) -> list:
+    """Fetches and ranks arXiv papers using TF-IDF and Cosine Similarity.
     Args:
         keywords: List of keywords for search.
         List of the most relevant papers based on TF-IDF ranking.
     """
     try:
+        print(f"DEBUG: Searching arXiv papers with keywords: {keywords}")
+        # Use a general keyword search
+        query = "+AND+".join([f"all:{kw}" for kw in keywords])
+        query_encoded = urllib.parse.quote(query)
+        url = f"http://export.arxiv.org/api/query?search_query={query_encoded}&start=0&max_results=50&sortBy=submittedDate&sortOrder=descending"
+        print(f"DEBUG: Query URL - {url}")
         feed = feedparser.parse(url)
         papers = []
+        # Extract papers from arXiv
         for entry in feed.entries:
             papers.append({
                 "title": entry.title,
             })
         if not papers:
             return [{"error": "No results found. Try different keywords."}]
+        # Prepare TF-IDF Vectorization
         corpus = [paper["title"] + " " + paper["abstract"] for paper in papers]
+        vectorizer = TfidfVectorizer(stop_words=stopwords.words('english'))  # Remove stopwords
         tfidf_matrix = vectorizer.fit_transform(corpus)
+        # Transform Query into TF-IDF Vector
+        query_str = " ".join(keywords)
+        query_vec = vectorizer.transform([query_str])
+        #Compute Cosine Similarity
         similarity_scores = cosine_similarity(query_vec, tfidf_matrix).flatten()
+        #Sort papers based on similarity score
         ranked_papers = sorted(zip(papers, similarity_scores), key=lambda x: x[1], reverse=True)
+        # Return the most relevant papers
         return [paper[0] for paper in ranked_papers[:num_results]]
     except Exception as e:
         print(f"ERROR: {str(e)}")
         return [{"error": f"Error fetching research papers: {str(e)}"}]
 @tool
+def get_current_time_in_timezone(timezone: str) -> str:
+    """A tool that fetches the current local time in a specified timezone.
     Args:
+        timezone: A string representing a valid timezone (e.g., 'America/New_York').
     """
     try:
+        # Create timezone object
+        tz = pytz.timezone(timezone)
+        # Get current time in that timezone
+        local_time = datetime.datetime.now(tz).strftime("%Y-%m-%d %H:%M:%S")
+        return f"The current local time in {timezone} is: {local_time}"
     except Exception as e:
+        return f"Error fetching time for timezone '{timezone}': {str(e)}"
+final_answer = FinalAnswerTool()
+# AI Model
 model = HfApiModel(
     max_tokens=2096,
     temperature=0.5,
     custom_role_conversions=None,
 )
+# Import tool from Hub
+image_generation_tool = load_tool("agents-course/text-to-image", trust_remote_code=True)
+# Load prompt templates
 with open("prompts.yaml", 'r') as stream:
     prompt_templates = yaml.safe_load(stream)
+# Create the AI Agent
 agent = CodeAgent(
     model=model,
+    tools=[final_answer,fetch_latest_arxiv_papers],  # Add your tools here
     max_steps=6,
     verbosity_level=1,
     grammar=None,
     planning_interval=None,
     name="ScholarAgent",
+    description="An AI agent that fetches the latest research papers from arXiv based on user-defined keywords and filters.",
     prompt_templates=prompt_templates
 )
+#Search Papers
+def search_papers(user_input):
+    keywords = [kw.strip() for kw in user_input.split(",") if kw.strip()]  # Ensure valid keywords
+    print(f"DEBUG: Received input keywords - {keywords}")  # Debug user input
+    if not keywords:
+        print("DEBUG: No valid keywords provided.")
+        return "Error: Please enter at least one valid keyword."
+    results = fetch_latest_arxiv_papers(keywords, num_results=3)  # Fetch 3 results
+    print(f"DEBUG: Results received - {results}")  # Debug function output
+    # Check if the API returned an error
+    if isinstance(results, list) and len(results) > 0 and "error" in results[0]:
+        return results[0]["error"]  # Return the error message directly
+    # Format results only if valid papers exist
+    if isinstance(results, list) and results and isinstance(results[0], dict):
         formatted_results = "\n\n".join([
+            f"---\n\n"
             f"📌 **Title:** {paper['title']}\n\n"
             f"👨‍🔬 **Authors:** {paper['authors']}\n\n"
             f"📅 **Year:** {paper['year']}\n\n"
+            f"📖 **Abstract:** {paper['abstract'][:500]}... *(truncated for readability)*\n\n"
             f"[🔗 Read Full Paper]({paper['link']})\n\n"
             for paper in results
         ])
         return formatted_results
+    print("DEBUG: No results found.")
+    return "No results found. Try different keywords."
+# Create Gradio UI
+with gr.Blocks() as demo:
+    gr.Markdown("# ScholarAgent")
+    keyword_input = gr.Textbox(label="Enter keywords(comma-separated) or even full sentences ", placeholder="e.g., deep learning, reinforcement learning or NLP in finance or Deep learning in Medicine")
+    output_display = gr.Markdown()
+    search_button = gr.Button("Search")
     search_button.click(search_papers, inputs=[keyword_input], outputs=[output_display])
     print("DEBUG: Gradio UI is running. Waiting for user input...")
+# Launch Gradio App
 demo.launch()