ScholarAgent

Running

App Files Files Community

pdx97 commited on Mar 13

Commit

47b1f89

verified ·

1 Parent(s): 32f1a74

Update app.py

Browse files

Files changed (1) hide show

app.py +71 -28

app.py CHANGED Viewed

@@ -303,7 +303,8 @@
 # # Launch Gradio App
 # demo.launch()
-"""------New Features-----"""
 import feedparser
 import urllib.parse
 import yaml
@@ -314,13 +315,14 @@ from sklearn.metrics.pairwise import cosine_similarity
 import gradio as gr
 from smolagents import CodeAgent, HfApiModel, tool
 import nltk
 nltk.download("stopwords")
 nltk.download("punkt")
 from nltk.corpus import stopwords
-from transformers import pipeline
-# GPT Summarization Pipeline
 summarizer = pipeline("summarization", model="facebook/bart-large-cnn")
 @tool
@@ -339,63 +341,100 @@ def fetch_latest_arxiv_papers(keywords: list, num_results: int = 5) -> list:
             - "year" (str): The year of publication.
             - "abstract" (str): A summary of the paper.
             - "link" (str): A URL to the full paper.
     """
     try:
-        query = "+AND+".join([f"all:{kw}" for kw in keywords])
         query_encoded = urllib.parse.quote(query)
         url = f"http://export.arxiv.org/api/query?search_query={query_encoded}&start=0&max_results=50&sortBy=submittedDate&sortOrder=descending"
         feed = feedparser.parse(url)
         papers = []
         for entry in feed.entries:
-            papers.append({
                 "title": entry.title,
                 "authors": ", ".join(author.name for author in entry.authors),
                 "year": entry.published[:4],
                 "abstract": entry.summary,
-                "link": entry.link
-            })
         if not papers:
             return [{"error": "No results found. Try different keywords."}]
         corpus = [paper["title"] + " " + paper["abstract"] for paper in papers]
-        vectorizer = TfidfVectorizer(stop_words=stopwords.words('english'))
         tfidf_matrix = vectorizer.fit_transform(corpus)
         query_str = " ".join(keywords)
         query_vec = vectorizer.transform([query_str])
         similarity_scores = cosine_similarity(query_vec, tfidf_matrix).flatten()
         ranked_papers = sorted(zip(papers, similarity_scores), key=lambda x: x[1], reverse=True)
-        for paper, _ in ranked_papers:
             paper["summary"] = summarizer(paper["abstract"], max_length=100, min_length=30, do_sample=False)[0]["summary_text"]
         return [paper for paper, _ in ranked_papers[:num_results]]
     except Exception as e:
         return [{"error": f"Error fetching research papers: {str(e)}"}]
 @tool
 def get_citation_count(paper_title: str) -> int:
-    """Fetches citation count from Semantic Scholar API."""
     try:
-        url = f"https://api.semanticscholar.org/v1/paper/search?query={urllib.parse.quote(paper_title)}"
-        response = requests.get(url).json()
-        return response["results"][0].get("citationCount", 0) if "results" in response else 0
-    except:
         return 0
 @tool
 def rank_papers_by_citations(papers: list) -> list:
-    """Ranks papers based on citation count and TF-IDF similarity."""
     for paper in papers:
         paper["citations"] = get_citation_count(paper["title"])
-    return sorted(papers, key=lambda x: (x["citations"], x["tfidf_score"]), reverse=True)
-# AI Model
 model = HfApiModel(
     max_tokens=2096,
     temperature=0.5,
@@ -403,11 +442,11 @@ model = HfApiModel(
     custom_role_conversions=None,
 )
-# Load prompt templates
 with open("prompts.yaml", 'r') as stream:
     prompt_templates = yaml.safe_load(stream)
-# Create the AI Agent
 agent = CodeAgent(
     model=model,
     tools=[fetch_latest_arxiv_papers, get_citation_count, rank_papers_by_citations],
@@ -420,7 +459,8 @@ agent = CodeAgent(
     prompt_templates=prompt_templates
 )
-# Gradio UI
 with gr.Blocks() as demo:
     gr.Markdown("# ScholarAgent")
     keyword_input = gr.Textbox(label="Enter keywords or full sentences", placeholder="e.g., deep learning, reinforcement learning")
@@ -430,15 +470,17 @@ with gr.Blocks() as demo:
     def search_papers(user_input):
         keywords = [kw.strip() for kw in user_input.split(",") if kw.strip()]
         results = fetch_latest_arxiv_papers(keywords, num_results=3)
-        if isinstance(results, list) and len(results) > 0 and "error" in results[0]:
             return results[0]["error"]
         return "\n\n".join([
             f"---\n\n"
             f"📌 **Title:** {paper['title']}\n\n"
             f"👨‍🔬 **Authors:** {paper['authors']}\n\n"
             f"📅 **Year:** {paper['year']}\n\n"
-            f"📖 **Summary:** {paper['summary']}\n\n"
-            f"🔢 **Citations:** {paper['citations']}\n\n"
             f"[🔗 Read Full Paper]({paper['link']})\n\n"
             for paper in results
         ])
@@ -446,6 +488,7 @@ with gr.Blocks() as demo:
     search_button.click(search_papers, inputs=[keyword_input], outputs=[output_display])
     print("DEBUG: Gradio UI is running. Waiting for user input...")
-# Launch Gradio App
 demo.launch()

 # # Launch Gradio App
 # demo.launch()
+"""------Enhanced ScholarAgent with Fixes and Features-----"""
 import feedparser
 import urllib.parse
 import yaml
 import gradio as gr
 from smolagents import CodeAgent, HfApiModel, tool
 import nltk
+from transformers import pipeline
+# ✅ Ensure necessary NLTK data is downloaded
 nltk.download("stopwords")
 nltk.download("punkt")
 from nltk.corpus import stopwords
+# ✅ GPT Summarization Pipeline
 summarizer = pipeline("summarization", model="facebook/bart-large-cnn")
 @tool
             - "year" (str): The year of publication.
             - "abstract" (str): A summary of the paper.
             - "link" (str): A URL to the full paper.
+            - "citations" (int): Number of citations (from Semantic Scholar).
+            - "summary" (str): A GPT-generated summary of the abstract.
     """
     try:
+        # ✅ Construct the query for ArXiv API
+        query = "+AND+".join([f"all:{kw}" for kw in keywords])
         query_encoded = urllib.parse.quote(query)
         url = f"http://export.arxiv.org/api/query?search_query={query_encoded}&start=0&max_results=50&sortBy=submittedDate&sortOrder=descending"
+        # ✅ Fetch papers from ArXiv
         feed = feedparser.parse(url)
         papers = []
+        # ✅ Extract papers
         for entry in feed.entries:
+            paper = {
                 "title": entry.title,
                 "authors": ", ".join(author.name for author in entry.authors),
                 "year": entry.published[:4],
                 "abstract": entry.summary,
+                "link": entry.link,
+            }
+            paper["citations"] = get_citation_count(paper["title"])  # ✅ Fetch citation count
+            papers.append(paper)
         if not papers:
             return [{"error": "No results found. Try different keywords."}]
+        # ✅ TF-IDF Vectorization
         corpus = [paper["title"] + " " + paper["abstract"] for paper in papers]
+        vectorizer = TfidfVectorizer(stop_words=stopwords.words('english'))
         tfidf_matrix = vectorizer.fit_transform(corpus)
+        # ✅ Transform Query into TF-IDF Vector
         query_str = " ".join(keywords)
         query_vec = vectorizer.transform([query_str])
         similarity_scores = cosine_similarity(query_vec, tfidf_matrix).flatten()
+        # ✅ Sort papers based on similarity score
         ranked_papers = sorted(zip(papers, similarity_scores), key=lambda x: x[1], reverse=True)
+        # ✅ Assign TF-IDF scores and generate summaries
+        for paper, score in ranked_papers:
+            paper["tfidf_score"] = score
             paper["summary"] = summarizer(paper["abstract"], max_length=100, min_length=30, do_sample=False)[0]["summary_text"]
         return [paper for paper, _ in ranked_papers[:num_results]]
     except Exception as e:
         return [{"error": f"Error fetching research papers: {str(e)}"}]
 @tool
 def get_citation_count(paper_title: str) -> int:
+    """
+    Fetches citation count from Semantic Scholar API.
+    Args:
+        paper_title (str): Title of the research paper.
+    Returns:
+        int: Citation count (default 0 if not found).
+    """
     try:
+        base_url = "https://api.semanticscholar.org/graph/v1/paper/search"
+        params = {"query": paper_title, "fields": "citationCount"}
+        response = requests.get(base_url, params=params).json()
+        if "data" in response and response["data"]:
+            return response["data"][0].get("citationCount", 0)
+        return 0  # Default to 0 if no data found
+    except Exception as e:
+        print(f"ERROR fetching citation count: {e}")
         return 0
 @tool
 def rank_papers_by_citations(papers: list) -> list:
+    """
+    Ranks papers based on citation count and TF-IDF similarity.
+    Args:
+        papers (list): List of research papers.
+    Returns:
+        list: Papers sorted by citation count and TF-IDF score.
+    """
     for paper in papers:
         paper["citations"] = get_citation_count(paper["title"])
+    return sorted(papers, key=lambda x: (x["citations"], x.get("tfidf_score", 0)), reverse=True)
+# ✅ AI Model
 model = HfApiModel(
     max_tokens=2096,
     temperature=0.5,
     custom_role_conversions=None,
 )
+# ✅ Load prompt templates
 with open("prompts.yaml", 'r') as stream:
     prompt_templates = yaml.safe_load(stream)
+# ✅ Create the AI Agent
 agent = CodeAgent(
     model=model,
     tools=[fetch_latest_arxiv_papers, get_citation_count, rank_papers_by_citations],
     prompt_templates=prompt_templates
 )
+# ✅ Gradio UI
 with gr.Blocks() as demo:
     gr.Markdown("# ScholarAgent")
     keyword_input = gr.Textbox(label="Enter keywords or full sentences", placeholder="e.g., deep learning, reinforcement learning")
     def search_papers(user_input):
         keywords = [kw.strip() for kw in user_input.split(",") if kw.strip()]
         results = fetch_latest_arxiv_papers(keywords, num_results=3)
+        if isinstance(results, list) and results and "error" in results[0]:
             return results[0]["error"]
         return "\n\n".join([
             f"---\n\n"
             f"📌 **Title:** {paper['title']}\n\n"
             f"👨‍🔬 **Authors:** {paper['authors']}\n\n"
             f"📅 **Year:** {paper['year']}\n\n"
+            f"📖 **Summary:** {paper.get('summary', 'No summary available')[:500]}... *(truncated)*\n\n"
+            f"🔢 **Citations:** {paper.get('citations', 0)}\n\n"
             f"[🔗 Read Full Paper]({paper['link']})\n\n"
             for paper in results
         ])
     search_button.click(search_papers, inputs=[keyword_input], outputs=[output_display])
     print("DEBUG: Gradio UI is running. Waiting for user input...")
+# ✅ Launch Gradio App
 demo.launch()