ScholarAgent

Running

App Files Files Community

pdx97 commited on Mar 8

Commit

ebd9098

verified ·

1 Parent(s): 193f6e2

Update app.py

Browse files

Files changed (1) hide show

app.py +34 -30

app.py CHANGED Viewed

@@ -49,54 +49,59 @@ from smolagents import CodeAgent, HfApiModel, tool
 #         print(f"ERROR: {str(e)}")  # Debug errors
 #         return [f"Error fetching research papers: {str(e)}"]
-@tool
 def fetch_latest_arxiv_papers(keywords: list, num_results: int = 5) -> list:
-    """Fetches the latest research papers from arXiv based on provided keywords.
     Args:
-        keywords: A list of keywords to search for relevant papers.
-        num_results: The number of papers to fetch (default is 5).
     Returns:
-        A list of dictionaries containing:
-            - "title": The title of the research paper.
-            - "authors": The authors of the paper.
-            - "year": The publication year.
-            - "abstract": A summary of the research paper.
-            - "link": A direct link to the paper on arXiv.
     """
     try:
         print(f"DEBUG: Searching arXiv papers with keywords: {keywords}")
-        # Format query using "AND" to enforce strict keyword presence
-        query = "+AND+".join([f"ti:{kw}+OR+abs:{kw}" for kw in keywords])
-        query_encoded = urllib.parse.quote(query)  # Encode spaces and special characters
-        url = f"http://export.arxiv.org/api/query?search_query={query_encoded}&start=0&max_results=20&sortBy=submittedDate&sortOrder=descending"
         print(f"DEBUG: Query URL - {url}")
         feed = feedparser.parse(url)
         papers = []
         for entry in feed.entries:
-            title = entry.title.lower()
-            abstract = entry.summary.lower()
-            # ✅ Ensure at least one keyword appears in the title or abstract
-            if any(kw.lower() in title or kw.lower() in abstract for kw in keywords):
-                papers.append({
-                    "title": entry.title,
-                    "authors": ", ".join(author.name for author in entry.authors),
-                    "year": entry.published[:4],  # Extract year
-                    "abstract": entry.summary,
-                    "link": entry.link
-                })
-        # ✅ Sort papers: First prioritize keyword in title, then abstract
-        papers.sort(key=lambda x: sum(kw.lower() in x["title"].lower() for kw in keywords), reverse=True)
-        return papers[:num_results]  # Return top-matching papers
     except Exception as e:
         print(f"ERROR: {str(e)}")
@@ -104,7 +109,6 @@ def fetch_latest_arxiv_papers(keywords: list, num_results: int = 5) -> list:
 # AI Model
 model = HfApiModel(
     max_tokens=2096,

 #         print(f"ERROR: {str(e)}")  # Debug errors
 #         return [f"Error fetching research papers: {str(e)}"]
+from rank_bm25 import BM25Okapi
+import nltk
+nltk.download('punkt')
 def fetch_latest_arxiv_papers(keywords: list, num_results: int = 5) -> list:
+    """Fetches and ranks arXiv papers using BM25 keyword relevance.
     Args:
+        keywords: List of keywords for search.
+        num_results: Number of results to return.
     Returns:
+        List of the most relevant papers based on BM25 ranking.
     """
     try:
         print(f"DEBUG: Searching arXiv papers with keywords: {keywords}")
+        # Broadly search keywords in title and abstract
+        query = "+OR+".join([f"(ti:\"{kw}\"+OR+abs:\"{kw}\")" for kw in keywords])
+        query_encoded = urllib.parse.quote(query)
+        url = f"http://export.arxiv.org/api/query?search_query={query_encoded}&start=0&max_results=50&sortBy=submittedDate&sortOrder=descending"
         print(f"DEBUG: Query URL - {url}")
         feed = feedparser.parse(url)
         papers = []
+        # Extract papers from arXiv
         for entry in feed.entries:
+            papers.append({
+                "title": entry.title,
+                "authors": ", ".join(author.name for author in entry.authors),
+                "year": entry.published[:4],
+                "abstract": entry.summary,
+                "link": entry.link
+            })
+        if not papers:
+            return [{"error": "No results found. Try different keywords."}]
+        # Apply BM25 ranking
+        tokenized_corpus = [nltk.word_tokenize(paper["title"].lower() + " " + paper["abstract"].lower()) for paper in papers]
+        bm25 = BM25Okapi(tokenized_corpus)
+        tokenized_query = nltk.word_tokenize(" ".join(keywords).lower())
+        scores = bm25.get_scores(tokenized_query)
+        # Sort papers based on BM25 score
+        ranked_papers = sorted(zip(papers, scores), key=lambda x: x[1], reverse=True)
+        # Return the most relevant ones
+        return [paper[0] for paper in ranked_papers[:num_results]]
     except Exception as e:
         print(f"ERROR: {str(e)}")
 # AI Model
 model = HfApiModel(
     max_tokens=2096,