Spaces:
Running
Running
Update app.py
Browse files
app.py
CHANGED
@@ -49,54 +49,59 @@ from smolagents import CodeAgent, HfApiModel, tool
|
|
49 |
# print(f"ERROR: {str(e)}") # Debug errors
|
50 |
# return [f"Error fetching research papers: {str(e)}"]
|
51 |
|
52 |
-
|
|
|
|
|
|
|
|
|
53 |
def fetch_latest_arxiv_papers(keywords: list, num_results: int = 5) -> list:
|
54 |
-
"""Fetches
|
55 |
|
56 |
Args:
|
57 |
-
keywords:
|
58 |
-
num_results:
|
59 |
|
60 |
Returns:
|
61 |
-
|
62 |
-
- "title": The title of the research paper.
|
63 |
-
- "authors": The authors of the paper.
|
64 |
-
- "year": The publication year.
|
65 |
-
- "abstract": A summary of the research paper.
|
66 |
-
- "link": A direct link to the paper on arXiv.
|
67 |
"""
|
68 |
try:
|
69 |
print(f"DEBUG: Searching arXiv papers with keywords: {keywords}")
|
70 |
|
71 |
-
#
|
72 |
-
query = "+
|
73 |
-
query_encoded = urllib.parse.quote(query)
|
74 |
-
|
75 |
-
url = f"http://export.arxiv.org/api/query?search_query={query_encoded}&start=0&max_results=20&sortBy=submittedDate&sortOrder=descending"
|
76 |
|
77 |
print(f"DEBUG: Query URL - {url}")
|
78 |
|
79 |
feed = feedparser.parse(url)
|
80 |
papers = []
|
81 |
|
|
|
82 |
for entry in feed.entries:
|
83 |
-
|
84 |
-
|
|
|
|
|
|
|
|
|
|
|
85 |
|
86 |
-
|
87 |
-
|
88 |
-
papers.append({
|
89 |
-
"title": entry.title,
|
90 |
-
"authors": ", ".join(author.name for author in entry.authors),
|
91 |
-
"year": entry.published[:4], # Extract year
|
92 |
-
"abstract": entry.summary,
|
93 |
-
"link": entry.link
|
94 |
-
})
|
95 |
|
96 |
-
#
|
97 |
-
|
|
|
98 |
|
99 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
100 |
|
101 |
except Exception as e:
|
102 |
print(f"ERROR: {str(e)}")
|
@@ -104,7 +109,6 @@ def fetch_latest_arxiv_papers(keywords: list, num_results: int = 5) -> list:
|
|
104 |
|
105 |
|
106 |
|
107 |
-
|
108 |
# AI Model
|
109 |
model = HfApiModel(
|
110 |
max_tokens=2096,
|
|
|
49 |
# print(f"ERROR: {str(e)}") # Debug errors
|
50 |
# return [f"Error fetching research papers: {str(e)}"]
|
51 |
|
52 |
+
from rank_bm25 import BM25Okapi
|
53 |
+
import nltk
|
54 |
+
|
55 |
+
nltk.download('punkt')
|
56 |
+
|
57 |
def fetch_latest_arxiv_papers(keywords: list, num_results: int = 5) -> list:
|
58 |
+
"""Fetches and ranks arXiv papers using BM25 keyword relevance.
|
59 |
|
60 |
Args:
|
61 |
+
keywords: List of keywords for search.
|
62 |
+
num_results: Number of results to return.
|
63 |
|
64 |
Returns:
|
65 |
+
List of the most relevant papers based on BM25 ranking.
|
|
|
|
|
|
|
|
|
|
|
66 |
"""
|
67 |
try:
|
68 |
print(f"DEBUG: Searching arXiv papers with keywords: {keywords}")
|
69 |
|
70 |
+
# Broadly search keywords in title and abstract
|
71 |
+
query = "+OR+".join([f"(ti:\"{kw}\"+OR+abs:\"{kw}\")" for kw in keywords])
|
72 |
+
query_encoded = urllib.parse.quote(query)
|
73 |
+
url = f"http://export.arxiv.org/api/query?search_query={query_encoded}&start=0&max_results=50&sortBy=submittedDate&sortOrder=descending"
|
|
|
74 |
|
75 |
print(f"DEBUG: Query URL - {url}")
|
76 |
|
77 |
feed = feedparser.parse(url)
|
78 |
papers = []
|
79 |
|
80 |
+
# Extract papers from arXiv
|
81 |
for entry in feed.entries:
|
82 |
+
papers.append({
|
83 |
+
"title": entry.title,
|
84 |
+
"authors": ", ".join(author.name for author in entry.authors),
|
85 |
+
"year": entry.published[:4],
|
86 |
+
"abstract": entry.summary,
|
87 |
+
"link": entry.link
|
88 |
+
})
|
89 |
|
90 |
+
if not papers:
|
91 |
+
return [{"error": "No results found. Try different keywords."}]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
92 |
|
93 |
+
# Apply BM25 ranking
|
94 |
+
tokenized_corpus = [nltk.word_tokenize(paper["title"].lower() + " " + paper["abstract"].lower()) for paper in papers]
|
95 |
+
bm25 = BM25Okapi(tokenized_corpus)
|
96 |
|
97 |
+
tokenized_query = nltk.word_tokenize(" ".join(keywords).lower())
|
98 |
+
scores = bm25.get_scores(tokenized_query)
|
99 |
+
|
100 |
+
# Sort papers based on BM25 score
|
101 |
+
ranked_papers = sorted(zip(papers, scores), key=lambda x: x[1], reverse=True)
|
102 |
+
|
103 |
+
# Return the most relevant ones
|
104 |
+
return [paper[0] for paper in ranked_papers[:num_results]]
|
105 |
|
106 |
except Exception as e:
|
107 |
print(f"ERROR: {str(e)}")
|
|
|
109 |
|
110 |
|
111 |
|
|
|
112 |
# AI Model
|
113 |
model = HfApiModel(
|
114 |
max_tokens=2096,
|