pdx97 commited on
Commit
ebd9098
·
verified ·
1 Parent(s): 193f6e2

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +34 -30
app.py CHANGED
@@ -49,54 +49,59 @@ from smolagents import CodeAgent, HfApiModel, tool
49
  # print(f"ERROR: {str(e)}") # Debug errors
50
  # return [f"Error fetching research papers: {str(e)}"]
51
 
52
- @tool
 
 
 
 
53
  def fetch_latest_arxiv_papers(keywords: list, num_results: int = 5) -> list:
54
- """Fetches the latest research papers from arXiv based on provided keywords.
55
 
56
  Args:
57
- keywords: A list of keywords to search for relevant papers.
58
- num_results: The number of papers to fetch (default is 5).
59
 
60
  Returns:
61
- A list of dictionaries containing:
62
- - "title": The title of the research paper.
63
- - "authors": The authors of the paper.
64
- - "year": The publication year.
65
- - "abstract": A summary of the research paper.
66
- - "link": A direct link to the paper on arXiv.
67
  """
68
  try:
69
  print(f"DEBUG: Searching arXiv papers with keywords: {keywords}")
70
 
71
- # Format query using "AND" to enforce strict keyword presence
72
- query = "+AND+".join([f"ti:{kw}+OR+abs:{kw}" for kw in keywords])
73
- query_encoded = urllib.parse.quote(query) # Encode spaces and special characters
74
-
75
- url = f"http://export.arxiv.org/api/query?search_query={query_encoded}&start=0&max_results=20&sortBy=submittedDate&sortOrder=descending"
76
 
77
  print(f"DEBUG: Query URL - {url}")
78
 
79
  feed = feedparser.parse(url)
80
  papers = []
81
 
 
82
  for entry in feed.entries:
83
- title = entry.title.lower()
84
- abstract = entry.summary.lower()
 
 
 
 
 
85
 
86
- # Ensure at least one keyword appears in the title or abstract
87
- if any(kw.lower() in title or kw.lower() in abstract for kw in keywords):
88
- papers.append({
89
- "title": entry.title,
90
- "authors": ", ".join(author.name for author in entry.authors),
91
- "year": entry.published[:4], # Extract year
92
- "abstract": entry.summary,
93
- "link": entry.link
94
- })
95
 
96
- # Sort papers: First prioritize keyword in title, then abstract
97
- papers.sort(key=lambda x: sum(kw.lower() in x["title"].lower() for kw in keywords), reverse=True)
 
98
 
99
- return papers[:num_results] # Return top-matching papers
 
 
 
 
 
 
 
100
 
101
  except Exception as e:
102
  print(f"ERROR: {str(e)}")
@@ -104,7 +109,6 @@ def fetch_latest_arxiv_papers(keywords: list, num_results: int = 5) -> list:
104
 
105
 
106
 
107
-
108
  # AI Model
109
  model = HfApiModel(
110
  max_tokens=2096,
 
49
  # print(f"ERROR: {str(e)}") # Debug errors
50
  # return [f"Error fetching research papers: {str(e)}"]
51
 
52
+ from rank_bm25 import BM25Okapi
53
+ import nltk
54
+
55
+ nltk.download('punkt')
56
+
57
  def fetch_latest_arxiv_papers(keywords: list, num_results: int = 5) -> list:
58
+ """Fetches and ranks arXiv papers using BM25 keyword relevance.
59
 
60
  Args:
61
+ keywords: List of keywords for search.
62
+ num_results: Number of results to return.
63
 
64
  Returns:
65
+ List of the most relevant papers based on BM25 ranking.
 
 
 
 
 
66
  """
67
  try:
68
  print(f"DEBUG: Searching arXiv papers with keywords: {keywords}")
69
 
70
+ # Broadly search keywords in title and abstract
71
+ query = "+OR+".join([f"(ti:\"{kw}\"+OR+abs:\"{kw}\")" for kw in keywords])
72
+ query_encoded = urllib.parse.quote(query)
73
+ url = f"http://export.arxiv.org/api/query?search_query={query_encoded}&start=0&max_results=50&sortBy=submittedDate&sortOrder=descending"
 
74
 
75
  print(f"DEBUG: Query URL - {url}")
76
 
77
  feed = feedparser.parse(url)
78
  papers = []
79
 
80
+ # Extract papers from arXiv
81
  for entry in feed.entries:
82
+ papers.append({
83
+ "title": entry.title,
84
+ "authors": ", ".join(author.name for author in entry.authors),
85
+ "year": entry.published[:4],
86
+ "abstract": entry.summary,
87
+ "link": entry.link
88
+ })
89
 
90
+ if not papers:
91
+ return [{"error": "No results found. Try different keywords."}]
 
 
 
 
 
 
 
92
 
93
+ # Apply BM25 ranking
94
+ tokenized_corpus = [nltk.word_tokenize(paper["title"].lower() + " " + paper["abstract"].lower()) for paper in papers]
95
+ bm25 = BM25Okapi(tokenized_corpus)
96
 
97
+ tokenized_query = nltk.word_tokenize(" ".join(keywords).lower())
98
+ scores = bm25.get_scores(tokenized_query)
99
+
100
+ # Sort papers based on BM25 score
101
+ ranked_papers = sorted(zip(papers, scores), key=lambda x: x[1], reverse=True)
102
+
103
+ # Return the most relevant ones
104
+ return [paper[0] for paper in ranked_papers[:num_results]]
105
 
106
  except Exception as e:
107
  print(f"ERROR: {str(e)}")
 
109
 
110
 
111
 
 
112
  # AI Model
113
  model = HfApiModel(
114
  max_tokens=2096,