Spaces:
Running
Running
Update app.py
Browse files
app.py
CHANGED
@@ -327,10 +327,8 @@ summarizer = pipeline("summarization", model="facebook/bart-large-cnn")
|
|
327 |
|
328 |
@tool
|
329 |
def fetch_latest_arxiv_papers(keywords: list, num_results: int = 5) -> list:
|
330 |
-
|
331 |
-
|
332 |
"""
|
333 |
-
Fetches and ranks arXiv papers using TF-IDF and Cosine Similarity.
|
334 |
|
335 |
Args:
|
336 |
keywords: List of keywords for search.
|
@@ -339,59 +337,54 @@ def fetch_latest_arxiv_papers(keywords: list, num_results: int = 5) -> list:
|
|
339 |
Returns:
|
340 |
List of the most relevant papers based on TF-IDF ranking.
|
341 |
"""
|
342 |
-
|
343 |
-
|
344 |
-
|
345 |
-
|
346 |
try:
|
347 |
-
# ✅
|
348 |
-
|
349 |
-
|
350 |
-
|
351 |
-
|
352 |
-
# ✅ Fetch papers from ArXiv
|
353 |
feed = feedparser.parse(url)
|
354 |
-
|
355 |
|
356 |
-
|
|
|
357 |
for entry in feed.entries:
|
358 |
-
|
359 |
"title": entry.title,
|
360 |
"authors": ", ".join(author.name for author in entry.authors),
|
361 |
"year": entry.published[:4],
|
362 |
"abstract": entry.summary,
|
363 |
-
"link": entry.link
|
364 |
-
}
|
365 |
-
paper["citations"] = get_citation_count(paper["title"]) # ✅ Fetch citation count
|
366 |
-
papers.append(paper)
|
367 |
|
368 |
if not papers:
|
|
|
369 |
return [{"error": "No results found. Try different keywords."}]
|
370 |
|
371 |
-
# ✅ TF-IDF
|
372 |
corpus = [paper["title"] + " " + paper["abstract"] for paper in papers]
|
373 |
-
|
|
|
|
|
374 |
tfidf_matrix = vectorizer.fit_transform(corpus)
|
|
|
|
|
|
|
|
|
375 |
|
376 |
-
# ✅ Transform Query into TF-IDF Vector
|
377 |
-
query_str = " ".join(keywords)
|
378 |
-
query_vec = vectorizer.transform([query_str])
|
379 |
similarity_scores = cosine_similarity(query_vec, tfidf_matrix).flatten()
|
|
|
380 |
|
381 |
-
# ✅
|
382 |
ranked_papers = sorted(zip(papers, similarity_scores), key=lambda x: x[1], reverse=True)
|
383 |
-
|
384 |
-
|
385 |
-
for paper, score in ranked_papers:
|
386 |
-
paper["tfidf_score"] = score
|
387 |
-
paper["summary"] = summarizer(paper["abstract"], max_length=100, min_length=30, do_sample=False)[0]["summary_text"]
|
388 |
-
|
389 |
-
return [paper for paper, _ in ranked_papers[:num_results]]
|
390 |
|
391 |
except Exception as e:
|
|
|
392 |
return [{"error": f"Error fetching research papers: {str(e)}"}]
|
393 |
|
394 |
|
|
|
395 |
@tool
|
396 |
def get_citation_count(paper_title: str) -> int:
|
397 |
"""
|
|
|
327 |
|
328 |
@tool
|
329 |
def fetch_latest_arxiv_papers(keywords: list, num_results: int = 5) -> list:
|
|
|
|
|
330 |
"""
|
331 |
+
Fetches and ranks arXiv papers using optimized TF-IDF and Cosine Similarity.
|
332 |
|
333 |
Args:
|
334 |
keywords: List of keywords for search.
|
|
|
337 |
Returns:
|
338 |
List of the most relevant papers based on TF-IDF ranking.
|
339 |
"""
|
|
|
|
|
|
|
|
|
340 |
try:
|
341 |
+
# ✅ Fetch only 5 papers
|
342 |
+
url = f"http://export.arxiv.org/api/query?search_query={'%20'.join(keywords)}&start=0&max_results=5&sortBy=submittedDate&sortOrder=descending"
|
343 |
+
print(f"DEBUG: Query URL - {url}")
|
344 |
+
|
|
|
|
|
345 |
feed = feedparser.parse(url)
|
346 |
+
print(f"DEBUG: API Response - {feed.entries}")
|
347 |
|
348 |
+
papers = []
|
349 |
+
|
350 |
for entry in feed.entries:
|
351 |
+
papers.append({
|
352 |
"title": entry.title,
|
353 |
"authors": ", ".join(author.name for author in entry.authors),
|
354 |
"year": entry.published[:4],
|
355 |
"abstract": entry.summary,
|
356 |
+
"link": entry.link
|
357 |
+
})
|
|
|
|
|
358 |
|
359 |
if not papers:
|
360 |
+
print("DEBUG: No results from ArXiv API")
|
361 |
return [{"error": "No results found. Try different keywords."}]
|
362 |
|
363 |
+
# ✅ Debug Corpus before TF-IDF
|
364 |
corpus = [paper["title"] + " " + paper["abstract"] for paper in papers]
|
365 |
+
print(f"DEBUG: Corpus - {corpus}")
|
366 |
+
|
367 |
+
vectorizer = TfidfVectorizer(stop_words=stopwords.words('english'), max_features=2000)
|
368 |
tfidf_matrix = vectorizer.fit_transform(corpus)
|
369 |
+
print(f"DEBUG: TF-IDF Matrix Shape - {tfidf_matrix.shape}")
|
370 |
+
|
371 |
+
query_vec = vectorizer.transform([" ".join(keywords)])
|
372 |
+
print(f"DEBUG: Query Vector Shape - {query_vec.shape}")
|
373 |
|
|
|
|
|
|
|
374 |
similarity_scores = cosine_similarity(query_vec, tfidf_matrix).flatten()
|
375 |
+
print(f"DEBUG: Similarity Scores - {similarity_scores}")
|
376 |
|
377 |
+
# ✅ Rank papers by similarity score
|
378 |
ranked_papers = sorted(zip(papers, similarity_scores), key=lambda x: x[1], reverse=True)
|
379 |
+
|
380 |
+
return [paper[0] for paper in ranked_papers[:num_results]]
|
|
|
|
|
|
|
|
|
|
|
381 |
|
382 |
except Exception as e:
|
383 |
+
print(f"ERROR: {str(e)}")
|
384 |
return [{"error": f"Error fetching research papers: {str(e)}"}]
|
385 |
|
386 |
|
387 |
+
|
388 |
@tool
|
389 |
def get_citation_count(paper_title: str) -> int:
|
390 |
"""
|