pdx97 commited on
Commit
d54a2bd
·
verified ·
1 Parent(s): 1fea399

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +26 -33
app.py CHANGED
@@ -327,10 +327,8 @@ summarizer = pipeline("summarization", model="facebook/bart-large-cnn")
327
 
328
  @tool
329
  def fetch_latest_arxiv_papers(keywords: list, num_results: int = 5) -> list:
330
-
331
-
332
  """
333
- Fetches and ranks arXiv papers using TF-IDF and Cosine Similarity.
334
 
335
  Args:
336
  keywords: List of keywords for search.
@@ -339,59 +337,54 @@ def fetch_latest_arxiv_papers(keywords: list, num_results: int = 5) -> list:
339
  Returns:
340
  List of the most relevant papers based on TF-IDF ranking.
341
  """
342
-
343
-
344
-
345
-
346
  try:
347
- # ✅ Construct the query for ArXiv API
348
- query = "+AND+".join([f"all:{kw}" for kw in keywords])
349
- query_encoded = urllib.parse.quote(query)
350
- url = f"http://export.arxiv.org/api/query?search_query={query_encoded}&start=0&max_results=5&sortBy=submittedDate&sortOrder=descending"
351
-
352
- # ✅ Fetch papers from ArXiv
353
  feed = feedparser.parse(url)
354
- papers = []
355
 
356
- # Extract papers
 
357
  for entry in feed.entries:
358
- paper = {
359
  "title": entry.title,
360
  "authors": ", ".join(author.name for author in entry.authors),
361
  "year": entry.published[:4],
362
  "abstract": entry.summary,
363
- "link": entry.link,
364
- }
365
- paper["citations"] = get_citation_count(paper["title"]) # ✅ Fetch citation count
366
- papers.append(paper)
367
 
368
  if not papers:
 
369
  return [{"error": "No results found. Try different keywords."}]
370
 
371
- # ✅ TF-IDF Vectorization
372
  corpus = [paper["title"] + " " + paper["abstract"] for paper in papers]
373
- vectorizer = TfidfVectorizer(stop_words=stopwords.words('english'),max_features=3000)
 
 
374
  tfidf_matrix = vectorizer.fit_transform(corpus)
 
 
 
 
375
 
376
- # ✅ Transform Query into TF-IDF Vector
377
- query_str = " ".join(keywords)
378
- query_vec = vectorizer.transform([query_str])
379
  similarity_scores = cosine_similarity(query_vec, tfidf_matrix).flatten()
 
380
 
381
- # ✅ Sort papers based on similarity score
382
  ranked_papers = sorted(zip(papers, similarity_scores), key=lambda x: x[1], reverse=True)
383
-
384
- # Assign TF-IDF scores and generate summaries
385
- for paper, score in ranked_papers:
386
- paper["tfidf_score"] = score
387
- paper["summary"] = summarizer(paper["abstract"], max_length=100, min_length=30, do_sample=False)[0]["summary_text"]
388
-
389
- return [paper for paper, _ in ranked_papers[:num_results]]
390
 
391
  except Exception as e:
 
392
  return [{"error": f"Error fetching research papers: {str(e)}"}]
393
 
394
 
 
395
  @tool
396
  def get_citation_count(paper_title: str) -> int:
397
  """
 
327
 
328
  @tool
329
  def fetch_latest_arxiv_papers(keywords: list, num_results: int = 5) -> list:
 
 
330
  """
331
+ Fetches and ranks arXiv papers using optimized TF-IDF and Cosine Similarity.
332
 
333
  Args:
334
  keywords: List of keywords for search.
 
337
  Returns:
338
  List of the most relevant papers based on TF-IDF ranking.
339
  """
 
 
 
 
340
  try:
341
+ # ✅ Fetch only 5 papers
342
+ url = f"http://export.arxiv.org/api/query?search_query={'%20'.join(keywords)}&start=0&max_results=5&sortBy=submittedDate&sortOrder=descending"
343
+ print(f"DEBUG: Query URL - {url}")
344
+
 
 
345
  feed = feedparser.parse(url)
346
+ print(f"DEBUG: API Response - {feed.entries}")
347
 
348
+ papers = []
349
+
350
  for entry in feed.entries:
351
+ papers.append({
352
  "title": entry.title,
353
  "authors": ", ".join(author.name for author in entry.authors),
354
  "year": entry.published[:4],
355
  "abstract": entry.summary,
356
+ "link": entry.link
357
+ })
 
 
358
 
359
  if not papers:
360
+ print("DEBUG: No results from ArXiv API")
361
  return [{"error": "No results found. Try different keywords."}]
362
 
363
+ # ✅ Debug Corpus before TF-IDF
364
  corpus = [paper["title"] + " " + paper["abstract"] for paper in papers]
365
+ print(f"DEBUG: Corpus - {corpus}")
366
+
367
+ vectorizer = TfidfVectorizer(stop_words=stopwords.words('english'), max_features=2000)
368
  tfidf_matrix = vectorizer.fit_transform(corpus)
369
+ print(f"DEBUG: TF-IDF Matrix Shape - {tfidf_matrix.shape}")
370
+
371
+ query_vec = vectorizer.transform([" ".join(keywords)])
372
+ print(f"DEBUG: Query Vector Shape - {query_vec.shape}")
373
 
 
 
 
374
  similarity_scores = cosine_similarity(query_vec, tfidf_matrix).flatten()
375
+ print(f"DEBUG: Similarity Scores - {similarity_scores}")
376
 
377
+ # ✅ Rank papers by similarity score
378
  ranked_papers = sorted(zip(papers, similarity_scores), key=lambda x: x[1], reverse=True)
379
+
380
+ return [paper[0] for paper in ranked_papers[:num_results]]
 
 
 
 
 
381
 
382
  except Exception as e:
383
+ print(f"ERROR: {str(e)}")
384
  return [{"error": f"Error fetching research papers: {str(e)}"}]
385
 
386
 
387
+
388
  @tool
389
  def get_citation_count(paper_title: str) -> int:
390
  """