ScholarAgent

Running

App Files Files Community

pdx97 commited on Mar 13

Commit

d54a2bd

verified ·

1 Parent(s): 1fea399

Update app.py

Browse files

Files changed (1) hide show

app.py +26 -33

app.py CHANGED Viewed

@@ -327,10 +327,8 @@ summarizer = pipeline("summarization", model="facebook/bart-large-cnn")
 @tool
 def fetch_latest_arxiv_papers(keywords: list, num_results: int = 5) -> list:
     """
-    Fetches and ranks arXiv papers using TF-IDF and Cosine Similarity.
     Args:
         keywords: List of keywords for search.
@@ -339,59 +337,54 @@ def fetch_latest_arxiv_papers(keywords: list, num_results: int = 5) -> list:
     Returns:
         List of the most relevant papers based on TF-IDF ranking.
     """
     try:
-        # ✅ Construct the query for ArXiv API
-        query = "+AND+".join([f"all:{kw}" for kw in keywords])
-        query_encoded = urllib.parse.quote(query)
-        url = f"http://export.arxiv.org/api/query?search_query={query_encoded}&start=0&max_results=5&sortBy=submittedDate&sortOrder=descending"
-        # ✅ Fetch papers from ArXiv
         feed = feedparser.parse(url)
-        papers = []
-        # ✅ Extract papers
         for entry in feed.entries:
-            paper = {
                 "title": entry.title,
                 "authors": ", ".join(author.name for author in entry.authors),
                 "year": entry.published[:4],
                 "abstract": entry.summary,
-                "link": entry.link,
-            }
-            paper["citations"] = get_citation_count(paper["title"])  # ✅ Fetch citation count
-            papers.append(paper)
         if not papers:
             return [{"error": "No results found. Try different keywords."}]
-        # ✅ TF-IDF Vectorization
         corpus = [paper["title"] + " " + paper["abstract"] for paper in papers]
-        vectorizer = TfidfVectorizer(stop_words=stopwords.words('english'),max_features=3000)
         tfidf_matrix = vectorizer.fit_transform(corpus)
-        # ✅ Transform Query into TF-IDF Vector
-        query_str = " ".join(keywords)
-        query_vec = vectorizer.transform([query_str])
         similarity_scores = cosine_similarity(query_vec, tfidf_matrix).flatten()
-        # ✅ Sort papers based on similarity score
         ranked_papers = sorted(zip(papers, similarity_scores), key=lambda x: x[1], reverse=True)
-        # ✅ Assign TF-IDF scores and generate summaries
-        for paper, score in ranked_papers:
-            paper["tfidf_score"] = score
-            paper["summary"] = summarizer(paper["abstract"], max_length=100, min_length=30, do_sample=False)[0]["summary_text"]
-        return [paper for paper, _ in ranked_papers[:num_results]]
     except Exception as e:
         return [{"error": f"Error fetching research papers: {str(e)}"}]
 @tool
 def get_citation_count(paper_title: str) -> int:
     """

 @tool
 def fetch_latest_arxiv_papers(keywords: list, num_results: int = 5) -> list:
     """
+    Fetches and ranks arXiv papers using optimized TF-IDF and Cosine Similarity.
     Args:
         keywords: List of keywords for search.
     Returns:
         List of the most relevant papers based on TF-IDF ranking.
     """
     try:
+        # ✅ Fetch only 5 papers
+        url = f"http://export.arxiv.org/api/query?search_query={'%20'.join(keywords)}&start=0&max_results=5&sortBy=submittedDate&sortOrder=descending"
+        print(f"DEBUG: Query URL - {url}")
         feed = feedparser.parse(url)
+        print(f"DEBUG: API Response - {feed.entries}")
+        papers = []
         for entry in feed.entries:
+            papers.append({
                 "title": entry.title,
                 "authors": ", ".join(author.name for author in entry.authors),
                 "year": entry.published[:4],
                 "abstract": entry.summary,
+                "link": entry.link
+            })
         if not papers:
+            print("DEBUG: No results from ArXiv API")
             return [{"error": "No results found. Try different keywords."}]
+        # ✅ Debug Corpus before TF-IDF
         corpus = [paper["title"] + " " + paper["abstract"] for paper in papers]
+        print(f"DEBUG: Corpus - {corpus}")
+        vectorizer = TfidfVectorizer(stop_words=stopwords.words('english'), max_features=2000)
         tfidf_matrix = vectorizer.fit_transform(corpus)
+        print(f"DEBUG: TF-IDF Matrix Shape - {tfidf_matrix.shape}")
+        query_vec = vectorizer.transform([" ".join(keywords)])
+        print(f"DEBUG: Query Vector Shape - {query_vec.shape}")
         similarity_scores = cosine_similarity(query_vec, tfidf_matrix).flatten()
+        print(f"DEBUG: Similarity Scores - {similarity_scores}")
+        # ✅ Rank papers by similarity score
         ranked_papers = sorted(zip(papers, similarity_scores), key=lambda x: x[1], reverse=True)
+        return [paper[0] for paper in ranked_papers[:num_results]]
     except Exception as e:
+        print(f"ERROR: {str(e)}")
         return [{"error": f"Error fetching research papers: {str(e)}"}]
 @tool
 def get_citation_count(paper_title: str) -> int:
     """