File size: 2,811 Bytes
b4c04de
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
import requests
import xml.etree.ElementTree as ET
from scholarly import scholarly

class DataLoader:
    def __init__(self):
        print("DataLoader Init")
    def fetch_arxiv_papers(self, query, limit=None):  # Updated signature
        """
            Fetches top 5 research papers from ArXiv based on the user query.
            If <5 papers are found, expands the search using related topics.
            
            Returns:
                list: A list of dictionaries containing paper details (title, summary, link).
        """
        
        def search_arxiv(query):
            """Helper function to query ArXiv API."""
            url = f"http://export.arxiv.org/api/query?search_query=all:{query}&start=0&max_results=5"
            response = requests.get(url)
            if response.status_code == 200:
                root = ET.fromstring(response.text)
                return [
                    {
                        "title": entry.find("{http://www.w3.org/2005/Atom}title").text,
                        "summary": entry.find("{http://www.w3.org/2005/Atom}summary").text,
                        "link": entry.find("{http://www.w3.org/2005/Atom}id").text
                    }
                    for entry in root.findall("{http://www.w3.org/2005/Atom}entry")
                ]
            return []

        papers = search_arxiv(query)

        if len(papers) < 5 and self.search_agent:  # If fewer than 5 papers, expand search
            related_topics_response = self.search_agent.generate_reply(
                messages=[{"role": "user", "content": f"Suggest 3 related research topics for '{query}'"}]
            )
            related_topics = related_topics_response.get("content", "").split("\n")

            for topic in related_topics:
                topic = topic.strip()
                if topic and len(papers) < 5:
                    new_papers = search_arxiv(topic)
                    papers.extend(new_papers)
                    papers = papers[:5]  # Ensure max 5 papers

        if limit is not None:
            papers = papers[:limit]
            
        return papers

    def fetch_google_scholar_papers(self, query):
        """
            Fetches top 5 research papers from Google Scholar.
            Returns:
                list: A list of dictionaries containing paper details (title, summary, link)
        """
        papers = []
        search_results = scholarly.search_pubs(query)

        for i, paper in enumerate(search_results):
            if i >= 5:
                break
            papers.append({
                "title": paper["bib"]["title"],
                "summary": paper["bib"].get("abstract", "No summary available"),
                "link": paper.get("pub_url", "No link available")
            })
        return papers