File size: 3,666 Bytes
7b2e5db
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
from datetime import datetime

import requests
import xmltodict

from research_assistant.app_logging import app_logger
from research_assistant.constants import ARXIV_API_ACCESS_POINT
from research_assistant.entity import ArticleSearchConfig


class ArxivApiWrap:
    def __init__(self, config: ArticleSearchConfig):
        self.config = config

    def convert_link_to_pdflink(self, link):
        return link.replace("/abs/", "/pdf/") + ".pdf"

    def convert_date(self, date):
        return datetime.strptime(date, "%Y-%m-%d").strftime("%Y%m%d")

    """
    Fetches the response from the arXiv API based on the specified search terms and parameters.
    Args used by the arXiv API:
       Keywords (list of str): Contains the search terms
       max_length (int): Maximum number of articles to retrieve
       Date range : Contains start and end dates for the search
       Sort by : Sorts the results by a specific field (e.g., submittedDate)
       Sort order (str): Sort order for the results (e.g., asc, desc)

    Returns:
        requests.Response: The HTTP response object returned by the arXiv API.
    """

    def get_arxiv_api_response(self):
        keyword_query = " AND all:".join([f"'{kw}'" for kw in self.config.search_terms])
        if self.config.date_range.start_date:
            query = f" all:{keyword_query}  AND submittedDate:[{self.convert_date(self.config.date_range.start_date)} TO {self.convert_date(self.config.date_range.end_date)}]"
        else:
            query = f" all:{keyword_query}"
        params = {
            "search_query": query,
            "start": 0,  # Starts from page 1 of the results obtained
            "max_results": self.config.num_results,  # Adjust the number of results as needed
            "sortBy": self.config.sort_by,  # Sort by submission date
            "sortOrder": self.config.sort_order,  # Sort in descending order (latest first)
        }
        return requests.get(ARXIV_API_ACCESS_POINT, params=params)

    """
    Retrieves article search results from the arXiv API and logs detailed information about each article.

    This method fetches the API response, parses the XML content into a structured format,
    and extracts key information such as the title, summary, link, and authors for each article.

    Returns:
    list of str: A list of article links retrieved from the arXiv API.
    """

    def get_article_search_result(self):
        response = self.get_arxiv_api_response()  # Fetch the API response
        article_links = []
        if response.status_code == 200:  # Check if the request was successful
            # Parse the response (arXiv API returns XML)
            data = xmltodict.parse(response.content)
            for entry in data["feed"]["entry"]:
                title, summary, link, authors = (
                    entry["title"],
                    entry["summary"],
                    entry["id"],
                    [author["name"] for author in entry["author"]],
                )
                app_logger.info(
                    f"Title: {title}\n Authors: {authors} \n,Abstract: {summary}\n Page Link: {link}\n PDF Link: {self.convert_link_to_pdflink(link)}\n Paper Id: {link.split('/')[-1]}\n {'-'*80}"
                )
                article_links.append(link)
        else:
            app_logger.info(f"Failed to retrieve papers: {response.status_code}")
        return article_links

    def download_pdf(self, pdf_url):
        response, title = requests.get(pdf_url), pdf_url.split("/")[0]
        with open(f"data/{title}.pdf", "wb") as f:
            f.write(response.content)
            print(f"Downloaded: {title}.pdf")