from datetime import datetime import requests import xmltodict from research_assistant.app_logging import app_logger from research_assistant.constants import ARXIV_API_ACCESS_POINT from research_assistant.entity import ArticleSearchConfig class ArxivApiWrap: def __init__(self, config: ArticleSearchConfig): self.config = config def convert_link_to_pdflink(self, link): return link.replace("/abs/", "/pdf/") + ".pdf" def convert_date(self, date): return datetime.strptime(date, "%Y-%m-%d").strftime("%Y%m%d") """ Fetches the response from the arXiv API based on the specified search terms and parameters. Args used by the arXiv API: Keywords (list of str): Contains the search terms max_length (int): Maximum number of articles to retrieve Date range : Contains start and end dates for the search Sort by : Sorts the results by a specific field (e.g., submittedDate) Sort order (str): Sort order for the results (e.g., asc, desc) Returns: requests.Response: The HTTP response object returned by the arXiv API. """ def get_arxiv_api_response(self): keyword_query = " AND all:".join([f"'{kw}'" for kw in self.config.search_terms]) if self.config.date_range.start_date: query = f" all:{keyword_query} AND submittedDate:[{self.convert_date(self.config.date_range.start_date)} TO {self.convert_date(self.config.date_range.end_date)}]" else: query = f" all:{keyword_query}" params = { "search_query": query, "start": 0, # Starts from page 1 of the results obtained "max_results": self.config.num_results, # Adjust the number of results as needed "sortBy": self.config.sort_by, # Sort by submission date "sortOrder": self.config.sort_order, # Sort in descending order (latest first) } return requests.get(ARXIV_API_ACCESS_POINT, params=params) """ Retrieves article search results from the arXiv API and logs detailed information about each article. This method fetches the API response, parses the XML content into a structured format, and extracts key information such as the title, summary, link, and authors for each article. Returns: list of str: A list of article links retrieved from the arXiv API. """ def get_article_search_result(self): response = self.get_arxiv_api_response() # Fetch the API response article_links = [] if response.status_code == 200: # Check if the request was successful # Parse the response (arXiv API returns XML) data = xmltodict.parse(response.content) for entry in data["feed"]["entry"]: title, summary, link, authors = ( entry["title"], entry["summary"], entry["id"], [author["name"] for author in entry["author"]], ) app_logger.info( f"Title: {title}\n Authors: {authors} \n,Abstract: {summary}\n Page Link: {link}\n PDF Link: {self.convert_link_to_pdflink(link)}\n Paper Id: {link.split('/')[-1]}\n {'-'*80}" ) article_links.append(link) else: app_logger.info(f"Failed to retrieve papers: {response.status_code}") return article_links def download_pdf(self, pdf_url): response, title = requests.get(pdf_url), pdf_url.split("/")[0] with open(f"data/{title}.pdf", "wb") as f: f.write(response.content) print(f"Downloaded: {title}.pdf")