Spaces:
Runtime error
Runtime error
from datetime import datetime | |
import requests | |
import xmltodict | |
from research_assistant.app_logging import app_logger | |
from research_assistant.constants import ARXIV_API_ACCESS_POINT | |
from research_assistant.entity import ArticleSearchConfig | |
class ArxivApiWrap: | |
def __init__(self, config: ArticleSearchConfig): | |
self.config = config | |
def convert_link_to_pdflink(self, link): | |
return link.replace("/abs/", "/pdf/") + ".pdf" | |
def convert_date(self, date): | |
return datetime.strptime(date, "%Y-%m-%d").strftime("%Y%m%d") | |
""" | |
Fetches the response from the arXiv API based on the specified search terms and parameters. | |
Args used by the arXiv API: | |
Keywords (list of str): Contains the search terms | |
max_length (int): Maximum number of articles to retrieve | |
Date range : Contains start and end dates for the search | |
Sort by : Sorts the results by a specific field (e.g., submittedDate) | |
Sort order (str): Sort order for the results (e.g., asc, desc) | |
Returns: | |
requests.Response: The HTTP response object returned by the arXiv API. | |
""" | |
def get_arxiv_api_response(self): | |
keyword_query = " AND all:".join([f"'{kw}'" for kw in self.config.search_terms]) | |
if self.config.date_range.start_date: | |
query = f" all:{keyword_query} AND submittedDate:[{self.convert_date(self.config.date_range.start_date)} TO {self.convert_date(self.config.date_range.end_date)}]" | |
else: | |
query = f" all:{keyword_query}" | |
params = { | |
"search_query": query, | |
"start": 0, # Starts from page 1 of the results obtained | |
"max_results": self.config.num_results, # Adjust the number of results as needed | |
"sortBy": self.config.sort_by, # Sort by submission date | |
"sortOrder": self.config.sort_order, # Sort in descending order (latest first) | |
} | |
return requests.get(ARXIV_API_ACCESS_POINT, params=params) | |
""" | |
Retrieves article search results from the arXiv API and logs detailed information about each article. | |
This method fetches the API response, parses the XML content into a structured format, | |
and extracts key information such as the title, summary, link, and authors for each article. | |
Returns: | |
list of str: A list of article links retrieved from the arXiv API. | |
""" | |
def get_article_search_result(self): | |
response = self.get_arxiv_api_response() # Fetch the API response | |
article_links = [] | |
if response.status_code == 200: # Check if the request was successful | |
# Parse the response (arXiv API returns XML) | |
data = xmltodict.parse(response.content) | |
for entry in data["feed"]["entry"]: | |
title, summary, link, authors = ( | |
entry["title"], | |
entry["summary"], | |
entry["id"], | |
[author["name"] for author in entry["author"]], | |
) | |
app_logger.info( | |
f"Title: {title}\n Authors: {authors} \n,Abstract: {summary}\n Page Link: {link}\n PDF Link: {self.convert_link_to_pdflink(link)}\n Paper Id: {link.split('/')[-1]}\n {'-'*80}" | |
) | |
article_links.append(link) | |
else: | |
app_logger.info(f"Failed to retrieve papers: {response.status_code}") | |
return article_links | |
def download_pdf(self, pdf_url): | |
response, title = requests.get(pdf_url), pdf_url.split("/")[0] | |
with open(f"data/{title}.pdf", "wb") as f: | |
f.write(response.content) | |
print(f"Downloaded: {title}.pdf") | |