coderpotter's picture
Upload folder using huggingface_hub
7b2e5db verified
from datetime import datetime
import requests
import xmltodict
from research_assistant.app_logging import app_logger
from research_assistant.constants import ARXIV_API_ACCESS_POINT
from research_assistant.entity import ArticleSearchConfig
class ArxivApiWrap:
def __init__(self, config: ArticleSearchConfig):
self.config = config
def convert_link_to_pdflink(self, link):
return link.replace("/abs/", "/pdf/") + ".pdf"
def convert_date(self, date):
return datetime.strptime(date, "%Y-%m-%d").strftime("%Y%m%d")
"""
Fetches the response from the arXiv API based on the specified search terms and parameters.
Args used by the arXiv API:
Keywords (list of str): Contains the search terms
max_length (int): Maximum number of articles to retrieve
Date range : Contains start and end dates for the search
Sort by : Sorts the results by a specific field (e.g., submittedDate)
Sort order (str): Sort order for the results (e.g., asc, desc)
Returns:
requests.Response: The HTTP response object returned by the arXiv API.
"""
def get_arxiv_api_response(self):
keyword_query = " AND all:".join([f"'{kw}'" for kw in self.config.search_terms])
if self.config.date_range.start_date:
query = f" all:{keyword_query} AND submittedDate:[{self.convert_date(self.config.date_range.start_date)} TO {self.convert_date(self.config.date_range.end_date)}]"
else:
query = f" all:{keyword_query}"
params = {
"search_query": query,
"start": 0, # Starts from page 1 of the results obtained
"max_results": self.config.num_results, # Adjust the number of results as needed
"sortBy": self.config.sort_by, # Sort by submission date
"sortOrder": self.config.sort_order, # Sort in descending order (latest first)
}
return requests.get(ARXIV_API_ACCESS_POINT, params=params)
"""
Retrieves article search results from the arXiv API and logs detailed information about each article.
This method fetches the API response, parses the XML content into a structured format,
and extracts key information such as the title, summary, link, and authors for each article.
Returns:
list of str: A list of article links retrieved from the arXiv API.
"""
def get_article_search_result(self):
response = self.get_arxiv_api_response() # Fetch the API response
article_links = []
if response.status_code == 200: # Check if the request was successful
# Parse the response (arXiv API returns XML)
data = xmltodict.parse(response.content)
for entry in data["feed"]["entry"]:
title, summary, link, authors = (
entry["title"],
entry["summary"],
entry["id"],
[author["name"] for author in entry["author"]],
)
app_logger.info(
f"Title: {title}\n Authors: {authors} \n,Abstract: {summary}\n Page Link: {link}\n PDF Link: {self.convert_link_to_pdflink(link)}\n Paper Id: {link.split('/')[-1]}\n {'-'*80}"
)
article_links.append(link)
else:
app_logger.info(f"Failed to retrieve papers: {response.status_code}")
return article_links
def download_pdf(self, pdf_url):
response, title = requests.get(pdf_url), pdf_url.split("/")[0]
with open(f"data/{title}.pdf", "wb") as f:
f.write(response.content)
print(f"Downloaded: {title}.pdf")