Spaces:
Runtime error
Runtime error
import json | |
from pathlib import Path | |
from typing import List, Optional, Dict, Any | |
from phi.tools import Toolkit | |
from phi.utils.log import logger | |
try: | |
import arxiv | |
except ImportError: | |
raise ImportError("`arxiv` not installed. Please install using `pip install arxiv`") | |
try: | |
from pypdf import PdfReader | |
except ImportError: | |
raise ImportError("`pypdf` not installed. Please install using `pip install pypdf`") | |
class ArxivToolkit(Toolkit): | |
def __init__(self, search_arxiv: bool = True, read_arxiv_papers: bool = True, download_dir: Optional[Path] = None): | |
super().__init__(name="arxiv_tools") | |
self.client: arxiv.Client = arxiv.Client() | |
self.download_dir: Path = download_dir or Path(__file__).parent.joinpath("arxiv_pdfs") | |
if search_arxiv: | |
self.register(self.search_arxiv_and_return_articles) | |
if read_arxiv_papers: | |
self.register(self.read_arxiv_papers) | |
def search_arxiv_and_return_articles(self, query: str, num_articles: int = 10) -> str: | |
"""Use this function to search arXiv for a query and return the top articles. | |
Args: | |
query (str): The query to search arXiv for. | |
num_articles (int, optional): The number of articles to return. Defaults to 10. | |
Returns: | |
str: A JSON of the articles with title, id, authors, pdf_url and summary. | |
""" | |
articles = [] | |
logger.info(f"Searching arxiv for: {query}") | |
for result in self.client.results( | |
search=arxiv.Search( | |
query=query, | |
max_results=num_articles, | |
sort_by=arxiv.SortCriterion.Relevance, | |
sort_order=arxiv.SortOrder.Descending, | |
) | |
): | |
try: | |
article = { | |
"title": result.title, | |
"id": result.get_short_id(), | |
"entry_id": result.entry_id, | |
"authors": [author.name for author in result.authors], | |
"primary_category": result.primary_category, | |
"categories": result.categories, | |
"published": result.published.isoformat() if result.published else None, | |
"pdf_url": result.pdf_url, | |
"links": [link.href for link in result.links], | |
"summary": result.summary, | |
"comment": result.comment, | |
} | |
articles.append(article) | |
except Exception as e: | |
logger.error(f"Error processing article: {e}") | |
return json.dumps(articles, indent=4) | |
def read_arxiv_papers(self, id_list: List[str], pages_to_read: Optional[int] = None) -> str: | |
"""Use this function to read a list of arxiv papers and return the content. | |
Args: | |
id_list (list, str): The list of `id` of the papers to add to the knowledge base. | |
Should be of the format: ["2103.03404v1", "2103.03404v2"] | |
pages_to_read (int, optional): The number of pages to read from the paper. | |
None means read all pages. Defaults to None. | |
Returns: | |
str: JSON of the papers. | |
""" | |
download_dir = self.download_dir | |
download_dir.mkdir(parents=True, exist_ok=True) | |
articles = [] | |
logger.info(f"Searching arxiv for: {id_list}") | |
for result in self.client.results(search=arxiv.Search(id_list=id_list)): | |
try: | |
article: Dict[str, Any] = { | |
"title": result.title, | |
"id": result.get_short_id(), | |
"entry_id": result.entry_id, | |
"authors": [author.name for author in result.authors], | |
"primary_category": result.primary_category, | |
"categories": result.categories, | |
"published": result.published.isoformat() if result.published else None, | |
"pdf_url": result.pdf_url, | |
"links": [link.href for link in result.links], | |
"summary": result.summary, | |
"comment": result.comment, | |
} | |
if result.pdf_url: | |
logger.info(f"Downloading: {result.pdf_url}") | |
pdf_path = result.download_pdf(dirpath=str(download_dir)) | |
logger.info(f"To: {pdf_path}") | |
pdf_reader = PdfReader(pdf_path) | |
article["content"] = [] | |
for page_number, page in enumerate(pdf_reader.pages, start=1): | |
if pages_to_read and page_number > pages_to_read: | |
break | |
content = { | |
"page": page_number, | |
"text": page.extract_text(), | |
} | |
article["content"].append(content) | |
articles.append(article) | |
except Exception as e: | |
logger.error(f"Error processing article: {e}") | |
return json.dumps(articles, indent=4) | |