Spaces:
Runtime error
Runtime error
File size: 2,538 Bytes
105b369 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 |
import json
from typing import Any, Dict, Optional
from phi.tools import Toolkit
from phi.utils.log import logger
try:
import newspaper
except ImportError:
raise ImportError("`newspaper4k` not installed. Please run `pip install newspaper4k lxml_html_clean`.")
class Newspaper4k(Toolkit):
def __init__(
self,
read_article: bool = True,
include_summary: bool = False,
article_length: Optional[int] = None,
):
super().__init__(name="newspaper_tools")
self.include_summary: bool = include_summary
self.article_length: Optional[int] = article_length
if read_article:
self.register(self.read_article)
def get_article_data(self, url: str) -> Optional[Dict[str, Any]]:
"""Read and get article data from a URL.
Args:
url (str): The URL of the article.
Returns:
Dict[str, Any]: The article data.
"""
try:
article = newspaper.article(url)
article_data = {}
if article.title:
article_data["title"] = article.title
if article.authors:
article_data["authors"] = article.authors
if article.text:
article_data["text"] = article.text
if self.include_summary and article.summary:
article_data["summary"] = article.summary
try:
if article.publish_date:
article_data["publish_date"] = article.publish_date.isoformat() if article.publish_date else None
except Exception:
pass
return article_data
except Exception as e:
logger.warning(f"Error reading article from {url}: {e}")
return None
def read_article(self, url: str) -> str:
"""Use this function to read an article from a URL.
Args:
url (str): The URL of the article.
Returns:
str: JSON containing the article author, publish date, and text.
"""
try:
article_data = self.get_article_data(url)
if not article_data:
return f"Error reading article from {url}: No data found."
if self.article_length and "text" in article_data:
article_data["text"] = article_data["text"][: self.article_length]
return json.dumps(article_data, indent=2)
except Exception as e:
return f"Error reading article from {url}: {e}"
|