File size: 2,538 Bytes
105b369
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
import json
from typing import Any, Dict, Optional

from phi.tools import Toolkit
from phi.utils.log import logger

try:
    import newspaper
except ImportError:
    raise ImportError("`newspaper4k` not installed. Please run `pip install newspaper4k lxml_html_clean`.")


class Newspaper4k(Toolkit):
    def __init__(
        self,
        read_article: bool = True,
        include_summary: bool = False,
        article_length: Optional[int] = None,
    ):
        super().__init__(name="newspaper_tools")

        self.include_summary: bool = include_summary
        self.article_length: Optional[int] = article_length
        if read_article:
            self.register(self.read_article)

    def get_article_data(self, url: str) -> Optional[Dict[str, Any]]:
        """Read and get article data from a URL.

        Args:
            url (str): The URL of the article.

        Returns:
            Dict[str, Any]: The article data.
        """

        try:
            article = newspaper.article(url)
            article_data = {}
            if article.title:
                article_data["title"] = article.title
            if article.authors:
                article_data["authors"] = article.authors
            if article.text:
                article_data["text"] = article.text
            if self.include_summary and article.summary:
                article_data["summary"] = article.summary

            try:
                if article.publish_date:
                    article_data["publish_date"] = article.publish_date.isoformat() if article.publish_date else None
            except Exception:
                pass

            return article_data
        except Exception as e:
            logger.warning(f"Error reading article from {url}: {e}")
            return None

    def read_article(self, url: str) -> str:
        """Use this function to read an article from a URL.

        Args:
            url (str): The URL of the article.

        Returns:
            str: JSON containing the article author, publish date, and text.
        """

        try:
            article_data = self.get_article_data(url)
            if not article_data:
                return f"Error reading article from {url}: No data found."

            if self.article_length and "text" in article_data:
                article_data["text"] = article_data["text"][: self.article_length]

            return json.dumps(article_data, indent=2)
        except Exception as e:
            return f"Error reading article from {url}: {e}"