File size: 4,235 Bytes
dbaa71b
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
import json
import logging
from abc import abstractmethod
from typing import List, Optional, Dict, Any

import mmh3

from obsei.payload import TextPayload
from obsei.source.base_source import BaseSource, BaseSourceConfig

logger = logging.getLogger(__name__)


class BaseCrawlerConfig(BaseSourceConfig):
    TYPE: str = "BaseCrawler"

    @abstractmethod
    def extract_url(self, url: str, url_id: Optional[str] = None) -> Dict[str, Any]:
        pass

    @abstractmethod
    def find_urls(self, url: str) -> List[str]:
        pass


class TrafilaturaCrawlerConfig(BaseCrawlerConfig):
    # To understand about these configuration params refer:
    # https://trafilatura.readthedocs.io/
    _output_format: str = "json"
    TYPE: str = "Crawler"
    urls: List[str]
    include_comments: bool = False
    include_tables: bool = True
    no_fallback: bool = False
    include_images: bool = False
    include_formatting: bool = False
    deduplicate: bool = True
    no_ssl: bool = False
    is_feed: bool = False
    is_sitemap: bool = False
    include_links: bool = True
    target_language: Optional[str] = None
    url_blacklist: Optional[List[str]] = None

    def extract_url(self, url: str, url_id: Optional[str] = None) -> Dict[str, Any]:
        try:
            from trafilatura import extract, fetch_url
        except:
            logger.error("Trafilatura is not installed, install as follows: pip install trafilatura")
            return {}

        url_id = url_id or "{:02x}".format(mmh3.hash(url, signed=False))
        url_content = fetch_url(
            url=url,
            no_ssl=self.no_ssl,
        )
        extracted_dict: Dict[str, Any] = {}
        if url_content is not None:
            extracted_data = extract(
                filecontent=url_content,
                record_id=url_id,
                no_fallback=self.no_fallback,
                output_format=self._output_format,
                include_comments=self.include_comments,
                include_tables=self.include_tables,
                include_images=self.include_images,
                include_formatting=self.include_formatting,
                include_links=self.include_links,
                deduplicate=self.deduplicate,
                url_blacklist=self.url_blacklist,
                target_language=self.target_language
            )

            if extracted_data:
                extracted_dict = json.loads(extracted_data)
                if "raw-text" in extracted_dict:
                    del extracted_dict["raw-text"]

        return extracted_dict

    def find_urls(self, url: str) -> List[str]:
        try:
            from trafilatura import feeds, sitemaps
        except:
            logger.error("Trafilatura is not installed, install as follows: pip install trafilatura")
            return []

        urls: List[str] = []
        if self.is_sitemap:
            urls = sitemaps.sitemap_search(url=url, target_lang=self.target_language)
        elif self.is_feed:
            urls = feeds.find_feed_urls(url=url, target_lang=self.target_language)

        return urls


class TrafilaturaCrawlerSource(BaseSource):
    NAME: Optional[str] = "Crawler"

    def lookup(  # type: ignore[override]
        self, config: TrafilaturaCrawlerConfig, **kwargs: Any
    ) -> List[TextPayload]:
        source_responses: List[TextPayload] = []

        final_urls = []
        if config.is_sitemap or config.is_feed:
            for url in config.urls:
                final_urls.extend(config.find_urls(url=url))
        else:
            final_urls = config.urls

        for url in final_urls:
            extracted_data = config.extract_url(url=url)
            if extracted_data is None:
                logger.warning(f"Unable to crawl {url}, hence skipping it")
                continue
            comments = (
                "" if "comments" not in extracted_data else extracted_data["comments"]
            )
            source_responses.append(
                TextPayload(
                    processed_text=f"{extracted_data['text']}. {comments}",
                    meta=extracted_data,
                    source_name=self.NAME,
                )
            )

        return source_responses