Spaces:
Sleeping
Sleeping
File size: 4,235 Bytes
dbaa71b |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 |
import json
import logging
from abc import abstractmethod
from typing import List, Optional, Dict, Any
import mmh3
from obsei.payload import TextPayload
from obsei.source.base_source import BaseSource, BaseSourceConfig
logger = logging.getLogger(__name__)
class BaseCrawlerConfig(BaseSourceConfig):
TYPE: str = "BaseCrawler"
@abstractmethod
def extract_url(self, url: str, url_id: Optional[str] = None) -> Dict[str, Any]:
pass
@abstractmethod
def find_urls(self, url: str) -> List[str]:
pass
class TrafilaturaCrawlerConfig(BaseCrawlerConfig):
# To understand about these configuration params refer:
# https://trafilatura.readthedocs.io/
_output_format: str = "json"
TYPE: str = "Crawler"
urls: List[str]
include_comments: bool = False
include_tables: bool = True
no_fallback: bool = False
include_images: bool = False
include_formatting: bool = False
deduplicate: bool = True
no_ssl: bool = False
is_feed: bool = False
is_sitemap: bool = False
include_links: bool = True
target_language: Optional[str] = None
url_blacklist: Optional[List[str]] = None
def extract_url(self, url: str, url_id: Optional[str] = None) -> Dict[str, Any]:
try:
from trafilatura import extract, fetch_url
except:
logger.error("Trafilatura is not installed, install as follows: pip install trafilatura")
return {}
url_id = url_id or "{:02x}".format(mmh3.hash(url, signed=False))
url_content = fetch_url(
url=url,
no_ssl=self.no_ssl,
)
extracted_dict: Dict[str, Any] = {}
if url_content is not None:
extracted_data = extract(
filecontent=url_content,
record_id=url_id,
no_fallback=self.no_fallback,
output_format=self._output_format,
include_comments=self.include_comments,
include_tables=self.include_tables,
include_images=self.include_images,
include_formatting=self.include_formatting,
include_links=self.include_links,
deduplicate=self.deduplicate,
url_blacklist=self.url_blacklist,
target_language=self.target_language
)
if extracted_data:
extracted_dict = json.loads(extracted_data)
if "raw-text" in extracted_dict:
del extracted_dict["raw-text"]
return extracted_dict
def find_urls(self, url: str) -> List[str]:
try:
from trafilatura import feeds, sitemaps
except:
logger.error("Trafilatura is not installed, install as follows: pip install trafilatura")
return []
urls: List[str] = []
if self.is_sitemap:
urls = sitemaps.sitemap_search(url=url, target_lang=self.target_language)
elif self.is_feed:
urls = feeds.find_feed_urls(url=url, target_lang=self.target_language)
return urls
class TrafilaturaCrawlerSource(BaseSource):
NAME: Optional[str] = "Crawler"
def lookup( # type: ignore[override]
self, config: TrafilaturaCrawlerConfig, **kwargs: Any
) -> List[TextPayload]:
source_responses: List[TextPayload] = []
final_urls = []
if config.is_sitemap or config.is_feed:
for url in config.urls:
final_urls.extend(config.find_urls(url=url))
else:
final_urls = config.urls
for url in final_urls:
extracted_data = config.extract_url(url=url)
if extracted_data is None:
logger.warning(f"Unable to crawl {url}, hence skipping it")
continue
comments = (
"" if "comments" not in extracted_data else extracted_data["comments"]
)
source_responses.append(
TextPayload(
processed_text=f"{extracted_data['text']}. {comments}",
meta=extracted_data,
source_name=self.NAME,
)
)
return source_responses
|