demo_obsei / obsei_module /obsei /source /website_crawler_source.py
kltn20133118's picture
Upload 337 files
dbaa71b verified
import json
import logging
from abc import abstractmethod
from typing import List, Optional, Dict, Any
import mmh3
from obsei.payload import TextPayload
from obsei.source.base_source import BaseSource, BaseSourceConfig
logger = logging.getLogger(__name__)
class BaseCrawlerConfig(BaseSourceConfig):
TYPE: str = "BaseCrawler"
@abstractmethod
def extract_url(self, url: str, url_id: Optional[str] = None) -> Dict[str, Any]:
pass
@abstractmethod
def find_urls(self, url: str) -> List[str]:
pass
class TrafilaturaCrawlerConfig(BaseCrawlerConfig):
# To understand about these configuration params refer:
# https://trafilatura.readthedocs.io/
_output_format: str = "json"
TYPE: str = "Crawler"
urls: List[str]
include_comments: bool = False
include_tables: bool = True
no_fallback: bool = False
include_images: bool = False
include_formatting: bool = False
deduplicate: bool = True
no_ssl: bool = False
is_feed: bool = False
is_sitemap: bool = False
include_links: bool = True
target_language: Optional[str] = None
url_blacklist: Optional[List[str]] = None
def extract_url(self, url: str, url_id: Optional[str] = None) -> Dict[str, Any]:
try:
from trafilatura import extract, fetch_url
except:
logger.error("Trafilatura is not installed, install as follows: pip install trafilatura")
return {}
url_id = url_id or "{:02x}".format(mmh3.hash(url, signed=False))
url_content = fetch_url(
url=url,
no_ssl=self.no_ssl,
)
extracted_dict: Dict[str, Any] = {}
if url_content is not None:
extracted_data = extract(
filecontent=url_content,
record_id=url_id,
no_fallback=self.no_fallback,
output_format=self._output_format,
include_comments=self.include_comments,
include_tables=self.include_tables,
include_images=self.include_images,
include_formatting=self.include_formatting,
include_links=self.include_links,
deduplicate=self.deduplicate,
url_blacklist=self.url_blacklist,
target_language=self.target_language
)
if extracted_data:
extracted_dict = json.loads(extracted_data)
if "raw-text" in extracted_dict:
del extracted_dict["raw-text"]
return extracted_dict
def find_urls(self, url: str) -> List[str]:
try:
from trafilatura import feeds, sitemaps
except:
logger.error("Trafilatura is not installed, install as follows: pip install trafilatura")
return []
urls: List[str] = []
if self.is_sitemap:
urls = sitemaps.sitemap_search(url=url, target_lang=self.target_language)
elif self.is_feed:
urls = feeds.find_feed_urls(url=url, target_lang=self.target_language)
return urls
class TrafilaturaCrawlerSource(BaseSource):
NAME: Optional[str] = "Crawler"
def lookup( # type: ignore[override]
self, config: TrafilaturaCrawlerConfig, **kwargs: Any
) -> List[TextPayload]:
source_responses: List[TextPayload] = []
final_urls = []
if config.is_sitemap or config.is_feed:
for url in config.urls:
final_urls.extend(config.find_urls(url=url))
else:
final_urls = config.urls
for url in final_urls:
extracted_data = config.extract_url(url=url)
if extracted_data is None:
logger.warning(f"Unable to crawl {url}, hence skipping it")
continue
comments = (
"" if "comments" not in extracted_data else extracted_data["comments"]
)
source_responses.append(
TextPayload(
processed_text=f"{extracted_data['text']}. {comments}",
meta=extracted_data,
source_name=self.NAME,
)
)
return source_responses