Spaces:

kltn20133118
/

demo_obsei

Sleeping

App Files Files Community

demo_obsei / obsei_module /obsei /source /website_crawler_source.py

kltn20133118

Upload 337 files

dbaa71b verified 7 months ago

raw

history blame contribute delete

4.24 kB

	import json
	import logging
	from abc import abstractmethod
	from typing import List, Optional, Dict, Any

	import mmh3

	from obsei.payload import TextPayload
	from obsei.source.base_source import BaseSource, BaseSourceConfig

	logger = logging.getLogger(__name__)


	class BaseCrawlerConfig(BaseSourceConfig):
	TYPE: str = "BaseCrawler"

	@abstractmethod
	def extract_url(self, url: str, url_id: Optional[str] = None) -> Dict[str, Any]:
	pass

	@abstractmethod
	def find_urls(self, url: str) -> List[str]:
	pass


	class TrafilaturaCrawlerConfig(BaseCrawlerConfig):
	# To understand about these configuration params refer:
	# https://trafilatura.readthedocs.io/
	_output_format: str = "json"
	TYPE: str = "Crawler"
	urls: List[str]
	include_comments: bool = False
	include_tables: bool = True
	no_fallback: bool = False
	include_images: bool = False
	include_formatting: bool = False
	deduplicate: bool = True
	no_ssl: bool = False
	is_feed: bool = False
	is_sitemap: bool = False
	include_links: bool = True
	target_language: Optional[str] = None
	url_blacklist: Optional[List[str]] = None

	def extract_url(self, url: str, url_id: Optional[str] = None) -> Dict[str, Any]:
	try:
	from trafilatura import extract, fetch_url
	except:
	logger.error("Trafilatura is not installed, install as follows: pip install trafilatura")
	return {}

	url_id = url_id or "{:02x}".format(mmh3.hash(url, signed=False))
	url_content = fetch_url(
	url=url,
	no_ssl=self.no_ssl,
	)
	extracted_dict: Dict[str, Any] = {}
	if url_content is not None:
	extracted_data = extract(
	filecontent=url_content,
	record_id=url_id,
	no_fallback=self.no_fallback,
	output_format=self._output_format,
	include_comments=self.include_comments,
	include_tables=self.include_tables,
	include_images=self.include_images,
	include_formatting=self.include_formatting,
	include_links=self.include_links,
	deduplicate=self.deduplicate,
	url_blacklist=self.url_blacklist,
	target_language=self.target_language
	)

	if extracted_data:
	extracted_dict = json.loads(extracted_data)
	if "raw-text" in extracted_dict:
	del extracted_dict["raw-text"]

	return extracted_dict

	def find_urls(self, url: str) -> List[str]:
	try:
	from trafilatura import feeds, sitemaps
	except:
	logger.error("Trafilatura is not installed, install as follows: pip install trafilatura")
	return []

	urls: List[str] = []
	if self.is_sitemap:
	urls = sitemaps.sitemap_search(url=url, target_lang=self.target_language)
	elif self.is_feed:
	urls = feeds.find_feed_urls(url=url, target_lang=self.target_language)

	return urls


	class TrafilaturaCrawlerSource(BaseSource):
	NAME: Optional[str] = "Crawler"

	def lookup( # type: ignore[override]
	self, config: TrafilaturaCrawlerConfig, **kwargs: Any
	) -> List[TextPayload]:
	source_responses: List[TextPayload] = []

	final_urls = []
	if config.is_sitemap or config.is_feed:
	for url in config.urls:
	final_urls.extend(config.find_urls(url=url))
	else:
	final_urls = config.urls

	for url in final_urls:
	extracted_data = config.extract_url(url=url)
	if extracted_data is None:
	logger.warning(f"Unable to crawl {url}, hence skipping it")
	continue
	comments = (
	"" if "comments" not in extracted_data else extracted_data["comments"]
	)
	source_responses.append(
	TextPayload(
	processed_text=f"{extracted_data['text']}. {comments}",
	meta=extracted_data,
	source_name=self.NAME,
	)
	)

	return source_responses