SkazuHD's picture
init space
d660b02
import re
from urllib.parse import urlparse
from loguru import logger
from .base import BaseCrawler
from .custom_article import CustomArticleCrawler
from .github import GithubCrawler
class CrawlerDispatcher:
def __init__(self) -> None:
self._crawlers = {}
@classmethod
def build(cls) -> "CrawlerDispatcher":
dispatcher = cls()
return dispatcher
def register_github(self) -> "CrawlerDispatcher":
self.register("https://github.com", GithubCrawler)
return self
def register(self, domain: str, crawler: type[BaseCrawler]) -> None:
parsed_domain = urlparse(domain)
domain = parsed_domain.netloc
self._crawlers[r"https://(www\.)?{}/*".format(re.escape(domain))] = crawler
def get_crawler(self, url: str) -> BaseCrawler:
for pattern, crawler in self._crawlers.items():
if re.match(pattern, url):
return crawler()
else:
logger.warning(f"No crawler found for {url}. Defaulting to CustomArticleCrawler.")
return CustomArticleCrawler()