Spaces:
Build error
Build error
File size: 1,132 Bytes
d660b02 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 |
import re
from urllib.parse import urlparse
from loguru import logger
from .base import BaseCrawler
from .custom_article import CustomArticleCrawler
from .github import GithubCrawler
class CrawlerDispatcher:
def __init__(self) -> None:
self._crawlers = {}
@classmethod
def build(cls) -> "CrawlerDispatcher":
dispatcher = cls()
return dispatcher
def register_github(self) -> "CrawlerDispatcher":
self.register("https://github.com", GithubCrawler)
return self
def register(self, domain: str, crawler: type[BaseCrawler]) -> None:
parsed_domain = urlparse(domain)
domain = parsed_domain.netloc
self._crawlers[r"https://(www\.)?{}/*".format(re.escape(domain))] = crawler
def get_crawler(self, url: str) -> BaseCrawler:
for pattern, crawler in self._crawlers.items():
if re.match(pattern, url):
return crawler()
else:
logger.warning(f"No crawler found for {url}. Defaulting to CustomArticleCrawler.")
return CustomArticleCrawler()
|