Spaces:
Build error
Build error
File size: 1,750 Bytes
d660b02 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 |
from urllib.parse import urlparse
from loguru import logger
from tqdm import tqdm
from typing_extensions import Annotated
from clearml import PipelineDecorator
from llm_engineering.application.crawlers.dispatcher import CrawlerDispatcher
@PipelineDecorator.component(name="Crawl Links")
def crawl_links(links: list[str]) -> Annotated[list[str], "crawled_links"]:
def _crawl_link(dispatcher: CrawlerDispatcher, link: str) -> tuple[bool, str]:
# Logic for crawling
crawler = dispatcher.get_crawler(link)
crawler_domain = urlparse(link).netloc
try:
crawler.extract(link=link)
return (True, crawler_domain)
except Exception as e:
logger.error(f"An error occurred while crawling: {e!s}")
return (False, crawler_domain)
def _add_to_metadata(metadata: dict, domain: str, successfull_crawl: bool) -> dict:
if domain not in metadata:
metadata[domain] = {}
metadata[domain]["successful"] = metadata.get(domain, {}).get("successful", 0) + successfull_crawl
metadata[domain]["total"] = metadata.get(domain, {}).get("total", 0) + 1
return metadata
dispatcher = CrawlerDispatcher.build().register_github()
logger.info(f"Starting to crawl {len(links)} link(s).")
metadata = {}
successfull_crawls = 0
for link in tqdm(links):
successfull_crawl, crawled_domain = _crawl_link(dispatcher, link)
successfull_crawls += successfull_crawl
metadata = _add_to_metadata(metadata, crawled_domain, successfull_crawl)
logger.info(f"Successfully crawled {successfull_crawls} / {len(links)} links.")
return links
|