File size: 1,750 Bytes
d660b02
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
from urllib.parse import urlparse

from loguru import logger
from tqdm import tqdm
from typing_extensions import Annotated
from clearml import PipelineDecorator

from llm_engineering.application.crawlers.dispatcher import CrawlerDispatcher


@PipelineDecorator.component(name="Crawl Links")
def crawl_links(links: list[str]) -> Annotated[list[str], "crawled_links"]:
    def _crawl_link(dispatcher: CrawlerDispatcher, link: str) -> tuple[bool, str]:
        # Logic for crawling
        crawler = dispatcher.get_crawler(link)
        crawler_domain = urlparse(link).netloc

        try:
            crawler.extract(link=link)
            return (True, crawler_domain)
        except Exception as e:
            logger.error(f"An error occurred while crawling: {e!s}")
            return (False, crawler_domain)
    def _add_to_metadata(metadata: dict, domain: str, successfull_crawl: bool) -> dict:
        if domain not in metadata:
            metadata[domain] = {}
        metadata[domain]["successful"] = metadata.get(domain, {}).get("successful", 0) + successfull_crawl
        metadata[domain]["total"] = metadata.get(domain, {}).get("total", 0) + 1

        return metadata

    dispatcher = CrawlerDispatcher.build().register_github()
    logger.info(f"Starting to crawl {len(links)} link(s).")

    metadata = {}
    successfull_crawls = 0
    for link in tqdm(links):
        successfull_crawl, crawled_domain = _crawl_link(dispatcher, link)
        successfull_crawls += successfull_crawl

        metadata = _add_to_metadata(metadata, crawled_domain, successfull_crawl)

    logger.info(f"Successfully crawled {successfull_crawls} / {len(links)} links.")
    return links