from urllib.parse import urlparse from langchain_community.document_loaders import AsyncHtmlLoader from langchain_community.document_transformers.html2text import Html2TextTransformer from loguru import logger from llm_engineering.domain.documents import ArticleDocument from .base import BaseCrawler class CustomArticleCrawler(BaseCrawler): model = ArticleDocument def __init__(self) -> None: super().__init__() def extract(self, link: str, **kwargs) -> None: old_model = self.model.find(link=link) if old_model is not None: logger.info(f"Article already exists in the database: {link}") return logger.info(f"Starting scrapping article: {link}") loader = AsyncHtmlLoader([link]) docs = loader.load() html2text = Html2TextTransformer() docs_transformed = html2text.transform_documents(docs) doc_transformed = docs_transformed[0] content = { "Title": doc_transformed.metadata.get("title"), "Subtitle": doc_transformed.metadata.get("description"), "Content": doc_transformed.page_content, "language": doc_transformed.metadata.get("language"), } parsed_url = urlparse(link) platform = parsed_url.netloc user = kwargs["user"] instance = self.model( content=content, link=link, platform=platform, author_id=user.id, author_full_name=user.full_name, ) instance.save() logger.info(f"Finished scrapping custom article: {link}")