Spaces:
Build error
Build error
from urllib.parse import urlparse | |
from langchain_community.document_loaders import AsyncHtmlLoader | |
from langchain_community.document_transformers.html2text import Html2TextTransformer | |
from loguru import logger | |
from llm_engineering.domain.documents import ArticleDocument | |
from .base import BaseCrawler | |
class CustomArticleCrawler(BaseCrawler): | |
model = ArticleDocument | |
def __init__(self) -> None: | |
super().__init__() | |
def extract(self, link: str, **kwargs) -> None: | |
old_model = self.model.find(link=link) | |
if old_model is not None: | |
logger.info(f"Article already exists in the database: {link}") | |
return | |
logger.info(f"Starting scrapping article: {link}") | |
loader = AsyncHtmlLoader([link]) | |
docs = loader.load() | |
html2text = Html2TextTransformer() | |
docs_transformed = html2text.transform_documents(docs) | |
doc_transformed = docs_transformed[0] | |
content = { | |
"Title": doc_transformed.metadata.get("title"), | |
"Subtitle": doc_transformed.metadata.get("description"), | |
"Content": doc_transformed.page_content, | |
"language": doc_transformed.metadata.get("language"), | |
} | |
parsed_url = urlparse(link) | |
platform = parsed_url.netloc | |
user = kwargs["user"] | |
instance = self.model( | |
content=content, | |
link=link, | |
platform=platform, | |
author_id=user.id, | |
author_full_name=user.full_name, | |
) | |
instance.save() | |
logger.info(f"Finished scrapping custom article: {link}") | |