Spaces:
Build error
Build error
File size: 1,677 Bytes
d660b02 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 |
from urllib.parse import urlparse
from langchain_community.document_loaders import AsyncHtmlLoader
from langchain_community.document_transformers.html2text import Html2TextTransformer
from loguru import logger
from llm_engineering.domain.documents import ArticleDocument
from .base import BaseCrawler
class CustomArticleCrawler(BaseCrawler):
model = ArticleDocument
def __init__(self) -> None:
super().__init__()
def extract(self, link: str, **kwargs) -> None:
old_model = self.model.find(link=link)
if old_model is not None:
logger.info(f"Article already exists in the database: {link}")
return
logger.info(f"Starting scrapping article: {link}")
loader = AsyncHtmlLoader([link])
docs = loader.load()
html2text = Html2TextTransformer()
docs_transformed = html2text.transform_documents(docs)
doc_transformed = docs_transformed[0]
content = {
"Title": doc_transformed.metadata.get("title"),
"Subtitle": doc_transformed.metadata.get("description"),
"Content": doc_transformed.page_content,
"language": doc_transformed.metadata.get("language"),
}
parsed_url = urlparse(link)
platform = parsed_url.netloc
user = kwargs["user"]
instance = self.model(
content=content,
link=link,
platform=platform,
author_id=user.id,
author_full_name=user.full_name,
)
instance.save()
logger.info(f"Finished scrapping custom article: {link}")
|