Spaces:
Build error
Build error
File size: 2,421 Bytes
d660b02 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 |
from abc import ABC, abstractmethod
from typing import Generic, TypeVar
from llm_engineering.domain.cleaned_documents import (
CleanedArticleDocument,
CleanedDocument,
CleanedPostDocument,
CleanedRepositoryDocument,
)
from llm_engineering.domain.documents import (
ArticleDocument,
Document,
PostDocument,
RepositoryDocument,
)
from .operations import clean_text
DocumentT = TypeVar("DocumentT", bound=Document)
CleanedDocumentT = TypeVar("CleanedDocumentT", bound=CleanedDocument)
class CleaningDataHandler(ABC, Generic[DocumentT, CleanedDocumentT]):
"""
Abstract class for all cleaning data handlers.
All data transformations logic for the cleaning step is done here
"""
@abstractmethod
def clean(self, data_model: DocumentT) -> CleanedDocumentT:
pass
class PostCleaningHandler(CleaningDataHandler):
def clean(self, data_model: PostDocument) -> CleanedPostDocument:
return CleanedPostDocument(
id=data_model.id,
content=clean_text(" #### ".join(data_model.content.values())),
platform=data_model.platform,
author_id=data_model.author_id,
author_full_name=data_model.author_full_name,
image=data_model.image if data_model.image else None,
)
class ArticleCleaningHandler(CleaningDataHandler):
def clean(self, data_model: ArticleDocument) -> CleanedArticleDocument:
valid_content = [content for content in data_model.content.values() if content]
return CleanedArticleDocument(
id=data_model.id,
content=clean_text(" #### ".join(valid_content)),
platform=data_model.platform,
link=data_model.link,
author_id=data_model.author_id,
author_full_name=data_model.author_full_name,
)
class RepositoryCleaningHandler(CleaningDataHandler):
def clean(self, data_model: RepositoryDocument) -> CleanedRepositoryDocument:
return CleanedRepositoryDocument(
id=data_model.id,
content=clean_text(" #### ".join(data_model.content.values())),
platform=data_model.platform,
name=data_model.name,
link=data_model.link,
author_id=data_model.author_id,
author_full_name=data_model.author_full_name,
)
|