SkazuHD's picture
init space
d660b02
import hashlib
from abc import ABC, abstractmethod
from typing import Generic, TypeVar
from uuid import UUID
from llm_engineering.domain.chunks import ArticleChunk, Chunk, PostChunk, RepositoryChunk
from llm_engineering.domain.cleaned_documents import (
CleanedArticleDocument,
CleanedDocument,
CleanedPostDocument,
CleanedRepositoryDocument,
)
from .operations import chunk_article, chunk_text
CleanedDocumentT = TypeVar("CleanedDocumentT", bound=CleanedDocument)
ChunkT = TypeVar("ChunkT", bound=Chunk)
class ChunkingDataHandler(ABC, Generic[CleanedDocumentT, ChunkT]):
"""
Abstract class for all Chunking data handlers.
All data transformations logic for the chunking step is done here
"""
@property
def metadata(self) -> dict:
return {
"chunk_size": 500,
"chunk_overlap": 50,
}
@abstractmethod
def chunk(self, data_model: CleanedDocumentT) -> list[ChunkT]:
pass
class PostChunkingHandler(ChunkingDataHandler):
@property
def metadata(self) -> dict:
return {
"chunk_size": 250,
"chunk_overlap": 25,
}
def chunk(self, data_model: CleanedPostDocument) -> list[PostChunk]:
data_models_list = []
cleaned_content = data_model.content
chunks = chunk_text(
cleaned_content, chunk_size=self.metadata["chunk_size"], chunk_overlap=self.metadata["chunk_overlap"]
)
for chunk in chunks:
chunk_id = hashlib.md5(chunk.encode()).hexdigest()
model = PostChunk(
id=UUID(chunk_id, version=4),
content=chunk,
platform=data_model.platform,
document_id=data_model.id,
author_id=data_model.author_id,
author_full_name=data_model.author_full_name,
image=data_model.image if data_model.image else None,
metadata=self.metadata,
)
data_models_list.append(model)
return data_models_list
class ArticleChunkingHandler(ChunkingDataHandler):
@property
def metadata(self) -> dict:
return {
"min_length": 1000,
"max_length": 2000,
}
def chunk(self, data_model: CleanedArticleDocument) -> list[ArticleChunk]:
data_models_list = []
cleaned_content = data_model.content
chunks = chunk_article(
cleaned_content, min_length=self.metadata["min_length"], max_length=self.metadata["max_length"]
)
for chunk in chunks:
chunk_id = hashlib.md5(chunk.encode()).hexdigest()
model = ArticleChunk(
id=UUID(chunk_id, version=4),
content=chunk,
platform=data_model.platform,
link=data_model.link,
document_id=data_model.id,
author_id=data_model.author_id,
author_full_name=data_model.author_full_name,
metadata=self.metadata,
)
data_models_list.append(model)
return data_models_list
class RepositoryChunkingHandler(ChunkingDataHandler):
@property
def metadata(self) -> dict:
return {
"chunk_size": 1500,
"chunk_overlap": 100,
}
def chunk(self, data_model: CleanedRepositoryDocument) -> list[RepositoryChunk]:
data_models_list = []
cleaned_content = data_model.content
chunks = chunk_text(
cleaned_content, chunk_size=self.metadata["chunk_size"], chunk_overlap=self.metadata["chunk_overlap"]
)
for chunk in chunks:
chunk_id = hashlib.md5(chunk.encode()).hexdigest()
model = RepositoryChunk(
id=UUID(chunk_id, version=4),
content=chunk,
platform=data_model.platform,
name=data_model.name,
link=data_model.link,
document_id=data_model.id,
author_id=data_model.author_id,
author_full_name=data_model.author_full_name,
metadata=self.metadata,
)
data_models_list.append(model)
return data_models_list