Spaces:
Build error
Build error
File size: 4,383 Bytes
d660b02 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 |
import hashlib
from abc import ABC, abstractmethod
from typing import Generic, TypeVar
from uuid import UUID
from llm_engineering.domain.chunks import ArticleChunk, Chunk, PostChunk, RepositoryChunk
from llm_engineering.domain.cleaned_documents import (
CleanedArticleDocument,
CleanedDocument,
CleanedPostDocument,
CleanedRepositoryDocument,
)
from .operations import chunk_article, chunk_text
CleanedDocumentT = TypeVar("CleanedDocumentT", bound=CleanedDocument)
ChunkT = TypeVar("ChunkT", bound=Chunk)
class ChunkingDataHandler(ABC, Generic[CleanedDocumentT, ChunkT]):
"""
Abstract class for all Chunking data handlers.
All data transformations logic for the chunking step is done here
"""
@property
def metadata(self) -> dict:
return {
"chunk_size": 500,
"chunk_overlap": 50,
}
@abstractmethod
def chunk(self, data_model: CleanedDocumentT) -> list[ChunkT]:
pass
class PostChunkingHandler(ChunkingDataHandler):
@property
def metadata(self) -> dict:
return {
"chunk_size": 250,
"chunk_overlap": 25,
}
def chunk(self, data_model: CleanedPostDocument) -> list[PostChunk]:
data_models_list = []
cleaned_content = data_model.content
chunks = chunk_text(
cleaned_content, chunk_size=self.metadata["chunk_size"], chunk_overlap=self.metadata["chunk_overlap"]
)
for chunk in chunks:
chunk_id = hashlib.md5(chunk.encode()).hexdigest()
model = PostChunk(
id=UUID(chunk_id, version=4),
content=chunk,
platform=data_model.platform,
document_id=data_model.id,
author_id=data_model.author_id,
author_full_name=data_model.author_full_name,
image=data_model.image if data_model.image else None,
metadata=self.metadata,
)
data_models_list.append(model)
return data_models_list
class ArticleChunkingHandler(ChunkingDataHandler):
@property
def metadata(self) -> dict:
return {
"min_length": 1000,
"max_length": 2000,
}
def chunk(self, data_model: CleanedArticleDocument) -> list[ArticleChunk]:
data_models_list = []
cleaned_content = data_model.content
chunks = chunk_article(
cleaned_content, min_length=self.metadata["min_length"], max_length=self.metadata["max_length"]
)
for chunk in chunks:
chunk_id = hashlib.md5(chunk.encode()).hexdigest()
model = ArticleChunk(
id=UUID(chunk_id, version=4),
content=chunk,
platform=data_model.platform,
link=data_model.link,
document_id=data_model.id,
author_id=data_model.author_id,
author_full_name=data_model.author_full_name,
metadata=self.metadata,
)
data_models_list.append(model)
return data_models_list
class RepositoryChunkingHandler(ChunkingDataHandler):
@property
def metadata(self) -> dict:
return {
"chunk_size": 1500,
"chunk_overlap": 100,
}
def chunk(self, data_model: CleanedRepositoryDocument) -> list[RepositoryChunk]:
data_models_list = []
cleaned_content = data_model.content
chunks = chunk_text(
cleaned_content, chunk_size=self.metadata["chunk_size"], chunk_overlap=self.metadata["chunk_overlap"]
)
for chunk in chunks:
chunk_id = hashlib.md5(chunk.encode()).hexdigest()
model = RepositoryChunk(
id=UUID(chunk_id, version=4),
content=chunk,
platform=data_model.platform,
name=data_model.name,
link=data_model.link,
document_id=data_model.id,
author_id=data_model.author_id,
author_full_name=data_model.author_full_name,
metadata=self.metadata,
)
data_models_list.append(model)
return data_models_list
|