File size: 2,832 Bytes
d660b02
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
from typing_extensions import Annotated
from clearml import PipelineDecorator

from llm_engineering.application import utils
from llm_engineering.application.preprocessing import ChunkingDispatcher, EmbeddingDispatcher
from llm_engineering.domain.chunks import Chunk
from llm_engineering.domain.embedded_chunks import EmbeddedChunk


@PipelineDecorator.component(name="chunk_and_embed")
def chunk_and_embed(

    cleaned_documents: Annotated[list, "cleaned_documents"],

) -> Annotated[list, "embedded_documents"]:
    def _add_chunks_metadata(chunks: list[Chunk], metadata: dict) -> dict:
        for chunk in chunks:
            category = chunk.get_category()
            if category not in metadata:
                metadata[category] = chunk.metadata
            if "authors" not in metadata[category]:
                metadata[category]["authors"] = list()

            metadata[category]["num_chunks"] = metadata[category].get("num_chunks", 0) + 1
            metadata[category]["authors"].append(chunk.author_full_name)

        for value in metadata.values():
            if isinstance(value, dict) and "authors" in value:
                value["authors"] = list(set(value["authors"]))

        return metadata


    def _add_embeddings_metadata(embedded_chunks: list[EmbeddedChunk], metadata: dict) -> dict:
        for embedded_chunk in embedded_chunks:
            category = embedded_chunk.get_category()
            if category not in metadata:
                metadata[category] = embedded_chunk.metadata
            if "authors" not in metadata[category]:
                metadata[category]["authors"] = list()

            metadata[category]["authors"].append(embedded_chunk.author_full_name)

        for value in metadata.values():
            if isinstance(value, dict) and "authors" in value:
                value["authors"] = list(set(value["authors"]))

        return metadata   
    metadata = {"chunking": {}, "embedding": {}, "num_documents": len(cleaned_documents)}

    embedded_chunks = []
    for document in cleaned_documents:
        chunks = ChunkingDispatcher.dispatch(document)
        metadata["chunking"] = _add_chunks_metadata(chunks, metadata["chunking"])

        for batched_chunks in utils.misc.batch(chunks, 10):
            batched_embedded_chunks = EmbeddingDispatcher.dispatch(batched_chunks)
            embedded_chunks.extend(batched_embedded_chunks)

    metadata["embedding"] = _add_embeddings_metadata(embedded_chunks, metadata["embedding"])
    metadata["num_chunks"] = len(embedded_chunks)
    metadata["num_embedded_chunks"] = len(embedded_chunks)

    #step_context = get_step_context()
    #step_context.add_output_metadata(output_name="embedded_documents", metadata=metadata)

    return embedded_chunks