File size: 2,678 Bytes
4304c6d
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
from abc import ABC, abstractmethod
from collections.abc import Sequence
from typing import Any, Optional

from pydantic import BaseModel, Field


class Document(BaseModel):
    """Class for storing a piece of text and associated metadata."""

    page_content: str

    """Arbitrary metadata about the page content (e.g., source, relationships to other

        documents, etc.).

    """
    metadata: Optional[dict] = Field(default_factory=dict)


class BaseDocumentTransformer(ABC):
    """Abstract base class for document transformation systems.



    A document transformation system takes a sequence of Documents and returns a

    sequence of transformed Documents.



    Example:

        .. code-block:: python



            class EmbeddingsRedundantFilter(BaseDocumentTransformer, BaseModel):

                embeddings: Embeddings

                similarity_fn: Callable = cosine_similarity

                similarity_threshold: float = 0.95



                class Config:

                    arbitrary_types_allowed = True



                def transform_documents(

                    self, documents: Sequence[Document], **kwargs: Any

                ) -> Sequence[Document]:

                    stateful_documents = get_stateful_documents(documents)

                    embedded_documents = _get_embeddings_from_stateful_docs(

                        self.embeddings, stateful_documents

                    )

                    included_idxs = _filter_similar_embeddings(

                        embedded_documents, self.similarity_fn, self.similarity_threshold

                    )

                    return [stateful_documents[i] for i in sorted(included_idxs)]



                async def atransform_documents(

                    self, documents: Sequence[Document], **kwargs: Any

                ) -> Sequence[Document]:

                    raise NotImplementedError



    """  # noqa: E501

    @abstractmethod
    def transform_documents(

        self, documents: Sequence[Document], **kwargs: Any

    ) -> Sequence[Document]:
        """Transform a list of documents.



        Args:

            documents: A sequence of Documents to be transformed.



        Returns:

            A list of transformed Documents.

        """

    @abstractmethod
    async def atransform_documents(

        self, documents: Sequence[Document], **kwargs: Any

    ) -> Sequence[Document]:
        """Asynchronously transform a list of documents.



        Args:

            documents: A sequence of Documents to be transformed.



        Returns:

            A list of transformed Documents.

        """