File size: 4,191 Bytes
5cb3e42
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
class Chroma:

    def __init__(self) -> None:
        """
        creates client if none, stores session of client
        """
        import chromadb
        from chromadb.utils import embedding_functions
        import os

        self.DB_PATH = "./chromadb_linux/"
        self.MODEL_NAME: str = "mixedbread-ai/mxbai-embed-large-v1"  # ~ 0.5 gb
        self.COLLECTION_NAME: str = "scheme"
        self.EMBEDDING_FUNC = embedding_functions.SentenceTransformerEmbeddingFunction(
            model_name=self.MODEL_NAME
        )

        if os.path.exists(self.DB_PATH):
            self.client = chromadb.PersistentClient(path=self.DB_PATH)
            self.schemer = self.client.get_collection(
                name=self.COLLECTION_NAME,
                embedding_function=self.EMBEDDING_FUNC,
            )
        else:
            print("Collection Missing, Creating New Collection")
            client = chromadb.PersistentClient(path=self.DB_PATH)
            self.schemer = client.create_collection(
                name=self.COLLECTION_NAME,
                embedding_function=self.EMBEDDING_FUNC,
            )

    def get_collection(self):
        return self.schemer

    def add_materials(self, file_path: str) -> None:
        """
        adds file path of PDF into embedded database
        """
        print("hiiii")
        from pypdf import PdfReader as reader

        doc = reader(file_path)
        text_content: str = ""

        for page in doc.pages:
            text_content += page.extract_text()

        text_content.replace("\n", " ")

        batch_size = 1024
        padding_element = "."
        batch_documents = []
        batch_ids = []
        batch_metadata = []

        for i in range(0, len(text_content), batch_size):
            batch = text_content[i : min(i + batch_size, len(text_content))]

            if len(batch) < batch_size:
                padding_needed = batch_size - len(batch)
                batch = batch + str(padding_element * padding_needed)

            print(f"Batch {i}/{len(text_content)}")
            batch_documents.append(text_content)
            batch_ids.append(f"batch{i}{batch[0]}")
            batch_metadata.append({"length": len(batch)})

        print("Upserting into collection")
        self.schemer.upsert(
            ids=[str(id) for id in batch_ids],
            metadatas=batch_metadata,
            documents=batch_documents,
        )

    def encode_image(image) -> str:
        """
        idk why u would need this
        """
        import io
        import base64

        byte_arr = io.BytesIO()
        image.save(byte_arr, format="JPEG")
        encoded_image = base64.b64encode(byte_arr.getvalue()).decode("utf-8")
        return encoded_image

    async def image_to_text(self, image) -> object:
        """
        idk why you would need this ngl
        """
        from openai import OpenAI
        import json

        client = OpenAI()

        response = client.chat.completions.create(
            model="gpt-4-turbo",
            response_format={"type": "json_object"},
            messages=[
                {
                    "role": "user",
                    "content": [
                        {
                            "type": "text",
                            "text": "Transcribe the contents of this image and return a JSON object that contains the text. It must be structured in the following manner: two entries with the following keys: 'content' and 'text'. Content will be a line describing what the content of text will be, and text will be a simple transcription of the image",
                        },
                        {
                            "type": "image_url",
                            "image_url": {
                                "url": f"data:image/jpeg;base64;,{image}",
                                "detail": "high",
                            },
                        },
                    ],
                }
            ],
        )
        return json.loads(response.choices[0].message.content)


if __name__ == "__main__":
    c = Chroma()
    c.add_materials("data/Essentials of Programming Languages 2001.pdf")