Spaces:

LordFarquaad42
/

Groove-GPT

Sleeping

File size: 4,191 Bytes

5cb3e42

class Chroma:

    def __init__(self) -> None:
        """
        creates client if none, stores session of client
        """
        import chromadb
        from chromadb.utils import embedding_functions
        import os

        self.DB_PATH = "./chromadb_linux/"
        self.MODEL_NAME: str = "mixedbread-ai/mxbai-embed-large-v1"  # ~ 0.5 gb
        self.COLLECTION_NAME: str = "scheme"
        self.EMBEDDING_FUNC = embedding_functions.SentenceTransformerEmbeddingFunction(
            model_name=self.MODEL_NAME
        )

        if os.path.exists(self.DB_PATH):
            self.client = chromadb.PersistentClient(path=self.DB_PATH)
            self.schemer = self.client.get_collection(
                name=self.COLLECTION_NAME,
                embedding_function=self.EMBEDDING_FUNC,
            )
        else:
            print("Collection Missing, Creating New Collection")
            client = chromadb.PersistentClient(path=self.DB_PATH)
            self.schemer = client.create_collection(
                name=self.COLLECTION_NAME,
                embedding_function=self.EMBEDDING_FUNC,
            )

    def get_collection(self):
        return self.schemer

    def add_materials(self, file_path: str) -> None:
        """
        adds file path of PDF into embedded database
        """
        print("hiiii")
        from pypdf import PdfReader as reader

        doc = reader(file_path)
        text_content: str = ""

        for page in doc.pages:
            text_content += page.extract_text()

        text_content.replace("\n", " ")

        batch_size = 1024
        padding_element = "."
        batch_documents = []
        batch_ids = []
        batch_metadata = []

        for i in range(0, len(text_content), batch_size):
            batch = text_content[i : min(i + batch_size, len(text_content))]

            if len(batch) < batch_size:
                padding_needed = batch_size - len(batch)
                batch = batch + str(padding_element * padding_needed)

            print(f"Batch {i}/{len(text_content)}")
            batch_documents.append(text_content)
            batch_ids.append(f"batch{i}{batch[0]}")
            batch_metadata.append({"length": len(batch)})

        print("Upserting into collection")
        self.schemer.upsert(
            ids=[str(id) for id in batch_ids],
            metadatas=batch_metadata,
            documents=batch_documents,
        )

    def encode_image(image) -> str:
        """
        idk why u would need this
        """
        import io
        import base64

        byte_arr = io.BytesIO()
        image.save(byte_arr, format="JPEG")
        encoded_image = base64.b64encode(byte_arr.getvalue()).decode("utf-8")
        return encoded_image

    async def image_to_text(self, image) -> object:
        """
        idk why you would need this ngl
        """
        from openai import OpenAI
        import json

        client = OpenAI()

        response = client.chat.completions.create(
            model="gpt-4-turbo",
            response_format={"type": "json_object"},
            messages=[
                {
                    "role": "user",
                    "content": [
                        {
                            "type": "text",
                            "text": "Transcribe the contents of this image and return a JSON object that contains the text. It must be structured in the following manner: two entries with the following keys: 'content' and 'text'. Content will be a line describing what the content of text will be, and text will be a simple transcription of the image",
                        },
                        {
                            "type": "image_url",
                            "image_url": {
                                "url": f"data:image/jpeg;base64;,{image}",
                                "detail": "high",
                            },
                        },
                    ],
                }
            ],
        )
        return json.loads(response.choices[0].message.content)


if __name__ == "__main__":
    c = Chroma()
    c.add_materials("data/Essentials of Programming Languages 2001.pdf")