Spaces:
Sleeping
Sleeping
File size: 4,191 Bytes
5cb3e42 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 |
class Chroma:
def __init__(self) -> None:
"""
creates client if none, stores session of client
"""
import chromadb
from chromadb.utils import embedding_functions
import os
self.DB_PATH = "./chromadb_linux/"
self.MODEL_NAME: str = "mixedbread-ai/mxbai-embed-large-v1" # ~ 0.5 gb
self.COLLECTION_NAME: str = "scheme"
self.EMBEDDING_FUNC = embedding_functions.SentenceTransformerEmbeddingFunction(
model_name=self.MODEL_NAME
)
if os.path.exists(self.DB_PATH):
self.client = chromadb.PersistentClient(path=self.DB_PATH)
self.schemer = self.client.get_collection(
name=self.COLLECTION_NAME,
embedding_function=self.EMBEDDING_FUNC,
)
else:
print("Collection Missing, Creating New Collection")
client = chromadb.PersistentClient(path=self.DB_PATH)
self.schemer = client.create_collection(
name=self.COLLECTION_NAME,
embedding_function=self.EMBEDDING_FUNC,
)
def get_collection(self):
return self.schemer
def add_materials(self, file_path: str) -> None:
"""
adds file path of PDF into embedded database
"""
print("hiiii")
from pypdf import PdfReader as reader
doc = reader(file_path)
text_content: str = ""
for page in doc.pages:
text_content += page.extract_text()
text_content.replace("\n", " ")
batch_size = 1024
padding_element = "."
batch_documents = []
batch_ids = []
batch_metadata = []
for i in range(0, len(text_content), batch_size):
batch = text_content[i : min(i + batch_size, len(text_content))]
if len(batch) < batch_size:
padding_needed = batch_size - len(batch)
batch = batch + str(padding_element * padding_needed)
print(f"Batch {i}/{len(text_content)}")
batch_documents.append(text_content)
batch_ids.append(f"batch{i}{batch[0]}")
batch_metadata.append({"length": len(batch)})
print("Upserting into collection")
self.schemer.upsert(
ids=[str(id) for id in batch_ids],
metadatas=batch_metadata,
documents=batch_documents,
)
def encode_image(image) -> str:
"""
idk why u would need this
"""
import io
import base64
byte_arr = io.BytesIO()
image.save(byte_arr, format="JPEG")
encoded_image = base64.b64encode(byte_arr.getvalue()).decode("utf-8")
return encoded_image
async def image_to_text(self, image) -> object:
"""
idk why you would need this ngl
"""
from openai import OpenAI
import json
client = OpenAI()
response = client.chat.completions.create(
model="gpt-4-turbo",
response_format={"type": "json_object"},
messages=[
{
"role": "user",
"content": [
{
"type": "text",
"text": "Transcribe the contents of this image and return a JSON object that contains the text. It must be structured in the following manner: two entries with the following keys: 'content' and 'text'. Content will be a line describing what the content of text will be, and text will be a simple transcription of the image",
},
{
"type": "image_url",
"image_url": {
"url": f"data:image/jpeg;base64;,{image}",
"detail": "high",
},
},
],
}
],
)
return json.loads(response.choices[0].message.content)
if __name__ == "__main__":
c = Chroma()
c.add_materials("data/Essentials of Programming Languages 2001.pdf")
|