Spaces:
Sleeping
Sleeping
LordFarquaad42
commited on
Commit
·
5cb3e42
1
Parent(s):
12c8868
wrapping all database related actions under Chroma class
Browse files- Chroma.py +123 -0
- add_data.py +0 -96
- database.py +0 -15
Chroma.py
ADDED
@@ -0,0 +1,123 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
class Chroma:
|
2 |
+
|
3 |
+
def __init__(self) -> None:
|
4 |
+
"""
|
5 |
+
creates client if none, stores session of client
|
6 |
+
"""
|
7 |
+
import chromadb
|
8 |
+
from chromadb.utils import embedding_functions
|
9 |
+
import os
|
10 |
+
|
11 |
+
self.DB_PATH = "./chromadb_linux/"
|
12 |
+
self.MODEL_NAME: str = "mixedbread-ai/mxbai-embed-large-v1" # ~ 0.5 gb
|
13 |
+
self.COLLECTION_NAME: str = "scheme"
|
14 |
+
self.EMBEDDING_FUNC = embedding_functions.SentenceTransformerEmbeddingFunction(
|
15 |
+
model_name=self.MODEL_NAME
|
16 |
+
)
|
17 |
+
|
18 |
+
if os.path.exists(self.DB_PATH):
|
19 |
+
self.client = chromadb.PersistentClient(path=self.DB_PATH)
|
20 |
+
self.schemer = self.client.get_collection(
|
21 |
+
name=self.COLLECTION_NAME,
|
22 |
+
embedding_function=self.EMBEDDING_FUNC,
|
23 |
+
)
|
24 |
+
else:
|
25 |
+
print("Collection Missing, Creating New Collection")
|
26 |
+
client = chromadb.PersistentClient(path=self.DB_PATH)
|
27 |
+
self.schemer = client.create_collection(
|
28 |
+
name=self.COLLECTION_NAME,
|
29 |
+
embedding_function=self.EMBEDDING_FUNC,
|
30 |
+
)
|
31 |
+
|
32 |
+
def get_collection(self):
|
33 |
+
return self.schemer
|
34 |
+
|
35 |
+
def add_materials(self, file_path: str) -> None:
|
36 |
+
"""
|
37 |
+
adds file path of PDF into embedded database
|
38 |
+
"""
|
39 |
+
print("hiiii")
|
40 |
+
from pypdf import PdfReader as reader
|
41 |
+
|
42 |
+
doc = reader(file_path)
|
43 |
+
text_content: str = ""
|
44 |
+
|
45 |
+
for page in doc.pages:
|
46 |
+
text_content += page.extract_text()
|
47 |
+
|
48 |
+
text_content.replace("\n", " ")
|
49 |
+
|
50 |
+
batch_size = 1024
|
51 |
+
padding_element = "."
|
52 |
+
batch_documents = []
|
53 |
+
batch_ids = []
|
54 |
+
batch_metadata = []
|
55 |
+
|
56 |
+
for i in range(0, len(text_content), batch_size):
|
57 |
+
batch = text_content[i : min(i + batch_size, len(text_content))]
|
58 |
+
|
59 |
+
if len(batch) < batch_size:
|
60 |
+
padding_needed = batch_size - len(batch)
|
61 |
+
batch = batch + str(padding_element * padding_needed)
|
62 |
+
|
63 |
+
print(f"Batch {i}/{len(text_content)}")
|
64 |
+
batch_documents.append(text_content)
|
65 |
+
batch_ids.append(f"batch{i}{batch[0]}")
|
66 |
+
batch_metadata.append({"length": len(batch)})
|
67 |
+
|
68 |
+
print("Upserting into collection")
|
69 |
+
self.schemer.upsert(
|
70 |
+
ids=[str(id) for id in batch_ids],
|
71 |
+
metadatas=batch_metadata,
|
72 |
+
documents=batch_documents,
|
73 |
+
)
|
74 |
+
|
75 |
+
def encode_image(image) -> str:
|
76 |
+
"""
|
77 |
+
idk why u would need this
|
78 |
+
"""
|
79 |
+
import io
|
80 |
+
import base64
|
81 |
+
|
82 |
+
byte_arr = io.BytesIO()
|
83 |
+
image.save(byte_arr, format="JPEG")
|
84 |
+
encoded_image = base64.b64encode(byte_arr.getvalue()).decode("utf-8")
|
85 |
+
return encoded_image
|
86 |
+
|
87 |
+
async def image_to_text(self, image) -> object:
|
88 |
+
"""
|
89 |
+
idk why you would need this ngl
|
90 |
+
"""
|
91 |
+
from openai import OpenAI
|
92 |
+
import json
|
93 |
+
|
94 |
+
client = OpenAI()
|
95 |
+
|
96 |
+
response = client.chat.completions.create(
|
97 |
+
model="gpt-4-turbo",
|
98 |
+
response_format={"type": "json_object"},
|
99 |
+
messages=[
|
100 |
+
{
|
101 |
+
"role": "user",
|
102 |
+
"content": [
|
103 |
+
{
|
104 |
+
"type": "text",
|
105 |
+
"text": "Transcribe the contents of this image and return a JSON object that contains the text. It must be structured in the following manner: two entries with the following keys: 'content' and 'text'. Content will be a line describing what the content of text will be, and text will be a simple transcription of the image",
|
106 |
+
},
|
107 |
+
{
|
108 |
+
"type": "image_url",
|
109 |
+
"image_url": {
|
110 |
+
"url": f"data:image/jpeg;base64;,{image}",
|
111 |
+
"detail": "high",
|
112 |
+
},
|
113 |
+
},
|
114 |
+
],
|
115 |
+
}
|
116 |
+
],
|
117 |
+
)
|
118 |
+
return json.loads(response.choices[0].message.content)
|
119 |
+
|
120 |
+
|
121 |
+
if __name__ == "__main__":
|
122 |
+
c = Chroma()
|
123 |
+
c.add_materials("data/Essentials of Programming Languages 2001.pdf")
|
add_data.py
DELETED
@@ -1,96 +0,0 @@
|
|
1 |
-
import chromadb
|
2 |
-
from chromadb.utils import embedding_functions
|
3 |
-
|
4 |
-
|
5 |
-
def create_client():
|
6 |
-
client = chromadb.PersistentClient(path="./chromadb_linux/")
|
7 |
-
MODEL_NAME: str = "mixedbread-ai/mxbai-embed-large-v1" # ~ 0.5 gb
|
8 |
-
COLLECTION_NAME: str = "schemer2"
|
9 |
-
EMBEDDING_FUNC = embedding_functions.SentenceTransformerEmbeddingFunction(
|
10 |
-
model_name=MODEL_NAME
|
11 |
-
)
|
12 |
-
schemer = client.create_collection(
|
13 |
-
name=COLLECTION_NAME,
|
14 |
-
embedding_function=EMBEDDING_FUNC,
|
15 |
-
)
|
16 |
-
return schemer
|
17 |
-
|
18 |
-
def get_client():
|
19 |
-
client = chromadb.PersistentClient(path="./chromadb_linux/")
|
20 |
-
MODEL_NAME: str = "mixedbread-ai/mxbai-embed-large-v1" # ~ 0.5 gb
|
21 |
-
COLLECTION_NAME: str = "scheme"
|
22 |
-
EMBEDDING_FUNC = embedding_functions.SentenceTransformerEmbeddingFunction(
|
23 |
-
model_name=MODEL_NAME
|
24 |
-
)
|
25 |
-
schemer = client.get_collection(
|
26 |
-
name=COLLECTION_NAME,
|
27 |
-
embedding_function=EMBEDDING_FUNC,
|
28 |
-
)
|
29 |
-
return schemer
|
30 |
-
|
31 |
-
|
32 |
-
def update_collection(iter: int, text: object, client: chromadb.Collection):
|
33 |
-
client.add(documents=[text["text"]], metadatas=[{"source": "pdf"}], ids=[text["content"] + str(iter)])
|
34 |
-
|
35 |
-
|
36 |
-
def encode_image(image) -> str:
|
37 |
-
import io
|
38 |
-
import base64
|
39 |
-
|
40 |
-
byte_arr = io.BytesIO()
|
41 |
-
image.save(byte_arr, format="JPEG")
|
42 |
-
encoded_image = base64.b64encode(byte_arr.getvalue()).decode("utf-8")
|
43 |
-
return encoded_image
|
44 |
-
|
45 |
-
|
46 |
-
async def image_to_text(image) -> object:
|
47 |
-
from openai import OpenAI
|
48 |
-
import json
|
49 |
-
|
50 |
-
client = OpenAI()
|
51 |
-
|
52 |
-
response = client.chat.completions.create(
|
53 |
-
model="gpt-4-turbo",
|
54 |
-
response_format={"type": "json_object"},
|
55 |
-
messages=[
|
56 |
-
{
|
57 |
-
"role": "user",
|
58 |
-
"content": [
|
59 |
-
{"type": "text", "text": "Transcribe the contents of this image and return a JSON object that contains the text. It must be structured in the following manner: two entries with the following keys: 'content' and 'text'. Content will be a line describing what the content of text will be, and text will be a simple transcription of the image"},
|
60 |
-
{
|
61 |
-
"type": "image_url",
|
62 |
-
"image_url": {
|
63 |
-
"url": f"data:image/jpeg;base64;,{image}",
|
64 |
-
"detail": "high",
|
65 |
-
},
|
66 |
-
},
|
67 |
-
],
|
68 |
-
}
|
69 |
-
],
|
70 |
-
)
|
71 |
-
return json.loads(response.choices[0].message.content)
|
72 |
-
|
73 |
-
|
74 |
-
async def start_troggin_off(dir: str, client):
|
75 |
-
# recursive
|
76 |
-
import os
|
77 |
-
from pdf2image import convert_from_path
|
78 |
-
|
79 |
-
dirs = os.listdir(dir)
|
80 |
-
for path in dirs:
|
81 |
-
if os.path.isdir(os.path.join(dir, path)):
|
82 |
-
await start_troggin_off(os.path.join(dir, path), client) # recursive call
|
83 |
-
|
84 |
-
if(os.path.join(dir, path).endswith(".pdf")):
|
85 |
-
images = convert_from_path(os.path.join(dir, path))
|
86 |
-
|
87 |
-
for i, image in enumerate(images):
|
88 |
-
encoded_image = encode_image(image)
|
89 |
-
text = await image_to_text(encoded_image)
|
90 |
-
update_collection(i, text, client)
|
91 |
-
|
92 |
-
if __name__ == "__main__":
|
93 |
-
import asyncio
|
94 |
-
client = create_client()
|
95 |
-
# client = None
|
96 |
-
asyncio.run(start_troggin_off("data/", client))
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
database.py
DELETED
@@ -1,15 +0,0 @@
|
|
1 |
-
import chromadb
|
2 |
-
from chromadb.utils import embedding_functions
|
3 |
-
|
4 |
-
def get_client():
|
5 |
-
client = chromadb.PersistentClient(path="./chromadb_linux/")
|
6 |
-
MODEL_NAME: str = "mixedbread-ai/mxbai-embed-large-v1" # ~ 0.5 gb
|
7 |
-
COLLECTION_NAME: str = "scheme"
|
8 |
-
EMBEDDING_FUNC = embedding_functions.SentenceTransformerEmbeddingFunction(
|
9 |
-
model_name=MODEL_NAME
|
10 |
-
)
|
11 |
-
schemer = client.get_collection(
|
12 |
-
name=COLLECTION_NAME,
|
13 |
-
embedding_function=EMBEDDING_FUNC,
|
14 |
-
)
|
15 |
-
return schemer
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|