LordFarquaad42 commited on
Commit
5cb3e42
·
1 Parent(s): 12c8868

wrapping all database related actions under Chroma class

Browse files
Files changed (3) hide show
  1. Chroma.py +123 -0
  2. add_data.py +0 -96
  3. database.py +0 -15
Chroma.py ADDED
@@ -0,0 +1,123 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ class Chroma:
2
+
3
+ def __init__(self) -> None:
4
+ """
5
+ creates client if none, stores session of client
6
+ """
7
+ import chromadb
8
+ from chromadb.utils import embedding_functions
9
+ import os
10
+
11
+ self.DB_PATH = "./chromadb_linux/"
12
+ self.MODEL_NAME: str = "mixedbread-ai/mxbai-embed-large-v1" # ~ 0.5 gb
13
+ self.COLLECTION_NAME: str = "scheme"
14
+ self.EMBEDDING_FUNC = embedding_functions.SentenceTransformerEmbeddingFunction(
15
+ model_name=self.MODEL_NAME
16
+ )
17
+
18
+ if os.path.exists(self.DB_PATH):
19
+ self.client = chromadb.PersistentClient(path=self.DB_PATH)
20
+ self.schemer = self.client.get_collection(
21
+ name=self.COLLECTION_NAME,
22
+ embedding_function=self.EMBEDDING_FUNC,
23
+ )
24
+ else:
25
+ print("Collection Missing, Creating New Collection")
26
+ client = chromadb.PersistentClient(path=self.DB_PATH)
27
+ self.schemer = client.create_collection(
28
+ name=self.COLLECTION_NAME,
29
+ embedding_function=self.EMBEDDING_FUNC,
30
+ )
31
+
32
+ def get_collection(self):
33
+ return self.schemer
34
+
35
+ def add_materials(self, file_path: str) -> None:
36
+ """
37
+ adds file path of PDF into embedded database
38
+ """
39
+ print("hiiii")
40
+ from pypdf import PdfReader as reader
41
+
42
+ doc = reader(file_path)
43
+ text_content: str = ""
44
+
45
+ for page in doc.pages:
46
+ text_content += page.extract_text()
47
+
48
+ text_content.replace("\n", " ")
49
+
50
+ batch_size = 1024
51
+ padding_element = "."
52
+ batch_documents = []
53
+ batch_ids = []
54
+ batch_metadata = []
55
+
56
+ for i in range(0, len(text_content), batch_size):
57
+ batch = text_content[i : min(i + batch_size, len(text_content))]
58
+
59
+ if len(batch) < batch_size:
60
+ padding_needed = batch_size - len(batch)
61
+ batch = batch + str(padding_element * padding_needed)
62
+
63
+ print(f"Batch {i}/{len(text_content)}")
64
+ batch_documents.append(text_content)
65
+ batch_ids.append(f"batch{i}{batch[0]}")
66
+ batch_metadata.append({"length": len(batch)})
67
+
68
+ print("Upserting into collection")
69
+ self.schemer.upsert(
70
+ ids=[str(id) for id in batch_ids],
71
+ metadatas=batch_metadata,
72
+ documents=batch_documents,
73
+ )
74
+
75
+ def encode_image(image) -> str:
76
+ """
77
+ idk why u would need this
78
+ """
79
+ import io
80
+ import base64
81
+
82
+ byte_arr = io.BytesIO()
83
+ image.save(byte_arr, format="JPEG")
84
+ encoded_image = base64.b64encode(byte_arr.getvalue()).decode("utf-8")
85
+ return encoded_image
86
+
87
+ async def image_to_text(self, image) -> object:
88
+ """
89
+ idk why you would need this ngl
90
+ """
91
+ from openai import OpenAI
92
+ import json
93
+
94
+ client = OpenAI()
95
+
96
+ response = client.chat.completions.create(
97
+ model="gpt-4-turbo",
98
+ response_format={"type": "json_object"},
99
+ messages=[
100
+ {
101
+ "role": "user",
102
+ "content": [
103
+ {
104
+ "type": "text",
105
+ "text": "Transcribe the contents of this image and return a JSON object that contains the text. It must be structured in the following manner: two entries with the following keys: 'content' and 'text'. Content will be a line describing what the content of text will be, and text will be a simple transcription of the image",
106
+ },
107
+ {
108
+ "type": "image_url",
109
+ "image_url": {
110
+ "url": f"data:image/jpeg;base64;,{image}",
111
+ "detail": "high",
112
+ },
113
+ },
114
+ ],
115
+ }
116
+ ],
117
+ )
118
+ return json.loads(response.choices[0].message.content)
119
+
120
+
121
+ if __name__ == "__main__":
122
+ c = Chroma()
123
+ c.add_materials("data/Essentials of Programming Languages 2001.pdf")
add_data.py DELETED
@@ -1,96 +0,0 @@
1
- import chromadb
2
- from chromadb.utils import embedding_functions
3
-
4
-
5
- def create_client():
6
- client = chromadb.PersistentClient(path="./chromadb_linux/")
7
- MODEL_NAME: str = "mixedbread-ai/mxbai-embed-large-v1" # ~ 0.5 gb
8
- COLLECTION_NAME: str = "schemer2"
9
- EMBEDDING_FUNC = embedding_functions.SentenceTransformerEmbeddingFunction(
10
- model_name=MODEL_NAME
11
- )
12
- schemer = client.create_collection(
13
- name=COLLECTION_NAME,
14
- embedding_function=EMBEDDING_FUNC,
15
- )
16
- return schemer
17
-
18
- def get_client():
19
- client = chromadb.PersistentClient(path="./chromadb_linux/")
20
- MODEL_NAME: str = "mixedbread-ai/mxbai-embed-large-v1" # ~ 0.5 gb
21
- COLLECTION_NAME: str = "scheme"
22
- EMBEDDING_FUNC = embedding_functions.SentenceTransformerEmbeddingFunction(
23
- model_name=MODEL_NAME
24
- )
25
- schemer = client.get_collection(
26
- name=COLLECTION_NAME,
27
- embedding_function=EMBEDDING_FUNC,
28
- )
29
- return schemer
30
-
31
-
32
- def update_collection(iter: int, text: object, client: chromadb.Collection):
33
- client.add(documents=[text["text"]], metadatas=[{"source": "pdf"}], ids=[text["content"] + str(iter)])
34
-
35
-
36
- def encode_image(image) -> str:
37
- import io
38
- import base64
39
-
40
- byte_arr = io.BytesIO()
41
- image.save(byte_arr, format="JPEG")
42
- encoded_image = base64.b64encode(byte_arr.getvalue()).decode("utf-8")
43
- return encoded_image
44
-
45
-
46
- async def image_to_text(image) -> object:
47
- from openai import OpenAI
48
- import json
49
-
50
- client = OpenAI()
51
-
52
- response = client.chat.completions.create(
53
- model="gpt-4-turbo",
54
- response_format={"type": "json_object"},
55
- messages=[
56
- {
57
- "role": "user",
58
- "content": [
59
- {"type": "text", "text": "Transcribe the contents of this image and return a JSON object that contains the text. It must be structured in the following manner: two entries with the following keys: 'content' and 'text'. Content will be a line describing what the content of text will be, and text will be a simple transcription of the image"},
60
- {
61
- "type": "image_url",
62
- "image_url": {
63
- "url": f"data:image/jpeg;base64;,{image}",
64
- "detail": "high",
65
- },
66
- },
67
- ],
68
- }
69
- ],
70
- )
71
- return json.loads(response.choices[0].message.content)
72
-
73
-
74
- async def start_troggin_off(dir: str, client):
75
- # recursive
76
- import os
77
- from pdf2image import convert_from_path
78
-
79
- dirs = os.listdir(dir)
80
- for path in dirs:
81
- if os.path.isdir(os.path.join(dir, path)):
82
- await start_troggin_off(os.path.join(dir, path), client) # recursive call
83
-
84
- if(os.path.join(dir, path).endswith(".pdf")):
85
- images = convert_from_path(os.path.join(dir, path))
86
-
87
- for i, image in enumerate(images):
88
- encoded_image = encode_image(image)
89
- text = await image_to_text(encoded_image)
90
- update_collection(i, text, client)
91
-
92
- if __name__ == "__main__":
93
- import asyncio
94
- client = create_client()
95
- # client = None
96
- asyncio.run(start_troggin_off("data/", client))
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
database.py DELETED
@@ -1,15 +0,0 @@
1
- import chromadb
2
- from chromadb.utils import embedding_functions
3
-
4
- def get_client():
5
- client = chromadb.PersistentClient(path="./chromadb_linux/")
6
- MODEL_NAME: str = "mixedbread-ai/mxbai-embed-large-v1" # ~ 0.5 gb
7
- COLLECTION_NAME: str = "scheme"
8
- EMBEDDING_FUNC = embedding_functions.SentenceTransformerEmbeddingFunction(
9
- model_name=MODEL_NAME
10
- )
11
- schemer = client.get_collection(
12
- name=COLLECTION_NAME,
13
- embedding_function=EMBEDDING_FUNC,
14
- )
15
- return schemer