Spaces:
Sleeping
Sleeping
Upload vectordatabase.py
Browse files- aimakerspace/vectordatabase.py +83 -81
aimakerspace/vectordatabase.py
CHANGED
@@ -1,81 +1,83 @@
|
|
1 |
-
import numpy as np
|
2 |
-
from collections import defaultdict
|
3 |
-
from typing import List, Tuple, Callable
|
4 |
-
from aimakerspace.openai_utils.embedding import EmbeddingModel
|
5 |
-
import asyncio
|
6 |
-
|
7 |
-
|
8 |
-
def cosine_similarity(vector_a: np.array, vector_b: np.array) -> float:
|
9 |
-
"""Computes the cosine similarity between two vectors."""
|
10 |
-
dot_product = np.dot(vector_a, vector_b)
|
11 |
-
norm_a = np.linalg.norm(vector_a)
|
12 |
-
norm_b = np.linalg.norm(vector_b)
|
13 |
-
return dot_product / (norm_a * norm_b)
|
14 |
-
|
15 |
-
|
16 |
-
class VectorDatabase:
|
17 |
-
def __init__(self, embedding_model: EmbeddingModel = None):
|
18 |
-
self.vectors = defaultdict(np.array)
|
19 |
-
self.embedding_model = embedding_model or EmbeddingModel()
|
20 |
-
|
21 |
-
def insert(self, key: str, vector: np.array) -> None:
|
22 |
-
self.vectors[key] = vector
|
23 |
-
|
24 |
-
def search(
|
25 |
-
self,
|
26 |
-
query_vector: np.array,
|
27 |
-
k: int,
|
28 |
-
distance_measure: Callable = cosine_similarity,
|
29 |
-
) -> List[Tuple[str, float]]:
|
30 |
-
scores = [
|
31 |
-
(key, distance_measure(query_vector, vector))
|
32 |
-
for key, vector in self.vectors.items()
|
33 |
-
]
|
34 |
-
return sorted(scores, key=lambda x: x[1], reverse=True)[:k]
|
35 |
-
|
36 |
-
def search_by_text(
|
37 |
-
self,
|
38 |
-
query_text: str,
|
39 |
-
k: int,
|
40 |
-
distance_measure: Callable = cosine_similarity,
|
41 |
-
return_as_text: bool = False,
|
42 |
-
) -> List[Tuple[str, float]]:
|
43 |
-
query_vector = self.embedding_model.get_embedding(query_text)
|
44 |
-
|
45 |
-
|
46 |
-
|
47 |
-
|
48 |
-
|
49 |
-
|
50 |
-
|
51 |
-
|
52 |
-
|
53 |
-
|
54 |
-
|
55 |
-
|
56 |
-
|
57 |
-
|
58 |
-
|
59 |
-
|
60 |
-
|
61 |
-
"
|
62 |
-
"
|
63 |
-
"
|
64 |
-
|
65 |
-
|
66 |
-
|
67 |
-
|
68 |
-
|
69 |
-
|
70 |
-
|
71 |
-
|
72 |
-
|
73 |
-
|
74 |
-
|
75 |
-
|
76 |
-
|
77 |
-
|
78 |
-
|
79 |
-
|
80 |
-
|
81 |
-
|
|
|
|
|
|
1 |
+
import numpy as np
|
2 |
+
from collections import defaultdict
|
3 |
+
from typing import List, Tuple, Callable
|
4 |
+
from aimakerspace.openai_utils.embedding import EmbeddingModel
|
5 |
+
import asyncio
|
6 |
+
|
7 |
+
|
8 |
+
def cosine_similarity(vector_a: np.array, vector_b: np.array) -> float:
|
9 |
+
"""Computes the cosine similarity between two vectors."""
|
10 |
+
dot_product = np.dot(vector_a, vector_b)
|
11 |
+
norm_a = np.linalg.norm(vector_a)
|
12 |
+
norm_b = np.linalg.norm(vector_b)
|
13 |
+
return dot_product / (norm_a * norm_b)
|
14 |
+
|
15 |
+
|
16 |
+
class VectorDatabase:
|
17 |
+
def __init__(self, embedding_model: EmbeddingModel = None):
|
18 |
+
self.vectors = defaultdict(np.array)
|
19 |
+
self.embedding_model = embedding_model or EmbeddingModel()
|
20 |
+
|
21 |
+
def insert(self, key: str, vector: np.array) -> None:
|
22 |
+
self.vectors[key] = vector
|
23 |
+
|
24 |
+
def search(
|
25 |
+
self,
|
26 |
+
query_vector: np.array,
|
27 |
+
k: int,
|
28 |
+
distance_measure: Callable = cosine_similarity,
|
29 |
+
) -> List[Tuple[str, float]]:
|
30 |
+
scores = [
|
31 |
+
(key, distance_measure(query_vector, vector))
|
32 |
+
for key, vector in self.vectors.items()
|
33 |
+
]
|
34 |
+
return sorted(scores, key=lambda x: x[1], reverse=True)[:k]
|
35 |
+
|
36 |
+
def search_by_text(
|
37 |
+
self,
|
38 |
+
query_text: str,
|
39 |
+
k: int,
|
40 |
+
distance_measure: Callable = cosine_similarity,
|
41 |
+
return_as_text: bool = False,
|
42 |
+
) -> List[Tuple[str, float]]:
|
43 |
+
# query_vector = self.embedding_model.get_embedding(query_text)
|
44 |
+
query_vector = self.embedding_model.embed_query(query_text)
|
45 |
+
results = self.search(query_vector, k, distance_measure)
|
46 |
+
return [result[0] for result in results] if return_as_text else results
|
47 |
+
|
48 |
+
def retrieve_from_key(self, key: str) -> np.array:
|
49 |
+
return self.vectors.get(key, None)
|
50 |
+
|
51 |
+
async def abuild_from_list(self, list_of_text: List[str]) -> "VectorDatabase":
|
52 |
+
# embeddings = await self.embedding_model.async_get_embeddings(list_of_text)
|
53 |
+
embeddings = await self.embedding_model.embed_documents(list_of_text)
|
54 |
+
for text, embedding in zip(list_of_text, embeddings):
|
55 |
+
self.insert(text, np.array(embedding))
|
56 |
+
return self
|
57 |
+
|
58 |
+
|
59 |
+
if __name__ == "__main__":
|
60 |
+
list_of_text = [
|
61 |
+
"I like to eat broccoli and bananas.",
|
62 |
+
"I ate a banana and spinach smoothie for breakfast.",
|
63 |
+
"Chinchillas and kittens are cute.",
|
64 |
+
"My sister adopted a kitten yesterday.",
|
65 |
+
"Look at this cute hamster munching on a piece of broccoli.",
|
66 |
+
]
|
67 |
+
|
68 |
+
vector_db = VectorDatabase()
|
69 |
+
vector_db = asyncio.run(vector_db.abuild_from_list(list_of_text))
|
70 |
+
k = 2
|
71 |
+
|
72 |
+
searched_vector = vector_db.search_by_text("I think fruit is awesome!", k=k)
|
73 |
+
print(f"Closest {k} vector(s):", searched_vector)
|
74 |
+
|
75 |
+
retrieved_vector = vector_db.retrieve_from_key(
|
76 |
+
"I like to eat broccoli and bananas."
|
77 |
+
)
|
78 |
+
print("Retrieved vector:", retrieved_vector)
|
79 |
+
|
80 |
+
relevant_texts = vector_db.search_by_text(
|
81 |
+
"I think fruit is awesome!", k=k, return_as_text=True
|
82 |
+
)
|
83 |
+
print(f"Closest {k} text(s):", relevant_texts)
|