acpotts commited on
Commit
2823e81
·
verified ·
1 Parent(s): 9caba04

Upload vectordatabase.py

Browse files
Files changed (1) hide show
  1. aimakerspace/vectordatabase.py +83 -81
aimakerspace/vectordatabase.py CHANGED
@@ -1,81 +1,83 @@
1
- import numpy as np
2
- from collections import defaultdict
3
- from typing import List, Tuple, Callable
4
- from aimakerspace.openai_utils.embedding import EmbeddingModel
5
- import asyncio
6
-
7
-
8
- def cosine_similarity(vector_a: np.array, vector_b: np.array) -> float:
9
- """Computes the cosine similarity between two vectors."""
10
- dot_product = np.dot(vector_a, vector_b)
11
- norm_a = np.linalg.norm(vector_a)
12
- norm_b = np.linalg.norm(vector_b)
13
- return dot_product / (norm_a * norm_b)
14
-
15
-
16
- class VectorDatabase:
17
- def __init__(self, embedding_model: EmbeddingModel = None):
18
- self.vectors = defaultdict(np.array)
19
- self.embedding_model = embedding_model or EmbeddingModel()
20
-
21
- def insert(self, key: str, vector: np.array) -> None:
22
- self.vectors[key] = vector
23
-
24
- def search(
25
- self,
26
- query_vector: np.array,
27
- k: int,
28
- distance_measure: Callable = cosine_similarity,
29
- ) -> List[Tuple[str, float]]:
30
- scores = [
31
- (key, distance_measure(query_vector, vector))
32
- for key, vector in self.vectors.items()
33
- ]
34
- return sorted(scores, key=lambda x: x[1], reverse=True)[:k]
35
-
36
- def search_by_text(
37
- self,
38
- query_text: str,
39
- k: int,
40
- distance_measure: Callable = cosine_similarity,
41
- return_as_text: bool = False,
42
- ) -> List[Tuple[str, float]]:
43
- query_vector = self.embedding_model.get_embedding(query_text)
44
- results = self.search(query_vector, k, distance_measure)
45
- return [result[0] for result in results] if return_as_text else results
46
-
47
- def retrieve_from_key(self, key: str) -> np.array:
48
- return self.vectors.get(key, None)
49
-
50
- async def abuild_from_list(self, list_of_text: List[str]) -> "VectorDatabase":
51
- embeddings = await self.embedding_model.async_get_embeddings(list_of_text)
52
- for text, embedding in zip(list_of_text, embeddings):
53
- self.insert(text, np.array(embedding))
54
- return self
55
-
56
-
57
- if __name__ == "__main__":
58
- list_of_text = [
59
- "I like to eat broccoli and bananas.",
60
- "I ate a banana and spinach smoothie for breakfast.",
61
- "Chinchillas and kittens are cute.",
62
- "My sister adopted a kitten yesterday.",
63
- "Look at this cute hamster munching on a piece of broccoli.",
64
- ]
65
-
66
- vector_db = VectorDatabase()
67
- vector_db = asyncio.run(vector_db.abuild_from_list(list_of_text))
68
- k = 2
69
-
70
- searched_vector = vector_db.search_by_text("I think fruit is awesome!", k=k)
71
- print(f"Closest {k} vector(s):", searched_vector)
72
-
73
- retrieved_vector = vector_db.retrieve_from_key(
74
- "I like to eat broccoli and bananas."
75
- )
76
- print("Retrieved vector:", retrieved_vector)
77
-
78
- relevant_texts = vector_db.search_by_text(
79
- "I think fruit is awesome!", k=k, return_as_text=True
80
- )
81
- print(f"Closest {k} text(s):", relevant_texts)
 
 
 
1
+ import numpy as np
2
+ from collections import defaultdict
3
+ from typing import List, Tuple, Callable
4
+ from aimakerspace.openai_utils.embedding import EmbeddingModel
5
+ import asyncio
6
+
7
+
8
+ def cosine_similarity(vector_a: np.array, vector_b: np.array) -> float:
9
+ """Computes the cosine similarity between two vectors."""
10
+ dot_product = np.dot(vector_a, vector_b)
11
+ norm_a = np.linalg.norm(vector_a)
12
+ norm_b = np.linalg.norm(vector_b)
13
+ return dot_product / (norm_a * norm_b)
14
+
15
+
16
+ class VectorDatabase:
17
+ def __init__(self, embedding_model: EmbeddingModel = None):
18
+ self.vectors = defaultdict(np.array)
19
+ self.embedding_model = embedding_model or EmbeddingModel()
20
+
21
+ def insert(self, key: str, vector: np.array) -> None:
22
+ self.vectors[key] = vector
23
+
24
+ def search(
25
+ self,
26
+ query_vector: np.array,
27
+ k: int,
28
+ distance_measure: Callable = cosine_similarity,
29
+ ) -> List[Tuple[str, float]]:
30
+ scores = [
31
+ (key, distance_measure(query_vector, vector))
32
+ for key, vector in self.vectors.items()
33
+ ]
34
+ return sorted(scores, key=lambda x: x[1], reverse=True)[:k]
35
+
36
+ def search_by_text(
37
+ self,
38
+ query_text: str,
39
+ k: int,
40
+ distance_measure: Callable = cosine_similarity,
41
+ return_as_text: bool = False,
42
+ ) -> List[Tuple[str, float]]:
43
+ # query_vector = self.embedding_model.get_embedding(query_text)
44
+ query_vector = self.embedding_model.embed_query(query_text)
45
+ results = self.search(query_vector, k, distance_measure)
46
+ return [result[0] for result in results] if return_as_text else results
47
+
48
+ def retrieve_from_key(self, key: str) -> np.array:
49
+ return self.vectors.get(key, None)
50
+
51
+ async def abuild_from_list(self, list_of_text: List[str]) -> "VectorDatabase":
52
+ # embeddings = await self.embedding_model.async_get_embeddings(list_of_text)
53
+ embeddings = await self.embedding_model.embed_documents(list_of_text)
54
+ for text, embedding in zip(list_of_text, embeddings):
55
+ self.insert(text, np.array(embedding))
56
+ return self
57
+
58
+
59
+ if __name__ == "__main__":
60
+ list_of_text = [
61
+ "I like to eat broccoli and bananas.",
62
+ "I ate a banana and spinach smoothie for breakfast.",
63
+ "Chinchillas and kittens are cute.",
64
+ "My sister adopted a kitten yesterday.",
65
+ "Look at this cute hamster munching on a piece of broccoli.",
66
+ ]
67
+
68
+ vector_db = VectorDatabase()
69
+ vector_db = asyncio.run(vector_db.abuild_from_list(list_of_text))
70
+ k = 2
71
+
72
+ searched_vector = vector_db.search_by_text("I think fruit is awesome!", k=k)
73
+ print(f"Closest {k} vector(s):", searched_vector)
74
+
75
+ retrieved_vector = vector_db.retrieve_from_key(
76
+ "I like to eat broccoli and bananas."
77
+ )
78
+ print("Retrieved vector:", retrieved_vector)
79
+
80
+ relevant_texts = vector_db.search_by_text(
81
+ "I think fruit is awesome!", k=k, return_as_text=True
82
+ )
83
+ print(f"Closest {k} text(s):", relevant_texts)