File size: 6,927 Bytes
249d2c8
234eac0
637aeec
234eac0
 
 
 
249d2c8
4c501f4
234eac0
249d2c8
234eac0
 
 
 
 
 
 
 
 
249d2c8
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
234eac0
249d2c8
 
 
 
 
 
 
234eac0
249d2c8
 
 
 
637aeec
 
4c501f4
637aeec
4c501f4
637aeec
4c501f4
637aeec
234eac0
4c501f4
637aeec
 
 
 
 
4c501f4
637aeec
 
 
 
 
 
 
4c501f4
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
234eac0
 
 
 
 
 
637aeec
 
 
 
 
 
 
 
4c501f4
637aeec
 
 
234eac0
 
 
 
 
 
 
 
 
637aeec
 
 
234eac0
 
 
 
 
637aeec
4c501f4
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
234eac0
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
from enum import Enum
import numpy as np
import uuid
from collections import defaultdict
from typing import List, Tuple, Callable
from aimakerspace.openai_utils.embedding import EmbeddingModel
import asyncio
from qdrant_client import models, QdrantClient
from qdrant_client.models import PointStruct,VectorParams,Distance,Batch,VectorStruct,Payload

collection_name = "embedding_collection"

def cosine_similarity(vector_a: np.array, vector_b: np.array) -> float:
    """Computes the cosine similarity between two vectors."""
    dot_product = np.dot(vector_a, vector_b)
    norm_a = np.linalg.norm(vector_a)
    norm_b = np.linalg.norm(vector_b)
    return dot_product / (norm_a * norm_b)


def euclidean_distance(vector_a: np.array, vector_b: np.array) -> float:
    """Computes the Euclidean distance between two vectors."""
    return np.sqrt(np.sum((vector_a - vector_b) ** 2))


def manhattan_distance(vector_a: np.array, vector_b: np.array) -> float:
    """Computes the Manhattan distance between two vectors."""
    return np.sum(np.abs(vector_a - vector_b))


def minkowski_distance(vector_a: np.array, vector_b: np.array, p: float) -> float:
    """
    Computes the Minkowski distance between two vectors.

    Parameters:
    vector_a (np.array): First vector.
    vector_b (np.array): Second vector.
    p (float): The order of the norm. For example, p=1 gives Manhattan distance, p=2 gives Euclidean distance.

    Returns:
    float: Minkowski distance between vector_a and vector_b.
    """
    # Ensure the input vectors are NumPy arrays
    vector_a = np.asarray(vector_a)
    vector_b = np.asarray(vector_b)

    # Compute Minkowski distance
    distance = np.sum(np.abs(vector_a - vector_b) ** p) ** (1 / p)
    return distance


class DistanceMeasure(Enum):
    COSINE_SIMILARITY = cosine_similarity
    EUCLIDEAN_DISTANCE = euclidean_distance
    MANHATTAN_DISTANCE = manhattan_distance
    MINKOWSKI_DISTANCE = minkowski_distance


class VectorDatabaseOptions(Enum):
    DICTIONARY = "dictionary"
    QDRANT = "qdrant"


class VectorDatabase:
    def __init__(
        self,
        vector_db_options: VectorDatabaseOptions,
        embedding_model: EmbeddingModel = None,
    ):
        self.vectors = None
        self.vector_db_options = vector_db_options
        self.embedding_model = embedding_model or EmbeddingModel()
        if vector_db_options == VectorDatabaseOptions.DICTIONARY:
            self.vectors = defaultdict(np.array)
        if vector_db_options == VectorDatabaseOptions.QDRANT:
            self.qdrant_client = QdrantClient(":memory:")
            vector_params = VectorParams(
                size=embedding_model.dimensions,  # vector size 
                distance=Distance.COSINE
            )
            self.qdrant_client.create_collection(
                collection_name=collection_name,
                vectors_config={"text": vector_params},
            )

    def insert(self, key: str, vectors: np.array) -> None:
        idx = str(uuid.uuid4())
        payload = {"text": key}  

        point = PointStruct(
            id=idx,
            vector={"default": vectors.tolist()},
            payload=payload
        )
        # Insert the vector into Qdrant with the associated document
        self.qdrant_client.upsert(
            collection_name=collection_name,
            points=[point]
        )
        # print(f"Inserted vector with ID {idx}: {vector}")
        # self.qdrant_client.upsert(
        #     collection_name=collection_name,
        #     points= [
        #         [PointStruct(
        #             id=idx,
        #             vector=vector,
        #             payload={"text": key}
        #         )]
        #         for idx, vector in enumerate(vectors)
        #     ])
        # self.qdrant_client.add(
        #     collection_name=collection_name,
        #     documents=[key],
        #     metadata=[],
        #     ids=str(uuid.uuid4())
        # )

            
    def search(
        self,
        query_vector: np.array,
        k: int,
        distance_measure: Callable = cosine_similarity,
    ) -> List[Tuple[str, float]]:
        # if isinstance(query_vector, list):
        #     query_vector = np.array(query_vector)
        print(f"Searching in collection: {collection_name} with vector: {query_vector}")
        collection_info = self.qdrant_client.get_collection(collection_name)
        print(f"Collection info: {collection_info}")

        search_results = self.qdrant_client.search(
            collection_name=collection_name,
            query_vector=('text',query_vector),
            limit=k
        )
        return [(result.payload['text'], result.score) for result in search_results]

    def search_by_text(
        self,
        query_text: str,
        k: int,
        distance_measure: Callable = cosine_similarity,
        return_as_text: bool = False,
    ) -> List[Tuple[str, float]]:
        query_vector = self.embedding_model.get_embedding(query_text)
        results = self.search(query_vector, k, distance_measure)
        return [result[0] for result in results] if return_as_text else results
       

    def retrieve_from_key(self, key: str) -> np.array:
        return self.vectors.get(key, None)

    async def abuild_from_list(self, list_of_text: List[str]) -> "VectorDatabase":
        embeddings = await self.embedding_model.async_get_embeddings(list_of_text)
        # vs = VectorStruct()
        # VectorStruct = Union[
        #     List[StrictFloat],
        #     List[List[StrictFloat]],
        #     Dict[StrictStr, Vector],
        points = [
            models.PointStruct(
                id=str(uuid.uuid4()),
                vector={ 'text': embedding},
                payload={
                    "text": text
                    }
            )
            for text, embedding in zip(list_of_text, embeddings)
        ]
        self.qdrant_client.upsert(
            collection_name=collection_name,
            points=points
        )
        return self


if __name__ == "__main__":
    list_of_text = [
        "I like to eat broccoli and bananas.",
        "I ate a banana and spinach smoothie for breakfast.",
        "Chinchillas and kittens are cute.",
        "My sister adopted a kitten yesterday.",
        "Look at this cute hamster munching on a piece of broccoli.",
    ]

    vector_db = VectorDatabase()
    vector_db = asyncio.run(vector_db.abuild_from_list(list_of_text))
    k = 2

    searched_vector = vector_db.search_by_text("I think fruit is awesome!", k=k)
    print(f"Closest {k} vector(s):", searched_vector)

    retrieved_vector = vector_db.retrieve_from_key(
        "I like to eat broccoli and bananas."
    )
    print("Retrieved vector:", retrieved_vector)

    relevant_texts = vector_db.search_by_text(
        "I think fruit is awesome!", k=k, return_as_text=True
    )
    print(f"Closest {k} text(s):", relevant_texts)