Spaces:
Sleeping
Sleeping
File size: 6,927 Bytes
249d2c8 234eac0 637aeec 234eac0 249d2c8 4c501f4 234eac0 249d2c8 234eac0 249d2c8 234eac0 249d2c8 234eac0 249d2c8 637aeec 4c501f4 637aeec 4c501f4 637aeec 4c501f4 637aeec 234eac0 4c501f4 637aeec 4c501f4 637aeec 4c501f4 234eac0 637aeec 4c501f4 637aeec 234eac0 637aeec 234eac0 637aeec 4c501f4 234eac0 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 |
from enum import Enum
import numpy as np
import uuid
from collections import defaultdict
from typing import List, Tuple, Callable
from aimakerspace.openai_utils.embedding import EmbeddingModel
import asyncio
from qdrant_client import models, QdrantClient
from qdrant_client.models import PointStruct,VectorParams,Distance,Batch,VectorStruct,Payload
collection_name = "embedding_collection"
def cosine_similarity(vector_a: np.array, vector_b: np.array) -> float:
"""Computes the cosine similarity between two vectors."""
dot_product = np.dot(vector_a, vector_b)
norm_a = np.linalg.norm(vector_a)
norm_b = np.linalg.norm(vector_b)
return dot_product / (norm_a * norm_b)
def euclidean_distance(vector_a: np.array, vector_b: np.array) -> float:
"""Computes the Euclidean distance between two vectors."""
return np.sqrt(np.sum((vector_a - vector_b) ** 2))
def manhattan_distance(vector_a: np.array, vector_b: np.array) -> float:
"""Computes the Manhattan distance between two vectors."""
return np.sum(np.abs(vector_a - vector_b))
def minkowski_distance(vector_a: np.array, vector_b: np.array, p: float) -> float:
"""
Computes the Minkowski distance between two vectors.
Parameters:
vector_a (np.array): First vector.
vector_b (np.array): Second vector.
p (float): The order of the norm. For example, p=1 gives Manhattan distance, p=2 gives Euclidean distance.
Returns:
float: Minkowski distance between vector_a and vector_b.
"""
# Ensure the input vectors are NumPy arrays
vector_a = np.asarray(vector_a)
vector_b = np.asarray(vector_b)
# Compute Minkowski distance
distance = np.sum(np.abs(vector_a - vector_b) ** p) ** (1 / p)
return distance
class DistanceMeasure(Enum):
COSINE_SIMILARITY = cosine_similarity
EUCLIDEAN_DISTANCE = euclidean_distance
MANHATTAN_DISTANCE = manhattan_distance
MINKOWSKI_DISTANCE = minkowski_distance
class VectorDatabaseOptions(Enum):
DICTIONARY = "dictionary"
QDRANT = "qdrant"
class VectorDatabase:
def __init__(
self,
vector_db_options: VectorDatabaseOptions,
embedding_model: EmbeddingModel = None,
):
self.vectors = None
self.vector_db_options = vector_db_options
self.embedding_model = embedding_model or EmbeddingModel()
if vector_db_options == VectorDatabaseOptions.DICTIONARY:
self.vectors = defaultdict(np.array)
if vector_db_options == VectorDatabaseOptions.QDRANT:
self.qdrant_client = QdrantClient(":memory:")
vector_params = VectorParams(
size=embedding_model.dimensions, # vector size
distance=Distance.COSINE
)
self.qdrant_client.create_collection(
collection_name=collection_name,
vectors_config={"text": vector_params},
)
def insert(self, key: str, vectors: np.array) -> None:
idx = str(uuid.uuid4())
payload = {"text": key}
point = PointStruct(
id=idx,
vector={"default": vectors.tolist()},
payload=payload
)
# Insert the vector into Qdrant with the associated document
self.qdrant_client.upsert(
collection_name=collection_name,
points=[point]
)
# print(f"Inserted vector with ID {idx}: {vector}")
# self.qdrant_client.upsert(
# collection_name=collection_name,
# points= [
# [PointStruct(
# id=idx,
# vector=vector,
# payload={"text": key}
# )]
# for idx, vector in enumerate(vectors)
# ])
# self.qdrant_client.add(
# collection_name=collection_name,
# documents=[key],
# metadata=[],
# ids=str(uuid.uuid4())
# )
def search(
self,
query_vector: np.array,
k: int,
distance_measure: Callable = cosine_similarity,
) -> List[Tuple[str, float]]:
# if isinstance(query_vector, list):
# query_vector = np.array(query_vector)
print(f"Searching in collection: {collection_name} with vector: {query_vector}")
collection_info = self.qdrant_client.get_collection(collection_name)
print(f"Collection info: {collection_info}")
search_results = self.qdrant_client.search(
collection_name=collection_name,
query_vector=('text',query_vector),
limit=k
)
return [(result.payload['text'], result.score) for result in search_results]
def search_by_text(
self,
query_text: str,
k: int,
distance_measure: Callable = cosine_similarity,
return_as_text: bool = False,
) -> List[Tuple[str, float]]:
query_vector = self.embedding_model.get_embedding(query_text)
results = self.search(query_vector, k, distance_measure)
return [result[0] for result in results] if return_as_text else results
def retrieve_from_key(self, key: str) -> np.array:
return self.vectors.get(key, None)
async def abuild_from_list(self, list_of_text: List[str]) -> "VectorDatabase":
embeddings = await self.embedding_model.async_get_embeddings(list_of_text)
# vs = VectorStruct()
# VectorStruct = Union[
# List[StrictFloat],
# List[List[StrictFloat]],
# Dict[StrictStr, Vector],
points = [
models.PointStruct(
id=str(uuid.uuid4()),
vector={ 'text': embedding},
payload={
"text": text
}
)
for text, embedding in zip(list_of_text, embeddings)
]
self.qdrant_client.upsert(
collection_name=collection_name,
points=points
)
return self
if __name__ == "__main__":
list_of_text = [
"I like to eat broccoli and bananas.",
"I ate a banana and spinach smoothie for breakfast.",
"Chinchillas and kittens are cute.",
"My sister adopted a kitten yesterday.",
"Look at this cute hamster munching on a piece of broccoli.",
]
vector_db = VectorDatabase()
vector_db = asyncio.run(vector_db.abuild_from_list(list_of_text))
k = 2
searched_vector = vector_db.search_by_text("I think fruit is awesome!", k=k)
print(f"Closest {k} vector(s):", searched_vector)
retrieved_vector = vector_db.retrieve_from_key(
"I like to eat broccoli and bananas."
)
print("Retrieved vector:", retrieved_vector)
relevant_texts = vector_db.search_by_text(
"I think fruit is awesome!", k=k, return_as_text=True
)
print(f"Closest {k} text(s):", relevant_texts)
|