PythonicRAG / aimakerspace /vectordatabase.py
jeevan
working local version
4c501f4
from enum import Enum
import numpy as np
import uuid
from collections import defaultdict
from typing import List, Tuple, Callable
from aimakerspace.openai_utils.embedding import EmbeddingModel
import asyncio
from qdrant_client import models, QdrantClient
from qdrant_client.models import PointStruct,VectorParams,Distance,Batch,VectorStruct,Payload
collection_name = "embedding_collection"
def cosine_similarity(vector_a: np.array, vector_b: np.array) -> float:
"""Computes the cosine similarity between two vectors."""
dot_product = np.dot(vector_a, vector_b)
norm_a = np.linalg.norm(vector_a)
norm_b = np.linalg.norm(vector_b)
return dot_product / (norm_a * norm_b)
def euclidean_distance(vector_a: np.array, vector_b: np.array) -> float:
"""Computes the Euclidean distance between two vectors."""
return np.sqrt(np.sum((vector_a - vector_b) ** 2))
def manhattan_distance(vector_a: np.array, vector_b: np.array) -> float:
"""Computes the Manhattan distance between two vectors."""
return np.sum(np.abs(vector_a - vector_b))
def minkowski_distance(vector_a: np.array, vector_b: np.array, p: float) -> float:
"""
Computes the Minkowski distance between two vectors.
Parameters:
vector_a (np.array): First vector.
vector_b (np.array): Second vector.
p (float): The order of the norm. For example, p=1 gives Manhattan distance, p=2 gives Euclidean distance.
Returns:
float: Minkowski distance between vector_a and vector_b.
"""
# Ensure the input vectors are NumPy arrays
vector_a = np.asarray(vector_a)
vector_b = np.asarray(vector_b)
# Compute Minkowski distance
distance = np.sum(np.abs(vector_a - vector_b) ** p) ** (1 / p)
return distance
class DistanceMeasure(Enum):
COSINE_SIMILARITY = cosine_similarity
EUCLIDEAN_DISTANCE = euclidean_distance
MANHATTAN_DISTANCE = manhattan_distance
MINKOWSKI_DISTANCE = minkowski_distance
class VectorDatabaseOptions(Enum):
DICTIONARY = "dictionary"
QDRANT = "qdrant"
class VectorDatabase:
def __init__(
self,
vector_db_options: VectorDatabaseOptions,
embedding_model: EmbeddingModel = None,
):
self.vectors = None
self.vector_db_options = vector_db_options
self.embedding_model = embedding_model or EmbeddingModel()
if vector_db_options == VectorDatabaseOptions.DICTIONARY:
self.vectors = defaultdict(np.array)
if vector_db_options == VectorDatabaseOptions.QDRANT:
self.qdrant_client = QdrantClient(":memory:")
vector_params = VectorParams(
size=embedding_model.dimensions, # vector size
distance=Distance.COSINE
)
self.qdrant_client.create_collection(
collection_name=collection_name,
vectors_config={"text": vector_params},
)
def insert(self, key: str, vectors: np.array) -> None:
idx = str(uuid.uuid4())
payload = {"text": key}
point = PointStruct(
id=idx,
vector={"default": vectors.tolist()},
payload=payload
)
# Insert the vector into Qdrant with the associated document
self.qdrant_client.upsert(
collection_name=collection_name,
points=[point]
)
# print(f"Inserted vector with ID {idx}: {vector}")
# self.qdrant_client.upsert(
# collection_name=collection_name,
# points= [
# [PointStruct(
# id=idx,
# vector=vector,
# payload={"text": key}
# )]
# for idx, vector in enumerate(vectors)
# ])
# self.qdrant_client.add(
# collection_name=collection_name,
# documents=[key],
# metadata=[],
# ids=str(uuid.uuid4())
# )
def search(
self,
query_vector: np.array,
k: int,
distance_measure: Callable = cosine_similarity,
) -> List[Tuple[str, float]]:
# if isinstance(query_vector, list):
# query_vector = np.array(query_vector)
print(f"Searching in collection: {collection_name} with vector: {query_vector}")
collection_info = self.qdrant_client.get_collection(collection_name)
print(f"Collection info: {collection_info}")
search_results = self.qdrant_client.search(
collection_name=collection_name,
query_vector=('text',query_vector),
limit=k
)
return [(result.payload['text'], result.score) for result in search_results]
def search_by_text(
self,
query_text: str,
k: int,
distance_measure: Callable = cosine_similarity,
return_as_text: bool = False,
) -> List[Tuple[str, float]]:
query_vector = self.embedding_model.get_embedding(query_text)
results = self.search(query_vector, k, distance_measure)
return [result[0] for result in results] if return_as_text else results
def retrieve_from_key(self, key: str) -> np.array:
return self.vectors.get(key, None)
async def abuild_from_list(self, list_of_text: List[str]) -> "VectorDatabase":
embeddings = await self.embedding_model.async_get_embeddings(list_of_text)
# vs = VectorStruct()
# VectorStruct = Union[
# List[StrictFloat],
# List[List[StrictFloat]],
# Dict[StrictStr, Vector],
points = [
models.PointStruct(
id=str(uuid.uuid4()),
vector={ 'text': embedding},
payload={
"text": text
}
)
for text, embedding in zip(list_of_text, embeddings)
]
self.qdrant_client.upsert(
collection_name=collection_name,
points=points
)
return self
if __name__ == "__main__":
list_of_text = [
"I like to eat broccoli and bananas.",
"I ate a banana and spinach smoothie for breakfast.",
"Chinchillas and kittens are cute.",
"My sister adopted a kitten yesterday.",
"Look at this cute hamster munching on a piece of broccoli.",
]
vector_db = VectorDatabase()
vector_db = asyncio.run(vector_db.abuild_from_list(list_of_text))
k = 2
searched_vector = vector_db.search_by_text("I think fruit is awesome!", k=k)
print(f"Closest {k} vector(s):", searched_vector)
retrieved_vector = vector_db.retrieve_from_key(
"I like to eat broccoli and bananas."
)
print("Retrieved vector:", retrieved_vector)
relevant_texts = vector_db.search_by_text(
"I think fruit is awesome!", k=k, return_as_text=True
)
print(f"Closest {k} text(s):", relevant_texts)