PythonicRAG / aimakerspace /vectordatabase.py
jeevan
working local version
4c501f4
raw
history blame
6.93 kB
from enum import Enum
import numpy as np
import uuid
from collections import defaultdict
from typing import List, Tuple, Callable
from aimakerspace.openai_utils.embedding import EmbeddingModel
import asyncio
from qdrant_client import models, QdrantClient
from qdrant_client.models import PointStruct,VectorParams,Distance,Batch,VectorStruct,Payload
collection_name = "embedding_collection"
def cosine_similarity(vector_a: np.array, vector_b: np.array) -> float:
"""Computes the cosine similarity between two vectors."""
dot_product = np.dot(vector_a, vector_b)
norm_a = np.linalg.norm(vector_a)
norm_b = np.linalg.norm(vector_b)
return dot_product / (norm_a * norm_b)
def euclidean_distance(vector_a: np.array, vector_b: np.array) -> float:
"""Computes the Euclidean distance between two vectors."""
return np.sqrt(np.sum((vector_a - vector_b) ** 2))
def manhattan_distance(vector_a: np.array, vector_b: np.array) -> float:
"""Computes the Manhattan distance between two vectors."""
return np.sum(np.abs(vector_a - vector_b))
def minkowski_distance(vector_a: np.array, vector_b: np.array, p: float) -> float:
"""
Computes the Minkowski distance between two vectors.
Parameters:
vector_a (np.array): First vector.
vector_b (np.array): Second vector.
p (float): The order of the norm. For example, p=1 gives Manhattan distance, p=2 gives Euclidean distance.
Returns:
float: Minkowski distance between vector_a and vector_b.
"""
# Ensure the input vectors are NumPy arrays
vector_a = np.asarray(vector_a)
vector_b = np.asarray(vector_b)
# Compute Minkowski distance
distance = np.sum(np.abs(vector_a - vector_b) ** p) ** (1 / p)
return distance
class DistanceMeasure(Enum):
COSINE_SIMILARITY = cosine_similarity
EUCLIDEAN_DISTANCE = euclidean_distance
MANHATTAN_DISTANCE = manhattan_distance
MINKOWSKI_DISTANCE = minkowski_distance
class VectorDatabaseOptions(Enum):
DICTIONARY = "dictionary"
QDRANT = "qdrant"
class VectorDatabase:
def __init__(
self,
vector_db_options: VectorDatabaseOptions,
embedding_model: EmbeddingModel = None,
):
self.vectors = None
self.vector_db_options = vector_db_options
self.embedding_model = embedding_model or EmbeddingModel()
if vector_db_options == VectorDatabaseOptions.DICTIONARY:
self.vectors = defaultdict(np.array)
if vector_db_options == VectorDatabaseOptions.QDRANT:
self.qdrant_client = QdrantClient(":memory:")
vector_params = VectorParams(
size=embedding_model.dimensions, # vector size
distance=Distance.COSINE
)
self.qdrant_client.create_collection(
collection_name=collection_name,
vectors_config={"text": vector_params},
)
def insert(self, key: str, vectors: np.array) -> None:
idx = str(uuid.uuid4())
payload = {"text": key}
point = PointStruct(
id=idx,
vector={"default": vectors.tolist()},
payload=payload
)
# Insert the vector into Qdrant with the associated document
self.qdrant_client.upsert(
collection_name=collection_name,
points=[point]
)
# print(f"Inserted vector with ID {idx}: {vector}")
# self.qdrant_client.upsert(
# collection_name=collection_name,
# points= [
# [PointStruct(
# id=idx,
# vector=vector,
# payload={"text": key}
# )]
# for idx, vector in enumerate(vectors)
# ])
# self.qdrant_client.add(
# collection_name=collection_name,
# documents=[key],
# metadata=[],
# ids=str(uuid.uuid4())
# )
def search(
self,
query_vector: np.array,
k: int,
distance_measure: Callable = cosine_similarity,
) -> List[Tuple[str, float]]:
# if isinstance(query_vector, list):
# query_vector = np.array(query_vector)
print(f"Searching in collection: {collection_name} with vector: {query_vector}")
collection_info = self.qdrant_client.get_collection(collection_name)
print(f"Collection info: {collection_info}")
search_results = self.qdrant_client.search(
collection_name=collection_name,
query_vector=('text',query_vector),
limit=k
)
return [(result.payload['text'], result.score) for result in search_results]
def search_by_text(
self,
query_text: str,
k: int,
distance_measure: Callable = cosine_similarity,
return_as_text: bool = False,
) -> List[Tuple[str, float]]:
query_vector = self.embedding_model.get_embedding(query_text)
results = self.search(query_vector, k, distance_measure)
return [result[0] for result in results] if return_as_text else results
def retrieve_from_key(self, key: str) -> np.array:
return self.vectors.get(key, None)
async def abuild_from_list(self, list_of_text: List[str]) -> "VectorDatabase":
embeddings = await self.embedding_model.async_get_embeddings(list_of_text)
# vs = VectorStruct()
# VectorStruct = Union[
# List[StrictFloat],
# List[List[StrictFloat]],
# Dict[StrictStr, Vector],
points = [
models.PointStruct(
id=str(uuid.uuid4()),
vector={ 'text': embedding},
payload={
"text": text
}
)
for text, embedding in zip(list_of_text, embeddings)
]
self.qdrant_client.upsert(
collection_name=collection_name,
points=points
)
return self
if __name__ == "__main__":
list_of_text = [
"I like to eat broccoli and bananas.",
"I ate a banana and spinach smoothie for breakfast.",
"Chinchillas and kittens are cute.",
"My sister adopted a kitten yesterday.",
"Look at this cute hamster munching on a piece of broccoli.",
]
vector_db = VectorDatabase()
vector_db = asyncio.run(vector_db.abuild_from_list(list_of_text))
k = 2
searched_vector = vector_db.search_by_text("I think fruit is awesome!", k=k)
print(f"Closest {k} vector(s):", searched_vector)
retrieved_vector = vector_db.retrieve_from_key(
"I like to eat broccoli and bananas."
)
print("Retrieved vector:", retrieved_vector)
relevant_texts = vector_db.search_by_text(
"I think fruit is awesome!", k=k, return_as_text=True
)
print(f"Closest {k} text(s):", relevant_texts)