from enum import Enum import numpy as np import uuid from collections import defaultdict from typing import List, Tuple, Callable from aimakerspace.openai_utils.embedding import EmbeddingModel import asyncio from qdrant_client import models, QdrantClient from qdrant_client.models import PointStruct,VectorParams,Distance,Batch,VectorStruct,Payload collection_name = "embedding_collection" def cosine_similarity(vector_a: np.array, vector_b: np.array) -> float: """Computes the cosine similarity between two vectors.""" dot_product = np.dot(vector_a, vector_b) norm_a = np.linalg.norm(vector_a) norm_b = np.linalg.norm(vector_b) return dot_product / (norm_a * norm_b) def euclidean_distance(vector_a: np.array, vector_b: np.array) -> float: """Computes the Euclidean distance between two vectors.""" return np.sqrt(np.sum((vector_a - vector_b) ** 2)) def manhattan_distance(vector_a: np.array, vector_b: np.array) -> float: """Computes the Manhattan distance between two vectors.""" return np.sum(np.abs(vector_a - vector_b)) def minkowski_distance(vector_a: np.array, vector_b: np.array, p: float) -> float: """ Computes the Minkowski distance between two vectors. Parameters: vector_a (np.array): First vector. vector_b (np.array): Second vector. p (float): The order of the norm. For example, p=1 gives Manhattan distance, p=2 gives Euclidean distance. Returns: float: Minkowski distance between vector_a and vector_b. """ # Ensure the input vectors are NumPy arrays vector_a = np.asarray(vector_a) vector_b = np.asarray(vector_b) # Compute Minkowski distance distance = np.sum(np.abs(vector_a - vector_b) ** p) ** (1 / p) return distance class DistanceMeasure(Enum): COSINE_SIMILARITY = cosine_similarity EUCLIDEAN_DISTANCE = euclidean_distance MANHATTAN_DISTANCE = manhattan_distance MINKOWSKI_DISTANCE = minkowski_distance class VectorDatabaseOptions(Enum): DICTIONARY = "dictionary" QDRANT = "qdrant" class VectorDatabase: def __init__( self, vector_db_options: VectorDatabaseOptions, embedding_model: EmbeddingModel = None, ): self.vectors = None self.vector_db_options = vector_db_options self.embedding_model = embedding_model or EmbeddingModel() if vector_db_options == VectorDatabaseOptions.DICTIONARY: self.vectors = defaultdict(np.array) if vector_db_options == VectorDatabaseOptions.QDRANT: self.qdrant_client = QdrantClient(":memory:") vector_params = VectorParams( size=embedding_model.dimensions, # vector size distance=Distance.COSINE ) self.qdrant_client.create_collection( collection_name=collection_name, vectors_config={"text": vector_params}, ) def insert(self, key: str, vectors: np.array) -> None: idx = str(uuid.uuid4()) payload = {"text": key} point = PointStruct( id=idx, vector={"default": vectors.tolist()}, payload=payload ) # Insert the vector into Qdrant with the associated document self.qdrant_client.upsert( collection_name=collection_name, points=[point] ) # print(f"Inserted vector with ID {idx}: {vector}") # self.qdrant_client.upsert( # collection_name=collection_name, # points= [ # [PointStruct( # id=idx, # vector=vector, # payload={"text": key} # )] # for idx, vector in enumerate(vectors) # ]) # self.qdrant_client.add( # collection_name=collection_name, # documents=[key], # metadata=[], # ids=str(uuid.uuid4()) # ) def search( self, query_vector: np.array, k: int, distance_measure: Callable = cosine_similarity, ) -> List[Tuple[str, float]]: # if isinstance(query_vector, list): # query_vector = np.array(query_vector) print(f"Searching in collection: {collection_name} with vector: {query_vector}") collection_info = self.qdrant_client.get_collection(collection_name) print(f"Collection info: {collection_info}") search_results = self.qdrant_client.search( collection_name=collection_name, query_vector=('text',query_vector), limit=k ) return [(result.payload['text'], result.score) for result in search_results] def search_by_text( self, query_text: str, k: int, distance_measure: Callable = cosine_similarity, return_as_text: bool = False, ) -> List[Tuple[str, float]]: query_vector = self.embedding_model.get_embedding(query_text) results = self.search(query_vector, k, distance_measure) return [result[0] for result in results] if return_as_text else results def retrieve_from_key(self, key: str) -> np.array: return self.vectors.get(key, None) async def abuild_from_list(self, list_of_text: List[str]) -> "VectorDatabase": embeddings = await self.embedding_model.async_get_embeddings(list_of_text) # vs = VectorStruct() # VectorStruct = Union[ # List[StrictFloat], # List[List[StrictFloat]], # Dict[StrictStr, Vector], points = [ models.PointStruct( id=str(uuid.uuid4()), vector={ 'text': embedding}, payload={ "text": text } ) for text, embedding in zip(list_of_text, embeddings) ] self.qdrant_client.upsert( collection_name=collection_name, points=points ) return self if __name__ == "__main__": list_of_text = [ "I like to eat broccoli and bananas.", "I ate a banana and spinach smoothie for breakfast.", "Chinchillas and kittens are cute.", "My sister adopted a kitten yesterday.", "Look at this cute hamster munching on a piece of broccoli.", ] vector_db = VectorDatabase() vector_db = asyncio.run(vector_db.abuild_from_list(list_of_text)) k = 2 searched_vector = vector_db.search_by_text("I think fruit is awesome!", k=k) print(f"Closest {k} vector(s):", searched_vector) retrieved_vector = vector_db.retrieve_from_key( "I like to eat broccoli and bananas." ) print("Retrieved vector:", retrieved_vector) relevant_texts = vector_db.search_by_text( "I think fruit is awesome!", k=k, return_as_text=True ) print(f"Closest {k} text(s):", relevant_texts)