from utils.mongo_utils import generate_mongodb_query, get_prompt from langchain_openai import ChatOpenAI, OpenAIEmbeddings from langchain_core.output_parsers import JsonOutputParser import random from utils.utils import timing_decorator class MongoSearch: def __init__(self, collection, search_index, index_variable, embedding_model="text-embedding-3-large"): self.collection = collection self.embedding_model = OpenAIEmbeddings(model=embedding_model) self.llm = ChatOpenAI(model="gpt-4o-2024-08-06", temperature=0) self.parser = JsonOutputParser() self.search_index = search_index self.index_variable = index_variable @timing_decorator def __call__(self, query, k=4, use_filter=True): query_filter = {} if use_filter: result = self.llm.invoke(get_prompt(query)) parser = JsonOutputParser() result = parser.parse(result.content) query_filter = generate_mongodb_query(result) query_vector = self.embedding_model.embed_query(query) # define pipeline first_pipeline = [ { '$vectorSearch': { 'index': self.search_index, 'path': self.index_variable, 'filter': query_filter if query_filter else {}, 'queryVector': query_vector, 'numCandidates': k * 3, 'limit': k } }, { '$project': { 'makeModel': 1, } }, ] # Step 1: Run pipeline to get the makeModel from the first vector search first_search_results = list(self.collection.aggregate(first_pipeline)) # Extract unique makeModel values for the next step make_model_list = [doc['makeModel'] for doc in first_search_results] k = k * 3 # Define the second pipeline second_pipeline = [ { '$vectorSearch': { 'index': 'filter-vector-index', # Perform vector search on the features search index 'path': 'feature_embedding', # Path to the feature embeddings 'filter': { 'makeModel': {'$in': make_model_list} # Use makeModel as a filter }, 'queryVector': query_vector, 'numCandidates': k * 3, 'limit': k } }, { '$project': { 'description': 0, # Exclude 'description' 'variants': 0, # Exclude 'variants' 'review_embedding': 0, # Exclude 'review_embedding' 'feature_embedding':0 } } ] # run pipeline result = self.collection.aggregate(second_pipeline) # Convert the result cursor to a list result_list = list(result) # Randomly select k/3 objects from the results k_third = k // 3 selected_cars = random.sample(result_list, min(k_third, len(result_list))) # Update the result with the randomly selected cars result = selected_cars cars = [] for i in result: cars.append(i) return cars