otoz-smart-search / src /mongo_search.py
teenaxta's picture
Update src/mongo_search.py
8fbac75 verified
from utils.mongo_utils import generate_mongodb_query, get_prompt
from langchain_openai import ChatOpenAI, OpenAIEmbeddings
from langchain_core.output_parsers import JsonOutputParser
import random
from utils.utils import timing_decorator
class MongoSearch:
def __init__(self, collection, search_index, index_variable, embedding_model="text-embedding-3-large"):
self.collection = collection
self.embedding_model = OpenAIEmbeddings(model=embedding_model)
self.llm = ChatOpenAI(model="gpt-4o-2024-08-06", temperature=0)
self.parser = JsonOutputParser()
self.search_index = search_index
self.index_variable = index_variable
@timing_decorator
def __call__(self, query, k=4, use_filter=True):
query_filter = {}
if use_filter:
result = self.llm.invoke(get_prompt(query))
parser = JsonOutputParser()
result = parser.parse(result.content)
query_filter = generate_mongodb_query(result)
query_vector = self.embedding_model.embed_query(query)
# define pipeline
first_pipeline = [
{
'$vectorSearch': {
'index': self.search_index,
'path': self.index_variable,
'filter': query_filter if query_filter else {},
'queryVector': query_vector,
'numCandidates': k * 3,
'limit': k
}
},
{
'$project': {
'makeModel': 1,
}
},
]
# Step 1: Run pipeline to get the makeModel from the first vector search
first_search_results = list(self.collection.aggregate(first_pipeline))
# Extract unique makeModel values for the next step
make_model_list = [doc['makeModel'] for doc in first_search_results]
k = k * 3
# Define the second pipeline
second_pipeline = [
{
'$vectorSearch': {
'index': 'filter-vector-index', # Perform vector search on the features search index
'path': 'feature_embedding', # Path to the feature embeddings
'filter': {
'makeModel': {'$in': make_model_list} # Use makeModel as a filter
},
'queryVector': query_vector,
'numCandidates': k * 3,
'limit': k
}
},
{
'$project': {
'description': 0, # Exclude 'description'
'variants': 0, # Exclude 'variants'
'review_embedding': 0, # Exclude 'review_embedding'
'feature_embedding':0
}
}
]
# run pipeline
result = self.collection.aggregate(second_pipeline)
# Convert the result cursor to a list
result_list = list(result)
# Randomly select k/3 objects from the results
k_third = k // 3
selected_cars = random.sample(result_list, min(k_third, len(result_list)))
# Update the result with the randomly selected cars
result = selected_cars
cars = []
for i in result:
cars.append(i)
return cars