File size: 3,431 Bytes
7baafc3
 
 
8fbac75
7baafc3
 
 
 
 
 
 
 
 
d19a3a8
7baafc3
 
 
 
 
8fbac75
 
 
7baafc3
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
8fbac75
 
7baafc3
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
8fbac75
 
 
 
 
 
 
 
 
 
7baafc3
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
from utils.mongo_utils import generate_mongodb_query, get_prompt
from langchain_openai import ChatOpenAI, OpenAIEmbeddings
from langchain_core.output_parsers import JsonOutputParser
import random

from utils.utils import timing_decorator

class MongoSearch:

    def __init__(self, collection, search_index, index_variable, embedding_model="text-embedding-3-large"):
        
        self.collection = collection
        self.embedding_model = OpenAIEmbeddings(model=embedding_model)
        self.llm = ChatOpenAI(model="gpt-4o-2024-08-06", temperature=0)
        self.parser = JsonOutputParser()
        self.search_index = search_index
        self.index_variable = index_variable

    @timing_decorator
    def __call__(self, query, k=4, use_filter=True):



        query_filter = {}
        if use_filter:
            result = self.llm.invoke(get_prompt(query))
            parser = JsonOutputParser()
            result = parser.parse(result.content)
            query_filter = generate_mongodb_query(result)

        query_vector = self.embedding_model.embed_query(query)

        # define pipeline
        first_pipeline = [
            {
                '$vectorSearch': {
                    'index': self.search_index, 
                    'path': self.index_variable, 
                    'filter': query_filter if query_filter else {}, 
                    'queryVector': query_vector,
                    'numCandidates': k * 3, 
                    'limit': k
                }
            },
            {
                '$project': {
                    'makeModel': 1,
                }
            },
        ]

        # Step 1: Run pipeline to get the makeModel from the first vector search
        first_search_results = list(self.collection.aggregate(first_pipeline))

        # Extract unique makeModel values for the next step
        make_model_list = [doc['makeModel'] for doc in first_search_results]

        k = k * 3

        # Define the second pipeline
        second_pipeline = [
            {
                '$vectorSearch': {
                    'index': 'filter-vector-index',  # Perform vector search on the features search index
                    'path': 'feature_embedding',               # Path to the feature embeddings
                    'filter': {
                        'makeModel': {'$in': make_model_list}  # Use makeModel as a filter
                    },
                    'queryVector': query_vector,
                    'numCandidates': k * 3,
                    'limit': k
                }
            },
            {
                '$project': {
                    'description': 0,         # Exclude 'description'
                    'variants': 0,            # Exclude 'variants'
                    'review_embedding': 0,     # Exclude 'review_embedding'
                    'feature_embedding':0
                }
            }
        ]

        # run pipeline
        result = self.collection.aggregate(second_pipeline)

        # Convert the result cursor to a list
        result_list = list(result)

        # Randomly select k/3 objects from the results
        k_third = k // 3
        selected_cars = random.sample(result_list, min(k_third, len(result_list)))

        # Update the result with the randomly selected cars
        result = selected_cars

        cars = []
        for i in result:
            cars.append(i)
        
        return cars