File size: 6,238 Bytes
7661630
 
 
 
 
6c077f9
 
 
7661630
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
6c077f9
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
7661630
6c077f9
e900f80
6c077f9
e900f80
 
 
6c077f9
 
e900f80
7661630
 
6c077f9
e900f80
 
 
 
 
 
 
 
 
 
 
7661630
 
 
 
e900f80
6c077f9
7661630
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
e900f80
7661630
 
6c077f9
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
import os
from langchain_openai import OpenAIEmbeddings
from qdrant_client import QdrantClient
from langchain_qdrant import QdrantVectorStore
from qdrant_client.http import models
from langchain_groq import ChatGroq
from langchain_core.output_parsers import StrOutputParser
from langchain_core.prompts import PromptTemplate

from dotenv import load_dotenv

# Load environment variables
load_dotenv('.env')

class Retriever():
    def __init__(self):
        # Initialize Qdrant client
        qdrant_client = QdrantClient(
            url=os.getenv("QDRANT_URL"),
            api_key=os.getenv("QDRANT_API_KEY")
        )
        # Initialize Qdrant vector store
        self.vector_store = QdrantVectorStore(
            client=qdrant_client,
            collection_name="siel-ai-assignment",
            embedding=OpenAIEmbeddings(),
        )
        self.vector_store_user = QdrantVectorStore(
            client=qdrant_client,
            collection_name="siel-ai-user",
            embedding=OpenAIEmbeddings(),
        )
        self.filters = ['Taxation-Goods-and-service-Tax',
                        'Taxation-INCOME-TAX-LAW',
                        'Direct Tax Laws and International Taxation',
                        'Indirect Tax Laws',
                        'INDIAN Income Tax ACTS',
                        'ONLINESITES']
        self.groq = ChatGroq(model='llama3-70b-8192')

    
    
    def multi_questions(self,user_prompt):
        llm = self.groq
        prompt = f'''
# You are an excellent Query Decomposer for database retrieval optimization.
# You are given a user_query.
===============================
# TASK:
    -> Your task is to provide a structured and hierarchical breakdown of the user query.
    -> This breakdown should be in the form of an ordered sequence that helps in extracting the right context from the database.
    -> Build the user query from the bottom level (basic requirements) to the top level (more specific details), ensuring the retrieval context improves at each level.
===============================
# USER_QUERY: {{user}}
===============================
# EXAMPLE:
    1. #USER_QUERY: "For 5 lakh, what type of taxes should I pay and how much?"
       -> #EXPECTED OUTPUT: | I'm purchasing a car for 5 lakh. | What type of taxes should I pay on the purchase of automobiles? | What type of taxes should I pay on the purchase of a car for 5 lakh? |
    
    2. #USER_QUERY: "For 5 lakh, what type of taxes should I pay and how much?"
       -> #EXPECTED OUTPUT: | NEW TAX REGIME and Income tax. | My income is 5 lakh. What type of taxes should I pay and how much should I pay? |

===============================
# OUTPUT FORMAT:
    -> Provide the formatted output separated with the pipe '|' enclosed as: |...|...|
    -> Stick to the given format without any additional explanation. Your only response must be the formatted sequence of queries.
    -> Do not answer the user question directly. Your job is to provide the decomposed queries in the format shown in the examples.
'''

        rag_prompt = PromptTemplate.from_template(prompt)
        l = (rag_prompt | llm | StrOutputParser())
        stream = l.invoke({"user":user_prompt})
        return stream
    
    def multiple_contexts(self,user_prompt):
        questions = self.filters
        contexts = []
        for i in questions:
            contexts+=self.filter_multiple(user_prompt,i,18)
            print(len(contexts))
        return contexts
    
    def filter_multiple(self,query,mapper,k1=10):
        retriever1 = self.vector_store.as_retriever(
                                            search_type="similarity_score_threshold",
                                            search_kwargs={"k": k1,
                                                           'score_threshold':0.75,
                                                            'filter':models.Filter(must=[models.FieldCondition(key="metadata.DOCUMENT_IS_ABOUT", match=models.MatchValue(value=mapper),)])
                                                            },
                                        )
        ret = retriever1.invoke(query)
        return ret
    
    def filter(self,query,k1=10,k2=17):
        retriever1 = self.vector_store.as_retriever(
                                            search_type="mmr",
                                            search_kwargs={"k": k1,
                                                            'filter':models.Filter(must=[models.FieldCondition(key="metadata.DOCUMENT_IS_ABOUT", match=models.MatchValue(value=self.filters[-1]),)])
                                                            },
                                        )
        retriever2 = self.vector_store.as_retriever(
                                            search_type="mmr",
                                            search_kwargs={"k": k2,
                                                            'filter':models.Filter(must_not=[models.FieldCondition(key="metadata.DOCUMENT_IS_ABOUT", match=models.MatchValue(value=self.filters[-1]),)])
                                                           },
                                        )
        ret = retriever1.invoke(query)+retriever2.invoke(query)
        return ret

    def id_filter(self,query,id):
        retriever1 = self.vector_store_user.as_retriever(
                                            search_type="similarity_score_threshold",
                                            search_kwargs={"k": 10,
                                                           'score_threshold':0.7,
                                                            'filter':models.Filter(must=[models.FieldCondition(key="metadata.ID", match=models.MatchValue(value=id),)])
                                                            }
                                        )
        ret = retriever1.invoke(query)
        return ret

    def data_retrieve(self, query=''):
        retrieved_docs = self.vector_store.similarity_search_with_score(query, k=10)
        return [doc for doc, _ in retrieved_docs]

# ret = Retriever()
# print(ret.multiple_contexts("i'm purchasing a car for 5Lack, what type of taxes should I pay and how much?"))