sunil448832 commited on
Commit
eccde2c
Β·
1 Parent(s): d14c166

Initial Commit

Browse files
app.py ADDED
@@ -0,0 +1,32 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ from models import EmbeddingModel, LLM
3
+ from utils import MistralPrompts
4
+ from vector_store import FaissVectorStore
5
+ from chat import ChatBot
6
+
7
+ VECTOR_DATABASE_PATH = 'vector_db'
8
+ # Initialize models and vector store
9
+ embedding_model = EmbeddingModel(model_name='sentence-transformers/all-MiniLM-L6-v2')
10
+ llm = LLM("mistralai/Mistral-7B-Instruct-v0.1")
11
+ vector_store = FaissVectorStore.as_retriever(database_path=VECTOR_DATABASE_PATH)
12
+
13
+ # Create a ChatBot instance
14
+ chat_bot = ChatBot(llm, embedding_model, vector_store)
15
+
16
+ # Function to handle the user's input and generate a response
17
+ def chat_bot(input_text):
18
+ response = chat_bot.chat(input_text)
19
+
20
+ return response
21
+
22
+ # Create a Gradio interface
23
+ chatbot_interface = gr.Interface(
24
+ fn=chat_bot,
25
+ inputs=gr.inputs.Textbox(prompt="User:"),
26
+ outputs=gr.inputs.Textbox(prompt="Bot:"),
27
+ title="Chatbot Assitant for PAN card related query",
28
+ theme="compact"
29
+ )
30
+
31
+ # Launch the Gradio interface
32
+ chatbot_interface.launch()
chat.py ADDED
@@ -0,0 +1,76 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from models import EmbeddingModel, LLM
2
+ from utils import MistralPrompts
3
+ from vector_store import FaissVectorStore
4
+ import argparse
5
+
6
+ import warnings
7
+ warnings.filterwarnings("ignore")
8
+
9
+ # Create a ChatBot class to manage interactions
10
+ class ChatBot:
11
+ def __init__(self, llm, embedding_model, vector_store):
12
+ self.llm = llm
13
+ self.embedding_model = embedding_model
14
+ self.chat_history = []
15
+ self.vector_store = vector_store
16
+
17
+ def format_context(self, retrieved_documents):
18
+ context, sources = '', ''
19
+
20
+ # Format retrieved documents into context and sources
21
+ # This is simplest way to combine. there are other techniques as well to try out.
22
+ for doc in retrieved_documents:
23
+ context += doc.text + '\n\n'
24
+ sources += str(doc.metadata) + '\n'
25
+
26
+ return context, sources
27
+
28
+ def chat(self, question):
29
+ if len(self.chat_history):
30
+ # Create a prompt based on chat history
31
+ chat_history_prompt = MistralPrompts.create_history_prompt(self.chat_history)
32
+ standalone_question_prompt = MistralPrompts.create_standalone_question_prompt(question, chat_history_prompt)
33
+ standalone_question = self.llm.generate_response(standalone_question_prompt)
34
+ else:
35
+ chat_history_prompt = ''
36
+ standalone_question = question
37
+
38
+ # Encode the question using the embedding model
39
+ query_embedding = self.embedding_model.encode(standalone_question)
40
+
41
+ # Retrieve documents related to the question
42
+ retrieved_documents = self.vector_store.query(query_embedding, 3)
43
+ context, sources = self.format_context(retrieved_documents)
44
+
45
+ # Print information about retrieved documents
46
+ print("Retrieved documents info: \n", sources)
47
+
48
+ # Create a prompt and generate a response
49
+ prompt = MistralPrompts.create_question_prompt(question, context, chat_history_prompt)
50
+ response = self.llm.generate_response(prompt)
51
+
52
+ # Extract the response and update chat history
53
+ response = MistralPrompts.extract_response(response)
54
+ self.chat_history.append((question, response))
55
+ return response
56
+
57
+ if __name__ == '__main__':
58
+ parser = argparse.ArgumentParser()
59
+ parser.add_argument("--vector_database_path", default='vector_db',help="Vector database which store embeddings vector")
60
+ args = parser.parse_args()
61
+
62
+ VECTOR_DATABASE_PATH = parser.vector_database_path
63
+ # Initialize models and vector store
64
+ embedding_model = EmbeddingModel(model_name='sentence-transformers/all-MiniLM-L6-v2')
65
+ llm = LLM("mistralai/Mistral-7B-Instruct-v0.1")
66
+ vector_store = FaissVectorStore.as_retriever(database_path=VECTOR_DATABASE_PATH)
67
+
68
+ # Create a ChatBot instance
69
+ chat_bot = ChatBot(llm, embedding_model, vector_store)
70
+
71
+ # Start the conversation
72
+ print("Assistant Bot: Hello, I'm the Assistant Bot! How may I assist you today?")
73
+ while True:
74
+ question = input("User:")
75
+ response = chat_bot.chat(question)
76
+ print("Assistant Bot:", response, '\n')
data/KnowledgeDocument(pan_card_services).txt ADDED
@@ -0,0 +1,316 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # About Pan Card
2
+
3
+ ### What is Pan card?
4
+
5
+ The PAN card is a unique ten-digit alphanumeric identification number that is issued by the Income Tax Department of India to track the tax-related transactions of individuals and entities. The PAN card is mandatory for any financial transaction in India, including opening a bank account, buying or selling property, and filing income tax returns.
6
+
7
+ ### Who needs a Pan card?
8
+
9
+ All individuals/non-individuals (including foreign citizens/entities) earning taxable income in IndiaΒ must have a PAN card.
10
+
11
+ ### Types of PAN cards
12
+
13
+ In India, two types of PAN cards are available: e-PAN card and physical PAN card.
14
+
15
+ 1. e-PAN card: An e-PAN card is a digitally-signed PAN card issued in electronic format. It contains the same PAN details as a physical PAN card but is available in a digital format. It can be downloaded online and used as a valid identification document for various purposes. The e-PAN card is usually issued in a PDF format.
16
+ 2. Physical PAN card: A physical PAN card is a laminated card with your PAN details printed on it. It is a physical document that can be carried and used as a valid identification proof. The physical PAN card is sent to the applicant's registered address by post.
17
+
18
+ Both e-PAN and physical PAN cards have the same validity and can be used for identification purposes. The choice between the two depends on the applicant's preference and requirements.
19
+
20
+ ### **Why do NRIs need PAN card?**
21
+
22
+ NRIS don’t need to have a PAN Card. However, a PAN Card is necessary for NRIs if they wish to do any of the following in India:
23
+
24
+ 1. A PAN card is required to carry out financial transactions such as opening a bank account, investing in stocks, purchasing or selling property, and investing in India.
25
+
26
+ 2. If an NRI earns an income in India, they must file income tax returns. A PAN card is necessary to file these returns.
27
+
28
+ 3. If an NRI wants to invest in mutual funds in India, they must have a PAN card.
29
+
30
+ ## Importance of PAN card for different NRI account options
31
+
32
+ NRI Accounts comprise of NRE,NRO and FCNR Accounts. A basic overview to understand the importance of PAN Card with respect to these three accounts can be understood as:
33
+
34
+ **NRE:**Β For funds earned outside India where both Principal and Interest earned are tax-free. Therefore, NRIs can opt for Form 60, which is a substitute for PAN for opening an NRE Account.
35
+
36
+ **NRO:**Β For funds earned in India which are mostly liable to taxes. Income such as rent or pension whereΒ taxes are not deducted at sourceΒ are deposited in an NRO Account. Therefore, aΒ PAN Card is mandatory for NRO Accounts.
37
+
38
+ **FCNR:**Β For foreign currency term deposits from outside India. Again, the principal and interest are tax-free, so usage of a PAN Card is not necessary and can be substituted with Form 60.
39
+
40
+ ---
41
+
42
+ # PAN Card Application Process
43
+
44
+ ## New Pan Card
45
+
46
+ ### How can NRI apply for a new PAN card
47
+
48
+ Here are the steps for *PAN CARD* processing.
49
+
50
+ - Visit ABC app
51
+ - Navigate to Services > NRI Pan Card > Apply New PAN
52
+ - Select the required form of PAN card and proceed with the payment
53
+ - Our team will get in touch with you to ask for the following documents:
54
+ - Passport(Any Country) / OCI Card
55
+ - Passport Size Photograph
56
+ - Overseas address proof with zip code (Supporting documents - Indian NRO/NRE Account statement or Overseas bank statement or Utility bill)
57
+
58
+ ### Documents required for a new PAN Card
59
+
60
+ **If you have Aadhaar card**
61
+
62
+ No other document is required. You can get your pan card through your Aadhaar card in 10 minutes.
63
+
64
+ **If you don’t have an Aadhaar card**
65
+
66
+ - Passport(Any Country) / OCI Card
67
+ - Passport Size Photograph
68
+ - Overseas address proof with zip code (Supporting documents - Indian NRO/NRE Account statement or Overseas bank statement or Utility bill)
69
+
70
+ ### Cost of new PAN card
71
+
72
+ The PAN CARD Application through ABC costs Rs 2500 for E-PAN, and if you want it to be couriered, it will cost Rs 1200 extra for physical delivery to your address.
73
+
74
+ - e-PAN Card cost: INR 2500
75
+ - Physical PAN Card cost: INR 3700
76
+
77
+ ### Time required to issue PAN card
78
+
79
+ ****If you have Aadhaar card****
80
+
81
+ You can get a Pan Card instantlyΒ **(in under 10 minutes)**, if you have an Aadhaar card. You can apply through ABC.
82
+
83
+ ********************************************************************If you don’t have an Aadhaar card********************************************************************
84
+
85
+ Once the payment is made to ABC, we will contact you and initiate the process. Pan card will be issued in 3 weeks.
86
+
87
+ ## Updation/Correction in the PAN Card
88
+
89
+ ### Information that can be updated in the PAN Card
90
+
91
+ - Your name
92
+ - Father’s name
93
+ - Date of Birth
94
+ - Citizenship
95
+ - Photograph
96
+ - Signature
97
+ - Gender
98
+ - Address
99
+ - Contact details
100
+
101
+ ### General process to update details on PAN Card
102
+
103
+ To update the details on your PAN card, you have to generate the reissue request for the Updation/ Correction of the PAN CARD. Follow the steps:
104
+
105
+ - Go to ABC app
106
+ - Navigate to Services > NRI PAN Card > PAN Card Correction
107
+ - Request reissue the required PAN card and make the payment
108
+ - Our team will reachout to you for the required documents
109
+
110
+ Do you want to start the process here instead? Click the button below.
111
+
112
+ **Time required to complete the correction process for the PAN card:** The duration to complete the correction process for your PAN card can vary, but it generally takes around 2-3 weeks.
113
+
114
+ ### Documents required to update the details on PAN Card
115
+
116
+ To update the information on the PAN card, kindly keep these documents ready.
117
+
118
+ - Copy of Existing Pan card
119
+ - Passport(Any Country) / OCI Card
120
+ - Passport Size Photograph
121
+ - Overseas address proof with zip code (Supporting documents - Indian NRO/NRE Account statement or Overseas bank statement or Utility bill)
122
+
123
+ ### Documents required to update the address on PAN Card
124
+
125
+ - Passport
126
+ - OCI Card
127
+ - Bank Account Statement in the country of residence
128
+ - NRE Bank Account Statement in India
129
+ - Residential Permit
130
+
131
+ ### ******************************************************************Process to change the citizenship on PAN Card******************************************************************
132
+
133
+ No direct provisions exist to change citizenship in a Pan Card as theΒ **PAN Card**Β doesn’t display your citizenship. It is a document required to file taxes, carry out investments and do transactions in India, whether you are a citizen, NRI, or OCI.
134
+
135
+ To change the citizenship in a PAN card, you must meet and notify your jurisdictional Assessing Officer. For NRIs, it is not easy to meet the assessing officer. However, ABC can meet or notify the jurisdictional assessing officer on your behalf. Contact a tax expert at ABC to change your citizenship on PAN card.
136
+
137
+ **Time required to update the citizenship status:** Generally, it takes around a month to complete the process, but the duration can vary based on factors such as workload and the authorities' responsiveness.
138
+
139
+ ## Reprinting lost Aadhaar Card
140
+
141
+ To reprint your PAN card, you need to follow a specific procedure that involves providing certain documents and information to authenticate your identity. The process can take around 2-3 weeks to complete. You can apply for a reprint through ABC. We will guide you through the process and help you obtain a new copy of your PAN card.
142
+
143
+ ### Documents required for reprinting the lost PAN card
144
+
145
+ 1. **If you remember your PAN number:**
146
+ - Pan number
147
+ - Passport(Any Country) / OCI Card
148
+ - Passport Size Photograph
149
+ - Overseas address proof with zip code (Supporting documents - Indian NRO/NRE Account statement or Overseas bank statement or Utility bill)
150
+ 2. **If you don’t remember your PAN number:**
151
+ This gets tricky in most cases so our representative will reach out to you to inform you about the process.
152
+
153
+ Do you remember your PAN card number?
154
+
155
+ ### C**harges for reprinting the PAN Card**
156
+
157
+ The charges for reprinting the PAN Card are INR 2500 for e-pan, and INR 3700 for physical pan card.
158
+
159
+ ---
160
+
161
+ ## Linking PAN with Aadhaar card
162
+
163
+ ABC can link your PAN card and Aadhaar card on your behalf.
164
+
165
+ ### Process to link PAN with Aadhaar
166
+
167
+ - Go to ABC app
168
+ - Navigate to Services > NRI PAN Card > Link PAN with Aadhaar
169
+ - Request reissue the required PAN card and make the payment
170
+ - Our team will reach out to you for the required documents
171
+
172
+ Alternatively, you can also initiate the process on WhatsApp as well.
173
+
174
+ ### ABC fees to link PAN with Aadhaar
175
+
176
+ The charges for linking Pan & Aadhaar is INR 2000/-, including the penalty charges to be paid for the delay in linking Pan & Aadhaar.
177
+
178
+ ### **Documents required** to link PAN with Aadhaar
179
+
180
+ Kindly share a copy of your pan card and Aadhaar card. ABC will review the documents and share a payment link for the linking.
181
+
182
+ New Deadline for linking Aadhaar with pan card 30 June 2023. PAN will become inoperative after June 2023 if not linked to Aadhaar.
183
+
184
+ ### Time required **for PAN Aadhaar link for NRI?**
185
+
186
+ It takes upto 6 to 7 days for PAN Aadhaar linking for NRIs.
187
+
188
+ ---
189
+
190
+ # Form 49AA
191
+
192
+ ### **What is Form 49aa?**
193
+
194
+ Form 49AA is the application form for the allotment of Permanent Account Number for Foreign residents and entities incorporated outside India.
195
+
196
+ ### **Documents Required for Form 49AA**
197
+
198
+ Here are the necessary documents that are supposed to be submitted along with PAN Card Form 49AA
199
+
200
+ 1. Passport
201
+ 2. PIO card issued by Government of India
202
+ 3. OCI card issued by Government of India
203
+ 4. Other national/citizenship Identification Number/Taxpayer Identification Number duly attested by β€œApostille” or by the Indian Embassy/High Commission/Consulate in the country where the applicant is located or authorized officials of overseas branches of Scheduled Banks registered in India.
204
+ 5. Bank account statement in the country of residence
205
+ 6. NRE bank account statement in India
206
+ 7. Certificate of Residence in India or Residential permit issued by the State Police Authorities
207
+ 8. Registration certificate issued by the Foreigner’s Registration Office showing Indian address
208
+ 9. Visa granted and Copy of appointment letter/ contract from Indian Company & Certificate (in original) of Indian address issued by the employer
209
+
210
+ ---
211
+
212
+ # FAQs about PAN Card
213
+
214
+ I**s it mandatory to link Aadhaar with PAN for NRI?**
215
+
216
+ No, Aadhaar and PAN linking is optional for NRIs. However, to avoid any legal complications in India, NRIs should either link their PAN to Aadhaar, or update their status as non-resident.
217
+
218
+ **Is PAN card different for NRI?**
219
+
220
+ No, PAN works completely the same way for both NRI and Resident Indian except for one factor – the type of Application Form alone changes with respect to your current Residential Status. A PAN is mandatory when filing an Income-Tax return,Β TDSΒ or any transaction that attracts tax.
221
+
222
+ **Can I apply for pan card from USA?**
223
+
224
+ Yes. You can apply for a PAN Card from the USA. The easiest and most convenient way to apply for a PAN card from the USA is through ABC.
225
+
226
+ **Is a PAN card mandatory for an NRI bank account?**
227
+
228
+ No, in the absence of the Pan Card,Β **NRIs can sign Form 60**Β [Form 60 is a declaration to be filed by an individual or a person (not being a company or firm) who does not have a Permanent Account Number (PAN) and who in involved in any transaction] to open an NRI Account.
229
+
230
+ **Can OCI holders get PAN card?**
231
+
232
+ Yes, NRIs who hold foreign citizenship, such asΒ OCIΒ holders or people of Indian origin who possess foreign citizenship or foreigners who are not of Indian origin, can get PAN card through Form 49AA.
233
+
234
+ **Can an NRI buy property in India without PAN card?**
235
+
236
+ No, AΒ [PAN card](https://ABC.com/blog/nri-income-tax/uses-of-a-pan-card-for-nris)Β is mandatory to buy property in India. NRIs need a PAN card because they will be required to file income tax returns if they have rented out the property. Besides, if the property is sold later, the capital gains resulting from the sales would be subject to capital gains tax.
237
+
238
+ **What is the difference between an NRI PAN card and normal PAN card?**
239
+
240
+ There is no difference between an NRI PAN card and normal PAN card. AnΒ [NRI PAN card is the same as a PAN card issued to individuals living in India](https://ABC.com/blog/nri-bank-accounts/what-is-the-difference-between-an-nri-pan-card-and-a-normal-pan-card#:~:text=There%20is%20only%20one%20Pan,to%20Indian%20Residents%20and%20NRIs.). However, to apply for a PAN card for OCI or people of Indian origin who hold foreign citizenship, an applicant is required to fill Form 49AA.
241
+
242
+ **Is a PAN card mandatory for NRI?**
243
+
244
+ PAN is not compulsory for all NRIs. A PAN card is mandatory for NRIs with a source of income in India to file their taxes or if they want to invest in stocks or mutual funds in India.
245
+
246
+ **What is the difference between PAN card and Form 60?**
247
+
248
+ TheΒ **basic difference**Β between a PAN Card andΒ [Form 60](https://www.incometaxindia.gov.in/forms/income-tax%20rules/103120000000007944.pdf)Β is that you can only sign and use the Form 60 to open a bank account but in order to file taxes and carry out investments from that account, a Pan Card is mandatory.
249
+
250
+ **Can I get a new PAN card as a Canadian citizen?**
251
+
252
+ No, it is illegal to possess multiple PAN cards. As a Canadian citizen, you cannot obtain a new PAN card. Instead, you should update your citizenship status in the existing PAN card.
253
+
254
+ **Can I perform KYC for my father's mutual funds with an NRI PAN card?**
255
+
256
+ You can use your NRI PAN card to perform KYC (Know Your Customer) for your father's mutual funds. However, ensuring that your PAN card reflects your updated citizenship status is crucial. You need to complete updating your citizenship in the PAN database before using it for any financial transactions or KYC requirements.
257
+
258
+ **Can the new PAN card be dispatched to my Canadian address?**
259
+
260
+ Yes, the PAN card can be delivered to your overseas address, including your Canadian address. While applying for a new PAN card or requesting corrections, you can provide your Canadian address as the delivery address. Make sure to provide accurate and complete address details to ensure successful delivery.
261
+
262
+ **What documents are required for updating citizenship on the PAN card?**
263
+
264
+ To update your citizenship status, you must provide documents such as your foreign passport, revoked Indian passport (if applicable), and a citizenship renunciation letter (if you have renounced Indian citizenship).
265
+
266
+ **Can an overseas driving license be considered as a valid address proof for the PAN card?**
267
+
268
+ No, an overseas driving license is generally not considered a valid address proof for the PAN card. Instead, you can provide alternative documents such as bank statements or credit card statements that contain your overseas address as proof of address while applying for corrections or updating your PAN card.
269
+
270
+ **How can I make the payment for the PAN card correction process?**
271
+
272
+ During the PAN card correction process, you will be either provided with a payment link or taken to Razorpay page. Razorpay is a secure online platform where you can make the payment conveniently using various payment options such as credit/debit cards, net banking, or digital wallets. Make sure to follow the instructions provided and ensure the payment is made within the specified timeframe.
273
+
274
+ **Can I link aadhaar card and PAN card even if there is minor difference in my name in both?**
275
+
276
+ It is important for the date of birth (DOB) to be the same on both the PAN and Aadhaar documents. However, minor differences in the name should not pose an issue.
277
+
278
+ **Can NRIs make the payment for the process using an Indian bank account?**
279
+
280
+ Yes, NRIs can make the payment through their Indian bank accounts. However, it is worth noting that paying in Indian Rupees (INR) may be costlier due to the application of 18% GST.
281
+
282
+ **Is it possible to make the payment for the process using a cheque?**
283
+
284
+ No, ABC only accepts online payments for the linking process and does not accept cheques.
285
+
286
+ **What payment options are available for NRIs?**
287
+
288
+ NRIs can make the payment using their debit or credit cards or through their international cards.
289
+
290
+ **Can NRIs residing in the USA link their PAN and Aadhaar cards without visiting India?**
291
+
292
+ Yes, NRIs residing in the USA can link their PAN and Aadhaar cards without the need to visit India. ABC can assist them digitally.
293
+
294
+ **How can NRIs share their PAN and Aadhaar card details with ABC for the linking process?**
295
+
296
+ NRIs can share their PAN and Aadhaar card details by providing clear images of the documents to ABC. Blurry or unclear images may require re-submission for verification.
297
+
298
+ **Can NRIs use an international card for making the payment?**
299
+
300
+ Yes, NRIs can use their international debit or credit cards to make the payment for the linking process.
301
+
302
+ **Is it necessary to download the ABC app or visit their website for the payment process**
303
+
304
+ No, it is not necessary to download the ABC app or visit their website for the payment process. The payment link provided by ABC can be accessed directly to make the payment.
305
+
306
+ **Can I apply for pan card without Aadhaar?**
307
+
308
+ Yes, NRIs can apply for a PAN card without an Aadhaar Card? They can simply do so by filling out either of the forms – 49A (for citizens of India) or Form 49AA (for foreign citizens).
309
+
310
+ **Can I apply for a PAN card if I am a non-resident Indian (NRI)?**
311
+
312
+ Yes, as an NRI, you can apply for a PAN card. The process for applying for a PAN card is the same for both residents and NRIs. However, if you are an OCI holder or a person of Indian origin who holds foreign citizenship, you will need to fill Form 49AA to apply for a PAN card.
313
+
314
+ **Can I take the delivery of Pan card at Indian address?**
315
+
316
+ Yes, you can take the delivery of your PAN card only at an Indian address mentioned in your Aadhaar card. While applying for a new PAN card or requesting corrections, you can provide your Indian address as the delivery address. Make sure to provide accurate and complete address details to ensure successful delivery.
data_processor/__init__.py ADDED
@@ -0,0 +1,2 @@
 
 
 
1
+ from .document_reader import DocumentReader
2
+ from .text_splitter import SentenceSplitter
data_processor/__pycache__/__init__.cpython-311.pyc ADDED
Binary file (324 Bytes). View file
 
data_processor/__pycache__/document_reader.cpython-311.pyc ADDED
Binary file (3.37 kB). View file
 
data_processor/__pycache__/text_splitter.cpython-311.pyc ADDED
Binary file (9.12 kB). View file
 
data_processor/document_reader.py ADDED
@@ -0,0 +1,51 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from pathlib import Path
2
+ import pypdf
3
+ import docx2txt
4
+
5
+ class DocumentReader:
6
+ @staticmethod
7
+ def read_pdf(data_path):
8
+ with open(data_path, "rb") as fp:
9
+ pdf = pypdf.PdfReader(fp) # Open the PDF file
10
+ num_pages = len(pdf.pages) # Get the number of pages in the PDF
11
+ docs = []
12
+ for page in range(num_pages):
13
+ page_text = pdf.pages[page].extract_text() # Extract text from the page
14
+ page_label = pdf.page_labels[page] # Get page label (e.g., page number)
15
+ metadata = {"page_label": page_label, "file_name": data_path.name}
16
+ docs.append({"text": page_text, "metadata": metadata})
17
+ return docs
18
+
19
+ @staticmethod
20
+ def read_docx(data_path):
21
+ metadata = {"file_name": data_path.name}
22
+ doc = docx2txt.process(data_path) # Extract text from the DOCX file
23
+ docs = [{'text': doc, 'metadata': metadata}]
24
+ return docs
25
+
26
+ @staticmethod
27
+ def read_txt(data_path):
28
+ print(data_path.name)
29
+ with open(data_path, "r") as fp:
30
+ text = fp.read() # Read text from the TXT file
31
+ metadata = {"file_name": data_path.name}
32
+ docs = [{'text': text, 'metadata': metadata}]
33
+ return docs
34
+
35
+ @staticmethod
36
+ def read_document(file_path):
37
+ data_path = Path(file_path)
38
+ if data_path.suffix == ".pdf":
39
+ return DocumentReader.read_pdf(data_path) # Read PDF document
40
+ elif data_path.suffix == ".docx":
41
+ return DocumentReader.read_docx(data_path) # Read DOCX document
42
+ elif data_path.suffix == ".txt":
43
+ return DocumentReader.read_txt(data_path) # Read TXT document
44
+ else:
45
+ raise ValueError("Unsupported file format")
46
+
47
+ if __name__=='__main__':
48
+ # Example usage:
49
+ DATA_PATH = '71763-gale-encyclopedia-of-medicine.-vol.-1.-2nd-ed.pdf'
50
+ documents = DocumentReader.read_document(DATA_PATH) # Read the specified document
51
+ print(documents) # Print the extracted text and metadata
data_processor/text_splitter.py ADDED
@@ -0,0 +1,151 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from dataclasses import dataclass
2
+ import re
3
+
4
+ # Data class for representing a text split
5
+ @dataclass
6
+ class Split:
7
+ text: str # the split text
8
+ is_sentence: bool # save whether this is a full sentence
9
+
10
+ # Data class for representing a document
11
+ @dataclass
12
+ class Document:
13
+ doc_id: str
14
+ text: str
15
+ metadata: dict
16
+
17
+ # Class for splitting text into sentences
18
+ class SentenceSplitter:
19
+ def __init__(self, chunk_size=100, chunk_overlap=50):
20
+ self.chunk_size = chunk_size
21
+ self.chunk_overlap = chunk_overlap
22
+ # List of functions for splitting text
23
+ self._split_fn_sentence = [self._split_by_sep('\n\n'), self._split_by_regex("[^,.;γ€‚οΌŸοΌ]+[,.;γ€‚οΌŸοΌ]?")]
24
+ self._split_fn_subsentence = [self._split_by_sep(' ')]
25
+
26
+ def _split_by_sep(self, sep):
27
+ # Split text by separator and maintain the separator
28
+ def fun(text):
29
+ parts = text.split(sep)
30
+ result = [sep + s if i > 0 else s for i, s in enumerate(parts)]
31
+ return [s for s in result if s]
32
+ return lambda text: fun(text)
33
+
34
+ def _split_by_regex(self, regex):
35
+ # Split text using a regular expression
36
+ return lambda text: re.findall(regex, text)
37
+
38
+ def _splits_by_fns(self, text):
39
+ for split_fn in self._split_fn_sentence:
40
+ splits = split_fn(text)
41
+ if len(splits) > 1:
42
+ return splits, True
43
+
44
+ for split_fn in self._split_fn_subsentence:
45
+ splits = split_fn(text)
46
+ if len(splits) > 1:
47
+ break
48
+
49
+ return splits, False
50
+
51
+ def _token_size(self, text):
52
+ # Calculate the token size of text
53
+ return len(text.split(' '))
54
+
55
+ def _split(self, text, chunk_size):
56
+ # Break text into splits that are smaller than chunk size
57
+ if self._token_size(text) <= chunk_size:
58
+ return [Split(text, is_sentence=True)]
59
+
60
+ text_splits = []
61
+ text_splits_by_fns, is_sentence = self._splits_by_fns(text)
62
+ for text_split_by_fns in text_splits_by_fns:
63
+ if self._token_size(text_split_by_fns) <= chunk_size:
64
+ text_splits.append(Split(text_split_by_fns, is_sentence=is_sentence))
65
+ else:
66
+ recursive_text_splits = self._split(text_split_by_fns, chunk_size=chunk_size)
67
+ text_splits.extend(recursive_text_splits)
68
+ return text_splits
69
+
70
+ def _merge(self, splits, chunk_size):
71
+ # Merge splits into chunks
72
+ chunks, cur_chunk, last_chunk = [], [], []
73
+ cur_chunk_len = 0
74
+ new_chunk = True
75
+
76
+ def close_chunk():
77
+ nonlocal chunks, cur_chunk, last_chunk, cur_chunk_len, new_chunk
78
+
79
+ chunks.append("".join([text for text, length in cur_chunk]))
80
+ last_chunk = cur_chunk
81
+ cur_chunk = []
82
+ cur_chunk_len = 0
83
+ new_chunk = True
84
+ # Add overlap to the new chunk from previous chunks
85
+ if len(last_chunk) > 0:
86
+ last_index = len(last_chunk) - 1
87
+ while (
88
+ last_index >= 0
89
+ and cur_chunk_len + last_chunk[last_index][1] <= self.chunk_overlap
90
+ ):
91
+ text, length = last_chunk[last_index]
92
+ cur_chunk_len += length
93
+ cur_chunk.insert(0, (text, length))
94
+ last_index -= 1
95
+
96
+ while len(splits) > 0:
97
+ cur_split = splits[0]
98
+ cur_split_len = self._token_size(cur_split.text)
99
+
100
+ # Close the chunk if it exceeds chunk_size
101
+ if cur_chunk_len + cur_split_len > chunk_size and not new_chunk:
102
+ close_chunk()
103
+ else:
104
+ if (
105
+ cur_split.is_sentence
106
+ or cur_chunk_len + cur_split_len <= chunk_size
107
+ or new_chunk # new chunk, always add at least one split
108
+ ):
109
+ # Add split to chunk
110
+ cur_chunk_len += cur_split_len
111
+ cur_chunk.append((cur_split.text, cur_split_len))
112
+ splits.pop(0)
113
+ new_chunk = False
114
+ else:
115
+ # Close out the chunk
116
+ close_chunk()
117
+
118
+ # Handle the last chunk
119
+ if not new_chunk:
120
+ chunk = "".join([text for text, length in cur_chunk])
121
+ chunks.append(chunk)
122
+
123
+ # Run post-processing to remove blank spaces
124
+ new_chunks = [chunk.strip() for chunk in chunks if chunk.strip() != ""]
125
+ return new_chunks
126
+
127
+ def split_texts(self, documents):
128
+ chunked_documents = []
129
+ for page_no, document in enumerate(documents):
130
+ text, metadata = document['text'], document['metadata']
131
+ if text == "":
132
+ continue
133
+ splits = self._split(text, self.chunk_size)
134
+ chunks = self._merge(splits, self.chunk_size)
135
+ for chunk_no, chunk in enumerate(chunks):
136
+ chunk_id = f"{metadata['file_name']}__{page_no}__{chunk_no}"
137
+ chunk_metadata = {'file_name': metadata['file_name'], 'page_no': page_no, 'chunk_no': chunk_no}
138
+ data = Document(chunk_id, chunk, chunk_metadata)
139
+ chunked_documents.append(data)
140
+ return chunked_documents
141
+
142
+ if __name__ == '__main__':
143
+ document = {
144
+ "text": "This is example texts",
145
+ "metadata": {"file_name": "example.pdf", "page_no": 1}
146
+ }
147
+ documents = [document] * 10
148
+ splitter = SentenceSplitter(chunk_size=100, chunk_overlap=30)
149
+ splitted_documents = splitter.split_texts(documents)
150
+
151
+ print(splitted_documents[0])
ingest.py ADDED
@@ -0,0 +1,53 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from data_processor import DocumentReader, SentenceSplitter
2
+ from models import EmbeddingModel
3
+ from vector_store import FaissVectorStore
4
+ from tqdm import tqdm
5
+ import argparse
6
+
7
+ if __name__ == '__main__':
8
+ parser = argparse.ArgumentParser()
9
+ parser.add_argument("--data_path", default='data/KnowledgeDocument(pan_card_services).txt',help="Input file name")
10
+ parser.add_argument("--vector_database_path", default='vector_db',help="Vector database which store embeddings vector")
11
+ args = parser.parse_args()
12
+
13
+ # Define the paths to the data and vector database
14
+ DATA_PATH = args.data_path
15
+ VECTOR_DATABASE_PATH = args.vector_database_path
16
+
17
+ # Read the document from the specified path
18
+ documents = DocumentReader.read_document(DATA_PATH)
19
+
20
+ # Split the document into sentences with specified chunk parameters
21
+ splitter = SentenceSplitter(chunk_size=60, chunk_overlap=20)
22
+ splitted_documents = splitter.split_texts(documents)
23
+
24
+ # Initialize the embedding model
25
+ embedding_model = EmbeddingModel(model_name='sentence-transformers/all-MiniLM-L6-v2')
26
+
27
+ # Create a dictionary to store documents and their corresponding vectors
28
+ database_documents = {}
29
+ batch_size = 16
30
+ print("Generating embedding vectors....")
31
+ # Process the documents in batches
32
+ for i in tqdm(range(0, len(splitted_documents), batch_size)):
33
+ batch = splitted_documents[i:i + batch_size]
34
+ texts = []
35
+
36
+ # Extract the text from each document in the batch
37
+ for b in batch:
38
+ texts.append(b.text)
39
+
40
+ # Generate embeddings for the batch of texts using the embedding model
41
+ embeddings = embedding_model.encode(texts)
42
+
43
+ # Associate each document with its corresponding vector and store in the dictionary
44
+ for i, b in enumerate(batch):
45
+ data = {'document': b, 'vector': embeddings[i]}
46
+ database_documents[b.doc_id] = data
47
+ print("Total embeddings: ",len(database_documents))
48
+ # Create a Faiss vector store from the processed documents and vectors
49
+ vector_store = FaissVectorStore.from_documents(database_documents, dimension=embedding_model.embedding_dim, nlists=100, nprobe=10)
50
+
51
+ # Write the vector store to the specified path
52
+ vector_store.write(VECTOR_DATABASE_PATH)
53
+ print(f"Successfully written embedding vectors to {VECTOR_DATABASE_PATH} .")
models/__init__.py ADDED
@@ -0,0 +1,2 @@
 
 
 
1
+ from .embedding_models import EmbeddingModel
2
+ from .llms import LLM
models/__pycache__/__init__.cpython-311.pyc ADDED
Binary file (295 Bytes). View file
 
models/__pycache__/embedding_models.cpython-311.pyc ADDED
Binary file (3.02 kB). View file
 
models/__pycache__/llms.cpython-311.pyc ADDED
Binary file (2.52 kB). View file
 
models/embedding_models.py ADDED
@@ -0,0 +1,35 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import torch
2
+ from transformers import AutoTokenizer, AutoModel
3
+ import torch.nn.functional as F
4
+
5
+ # Create a class for embedding sentences using Hugging Face Transformers
6
+ class EmbeddingModel:
7
+ def __init__(self, model_name='sentence-transformers/all-MiniLM-L6-v2'):
8
+ # Initialize the model with the given model_name
9
+ self.tokenizer = AutoTokenizer.from_pretrained(model_name)
10
+ self.model = AutoModel.from_pretrained(model_name)
11
+ # Get the embedding dimension from the model's output
12
+ self.embedding_dim = self.encode('Hi').shape[1]
13
+
14
+ def _mean_pooling(self, model_output, attention_mask):
15
+ # Calculate mean pooling of token embeddings
16
+ token_embeddings = model_output[0]
17
+ input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
18
+ embedding = torch.sum(token_embeddings * input_mask_expanded, 1) / torch.clamp(input_mask_expanded.sum(1), min=1e-9)
19
+ return embedding
20
+
21
+ def encode(self, text):
22
+ # Encode a text into sentence embeddings
23
+ inputs = self.tokenizer(text, padding=True, truncation=True, return_tensors='pt')
24
+ with torch.no_grad():
25
+ outputs = self.model(**inputs)
26
+ sentence_embeddings = self._mean_pooling(outputs, inputs['attention_mask'])
27
+ sentence_embeddings = F.normalize(sentence_embeddings, p=2, dim=1).numpy().astype('float32')
28
+ return sentence_embeddings
29
+
30
+ if __name__ == '__main__':
31
+ # Sentences we want sentence embeddings for
32
+ sentences = ['This is an example sentence', 'Each sentence is converted']
33
+ # Print the embedding dimension of the model
34
+ print(EmbeddingModel().embedding_dim)
35
+
models/llms.py ADDED
@@ -0,0 +1,55 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from transformers import AutoModelForCausalLM, AutoTokenizer
2
+ import torch
3
+
4
+ # Define a Language Model class
5
+ class LLM:
6
+ def __init__(self, model_name):
7
+ # Determine the device to use (GPU if available, otherwise CPU)
8
+ device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
9
+
10
+ # Load the pre-trained language model with specific settings
11
+ self.model = AutoModelForCausalLM.from_pretrained(
12
+ model_name,
13
+ torch_dtype=torch.float16, # Set the data type to float16
14
+ load_in_8bit=True, # Load in 8-bit format if available
15
+ device_map='auto' # Automatically select the device
16
+ ).bfloat16() # Convert the model to bfloat16 for lower precision
17
+
18
+ # Initialize the tokenizer for the same model
19
+ self.tokenizer = AutoTokenizer.from_pretrained(model_name)
20
+
21
+ # Set custom padding token and padding side
22
+ self.tokenizer.pad_token = "[PAD]"
23
+ self.tokenizer.padding_side = "left"
24
+
25
+ def generate_response(self, messages, max_tokens=100, do_sample=True):
26
+ # Tokenize the input messages and move them to the selected device (GPU or CPU)
27
+ input_ids = self.tokenizer(
28
+ messages,
29
+ max_length=512,
30
+ padding=True,
31
+ truncation=True,
32
+ return_tensors='pt'
33
+ ).input_ids.cuda()
34
+
35
+ with torch.no_grad():
36
+ # Generate a response using the loaded model
37
+ generated_ids = self.model.generate(
38
+ input_ids,
39
+ pad_token_id=self.tokenizer.pad_token_id,
40
+ max_new_tokens=max_tokens,
41
+ do_sample=do_sample,
42
+ temperature=0.3 # Adjust the sampling temperature
43
+ )
44
+ # Decode the generated tokens into a human-readable response
45
+ response = self.tokenizer.batch_decode(generated_ids, skip_special_tokens=False)[0]
46
+
47
+ return response
48
+
49
+ # Main program
50
+ if __name__ == '__main__':
51
+ # Specify the model name to use
52
+ model_name = "mistralai/Mistral-7B-Instruct-v0.1"
53
+
54
+ # Create an instance of the Language Model class with the specified model
55
+ llm = LLM(model_name)
requirements.txt ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ pypdf
2
+ docx2txt
3
+ faiss-cpu
4
+ git+https://github.com/huggingface/optimum.git
5
+ git+https://github.com/huggingface/transformers.git
6
+ accelerate
7
+ bitsandbytes
8
+ gradio
utils.py ADDED
@@ -0,0 +1,52 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ class MistralPrompts:
2
+
3
+ # Create a standalone question prompt by using chat history and followup question.
4
+ @staticmethod
5
+ def create_standalone_question_prompt(question, chat_history_prompt):
6
+ message = f'''
7
+ [INST]
8
+ Taking chat history as context, rephrase follow up question into a standalone question.
9
+ "Follow up question: {question}
10
+ [/INST]
11
+ '''
12
+ prompt = chat_history_prompt + message
13
+ return prompt
14
+
15
+ # Create a chat history prompt by combining user and bot messages.
16
+ @staticmethod
17
+ def create_history_prompt(chat_history):
18
+ user_message, bot_message = chat_history[0]
19
+ chat_history_text = f"<s>[INST] {user_message} [/INST] {bot_message}</s>"
20
+ chat_history_text += "".join(f"[INST] {user_message} [/INST] {bot_message}</s>" for user_message, bot_message in chat_history[1:])
21
+ return chat_history_text
22
+
23
+ # Create a question prompt by adding context and question to a chat history prompt.
24
+ @staticmethod
25
+ def create_question_prompt(question, context, chat_history_prompt):
26
+ message = '''
27
+ [INST]
28
+ {instructions}
29
+ Context: {context}
30
+ Question: {question}
31
+ [/INST]
32
+ '''
33
+ if chat_history_prompt == '':
34
+ # If no chat history, provide instructions.
35
+ instructions = '''
36
+ Use the following pieces of information to answer the user's question.
37
+ If you don't know the answer, just say that you don't know,
38
+ don't try to make up an answer.
39
+ '''
40
+ message = message.format(instructions=instructions, context=context, question=question)
41
+ prompt = message
42
+ else:
43
+ # If there's a chat history, add context and question to it.
44
+ message = message.format(instructions='', context=context, question=question)
45
+ prompt = chat_history_prompt + message
46
+ return prompt
47
+
48
+ # Extract the response from a prompt.
49
+ @staticmethod
50
+ def extract_response(response):
51
+ response = response.split('[/INST]')[-1].split('</s>')[0].strip()
52
+ return response
vector_db/documents.pkl ADDED
Binary file (140 kB). View file
 
vector_db/index.faiss ADDED
Binary file (109 kB). View file
 
vector_store/__init__.py ADDED
@@ -0,0 +1 @@
 
 
1
+ from .faiss_vector_store import FaissVectorStore
vector_store/__pycache__/__init__.cpython-311.pyc ADDED
Binary file (257 Bytes). View file
 
vector_store/__pycache__/faiss_vector_store.cpython-311.pyc ADDED
Binary file (9.93 kB). View file
 
vector_store/faiss_vector_store.py ADDED
@@ -0,0 +1,148 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import faiss
2
+ import numpy as np
3
+ import os
4
+ import pickle
5
+ from tqdm import tqdm
6
+
7
+ # Create a class for a flat index
8
+ class IndexFlat:
9
+ def __init__(self, dimension):
10
+ # Initialize a Faiss flat index with L2 distance
11
+ self.index = faiss.IndexFlatL2(dimension)
12
+
13
+ def add(self, vectors):
14
+ # Add vectors to the index
15
+ self.index.add(np.array(vectors))
16
+
17
+ def delete(self, ids):
18
+ # Remove vectors from the index by their IDs
19
+ self.index.remove_ids(np.array(ids))
20
+
21
+ def search(self, vectors, k):
22
+ # Search for the k-nearest neighbors of the given vectors
23
+ return self.index.search(np.array(vectors), k)
24
+
25
+ # Create a class for an IVF (Inverted File) index
26
+ class IndexIVF:
27
+ def __init__(self, dimension, nlists=100, nprobe=10):
28
+ # Initialize a Faiss flat index and an IVF index with inner product metric
29
+ self.index_flat = faiss.IndexFlatL2(dimension)
30
+ self.index = faiss.IndexIVFFlat(self.index_flat, dimension, nlists, faiss.METRIC_INNER_PRODUCT)
31
+ self.index.nprobe = nprobe
32
+
33
+ def add(self, vectors):
34
+ # Train and add vectors to the index
35
+ self.index.train(np.array(vectors))
36
+ self.index.add(np.array(vectors))
37
+
38
+ def delete(self, ids):
39
+ # Remove vectors from the index by their IDs
40
+ self.index.remove_ids(np.array(ids))
41
+
42
+ def search(self, vectors, k):
43
+ # Search for the k-nearest neighbors of the given vectors
44
+ return self.index.search(np.array(vectors), k)
45
+
46
+ # Create a class for managing Faiss vector storage
47
+ class FaissVectorStore:
48
+ def __init__(self, dimension=324, nlists=100, nprobe=10):
49
+ self.dimension = dimension
50
+ self.nlists = nlists
51
+ self.nprobe = nprobe
52
+ self.index = None
53
+ self.documents_db = {}
54
+
55
+ def add(self, documents):
56
+ ids = range(0, len(self.documents_db) + len(documents))
57
+ db_vectors, db_documents, db_docs_ids = [], [], []
58
+
59
+ # Collect existing document vectors and documents
60
+ for doc_id in self.documents_db:
61
+ db_vectors.append(self.documents_db[doc_id]['vector'])
62
+ db_documents.append(self.documents_db[doc_id]['document'])
63
+ db_docs_ids.append(doc_id)
64
+
65
+ # Add new document vectors and documents
66
+ for doc_id in documents:
67
+ db_vectors.append(documents[doc_id]['vector'])
68
+ db_documents.append(documents[doc_id]['document'])
69
+ db_docs_ids.append(doc_id)
70
+
71
+ if len(db_vectors) < 10000:
72
+ self.index = IndexFlat(self.dimension)
73
+ else:
74
+ self.index = IndexIVF(self.dimension, self.nlists, self.nprobe)
75
+
76
+ self.index.add(np.array(db_vectors))
77
+ self.documents_db = {}
78
+ for i, doc_id in enumerate(db_docs_ids):
79
+ self.documents_db[doc_id] = {'vector': db_vectors[i], 'document': db_documents[i], 'index_id': i}
80
+
81
+ def delete(self, documents_ids):
82
+ # Delete vectors from the index by document IDs
83
+ index_ids_to_delete = []
84
+ for doc_id in documents_ids:
85
+ if doc_id in self.documents_db:
86
+ index_ids_to_delete.append(self.documents_db[doc_id]['index_id'])
87
+ self.index.delete(index_ids_to_delete)
88
+ self.documents_db = {k: v for k, v in self.documents_db.items() if k not in documents_ids}
89
+
90
+
91
+ def query(self, query_vector, k):
92
+ # Query for the top k nearest neighbors to the query_vector
93
+ _, I = self.index.search(query_vector, k)
94
+ documents = []
95
+ for doc_id in self.documents_db:
96
+ if self.documents_db[doc_id]['index_id'] in I[0]:
97
+ documents.append(self.documents_db[doc_id]['document'])
98
+ return documents
99
+
100
+ def write(self,database_path):
101
+ # Save the index and documents to files
102
+ if not os.path.exists(database_path):
103
+ os.makedirs(database_path)
104
+ faiss_path = os.path.join(database_path, 'index.faiss')
105
+ document_path = os.path.join(database_path, 'documents.pkl')
106
+ faiss.write_index(self.index.index, faiss_path)
107
+ with open(document_path, 'wb') as f:
108
+ pickle.dump(self.documents_db, f)
109
+
110
+ def read(self,database_path):
111
+ # Read the index and documents from files
112
+ faiss_path = os.path.join(database_path, 'index.faiss')
113
+ document_path = os.path.join(database_path, 'documents.pkl')
114
+ self.index = faiss.read_index(faiss_path)
115
+ with open(document_path, 'rb') as f:
116
+ self.documents_db = pickle.load(f)
117
+
118
+ @classmethod
119
+ def from_documents(cls, documents, dimension, nlists, nprobe):
120
+ vector_store = cls(dimension, nlists, nprobe)
121
+ vector_store.add(documents)
122
+ return vector_store
123
+
124
+ @classmethod
125
+ def as_retriever(cls, database_path):
126
+ vector_store = cls()
127
+ vector_store.read(database_path)
128
+ return vector_store
129
+
130
+ if __name__ == '__main__':
131
+ nb = 20000
132
+ d = 50
133
+ database_path = 'db_path'
134
+
135
+ if not os.path.exists(database_path):
136
+ os.makedirs(database_path)
137
+
138
+ documents = {}
139
+ for i in range(nb):
140
+ id = f'id_{i}'
141
+ texts = f'text_{i}'
142
+ vectors = np.random.random((d)).astype('float32')
143
+ documents[id] = {'document': texts, 'vector': vectors}
144
+
145
+ vector_store = FaissVectorStore.from_documents(documents, dimension=50, nlists=100, nprobe=10)
146
+ query_vector = np.random.random((1, d)).astype('float32')
147
+ nearest_neighbors = vector_store.query(query_vector, k=5)
148
+ print(nearest_neighbors)