Spaces:

sunil448832
/

retrieval-augment-generation

Runtime error

App Files Files Community

sunil448832 commited on Oct 25, 2023

Commit

eccde2c

1 Parent(s): d14c166

Initial Commit

Browse files

Files changed (24) hide show

app.py +32 -0
chat.py +76 -0
data/KnowledgeDocument(pan_card_services).txt +316 -0
data_processor/__init__.py +2 -0
data_processor/__pycache__/__init__.cpython-311.pyc +0 -0
data_processor/__pycache__/document_reader.cpython-311.pyc +0 -0
data_processor/__pycache__/text_splitter.cpython-311.pyc +0 -0
data_processor/document_reader.py +51 -0
data_processor/text_splitter.py +151 -0
ingest.py +53 -0
models/__init__.py +2 -0
models/__pycache__/__init__.cpython-311.pyc +0 -0
models/__pycache__/embedding_models.cpython-311.pyc +0 -0
models/__pycache__/llms.cpython-311.pyc +0 -0
models/embedding_models.py +35 -0
models/llms.py +55 -0
requirements.txt +8 -0
utils.py +52 -0
vector_db/documents.pkl +0 -0
vector_db/index.faiss +0 -0
vector_store/__init__.py +1 -0
vector_store/__pycache__/__init__.cpython-311.pyc +0 -0
vector_store/__pycache__/faiss_vector_store.cpython-311.pyc +0 -0
vector_store/faiss_vector_store.py +148 -0

app.py ADDED Viewed

	@@ -0,0 +1,32 @@

+import gradio as gr
+from models import EmbeddingModel, LLM
+from utils import MistralPrompts
+from vector_store import FaissVectorStore
+from chat import ChatBot
+VECTOR_DATABASE_PATH = 'vector_db'
+# Initialize models and vector store
+embedding_model = EmbeddingModel(model_name='sentence-transformers/all-MiniLM-L6-v2')
+llm = LLM("mistralai/Mistral-7B-Instruct-v0.1")
+vector_store = FaissVectorStore.as_retriever(database_path=VECTOR_DATABASE_PATH)
+# Create a ChatBot instance
+chat_bot = ChatBot(llm, embedding_model, vector_store)
+# Function to handle the user's input and generate a response
+def chat_bot(input_text):
+    response = chat_bot.chat(input_text)
+    return response
+# Create a Gradio interface
+chatbot_interface = gr.Interface(
+    fn=chat_bot,
+    inputs=gr.inputs.Textbox(prompt="User:"),
+    outputs=gr.inputs.Textbox(prompt="Bot:"),
+    title="Chatbot Assitant for PAN card related query",
+    theme="compact"
+)
+# Launch the Gradio interface
+chatbot_interface.launch()

chat.py ADDED Viewed

	@@ -0,0 +1,76 @@

+from models import EmbeddingModel, LLM
+from utils import MistralPrompts
+from vector_store import FaissVectorStore
+import argparse
+import warnings
+warnings.filterwarnings("ignore")
+# Create a ChatBot class to manage interactions
+class ChatBot:
+    def __init__(self, llm, embedding_model, vector_store):
+        self.llm = llm
+        self.embedding_model = embedding_model
+        self.chat_history = []
+        self.vector_store = vector_store
+    def format_context(self, retrieved_documents):
+        context, sources = '', ''
+        # Format retrieved documents into context and sources
+        # This is simplest way to combine. there are other techniques as well to try out.
+        for doc in retrieved_documents:
+            context += doc.text + '\n\n'
+            sources += str(doc.metadata) + '\n'
+        return context, sources
+    def chat(self, question):
+        if len(self.chat_history):
+            # Create a prompt based on chat history
+            chat_history_prompt = MistralPrompts.create_history_prompt(self.chat_history)
+            standalone_question_prompt = MistralPrompts.create_standalone_question_prompt(question, chat_history_prompt)
+            standalone_question = self.llm.generate_response(standalone_question_prompt)
+        else:
+            chat_history_prompt = ''
+            standalone_question = question
+        # Encode the question using the embedding model
+        query_embedding = self.embedding_model.encode(standalone_question)
+        # Retrieve documents related to the question
+        retrieved_documents = self.vector_store.query(query_embedding, 3)
+        context, sources = self.format_context(retrieved_documents)
+        # Print information about retrieved documents
+        print("Retrieved documents info: \n", sources)
+        # Create a prompt and generate a response
+        prompt = MistralPrompts.create_question_prompt(question, context, chat_history_prompt)
+        response = self.llm.generate_response(prompt)
+        # Extract the response and update chat history
+        response = MistralPrompts.extract_response(response)
+        self.chat_history.append((question, response))
+        return response
+if __name__ == '__main__':
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--vector_database_path", default='vector_db',help="Vector database which store embeddings vector")
+    args = parser.parse_args()
+    VECTOR_DATABASE_PATH = parser.vector_database_path
+    # Initialize models and vector store
+    embedding_model = EmbeddingModel(model_name='sentence-transformers/all-MiniLM-L6-v2')
+    llm = LLM("mistralai/Mistral-7B-Instruct-v0.1")
+    vector_store = FaissVectorStore.as_retriever(database_path=VECTOR_DATABASE_PATH)
+    # Create a ChatBot instance
+    chat_bot = ChatBot(llm, embedding_model, vector_store)
+    # Start the conversation
+    print("Assistant Bot: Hello, I'm the Assistant Bot! How may I assist you today?")
+    while True:
+        question = input("User:")
+        response = chat_bot.chat(question)
+        print("Assistant Bot:", response, '\n')

data/KnowledgeDocument(pan_card_services).txt ADDED Viewed

	@@ -0,0 +1,316 @@

+# About Pan Card
+### What is Pan card?
+The PAN card is a unique ten-digit alphanumeric identification number that is issued by the Income Tax Department of India to track the tax-related transactions of individuals and entities. The PAN card is mandatory for any financial transaction in India, including opening a bank account, buying or selling property, and filing income tax returns.
+### Who needs a Pan card?
+All individuals/non-individuals (including foreign citizens/entities) earning taxable income in India must have a PAN card.
+### Types of PAN cards
+In India, two types of PAN cards are available: e-PAN card and physical PAN card.
+1. e-PAN card: An e-PAN card is a digitally-signed PAN card issued in electronic format. It contains the same PAN details as a physical PAN card but is available in a digital format. It can be downloaded online and used as a valid identification document for various purposes. The e-PAN card is usually issued in a PDF format.
+2. Physical PAN card: A physical PAN card is a laminated card with your PAN details printed on it. It is a physical document that can be carried and used as a valid identification proof. The physical PAN card is sent to the applicant's registered address by post.
+Both e-PAN and physical PAN cards have the same validity and can be used for identification purposes. The choice between the two depends on the applicant's preference and requirements.
+### **Why do NRIs need PAN card?**
+NRIS don’t need to have a PAN Card. However, a PAN Card is necessary for NRIs if they wish to do any of the following in India:
+1. A PAN card is required to carry out financial transactions such as opening a bank account, investing in stocks, purchasing or selling property, and investing in India.
+2. If an NRI earns an income in India, they must file income tax returns. A PAN card is necessary to file these returns.
+3. If an NRI wants to invest in mutual funds in India, they must have a PAN card.
+## Importance of PAN card for different NRI account options
+NRI Accounts comprise of NRE,NRO and FCNR Accounts. A basic overview to understand the importance of PAN Card with respect to these three accounts can be understood as:
+**NRE:** For funds earned outside India where both Principal and Interest earned are tax-free. Therefore, NRIs can opt for Form 60, which is a substitute for PAN for opening an NRE Account.
+**NRO:** For funds earned in India which are mostly liable to taxes. Income such as rent or pension where taxes are not deducted at source are deposited in an NRO Account. Therefore, a PAN Card is mandatory for NRO Accounts.
+**FCNR:** For foreign currency term deposits from outside India. Again, the principal and interest are tax-free, so usage of a PAN Card is not necessary and can be substituted with Form 60.
+---
+# PAN Card Application Process
+## New Pan Card
+### How can NRI apply for a new PAN card
+Here are the steps for *PAN CARD* processing.
+- Visit ABC app
+- Navigate to Services > NRI Pan Card > Apply New PAN
+- Select the required form of PAN card and proceed with the payment
+- Our team will get in touch with you to ask for the following documents:
+    - Passport(Any Country) / OCI Card
+    - Passport Size Photograph
+    - Overseas address proof with zip code (Supporting documents - Indian NRO/NRE Account statement or Overseas bank statement or Utility bill)
+### Documents required for a new PAN Card
+**If you have Aadhaar card**
+No other document is required. You can get your pan card through your Aadhaar card in 10 minutes.
+**If you don’t have an Aadhaar card**
+- Passport(Any Country) / OCI Card
+- Passport Size Photograph
+- Overseas address proof with zip code (Supporting documents - Indian NRO/NRE Account statement or Overseas bank statement or Utility bill)
+### Cost of new PAN card
+The PAN CARD Application through ABC costs Rs 2500 for E-PAN, and if you want it to be couriered, it will cost Rs 1200 extra for physical delivery to your address.
+- e-PAN Card cost: INR 2500
+- Physical PAN Card cost: INR 3700
+### Time required to issue PAN card
+****If you have Aadhaar card****
+You can get a Pan Card instantly **(in under 10 minutes)**, if you have an Aadhaar card. You can apply through ABC.
+********************************************************************If you don’t have an Aadhaar card********************************************************************
+Once the payment is made to ABC, we will contact you and initiate the process. Pan card will be issued in 3 weeks.
+## Updation/Correction in the PAN Card
+### Information that can be updated in the PAN Card
+- Your name
+- Father’s name
+- Date of Birth
+- Citizenship
+- Photograph
+- Signature
+- Gender
+- Address
+- Contact details
+### General process to update details on PAN Card
+To update the details on your PAN card, you have to generate the reissue request for the Updation/ Correction of the PAN CARD. Follow the steps:
+- Go to ABC app
+- Navigate to Services > NRI PAN Card > PAN Card Correction
+- Request reissue the required PAN card and make the payment
+- Our team will reachout to you for the required documents
+Do you want to start the process here instead? Click the button below.
+**Time required to complete the correction process for the PAN card:** The duration to complete the correction process for your PAN card can vary, but it generally takes around 2-3 weeks.
+### Documents required to update the details on PAN Card
+To update the information on the PAN card, kindly keep these documents ready.
+- Copy of Existing Pan card
+- Passport(Any Country) / OCI Card
+- Passport Size Photograph
+- Overseas address proof with zip code (Supporting documents - Indian NRO/NRE Account statement or Overseas bank statement or Utility bill)
+### Documents required to update the address on PAN Card
+- Passport
+- OCI Card
+- Bank Account Statement in the country of residence
+- NRE Bank Account Statement in India
+- Residential Permit
+### ******************************************************************Process to change the citizenship on PAN Card******************************************************************
+No direct provisions exist to change citizenship in a Pan Card as the **PAN Card** doesn’t display your citizenship. It is a document required to file taxes, carry out investments and do transactions in India, whether you are a citizen, NRI, or OCI.
+To change the citizenship in a PAN card, you must meet and notify your jurisdictional Assessing Officer. For NRIs, it is not easy to meet the assessing officer. However, ABC can meet or notify the jurisdictional assessing officer on your behalf. Contact a tax expert at ABC to change your citizenship on PAN card.
+**Time required to update the citizenship status:** Generally, it takes around a month to complete the process, but the duration can vary based on factors such as workload and the authorities' responsiveness.
+## Reprinting lost Aadhaar Card
+To reprint your PAN card, you need to follow a specific procedure that involves providing certain documents and information to authenticate your identity. The process can take around 2-3 weeks to complete. You can apply for a reprint through ABC. We will guide you through the process and help you obtain a new copy of your PAN card.
+### Documents required for reprinting the lost PAN card
+1. **If you remember your PAN number:**
+- Pan number
+- Passport(Any Country) / OCI Card
+- Passport Size Photograph
+- Overseas address proof with zip code (Supporting documents - Indian NRO/NRE Account statement or Overseas bank statement or Utility bill)
+2. **If you don’t remember your PAN number:**
+This gets tricky in most cases so our representative will reach out to you to inform you about the process.
+Do you remember your PAN card number?
+### C**harges for reprinting the PAN Card**
+The charges for reprinting the PAN Card are INR 2500 for e-pan, and INR 3700 for physical pan card.
+---
+## Linking PAN with Aadhaar card
+ABC can link your PAN card and Aadhaar card on your behalf.
+### Process to link PAN with Aadhaar
+- Go to ABC app
+- Navigate to Services > NRI PAN Card > Link PAN with Aadhaar
+- Request reissue the required PAN card and make the payment
+- Our team will reach out to you for the required documents
+Alternatively, you can also initiate the process on WhatsApp as well.
+### ABC fees to link PAN with Aadhaar
+The charges for linking Pan & Aadhaar is INR 2000/-, including the penalty charges to be paid for the delay in linking Pan & Aadhaar.
+### **Documents required** to link PAN with Aadhaar
+Kindly share a copy of your pan card and Aadhaar card. ABC will review the documents and share a payment link for the linking.
+New Deadline for linking Aadhaar with pan card 30 June 2023. PAN will become inoperative after June 2023 if not linked to Aadhaar.
+### Time required **for PAN Aadhaar link for NRI?**
+It takes upto 6 to 7 days for PAN Aadhaar linking for NRIs.
+---
+# Form 49AA
+### **What is Form 49aa?**
+Form 49AA is the application form for the allotment of Permanent Account Number for Foreign residents and entities incorporated outside India.
+### **Documents Required for Form 49AA**
+Here are the necessary documents that are supposed to be submitted along with PAN Card Form 49AA
+1. Passport
+2. PIO card issued by Government of India
+3. OCI card issued by Government of India
+4. Other national/citizenship Identification Number/Taxpayer Identification Number duly attested by “Apostille” or by the Indian Embassy/High Commission/Consulate in the country where the applicant is located or authorized officials of overseas branches of Scheduled Banks registered in India.
+5. Bank account statement in the country of residence
+6. NRE bank account statement in India
+7. Certificate of Residence in India or Residential permit issued by the State Police Authorities
+8. Registration certificate issued by the Foreigner’s Registration Office showing Indian address
+9. Visa granted and Copy of appointment letter/ contract from Indian Company & Certificate (in original) of Indian address issued by the employer
+---
+# FAQs about PAN Card
+I**s it mandatory to link Aadhaar with PAN for NRI?**
+No, Aadhaar and PAN linking is optional for NRIs. However, to avoid any legal complications in India, NRIs should either link their PAN to Aadhaar, or update their status as non-resident.
+**Is PAN card different for NRI?**
+No, PAN works completely the same way for both NRI and Resident Indian except for one factor – the type of Application Form alone changes with respect to your current Residential Status. A PAN is mandatory when filing an Income-Tax return, TDS or any transaction that attracts tax.
+**Can I apply for pan card from USA?**
+Yes. You can apply for a PAN Card from the USA. The easiest and most convenient way to apply for a PAN card from the USA is through ABC.
+**Is a PAN card mandatory for an NRI bank account?**
+No, in the absence of the Pan Card, **NRIs can sign Form 60** [Form 60 is a declaration to be filed by an individual or a person (not being a company or firm) who does not have a Permanent Account Number (PAN) and who in involved in any transaction] to open an NRI Account.
+**Can OCI holders get PAN card?**
+Yes, NRIs who hold foreign citizenship, such as OCI holders or people of Indian origin who possess foreign citizenship or foreigners who are not of Indian origin, can get PAN card through Form 49AA.
+**Can an NRI buy property in India without PAN card?**
+No, A [PAN card](https://ABC.com/blog/nri-income-tax/uses-of-a-pan-card-for-nris) is mandatory to buy property in India. NRIs need a PAN card because they will be required to file income tax returns if they have rented out the property. Besides, if the property is sold later, the capital gains resulting from the sales would be subject to capital gains tax.
+**What is the difference between an NRI PAN card and normal PAN card?**
+There is no difference between an NRI PAN card and normal PAN card. An [NRI PAN card is the same as a PAN card issued to individuals living in India](https://ABC.com/blog/nri-bank-accounts/what-is-the-difference-between-an-nri-pan-card-and-a-normal-pan-card#:~:text=There%20is%20only%20one%20Pan,to%20Indian%20Residents%20and%20NRIs.). However, to apply for a PAN card for OCI or people of Indian origin who hold foreign citizenship, an applicant is required to fill Form 49AA.
+**Is a PAN card mandatory for NRI?**
+PAN is not compulsory for all NRIs. A PAN card is mandatory for NRIs with a source of income in India to file their taxes or if they want to invest in stocks or mutual funds in India.
+**What is the difference between PAN card and Form 60?**
+The **basic difference** between a PAN Card and [Form 60](https://www.incometaxindia.gov.in/forms/income-tax%20rules/103120000000007944.pdf) is that you can only sign and use the Form 60 to open a bank account but in order to file taxes and carry out investments from that account, a Pan Card is mandatory.
+**Can I get a new PAN card as a Canadian citizen?**
+No, it is illegal to possess multiple PAN cards. As a Canadian citizen, you cannot obtain a new PAN card. Instead, you should update your citizenship status in the existing PAN card.
+**Can I perform KYC for my father's mutual funds with an NRI PAN card?**
+You can use your NRI PAN card to perform KYC (Know Your Customer) for your father's mutual funds. However, ensuring that your PAN card reflects your updated citizenship status is crucial. You need to complete updating your citizenship in the PAN database before using it for any financial transactions or KYC requirements.
+**Can the new PAN card be dispatched to my Canadian address?**
+Yes, the PAN card can be delivered to your overseas address, including your Canadian address. While applying for a new PAN card or requesting corrections, you can provide your Canadian address as the delivery address. Make sure to provide accurate and complete address details to ensure successful delivery.
+**What documents are required for updating citizenship on the PAN card?**
+To update your citizenship status, you must provide documents such as your foreign passport, revoked Indian passport (if applicable), and a citizenship renunciation letter (if you have renounced Indian citizenship).
+**Can an overseas driving license be considered as a valid address proof for the PAN card?**
+No, an overseas driving license is generally not considered a valid address proof for the PAN card. Instead, you can provide alternative documents such as bank statements or credit card statements that contain your overseas address as proof of address while applying for corrections or updating your PAN card.
+**How can I make the payment for the PAN card correction process?**
+During the PAN card correction process, you will be either provided with a payment link or taken to Razorpay page. Razorpay is a secure online platform where you can make the payment conveniently using various payment options such as credit/debit cards, net banking, or digital wallets. Make sure to follow the instructions provided and ensure the payment is made within the specified timeframe.
+**Can I link aadhaar card and PAN card even if there is minor difference in my name in both?**
+It is important for the date of birth (DOB) to be the same on both the PAN and Aadhaar documents. However, minor differences in the name should not pose an issue.
+**Can NRIs make the payment for the process using an Indian bank account?**
+Yes, NRIs can make the payment through their Indian bank accounts. However, it is worth noting that paying in Indian Rupees (INR) may be costlier due to the application of 18% GST.
+**Is it possible to make the payment for the process using a cheque?**
+No, ABC only accepts online payments for the linking process and does not accept cheques.
+**What payment options are available for NRIs?**
+NRIs can make the payment using their debit or credit cards or through their international cards.
+**Can NRIs residing in the USA link their PAN and Aadhaar cards without visiting India?**
+Yes, NRIs residing in the USA can link their PAN and Aadhaar cards without the need to visit India. ABC can assist them digitally.
+**How can NRIs share their PAN and Aadhaar card details with ABC for the linking process?**
+NRIs can share their PAN and Aadhaar card details by providing clear images of the documents to ABC. Blurry or unclear images may require re-submission for verification.
+**Can NRIs use an international card for making the payment?**
+Yes, NRIs can use their international debit or credit cards to make the payment for the linking process.
+**Is it necessary to download the ABC app or visit their website for the payment process**
+No, it is not necessary to download the ABC app or visit their website for the payment process. The payment link provided by ABC can be accessed directly to make the payment.
+**Can I apply for pan card without Aadhaar?**
+Yes, NRIs can apply for a PAN card without an Aadhaar Card? They can simply do so by filling out either of the forms – 49A (for citizens of India) or Form 49AA (for foreign citizens).
+**Can I apply for a PAN card if I am a non-resident Indian (NRI)?**
+Yes, as an NRI, you can apply for a PAN card. The process for applying for a PAN card is the same for both residents and NRIs. However, if you are an OCI holder or a person of Indian origin who holds foreign citizenship, you will need to fill Form 49AA to apply for a PAN card.
+**Can I take the delivery of Pan card at Indian address?**
+Yes, you can take the delivery of your PAN card only at an Indian address mentioned in your Aadhaar card. While applying for a new PAN card or requesting corrections, you can provide your Indian address as the delivery address. Make sure to provide accurate and complete address details to ensure successful delivery.

data_processor/__init__.py ADDED Viewed

	@@ -0,0 +1,2 @@


1	+ from .document_reader import DocumentReader
2	+ from .text_splitter import SentenceSplitter

data_processor/__pycache__/__init__.cpython-311.pyc ADDED Viewed

Binary file (324 Bytes). View file

data_processor/__pycache__/document_reader.cpython-311.pyc ADDED Viewed

Binary file (3.37 kB). View file

data_processor/__pycache__/text_splitter.cpython-311.pyc ADDED Viewed

Binary file (9.12 kB). View file

data_processor/document_reader.py ADDED Viewed

	@@ -0,0 +1,51 @@

+from pathlib import Path
+import pypdf
+import docx2txt
+class DocumentReader:
+    @staticmethod
+    def read_pdf(data_path):
+        with open(data_path, "rb") as fp:
+            pdf = pypdf.PdfReader(fp)  # Open the PDF file
+            num_pages = len(pdf.pages)  # Get the number of pages in the PDF
+            docs = []
+            for page in range(num_pages):
+                page_text = pdf.pages[page].extract_text()  # Extract text from the page
+                page_label = pdf.page_labels[page]  # Get page label (e.g., page number)
+                metadata = {"page_label": page_label, "file_name": data_path.name}
+                docs.append({"text": page_text, "metadata": metadata})
+            return docs
+    @staticmethod
+    def read_docx(data_path):
+        metadata = {"file_name": data_path.name}
+        doc = docx2txt.process(data_path)  # Extract text from the DOCX file
+        docs = [{'text': doc, 'metadata': metadata}]
+        return docs
+    @staticmethod
+    def read_txt(data_path):
+        print(data_path.name)
+        with open(data_path, "r") as fp:
+            text = fp.read()  # Read text from the TXT file
+            metadata = {"file_name": data_path.name}
+            docs = [{'text': text, 'metadata': metadata}]
+        return docs
+    @staticmethod
+    def read_document(file_path):
+        data_path = Path(file_path)
+        if data_path.suffix == ".pdf":
+            return DocumentReader.read_pdf(data_path)  # Read PDF document
+        elif data_path.suffix == ".docx":
+            return DocumentReader.read_docx(data_path)  # Read DOCX document
+        elif data_path.suffix == ".txt":
+            return DocumentReader.read_txt(data_path)  # Read TXT document
+        else:
+            raise ValueError("Unsupported file format")
+if __name__=='__main__':
+    # Example usage:
+    DATA_PATH = '71763-gale-encyclopedia-of-medicine.-vol.-1.-2nd-ed.pdf'
+    documents = DocumentReader.read_document(DATA_PATH)  # Read the specified document
+    print(documents)  # Print the extracted text and metadata

data_processor/text_splitter.py ADDED Viewed

	@@ -0,0 +1,151 @@

+from dataclasses import dataclass
+import re
+# Data class for representing a text split
+@dataclass
+class Split:
+    text: str  # the split text
+    is_sentence: bool  # save whether this is a full sentence
+# Data class for representing a document
+@dataclass
+class Document:
+    doc_id: str
+    text: str
+    metadata: dict
+# Class for splitting text into sentences
+class SentenceSplitter:
+    def __init__(self, chunk_size=100, chunk_overlap=50):
+        self.chunk_size = chunk_size
+        self.chunk_overlap = chunk_overlap
+        # List of functions for splitting text
+        self._split_fn_sentence = [self._split_by_sep('\n\n'), self._split_by_regex("[^,.;。？！]+[,.;。？！]?")]
+        self._split_fn_subsentence = [self._split_by_sep(' ')]
+    def _split_by_sep(self, sep):
+        # Split text by separator and maintain the separator
+        def fun(text):
+            parts = text.split(sep)
+            result = [sep + s if i > 0 else s for i, s in enumerate(parts)]
+            return [s for s in result if s]
+        return lambda text: fun(text)
+    def _split_by_regex(self, regex):
+        # Split text using a regular expression
+        return lambda text: re.findall(regex, text)
+    def _splits_by_fns(self, text):
+        for split_fn in self._split_fn_sentence:
+            splits = split_fn(text)
+            if len(splits) > 1:
+                return splits, True
+        for split_fn in self._split_fn_subsentence:
+            splits = split_fn(text)
+            if len(splits) > 1:
+                break
+        return splits, False
+    def _token_size(self, text):
+        # Calculate the token size of text
+        return len(text.split(' '))
+    def _split(self, text, chunk_size):
+        # Break text into splits that are smaller than chunk size
+        if self._token_size(text) <= chunk_size:
+            return [Split(text, is_sentence=True)]
+        text_splits = []
+        text_splits_by_fns, is_sentence = self._splits_by_fns(text)
+        for text_split_by_fns in text_splits_by_fns:
+            if self._token_size(text_split_by_fns) <= chunk_size:
+                text_splits.append(Split(text_split_by_fns, is_sentence=is_sentence))
+            else:
+                recursive_text_splits = self._split(text_split_by_fns, chunk_size=chunk_size)
+                text_splits.extend(recursive_text_splits)
+        return text_splits
+    def _merge(self, splits, chunk_size):
+        # Merge splits into chunks
+        chunks, cur_chunk, last_chunk = [], [], []
+        cur_chunk_len = 0
+        new_chunk = True
+        def close_chunk():
+            nonlocal chunks, cur_chunk, last_chunk, cur_chunk_len, new_chunk
+            chunks.append("".join([text for text, length in cur_chunk]))
+            last_chunk = cur_chunk
+            cur_chunk = []
+            cur_chunk_len = 0
+            new_chunk = True
+            # Add overlap to the new chunk from previous chunks
+            if len(last_chunk) > 0:
+                last_index = len(last_chunk) - 1
+                while (
+                    last_index >= 0
+                    and cur_chunk_len + last_chunk[last_index][1] <= self.chunk_overlap
+                ):
+                    text, length = last_chunk[last_index]
+                    cur_chunk_len += length
+                    cur_chunk.insert(0, (text, length))
+                    last_index -= 1
+        while len(splits) > 0:
+            cur_split = splits[0]
+            cur_split_len = self._token_size(cur_split.text)
+            # Close the chunk if it exceeds chunk_size
+            if cur_chunk_len + cur_split_len > chunk_size and not new_chunk:
+                close_chunk()
+            else:
+                if (
+                    cur_split.is_sentence
+                    or cur_chunk_len + cur_split_len <= chunk_size
+                    or new_chunk  # new chunk, always add at least one split
+                ):
+                    # Add split to chunk
+                    cur_chunk_len += cur_split_len
+                    cur_chunk.append((cur_split.text, cur_split_len))
+                    splits.pop(0)
+                    new_chunk = False
+                else:
+                    # Close out the chunk
+                    close_chunk()
+        # Handle the last chunk
+        if not new_chunk:
+            chunk = "".join([text for text, length in cur_chunk])
+            chunks.append(chunk)
+        # Run post-processing to remove blank spaces
+        new_chunks = [chunk.strip() for chunk in chunks if chunk.strip() != ""]
+        return new_chunks
+    def split_texts(self, documents):
+        chunked_documents = []
+        for page_no, document in enumerate(documents):
+            text, metadata = document['text'], document['metadata']
+            if text == "":
+                continue
+            splits = self._split(text, self.chunk_size)
+            chunks = self._merge(splits, self.chunk_size)
+            for chunk_no, chunk in enumerate(chunks):
+                chunk_id = f"{metadata['file_name']}__{page_no}__{chunk_no}"
+                chunk_metadata = {'file_name': metadata['file_name'], 'page_no': page_no, 'chunk_no': chunk_no}
+                data = Document(chunk_id, chunk, chunk_metadata)
+                chunked_documents.append(data)
+        return chunked_documents
+if __name__ == '__main__':
+    document = {
+        "text": "This is example texts",
+        "metadata": {"file_name": "example.pdf", "page_no": 1}
+    }
+    documents = [document] * 10
+    splitter = SentenceSplitter(chunk_size=100, chunk_overlap=30)
+    splitted_documents = splitter.split_texts(documents)
+    print(splitted_documents[0])

ingest.py ADDED Viewed

	@@ -0,0 +1,53 @@

+from data_processor import DocumentReader, SentenceSplitter
+from models import EmbeddingModel
+from vector_store import FaissVectorStore
+from tqdm import tqdm
+import argparse
+if __name__ == '__main__':
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--data_path", default='data/KnowledgeDocument(pan_card_services).txt',help="Input file name")
+    parser.add_argument("--vector_database_path", default='vector_db',help="Vector database which store embeddings vector")
+    args = parser.parse_args()
+    # Define the paths to the data and vector database
+    DATA_PATH = args.data_path
+    VECTOR_DATABASE_PATH = args.vector_database_path
+    # Read the document from the specified path
+    documents = DocumentReader.read_document(DATA_PATH)
+    # Split the document into sentences with specified chunk parameters
+    splitter = SentenceSplitter(chunk_size=60, chunk_overlap=20)
+    splitted_documents = splitter.split_texts(documents)
+    # Initialize the embedding model
+    embedding_model = EmbeddingModel(model_name='sentence-transformers/all-MiniLM-L6-v2')
+    # Create a dictionary to store documents and their corresponding vectors
+    database_documents = {}
+    batch_size = 16
+    print("Generating embedding vectors....")
+    # Process the documents in batches
+    for i in tqdm(range(0, len(splitted_documents), batch_size)):
+        batch = splitted_documents[i:i + batch_size]
+        texts = []
+        # Extract the text from each document in the batch
+        for b in batch:
+            texts.append(b.text)
+        # Generate embeddings for the batch of texts using the embedding model
+        embeddings = embedding_model.encode(texts)
+        # Associate each document with its corresponding vector and store in the dictionary
+        for i, b in enumerate(batch):
+            data = {'document': b, 'vector': embeddings[i]}
+            database_documents[b.doc_id] = data
+    print("Total embeddings: ",len(database_documents))
+    # Create a Faiss vector store from the processed documents and vectors
+    vector_store = FaissVectorStore.from_documents(database_documents, dimension=embedding_model.embedding_dim, nlists=100, nprobe=10)
+    # Write the vector store to the specified path
+    vector_store.write(VECTOR_DATABASE_PATH)
+    print(f"Successfully written embedding vectors to {VECTOR_DATABASE_PATH} .")

models/__init__.py ADDED Viewed

	@@ -0,0 +1,2 @@


1	+ from .embedding_models import EmbeddingModel
2	+ from .llms import LLM

models/__pycache__/__init__.cpython-311.pyc ADDED Viewed

Binary file (295 Bytes). View file

models/__pycache__/embedding_models.cpython-311.pyc ADDED Viewed

Binary file (3.02 kB). View file

models/__pycache__/llms.cpython-311.pyc ADDED Viewed

Binary file (2.52 kB). View file

models/embedding_models.py ADDED Viewed

	@@ -0,0 +1,35 @@

+import torch
+from transformers import AutoTokenizer, AutoModel
+import torch.nn.functional as F
+# Create a class for embedding sentences using Hugging Face Transformers
+class EmbeddingModel:
+    def __init__(self, model_name='sentence-transformers/all-MiniLM-L6-v2'):
+        # Initialize the model with the given model_name
+        self.tokenizer = AutoTokenizer.from_pretrained(model_name)
+        self.model = AutoModel.from_pretrained(model_name)
+        # Get the embedding dimension from the model's output
+        self.embedding_dim = self.encode('Hi').shape[1]
+    def _mean_pooling(self, model_output, attention_mask):
+        # Calculate mean pooling of token embeddings
+        token_embeddings = model_output[0]
+        input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
+        embedding = torch.sum(token_embeddings * input_mask_expanded, 1) / torch.clamp(input_mask_expanded.sum(1), min=1e-9)
+        return embedding
+    def encode(self, text):
+        # Encode a text into sentence embeddings
+        inputs = self.tokenizer(text, padding=True, truncation=True, return_tensors='pt')
+        with torch.no_grad():
+            outputs = self.model(**inputs)
+        sentence_embeddings = self._mean_pooling(outputs, inputs['attention_mask'])
+        sentence_embeddings = F.normalize(sentence_embeddings, p=2, dim=1).numpy().astype('float32')
+        return sentence_embeddings
+if __name__ == '__main__':
+    # Sentences we want sentence embeddings for
+    sentences = ['This is an example sentence', 'Each sentence is converted']
+    # Print the embedding dimension of the model
+    print(EmbeddingModel().embedding_dim)

models/llms.py ADDED Viewed

	@@ -0,0 +1,55 @@

+from transformers import AutoModelForCausalLM, AutoTokenizer
+import torch
+# Define a Language Model class
+class LLM:
+    def __init__(self, model_name):
+        # Determine the device to use (GPU if available, otherwise CPU)
+        device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
+        # Load the pre-trained language model with specific settings
+        self.model = AutoModelForCausalLM.from_pretrained(
+            model_name,
+            torch_dtype=torch.float16,  # Set the data type to float16
+            load_in_8bit=True,         # Load in 8-bit format if available
+            device_map='auto'          # Automatically select the device
+        ).bfloat16()  # Convert the model to bfloat16 for lower precision
+        # Initialize the tokenizer for the same model
+        self.tokenizer = AutoTokenizer.from_pretrained(model_name)
+        # Set custom padding token and padding side
+        self.tokenizer.pad_token = "[PAD]"
+        self.tokenizer.padding_side = "left"
+    def generate_response(self, messages, max_tokens=100, do_sample=True):
+        # Tokenize the input messages and move them to the selected device (GPU or CPU)
+        input_ids = self.tokenizer(
+            messages,
+            max_length=512,
+            padding=True,
+            truncation=True,
+            return_tensors='pt'
+        ).input_ids.cuda()
+        with torch.no_grad():
+            # Generate a response using the loaded model
+            generated_ids = self.model.generate(
+                input_ids,
+                pad_token_id=self.tokenizer.pad_token_id,
+                max_new_tokens=max_tokens,
+                do_sample=do_sample,
+                temperature=0.3  # Adjust the sampling temperature
+            )
+            # Decode the generated tokens into a human-readable response
+            response = self.tokenizer.batch_decode(generated_ids, skip_special_tokens=False)[0]
+        return response
+# Main program
+if __name__ == '__main__':
+    # Specify the model name to use
+    model_name = "mistralai/Mistral-7B-Instruct-v0.1"
+    # Create an instance of the Language Model class with the specified model
+    llm = LLM(model_name)

requirements.txt ADDED Viewed

	@@ -0,0 +1,8 @@

+pypdf
+docx2txt
+faiss-cpu
+git+https://github.com/huggingface/optimum.git
+git+https://github.com/huggingface/transformers.git
+accelerate
+bitsandbytes
+gradio

utils.py ADDED Viewed

	@@ -0,0 +1,52 @@

+class MistralPrompts:
+    # Create a standalone question prompt by using chat history and followup question.
+    @staticmethod
+    def create_standalone_question_prompt(question, chat_history_prompt):
+        message = f'''
+                [INST]
+                Taking chat history as context, rephrase follow up question into a standalone question.
+                "Follow up question: {question}
+                [/INST]
+              '''
+        prompt = chat_history_prompt + message
+        return prompt
+    # Create a chat history prompt by combining user and bot messages.
+    @staticmethod
+    def create_history_prompt(chat_history):
+        user_message, bot_message = chat_history[0]
+        chat_history_text = f"<s>[INST] {user_message} [/INST] {bot_message}</s>"
+        chat_history_text += "".join(f"[INST] {user_message} [/INST] {bot_message}</s>" for user_message, bot_message in chat_history[1:])
+        return chat_history_text
+    # Create a question prompt by adding context and question to a chat history prompt.
+    @staticmethod
+    def create_question_prompt(question, context, chat_history_prompt):
+        message = '''
+              [INST]
+              {instructions}
+              Context: {context}
+              Question: {question}
+              [/INST]
+              '''
+        if chat_history_prompt == '':
+            # If no chat history, provide instructions.
+            instructions = '''
+                          Use the following pieces of information to answer the user's question.
+                          If you don't know the answer, just say that you don't know,
+                          don't try to make up an answer.
+                          '''
+            message = message.format(instructions=instructions, context=context, question=question)
+            prompt = message
+        else:
+            # If there's a chat history, add context and question to it.
+            message = message.format(instructions='', context=context, question=question)
+            prompt = chat_history_prompt + message
+        return prompt
+    # Extract the response from a prompt.
+    @staticmethod
+    def extract_response(response):
+        response = response.split('[/INST]')[-1].split('</s>')[0].strip()
+        return response

vector_db/documents.pkl ADDED Viewed

Binary file (140 kB). View file

vector_db/index.faiss ADDED Viewed

Binary file (109 kB). View file

vector_store/__init__.py ADDED Viewed

	@@ -0,0 +1 @@


1	+ from .faiss_vector_store import FaissVectorStore

vector_store/__pycache__/__init__.cpython-311.pyc ADDED Viewed

Binary file (257 Bytes). View file

vector_store/__pycache__/faiss_vector_store.cpython-311.pyc ADDED Viewed

Binary file (9.93 kB). View file

vector_store/faiss_vector_store.py ADDED Viewed

	@@ -0,0 +1,148 @@

+import faiss
+import numpy as np
+import os
+import pickle
+from tqdm import tqdm
+# Create a class for a flat index
+class IndexFlat:
+    def __init__(self, dimension):
+        # Initialize a Faiss flat index with L2 distance
+        self.index = faiss.IndexFlatL2(dimension)
+    def add(self, vectors):
+        # Add vectors to the index
+        self.index.add(np.array(vectors))
+    def delete(self, ids):
+        # Remove vectors from the index by their IDs
+        self.index.remove_ids(np.array(ids))
+    def search(self, vectors, k):
+        # Search for the k-nearest neighbors of the given vectors
+        return self.index.search(np.array(vectors), k)
+# Create a class for an IVF (Inverted File) index
+class IndexIVF:
+    def __init__(self, dimension, nlists=100, nprobe=10):
+        # Initialize a Faiss flat index and an IVF index with inner product metric
+        self.index_flat = faiss.IndexFlatL2(dimension)
+        self.index = faiss.IndexIVFFlat(self.index_flat, dimension, nlists, faiss.METRIC_INNER_PRODUCT)
+        self.index.nprobe = nprobe
+    def add(self, vectors):
+        # Train and add vectors to the index
+        self.index.train(np.array(vectors))
+        self.index.add(np.array(vectors))
+    def delete(self, ids):
+        # Remove vectors from the index by their IDs
+        self.index.remove_ids(np.array(ids))
+    def search(self, vectors, k):
+        # Search for the k-nearest neighbors of the given vectors
+        return self.index.search(np.array(vectors), k)
+# Create a class for managing Faiss vector storage
+class FaissVectorStore:
+    def __init__(self, dimension=324, nlists=100, nprobe=10):
+        self.dimension = dimension
+        self.nlists = nlists
+        self.nprobe = nprobe
+        self.index = None
+        self.documents_db = {}
+    def add(self, documents):
+        ids = range(0, len(self.documents_db) + len(documents))
+        db_vectors, db_documents, db_docs_ids = [], [], []
+        # Collect existing document vectors and documents
+        for doc_id in self.documents_db:
+            db_vectors.append(self.documents_db[doc_id]['vector'])
+            db_documents.append(self.documents_db[doc_id]['document'])
+            db_docs_ids.append(doc_id)
+        # Add new document vectors and documents
+        for doc_id in documents:
+            db_vectors.append(documents[doc_id]['vector'])
+            db_documents.append(documents[doc_id]['document'])
+            db_docs_ids.append(doc_id)
+        if len(db_vectors) < 10000:
+            self.index = IndexFlat(self.dimension)
+        else:
+            self.index = IndexIVF(self.dimension, self.nlists, self.nprobe)
+        self.index.add(np.array(db_vectors))
+        self.documents_db = {}
+        for i, doc_id in enumerate(db_docs_ids):
+            self.documents_db[doc_id] = {'vector': db_vectors[i], 'document': db_documents[i], 'index_id': i}
+    def delete(self, documents_ids):
+        # Delete vectors from the index by document IDs
+        index_ids_to_delete = []
+        for doc_id in documents_ids:
+            if doc_id in self.documents_db:
+                index_ids_to_delete.append(self.documents_db[doc_id]['index_id'])
+        self.index.delete(index_ids_to_delete)
+        self.documents_db = {k: v for k, v in self.documents_db.items() if k not in documents_ids}
+    def query(self, query_vector, k):
+        # Query for the top k nearest neighbors to the query_vector
+        _, I = self.index.search(query_vector, k)
+        documents = []
+        for doc_id in self.documents_db:
+            if self.documents_db[doc_id]['index_id'] in I[0]:
+                documents.append(self.documents_db[doc_id]['document'])
+        return documents
+    def write(self,database_path):
+        # Save the index and documents to files
+        if not os.path.exists(database_path):
+            os.makedirs(database_path)
+        faiss_path = os.path.join(database_path, 'index.faiss')
+        document_path = os.path.join(database_path, 'documents.pkl')
+        faiss.write_index(self.index.index, faiss_path)
+        with open(document_path, 'wb') as f:
+            pickle.dump(self.documents_db, f)
+    def read(self,database_path):
+        # Read the index and documents from files
+        faiss_path = os.path.join(database_path, 'index.faiss')
+        document_path = os.path.join(database_path, 'documents.pkl')
+        self.index = faiss.read_index(faiss_path)
+        with open(document_path, 'rb') as f:
+            self.documents_db = pickle.load(f)
+    @classmethod
+    def from_documents(cls, documents, dimension, nlists, nprobe):
+        vector_store = cls(dimension, nlists, nprobe)
+        vector_store.add(documents)
+        return vector_store
+    @classmethod
+    def as_retriever(cls, database_path):
+        vector_store = cls()
+        vector_store.read(database_path)
+        return vector_store
+if __name__ == '__main__':
+    nb = 20000
+    d = 50
+    database_path = 'db_path'
+    if not os.path.exists(database_path):
+        os.makedirs(database_path)
+    documents = {}
+    for i in range(nb):
+        id = f'id_{i}'
+        texts = f'text_{i}'
+        vectors = np.random.random((d)).astype('float32')
+        documents[id] = {'document': texts, 'vector': vectors}
+    vector_store = FaissVectorStore.from_documents(documents, dimension=50, nlists=100, nprobe=10)
+    query_vector = np.random.random((1, d)).astype('float32')
+    nearest_neighbors = vector_store.query(query_vector, k=5)
+    print(nearest_neighbors)