sunil448832's picture
Initial Commit
eccde2c
from data_processor import DocumentReader, SentenceSplitter
from models import EmbeddingModel
from vector_store import FaissVectorStore
from tqdm import tqdm
import argparse
if __name__ == '__main__':
parser = argparse.ArgumentParser()
parser.add_argument("--data_path", default='data/KnowledgeDocument(pan_card_services).txt',help="Input file name")
parser.add_argument("--vector_database_path", default='vector_db',help="Vector database which store embeddings vector")
args = parser.parse_args()
# Define the paths to the data and vector database
DATA_PATH = args.data_path
VECTOR_DATABASE_PATH = args.vector_database_path
# Read the document from the specified path
documents = DocumentReader.read_document(DATA_PATH)
# Split the document into sentences with specified chunk parameters
splitter = SentenceSplitter(chunk_size=60, chunk_overlap=20)
splitted_documents = splitter.split_texts(documents)
# Initialize the embedding model
embedding_model = EmbeddingModel(model_name='sentence-transformers/all-MiniLM-L6-v2')
# Create a dictionary to store documents and their corresponding vectors
database_documents = {}
batch_size = 16
print("Generating embedding vectors....")
# Process the documents in batches
for i in tqdm(range(0, len(splitted_documents), batch_size)):
batch = splitted_documents[i:i + batch_size]
texts = []
# Extract the text from each document in the batch
for b in batch:
texts.append(b.text)
# Generate embeddings for the batch of texts using the embedding model
embeddings = embedding_model.encode(texts)
# Associate each document with its corresponding vector and store in the dictionary
for i, b in enumerate(batch):
data = {'document': b, 'vector': embeddings[i]}
database_documents[b.doc_id] = data
print("Total embeddings: ",len(database_documents))
# Create a Faiss vector store from the processed documents and vectors
vector_store = FaissVectorStore.from_documents(database_documents, dimension=embedding_model.embedding_dim, nlists=100, nprobe=10)
# Write the vector store to the specified path
vector_store.write(VECTOR_DATABASE_PATH)
print(f"Successfully written embedding vectors to {VECTOR_DATABASE_PATH} .")