Spaces:
Runtime error
Runtime error
from data_processor import DocumentReader, SentenceSplitter | |
from models import EmbeddingModel | |
from vector_store import FaissVectorStore | |
from tqdm import tqdm | |
import argparse | |
if __name__ == '__main__': | |
parser = argparse.ArgumentParser() | |
parser.add_argument("--data_path", default='data/KnowledgeDocument(pan_card_services).txt',help="Input file name") | |
parser.add_argument("--vector_database_path", default='vector_db',help="Vector database which store embeddings vector") | |
args = parser.parse_args() | |
# Define the paths to the data and vector database | |
DATA_PATH = args.data_path | |
VECTOR_DATABASE_PATH = args.vector_database_path | |
# Read the document from the specified path | |
documents = DocumentReader.read_document(DATA_PATH) | |
# Split the document into sentences with specified chunk parameters | |
splitter = SentenceSplitter(chunk_size=60, chunk_overlap=20) | |
splitted_documents = splitter.split_texts(documents) | |
# Initialize the embedding model | |
embedding_model = EmbeddingModel(model_name='sentence-transformers/all-MiniLM-L6-v2') | |
# Create a dictionary to store documents and their corresponding vectors | |
database_documents = {} | |
batch_size = 16 | |
print("Generating embedding vectors....") | |
# Process the documents in batches | |
for i in tqdm(range(0, len(splitted_documents), batch_size)): | |
batch = splitted_documents[i:i + batch_size] | |
texts = [] | |
# Extract the text from each document in the batch | |
for b in batch: | |
texts.append(b.text) | |
# Generate embeddings for the batch of texts using the embedding model | |
embeddings = embedding_model.encode(texts) | |
# Associate each document with its corresponding vector and store in the dictionary | |
for i, b in enumerate(batch): | |
data = {'document': b, 'vector': embeddings[i]} | |
database_documents[b.doc_id] = data | |
print("Total embeddings: ",len(database_documents)) | |
# Create a Faiss vector store from the processed documents and vectors | |
vector_store = FaissVectorStore.from_documents(database_documents, dimension=embedding_model.embedding_dim, nlists=100, nprobe=10) | |
# Write the vector store to the specified path | |
vector_store.write(VECTOR_DATABASE_PATH) | |
print(f"Successfully written embedding vectors to {VECTOR_DATABASE_PATH} .") | |