File size: 2,371 Bytes
eccde2c
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
from data_processor import DocumentReader, SentenceSplitter
from models import EmbeddingModel
from vector_store import FaissVectorStore
from tqdm import tqdm
import argparse

if __name__ == '__main__':
    parser = argparse.ArgumentParser()
    parser.add_argument("--data_path", default='data/KnowledgeDocument(pan_card_services).txt',help="Input file name")
    parser.add_argument("--vector_database_path", default='vector_db',help="Vector database which store embeddings vector")
    args = parser.parse_args()

    # Define the paths to the data and vector database
    DATA_PATH = args.data_path
    VECTOR_DATABASE_PATH = args.vector_database_path

    # Read the document from the specified path
    documents = DocumentReader.read_document(DATA_PATH)

    # Split the document into sentences with specified chunk parameters
    splitter = SentenceSplitter(chunk_size=60, chunk_overlap=20)
    splitted_documents = splitter.split_texts(documents)

    # Initialize the embedding model
    embedding_model = EmbeddingModel(model_name='sentence-transformers/all-MiniLM-L6-v2')

    # Create a dictionary to store documents and their corresponding vectors
    database_documents = {}
    batch_size = 16
    print("Generating embedding vectors....")
    # Process the documents in batches
    for i in tqdm(range(0, len(splitted_documents), batch_size)):
        batch = splitted_documents[i:i + batch_size]
        texts = []
        
        # Extract the text from each document in the batch
        for b in batch:
            texts.append(b.text)

        # Generate embeddings for the batch of texts using the embedding model
        embeddings = embedding_model.encode(texts)

        # Associate each document with its corresponding vector and store in the dictionary
        for i, b in enumerate(batch):
            data = {'document': b, 'vector': embeddings[i]}
            database_documents[b.doc_id] = data
    print("Total embeddings: ",len(database_documents))
    # Create a Faiss vector store from the processed documents and vectors
    vector_store = FaissVectorStore.from_documents(database_documents, dimension=embedding_model.embedding_dim, nlists=100, nprobe=10)

    # Write the vector store to the specified path
    vector_store.write(VECTOR_DATABASE_PATH)
    print(f"Successfully written embedding vectors to {VECTOR_DATABASE_PATH} .")