import ray import logging from langchain_community.document_loaders import DirectoryLoader from langchain_community.embeddings import HuggingFaceEmbeddings from langchain.text_splitter import RecursiveCharacterTextSplitter from langchain_community.vectorstores import FAISS from faiss import IndexFlatL2 # Assuming using L2 distance for simplicity # Initialize Ray ray.init() # Set up basic configuration for logging logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s') # Load documents with logging logging.info("Loading documents...") loader = DirectoryLoader('data', glob="./*.txt") documents = loader.load() # Extract text from documents and split into manageable texts with logging logging.info("Extracting and splitting texts from documents...") text_splitter = RecursiveCharacterTextSplitter(chunk_size=1024, chunk_overlap=200) texts = [] for document in documents: if hasattr(document, 'get_text'): text_content = document.get_text() # Adjust according to actual method else: text_content = "" # Default to empty string if no text method is available texts.extend(text_splitter.split_text(text_content)) # Define embedding function def embedding_function(text): embeddings_model = HuggingFaceEmbeddings(model_name="law-ai/InLegalBERT") return embeddings_model.embed_query(text) # Create FAISS index for embeddings index = IndexFlatL2(768) # Dimension of embeddings, adjust as needed # Assuming docstore as a simple dictionary to store document texts docstore = {i: text for i, text in enumerate(texts)} index_to_docstore_id = {i: i for i in range(len(texts))} # Initialize FAISS faiss_db = FAISS(embedding_function, index, docstore, index_to_docstore_id) # Process and store embeddings logging.info("Storing embeddings in FAISS...") for i, text in enumerate(texts): embedding = embedding_function(text) faiss_db.add_documents([embedding]) # Exporting the vector embeddings database with logging logging.info("Exporting the vector embeddings database...") faiss_db.save_local("ipc_embed_db") # Log a message to indicate the completion of the process logging.info("Process completed successfully.") # Shutdown Ray after the process ray.shutdown()