import os from langchain.document_loaders import PyPDFLoader from langchain.embeddings import HuggingFaceEmbeddings # Updated import per deprecation notice from langchain.schema import Document from langchain_community.vectorstores import FAISS from langchain_text_splitters import CharacterTextSplitter import re # Path to folder containing PDFs folder_path = "normativa" pdf_files = [f for f in os.listdir(folder_path) if f.endswith(".pdf")] # Load docs in folder and split text text_splitter = CharacterTextSplitter(chunk_size = 500, chunk_overlap = 0) documents = [] for pdf in pdf_files: print("Loading file:", pdf) loader = PyPDFLoader(os.path.join(folder_path, pdf)) docs = loader.load() documents = text_splitter.split_documents(docs) # Load the embedding model embedding_model = HuggingFaceEmbeddings(model_name="sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2") # Create a FAISS index with chunk-level embeddings faiss_index = FAISS.from_documents(documents, embedding_model) # Save (persist) the index to disk faiss_index.save_local("faiss_index") print("FAISS index built and saved successfully!")