import datasets import faiss from langchain_community.docstore.in_memory import InMemoryDocstore from langchain_community.vectorstores import FAISS from langchain_core.documents import Document from langchain_text_splitters import RecursiveCharacterTextSplitter from rag.settings import get_embeddings_model def get_vector_store(): embeddings = get_embeddings_model() index = faiss.IndexFlatL2(len(embeddings.embed_query("hello world"))) vector_store = FAISS( embedding_function=embeddings, index=index, docstore=InMemoryDocstore(), index_to_docstore_id={}, ) return vector_store def get_docs(dataset): page_content = """ Model card: {models} List of labels: {labels} """ source_docs = [ Document( page_content=page_content, metadata={ "model_id": model["model_id"], "model_labels": model["model_labels"], }, ) for model in dataset ] text_splitter = RecursiveCharacterTextSplitter( chunk_size=500, # Characters per chunk chunk_overlap=50, # Overlap between chunks to maintain context add_start_index=True, strip_whitespace=True, separators=["\n\n", "\n", ".", " ", ""], # Priority order for splitting ) docs_processed = text_splitter.split_documents(source_docs) print(f"Knowledge base prepared with {len(docs_processed)} document chunks") return docs_processed if __name__ == "__main__": dataset = datasets.load_dataset("stevenbucaille/image-classification-models-dataset", split="train") docs_processed = get_docs(dataset) vector_store = get_vector_store() vector_store.add_documents(docs_processed) vector_store.save_local( folder_path="vector_store/image-classification", index_name="faiss_index", )