ScouterAI / rag /prepare_knowledge_base.py
stevenbucaille's picture
Enhance app.py with improved user interface and instructions, update model ID in llm.py, and add image classification capabilities across various components. Introduce segment anything functionality and refine README for clarity on model capabilities.
518d841
raw
history blame
1.89 kB
import datasets
import faiss
from langchain_community.docstore.in_memory import InMemoryDocstore
from langchain_community.vectorstores import FAISS
from langchain_core.documents import Document
from langchain_text_splitters import RecursiveCharacterTextSplitter
from rag.settings import get_embeddings_model
def get_vector_store():
embeddings = get_embeddings_model()
index = faiss.IndexFlatL2(len(embeddings.embed_query("hello world")))
vector_store = FAISS(
embedding_function=embeddings,
index=index,
docstore=InMemoryDocstore(),
index_to_docstore_id={},
)
return vector_store
def get_docs(dataset):
page_content = """
Model card:
{models}
List of labels:
{labels}
"""
source_docs = [
Document(
page_content=page_content,
metadata={
"model_id": model["model_id"],
"model_labels": model["model_labels"],
},
)
for model in dataset
]
text_splitter = RecursiveCharacterTextSplitter(
chunk_size=500, # Characters per chunk
chunk_overlap=50, # Overlap between chunks to maintain context
add_start_index=True,
strip_whitespace=True,
separators=["\n\n", "\n", ".", " ", ""], # Priority order for splitting
)
docs_processed = text_splitter.split_documents(source_docs)
print(f"Knowledge base prepared with {len(docs_processed)} document chunks")
return docs_processed
if __name__ == "__main__":
dataset = datasets.load_dataset("stevenbucaille/image-classification-models-dataset", split="train")
docs_processed = get_docs(dataset)
vector_store = get_vector_store()
vector_store.add_documents(docs_processed)
vector_store.save_local(
folder_path="vector_store/image-classification",
index_name="faiss_index",
)