|
import streamlit as st |
|
from langchain_huggingface import HuggingFaceEmbeddings |
|
from langchain_chroma import Chroma |
|
from langchain_community.document_loaders import PyPDFLoader |
|
from langchain_text_splitters import RecursiveCharacterTextSplitter |
|
|
|
|
|
@st.cache_resource() |
|
def load_embedding_model(model): |
|
""" |
|
sentence-transformers/all-mpnet-base-v2 |
|
sentence-transformers/all-MiniLM-L6-v2 |
|
""" |
|
model = HuggingFaceEmbeddings(model_name=model) |
|
return model |
|
|
|
|
|
def load_vector_store(): |
|
""" |
|
Loads a simple vector store |
|
I didn't use @st.cache because I want to |
|
load vector store on every page load |
|
""" |
|
model = load_embedding_model("sentence-transformers/all-MiniLM-L6-v2") |
|
vector_store = Chroma( |
|
collection_name="main_store", |
|
embedding_function=model, |
|
) |
|
return vector_store |
|
|
|
|
|
def process_pdf(pdf, vector_store): |
|
""" |
|
Loads a pdf and splits it into chunks |
|
""" |
|
loader = PyPDFLoader(pdf) |
|
docs = loader.load() |
|
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200) |
|
splits = text_splitter.split_documents(docs) |
|
vector_store.add_documents(splits) |
|
|