|
import box |
|
import yaml |
|
from langchain.vectorstores import FAISS |
|
from langchain.document_loaders import PyPDFDirectoryLoader |
|
from langchain.text_splitter import CharacterTextSplitter |
|
from langchain.text_splitter import RecursiveCharacterTextSplitter |
|
from langchain.document_loaders import PyPDFLoader, DirectoryLoader |
|
from langchain.embeddings import HuggingFaceEmbeddings |
|
from langchain_community.embeddings.sentence_transformer import ( |
|
SentenceTransformerEmbeddings, |
|
) |
|
from langchain.vectorstores import Chroma |
|
|
|
|
|
with open('config.yml', 'r', encoding='utf8') as ymlfile: |
|
cfg = box.Box(yaml.safe_load(ymlfile)) |
|
|
|
|
|
def run_ingest(): |
|
loader = DirectoryLoader(cfg.DATA_PATH, |
|
glob='*.pdf', |
|
loader_cls=PyPDFLoader) |
|
|
|
documents = loader.load() |
|
print("documents",documents) |
|
text_splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=20,length_function =len,add_start_index = True) |
|
text = text_splitter.split_documents(documents) |
|
embedding_function = SentenceTransformerEmbeddings(model_name="all-MiniLM-L6-v2",model_kwargs={'device': 'cpu'}) |
|
|
|
|
|
db2 = Chroma.from_documents(text, embedding_function, persist_directory="./vectorestore/chroma") |
|
|
|
if __name__ == "__main__": |
|
run_ingest() |
|
|