import glob import os from langchain.text_splitter import RecursiveCharacterTextSplitter, SentenceTransformersTokenTextSplitter from transformers import AutoTokenizer from langchain_community.document_loaders import PyMuPDFLoader from langchain_community.embeddings import HuggingFaceEmbeddings, HuggingFaceInferenceAPIEmbeddings from langchain_community.vectorstores import Qdrant #from dotenv import load_dotenv #load_dotenv() #HF_token = os.environ["HF_TOKEN"] path_to_data = "./data/" def process_pdf(): files = {'MWTS2021':'./data/MWTS2021.pdf', 'MWTS2022':'./data/MWTS2022.pdf', 'Consolidated2021':'./data/Consolidated2021.pdf'} docs = {} for file,value in files.items(): try: docs[file] = PyMuPDFLoader(value).load() except Exception as e: print("Exception: ", e) # text splitter based on the tokenizer of a model of your choosing # to make texts fit exactly a transformer's context window size # langchain text splitters: https://python.langchain.com/docs/modules/data_connection/document_transformers/ chunk_size = 512 text_splitter = RecursiveCharacterTextSplitter.from_huggingface_tokenizer( AutoTokenizer.from_pretrained("BAAI/bge-small-en-v1.5"), chunk_size=chunk_size, chunk_overlap=int(chunk_size / 20), add_start_index=True, strip_whitespace=True, separators=["\n\n", "\n"], ) all_documents = {'Consolidated':[], 'MWTS':[]} for file,value in docs.items(): doc_processed = text_splitter.split_documents(value) for doc in doc_processed: doc.metadata["source"] = file doc.metadata["year"] = file[-4:] for key in all_documents: if key in file: print(key) all_documents[key].append(doc_processed) for key, docs_processed in all_documents.items(): docs_processed = [item for sublist in docs_processed for item in sublist] all_documents[key] = docs_processed embeddings = HuggingFaceEmbeddings( model_kwargs = {'device': 'cpu'}, encode_kwargs = {'normalize_embeddings': True}, model_name="BAAI/bge-small-en-v1.5" ) qdrant_collections = {} for file,value in all_documents.items(): print("emebddings for:",file) qdrant_collections[file] = Qdrant.from_documents( value, embeddings, location=":memory:", collection_name=file, ) print("done") return qdrant_collections