import glob import os from langchain.text_splitter import RecursiveCharacterTextSplitter, SentenceTransformersTokenTextSplitter from transformers import AutoTokenizer from torch import cuda from langchain_community.document_loaders import PyMuPDFLoader from langchain_community.embeddings import HuggingFaceEmbeddings, HuggingFaceInferenceAPIEmbeddings from langchain_community.vectorstores import Qdrant from auditqa.reports import files, report_list device = 'cuda' if cuda.is_available() else 'cpu' #from dotenv import load_dotenv #load_dotenv() #HF_token = os.environ["HF_TOKEN"] path_to_data = "./data/pdf/" def process_pdf(): docs = {} for file in report_list: try: docs[file] = PyMuPDFLoader(path_to_data + file + '.pdf').load() except Exception as e: print("Exception: ", e) # text splitter based on the tokenizer of a model of your choosing # to make texts fit exactly a transformer's context window size # langchain text splitters: https://python.langchain.com/docs/modules/data_connection/document_transformers/ chunk_size = 256 text_splitter = RecursiveCharacterTextSplitter.from_huggingface_tokenizer( AutoTokenizer.from_pretrained("BAAI/bge-small-en-v1.5"), chunk_size=chunk_size, chunk_overlap=10, add_start_index=True, strip_whitespace=True, separators=["\n\n", "\n"], ) all_documents = {} categories = list(files.keys()) for category in categories: print(category) all_documents[category] = [] subtypes = list(files[category].keys()) for subtype in subtypes: print(subtype) for file in files[category][subtype]: doc_processed = text_splitter.split_documents(docs[file]) for doc in doc_processed: doc.metadata["source"] = category doc.metadata["subtype"] = subtype doc.metadata["year"] = file[-4:] all_documents[category].append(doc_processed)