import glob import pandas as pd import json import os from langchain.text_splitter import RecursiveCharacterTextSplitter, SentenceTransformersTokenTextSplitter from transformers import AutoTokenizer from torch import cuda from langchain_community.embeddings import HuggingFaceEmbeddings, HuggingFaceInferenceAPIEmbeddings from langchain_community.vectorstores import Qdrant from qdrant_client import QdrantClient from auditqa.reports import files, report_list from langchain.docstore.document import Document import configparser # read all the necessary variables device = 'cuda' if cuda.is_available() else 'cpu' path_to_data = "./reports/" ##---------------------functions -------------------------------------------## def getconfig(configfile_path:str): """ Read the config file Params ---------------- configfile_path: file path of .cfg file """ config = configparser.ConfigParser() try: config.read_file(open(configfile_path)) return config except: logging.warning("config file not found") def open_file(filepath): with open(filepath) as file: simple_json = json.load(file) return simple_json def load_chunks(): """ this method reads through the files and report_list to create the vector database """ # we iterate through the files which contain information about its # 'source'=='category', 'subtype', these are used in UI for document selection # which will be used later for filtering database config = getconfig("./model_params.cfg") all_documents = {} categories = list(files.keys()) # iterate through 'source' for category in categories: print("documents splitting in source:",category) all_documents[category] = [] subtypes = list(files[category].keys()) # iterate through 'subtype' within the source # example source/category == 'District', has subtypes which is district names for subtype in subtypes: print("document splitting for subtype:",subtype) for file in files[category][subtype]: # load the chunks try: doc_processed = open_file(path_to_data + file + "/"+ file+ ".chunks.json" ) except Exception as e: print("Exception: ", e) print("chunks in subtype:",subtype, "are:",len(doc_processed)) # add metadata information chunks_list = [] for doc in doc_processed: chunks_list.append(Document(page_content= doc['content'], metadata={"source": category, "subtype":subtype, "year":file[-4:], "filename":file, "page":doc['metadata']['page'], "headings":doc['metadata']['headings']})) all_documents[category].append(chunks_list) # convert list of list to flat list for key, docs_processed in all_documents.items(): docs_processed = [item for sublist in docs_processed for item in sublist] print("length of chunks in source:",key, "are:",len(docs_processed)) all_documents[key] = docs_processed all_documents['allreports'] = [sublist for key,sublist in all_documents.items()] all_documents['allreports'] = [item for sublist in all_documents['allreports'] for item in sublist] # define embedding model embeddings = HuggingFaceEmbeddings( model_kwargs = {'device': device}, encode_kwargs = {'normalize_embeddings': bool(int(config.get('retriever','NORMALIZE')))}, model_name=config.get('retriever','MODEL') ) # placeholder for collection qdrant_collections = {} for file,value in all_documents.items(): if file == "allreports": print("emebddings for:",file) qdrant_collections[file] = Qdrant.from_documents( value, embeddings, path="/data/local_qdrant", collection_name=file, ) print(qdrant_collections) print("vector embeddings done") return qdrant_collections def load_new_chunks(): """ this method reads through the files and report_list to create the vector database """ # we iterate through the files which contain information about its # 'source'=='category', 'subtype', these are used in UI for document selection # which will be used later for filtering database config = getconfig("./model_params.cfg") files = pd.read_json("./axa_processed_chunks_update.json") all_documents= [] # iterate through 'source' for i in range(len(files)): # load the chunks try: doc_processed = open_file(path_to_data + "/chunks/"+ os.path.basename(files.loc[i,'chunks_filepath'])) doc_processed = doc_processed['paragraphs'] except Exception as e: print("Exception: ", e) print("chunks in subtype:", files.loc[i,'filename'], "are:",len(doc_processed)) # add metadata information for doc in doc_processed: all_documents.append(Document(page_content= str(doc['content']), metadata={"source": files.loc[i,'category'], "subtype":os.path.splitext(files.loc[i,'filename'])[0], "year":str(files.loc[i,'year']), "filename":files.loc[0,'filename'], "page":doc['metadata']['page'], "headings":doc['metadata']['headings']})) # convert list of list to flat list print("length of chunks:",len(all_documents)) # define embedding model embeddings = HuggingFaceEmbeddings( model_kwargs = {'device': device}, encode_kwargs = {'normalize_embeddings': bool(int(config.get('retriever','NORMALIZE')))}, model_name=config.get('retriever','MODEL') ) # placeholder for collection qdrant_collections = {} qdrant_collections['allreports'] = Qdrant.from_documents( all_documents, embeddings, path="/data/local_qdrant", collection_name='allreports', ) print(qdrant_collections) print("vector embeddings done") return qdrant_collections def get_local_qdrant(): """once the local qdrant server is created this is used to make the connection to exisitng server""" config = getconfig("./model_params.cfg") qdrant_collections = {} embeddings = HuggingFaceEmbeddings( model_kwargs = {'device': device}, encode_kwargs = {'normalize_embeddings': True}, model_name=config.get('retriever','MODEL')) client = QdrantClient(path="/data/local_qdrant") print("Collections in local Qdrant:",client.get_collections()) qdrant_collections['allreports'] = Qdrant(client=client, collection_name='allreports', embeddings=embeddings, ) return qdrant_collections