from langchain.embeddings.openai import OpenAIEmbeddings from langchain.text_splitter import CharacterTextSplitter from langchain.vectorstores import FAISS import PyPDF2 # Function to process a PDF file def process_pdf(file_stream): if isinstance(file_stream, dict): # Check if PDF was obtained using Drag and Drop or Drive link file_path = file_stream['name'] # Use 'path' for local testing and 'name' for Gradio pdf_reader = PyPDF2.PdfReader(file_path) else: pdf_reader = PyPDF2.PdfReader(file_stream) text = "" for page_num in range(len(pdf_reader.pages)): page = pdf_reader.pages[page_num] text += page.extract_text() return text def create_dnd_database(file_list): raw_text = '' if file_list is None: return None for pdf in file_list: raw_text += process_pdf(pdf) embedding = OpenAIEmbeddings() text_splitter = CharacterTextSplitter( separator = "\n", chunk_size = 1000, chunk_overlap = 200, length_function = len, ) texts = text_splitter.split_text(raw_text) print('Length of text: ' + str(len(raw_text))) db = FAISS.from_texts(texts, embedding) return db