import os from langchain.document_loaders import PyPDFium2Loader from langchain.embeddings.openai import OpenAIEmbeddings from langchain.text_splitter import RecursiveCharacterTextSplitter from langchain.vectorstores import FAISS from sllim import chat # Standard Textract client setup template = """I will give you a couple of paragraphs from a PDF document along with a question about the document. You will provide an answer as accurately as possible and provide citations for why that answer is correct. DOCUMENTS: {docs} --- QUERY: {query} """ embeddings = OpenAIEmbeddings() def process_file(file_path): index_path = get_index_name(file_path) if os.path.exists(index_path): return loader = PyPDFium2Loader(file_path) data = loader.load() # Parse text into paragraphs text_splitter = RecursiveCharacterTextSplitter( chunk_size=1000, chunk_overlap=50, length_function=len, ) docs = text_splitter.split_documents(data) # Embed paragraphs db = FAISS.from_documents(docs, embeddings) db.save_local(index_path) def get_index_name(file_path): basename = os.path.splitext(os.path.basename(file_path))[0] index_path = basename + "_faiss_index" return index_path def ask_question_all(history): indices = [] docs = [] messages = [] for user, bot in history: if not isinstance(user, str): indices.append(get_index_name(user[0])) elif bot: messages.append({"role": "user", "content": user}) messages.append({"role": "assistant", "content": bot}) else: # Handle new message for index_path in indices: db = FAISS.load_local(index_path, embeddings) docs.extend(db.similarity_search(user)) messages.append( { "role": "user", "content": template.format( query=user, docs="\n".join(map(lambda x: x.page_content, docs)) ), } ) # send similar paragraphs with question to model return chat(messages, model="gpt-3.5-turbo") def ask_question(query, upload_file, history=None): file_path = upload_file.name index_path = get_index_name(file_path) if not os.path.exists(index_path): loader = PyPDFium2Loader(file_path) data = loader.load() # Parse text into paragraphs text_splitter = RecursiveCharacterTextSplitter( chunk_size=1000, chunk_overlap=50, length_function=len, ) docs = text_splitter.split_documents(data) # Embed paragraphs db = FAISS.from_documents(docs, embeddings) db.save_local(index_path) else: db = FAISS.load_local(index_path, embeddings) docs = db.similarity_search(query) messages = [ { "role": "user", "content": template.format( query=query, docs="\n".join(map(lambda x: x.page_content, docs)) ), } ] # send similar paragraphs with question to model return chat(messages, model="gpt-3.5-turbo")