Spaces:
Runtime error
Runtime error
import os | |
from langchain.document_loaders import PyPDFium2Loader | |
from langchain.embeddings.openai import OpenAIEmbeddings | |
from langchain.text_splitter import RecursiveCharacterTextSplitter | |
from langchain.vectorstores import FAISS | |
from sllim import chat | |
# Standard Textract client setup | |
template = """I will give you a couple of paragraphs from a PDF document along with a question about the document. You will provide an answer as accurately as possible and provide citations for why that answer is correct. | |
DOCUMENTS: | |
{docs} | |
--- | |
QUERY: | |
{query} | |
""" | |
embeddings = OpenAIEmbeddings() | |
def process_file(file_path): | |
index_path = get_index_name(file_path) | |
if os.path.exists(index_path): | |
return | |
loader = PyPDFium2Loader(file_path) | |
data = loader.load() | |
# Parse text into paragraphs | |
text_splitter = RecursiveCharacterTextSplitter( | |
chunk_size=1000, | |
chunk_overlap=50, | |
length_function=len, | |
) | |
docs = text_splitter.split_documents(data) | |
# Embed paragraphs | |
db = FAISS.from_documents(docs, embeddings) | |
db.save_local(index_path) | |
def get_index_name(file_path): | |
basename = os.path.splitext(os.path.basename(file_path))[0] | |
index_path = basename + "_faiss_index" | |
return index_path | |
def ask_question_all(history): | |
indices = [] | |
docs = [] | |
messages = [] | |
for user, bot in history: | |
if not isinstance(user, str): | |
indices.append(get_index_name(user[0])) | |
elif bot: | |
messages.append({"role": "user", "content": user}) | |
messages.append({"role": "assistant", "content": bot}) | |
else: | |
# Handle new message | |
for index_path in indices: | |
db = FAISS.load_local(index_path, embeddings) | |
docs.extend(db.similarity_search(user)) | |
messages.append( | |
{ | |
"role": "user", | |
"content": template.format( | |
query=user, docs="\n".join(map(lambda x: x.page_content, docs)) | |
), | |
} | |
) | |
# send similar paragraphs with question to model | |
return chat(messages, model="gpt-3.5-turbo") | |
def ask_question(query, upload_file, history=None): | |
file_path = upload_file.name | |
index_path = get_index_name(file_path) | |
if not os.path.exists(index_path): | |
loader = PyPDFium2Loader(file_path) | |
data = loader.load() | |
# Parse text into paragraphs | |
text_splitter = RecursiveCharacterTextSplitter( | |
chunk_size=1000, | |
chunk_overlap=50, | |
length_function=len, | |
) | |
docs = text_splitter.split_documents(data) | |
# Embed paragraphs | |
db = FAISS.from_documents(docs, embeddings) | |
db.save_local(index_path) | |
else: | |
db = FAISS.load_local(index_path, embeddings) | |
docs = db.similarity_search(query) | |
messages = [ | |
{ | |
"role": "user", | |
"content": template.format( | |
query=query, docs="\n".join(map(lambda x: x.page_content, docs)) | |
), | |
} | |
] | |
# send similar paragraphs with question to model | |
return chat(messages, model="gpt-3.5-turbo") | |