cloud-sean's picture
Update app.py
fd611a8
raw
history blame
3.92 kB
import gradio as gr
import openai, os
import tqdm
import time
from langchain.vectorstores import Chroma
from PyPDF2 import PdfReader
from langchain.text_splitter import CharacterTextSplitter, RecursiveCharacterTextSplitter
from langchain.embeddings.openai import OpenAIEmbeddings
from langchain import VectorDBQA
from langchain.llms import AzureOpenAI
from langchain.chains import RetrievalQA
from langchain.chat_models import AzureChatOpenAI
# from langchain.chat_models import AzureChatOpenAI
os.environ["OPENAI_API_TYPE"] = openai.api_type = "azure"
os.environ["OPENAI_API_VERSION"] = openai.api_version = "2023-03-15-preview"
os.environ["OPENAI_API_BASE"] = openai.api_base = "https://openai-endpoint.openai.azure.com/"
openai.api_key = os.environ["OPENAI_API_KEY"] = "b83d692637df4e339298f24790a2dcb6"
def upload_pdf(file, pdf_text, embeddings, vectorstore, azure_embeddings, qa, progress = gr.Progress(track_tqdm=True)):
reader = PdfReader(file)
number_of_pages = len(reader.pages)
pdf_text = ""
for page_number in range(number_of_pages):
page = reader.pages[page_number]
pdf_text += page.extract_text()
text_splitter = RecursiveCharacterTextSplitter(
chunk_size = 1000,
chunk_overlap = 200,
length_function = len,)
texts = text_splitter.split_text(pdf_text)
for text in tqdm.tqdm(texts):
try:
response = openai.Embedding.create(
input=text,
engine="text-embedding-ada-002")
emb = response['data'][0]['embedding']
embeddings.append(emb)
except Exception as e:
print(e)
time.sleep(8)
response = openai.Embedding.create(
input=text,
engine="text-embedding-ada-002")
emb = response['data'][0]['embedding']
embeddings.append(emb)
azure_embeddings = OpenAIEmbeddings(document_model_name="text-embedding-ada-002",query_model_name="text-embedding-ada-002")
vectorstore = Chroma("collection", embedding_function=azure_embeddings)
vectorstore._collection.add(
ids= [f"doc_{i}" for i in range(len(texts))],
documents=texts,
embeddings=embeddings,
metadatas=[{"source": "source"} for text in texts])
qa = RetrievalQA.from_chain_type(llm= AzureChatOpenAI(deployment_name="Bartos", model_name='gpt-35-turbo' ), chain_type="stuff", retriever=vectorstore.as_retriever())
# qa = RetrievalQA.from_chain_type(llm= AzureOpenAI(deployment_name="davinci003", model_name="text-davinci-003"), chain_type="stuff", vectorstore=vectorstore)
return pdf_text, pdf_text, embeddings, vectorstore, azure_embeddings, qa, gr.update(visible=True), gr.update(visible=True), gr.update(visible=False)
def add_text(chatstate, query, qa):
# chain.run(input_documents=docs, question=query)
chatstate = chatstate + [(query, qa.run(query))]
return chatstate, chatstate, qa
with gr.Blocks(css="footer {visibility: hidden}", title='PDF - Q&A') as demo:
qa = pdf_text = embeddings = vectorstore = azure_embeddings = gr.State([])
with gr.Row(visible=False) as chat_row:
chatbot = gr.Chatbot()
with gr.Row(visible=False) as submit_row:
text = gr.Textbox(show_label=False, placeholder="Enter text and press enter").style(container=False)
chatstate = gr.State([])
text.submit(add_text, [chatstate, text, qa], [chatbot, chatstate, qa])
# set state
with gr.Column() as upload_column:
file = gr.File()
upload_btn = gr.Button("Upload")
output_text = gr.TextArea()
upload_btn.click(upload_pdf, inputs=[file, pdf_text, embeddings, vectorstore, azure_embeddings, qa], outputs=[output_text, pdf_text, embeddings, vectorstore, azure_embeddings, qa, chat_row, submit_row, upload_column])
with gr.Row():
gr.Markdown("`now with GPT-3.5 Turbo`")
demo.launch(enable_queue=True)