cloud-sean's picture
Update app.py
3035b9c verified
raw
history blame
3.87 kB
import gradio as gr
import openai, os
import tqdm
import time
from langchain.vectorstores import Chroma
from PyPDF2 import PdfReader
from langchain.text_splitter import CharacterTextSplitter, RecursiveCharacterTextSplitter
from langchain.embeddings.openai import OpenAIEmbeddings
from langchain import VectorDBQA
# from langchain.llms import ChatOpenAI
from langchain.chains import RetrievalQA
from langchain.chat_models import AzureChatOpenAI
# from langchain.chat_models import AzureChatOpenAI
os.environ["OPENAI_API_TYPE"] = openai.api_type = "azure"
os.environ["OPENAI_API_VERSION"] = openai.api_version = "2023-03-15-preview"
os.environ["OPENAI_API_BASE"] = openai.api_base = "https://eastus-openai-sean.openai.azure.com/"
# openai.api_key = os.environ["OPENAI_API_KEY"]
def upload_pdf(file, pdf_text, embeddings, vectorstore, azure_embeddings, qa, progress = gr.Progress(track_tqdm=True)):
reader = PdfReader(file)
number_of_pages = len(reader.pages)
pdf_text = ""
for page_number in range(number_of_pages):
page = reader.pages[page_number]
pdf_text += page.extract_text()
text_splitter = RecursiveCharacterTextSplitter(
chunk_size = 1000,
chunk_overlap = 200,
length_function = len,)
texts = text_splitter.split_text(pdf_text)
for text in tqdm.tqdm(texts):
try:
response = openai.Embedding.create(
input=text,
engine="text-embedding-ada-002")
emb = response['data'][0]['embedding']
embeddings.append(emb)
except Exception as e:
print(e)
time.sleep(8)
response = openai.Embedding.create(
input=text,
engine="text-embedding-ada-002")
emb = response['data'][0]['embedding']
embeddings.append(emb)
azure_embeddings = OpenAIEmbeddings(
deployment="text-embedding-ada-002",
model="ytext-embedding-ada-002",
)
vectorstore = Chroma("collection", embedding_function=azure_embeddings)
vectorstore._collection.add(
ids= [f"doc_{i}" for i in range(len(texts))],
documents=texts,
embeddings=embeddings,
metadatas=[{"source": "source"} for text in texts])
qa = RetrievalQA.from_chain_type(llm= AzureChatOpenAI(model_name='gpt-35-turbo', deployment_name="gpt-35-turbo"), chain_type="stuff", retriever=vectorstore.as_retriever())
# qa = RetrievalQA.from_chain_type(llm= AzureChatOpenAI(deployment_name="chat", model_name="gpt-35-turbo"), chain_type="stuff", vectorstore=vectorstore.as_retriever())
return pdf_text, pdf_text, embeddings, vectorstore, azure_embeddings, qa, gr.update(visible=True), gr.update(visible=True), gr.update(visible=False)
def add_text(chatstate, query, qa):
# chain.run(input_documents=docs, question=query)
chatstate = chatstate + [(query, qa.run(query))]
return chatstate, chatstate, qa
with gr.Blocks(css="footer {visibility: hidden}", title='PDF - Q&A') as demo:
qa = pdf_text = embeddings = vectorstore = azure_embeddings = gr.State([])
with gr.Row(visible=False) as chat_row:
chatbot = gr.Chatbot()
with gr.Row(visible=False) as submit_row:
text = gr.Textbox(show_label=False, placeholder="Enter text and press enter").style(container=False)
chatstate = gr.State([])
text.submit(add_text, [chatstate, text, qa], [chatbot, chatstate, qa])
# set state
with gr.Column() as upload_column:
file = gr.File()
upload_btn = gr.Button("Upload")
output_text = gr.TextArea()
upload_btn.click(upload_pdf, inputs=[file, pdf_text, embeddings, vectorstore, azure_embeddings, qa], outputs=[output_text, pdf_text, embeddings, vectorstore, azure_embeddings, qa, chat_row, submit_row, upload_column])
demo.launch(enable_queue=True)