Spaces:
Running
Running
import gradio as gr | |
import openai, os | |
import tqdm | |
import time | |
from langchain.vectorstores import Chroma | |
from PyPDF2 import PdfReader | |
from langchain.text_splitter import CharacterTextSplitter, RecursiveCharacterTextSplitter | |
from langchain.embeddings.openai import OpenAIEmbeddings | |
from langchain import VectorDBQA | |
from langchain.llms import AzureOpenAI | |
from langchain.chains import RetrievalQA | |
from langchain.chat_models import AzureChatOpenAI | |
# from langchain.chat_models import AzureChatOpenAI | |
os.environ["OPENAI_API_TYPE"] = openai.api_type = "azure" | |
os.environ["OPENAI_API_VERSION"] = openai.api_version = "2023-03-15-preview" | |
os.environ["OPENAI_API_BASE"] = openai.api_base = "https://openai-endpoint.openai.azure.com/" | |
openai.api_key = os.environ["OPENAI_API_KEY"] = "b83d692637df4e339298f24790a2dcb6" | |
def upload_pdf(file, pdf_text, embeddings, vectorstore, azure_embeddings, qa, progress = gr.Progress(track_tqdm=True)): | |
reader = PdfReader(file) | |
number_of_pages = len(reader.pages) | |
pdf_text = "" | |
for page_number in range(number_of_pages): | |
page = reader.pages[page_number] | |
pdf_text += page.extract_text() | |
text_splitter = RecursiveCharacterTextSplitter( | |
chunk_size = 1000, | |
chunk_overlap = 200, | |
length_function = len,) | |
texts = text_splitter.split_text(pdf_text) | |
for text in tqdm.tqdm(texts): | |
try: | |
response = openai.Embedding.create( | |
input=text, | |
engine="text-embedding-ada-002") | |
emb = response['data'][0]['embedding'] | |
embeddings.append(emb) | |
except Exception as e: | |
print(e) | |
time.sleep(8) | |
response = openai.Embedding.create( | |
input=text, | |
engine="text-embedding-ada-002") | |
emb = response['data'][0]['embedding'] | |
embeddings.append(emb) | |
azure_embeddings = OpenAIEmbeddings(document_model_name="text-embedding-ada-002",query_model_name="text-embedding-ada-002") | |
vectorstore = Chroma("collection", embedding_function=azure_embeddings) | |
vectorstore._collection.add( | |
ids= [f"doc_{i}" for i in range(len(texts))], | |
documents=texts, | |
embeddings=embeddings, | |
metadatas=[{"source": "source"} for text in texts]) | |
qa = RetrievalQA.from_chain_type(llm= AzureChatOpenAI(deployment_name="Bartos", model_name='gpt-35-turbo' ), chain_type="stuff", retriever=vectorstore.as_retriever()) | |
# qa = RetrievalQA.from_chain_type(llm= AzureOpenAI(deployment_name="davinci003", model_name="text-davinci-003"), chain_type="stuff", vectorstore=vectorstore) | |
return pdf_text, pdf_text, embeddings, vectorstore, azure_embeddings, qa, gr.update(visible=True), gr.update(visible=True), gr.update(visible=False) | |
def add_text(chatstate, query, qa): | |
# chain.run(input_documents=docs, question=query) | |
chatstate = chatstate + [(query, qa.run(query))] | |
return chatstate, chatstate, qa | |
with gr.Blocks(css="footer {visibility: hidden}", title='PDF - Q&A') as demo: | |
qa = pdf_text = embeddings = vectorstore = azure_embeddings = gr.State([]) | |
with gr.Row(visible=False) as chat_row: | |
chatbot = gr.Chatbot() | |
with gr.Row(visible=False) as submit_row: | |
text = gr.Textbox(show_label=False, placeholder="Enter text and press enter").style(container=False) | |
chatstate = gr.State([]) | |
text.submit(add_text, [chatstate, text, qa], [chatbot, chatstate, qa]) | |
# set state | |
with gr.Column() as upload_column: | |
file = gr.File() | |
upload_btn = gr.Button("Upload") | |
output_text = gr.TextArea() | |
upload_btn.click(upload_pdf, inputs=[file, pdf_text, embeddings, vectorstore, azure_embeddings, qa], outputs=[output_text, pdf_text, embeddings, vectorstore, azure_embeddings, qa, chat_row, submit_row, upload_column]) | |
with gr.Row(): | |
gr.Markdown("`now with GPT-3.5 Turbo`") | |
demo.launch(enable_queue=True) |