cloud-sean's picture
Update app.py
2d2e179
raw
history blame
3.68 kB
import gradio as gr
from PyPDF2 import PdfReader
import tqdm
import os
import openai
import time
import gradio as gr
from langchain.embeddings.openai import OpenAIEmbeddings
from langchain.text_splitter import CharacterTextSplitter, RecursiveCharacterTextSplitter
from langchain.vectorstores import Chroma
from langchain.docstore.document import Document
from langchain.prompts import PromptTemplate
from langchain.document_loaders import TextLoader
from langchain.chains.question_answering import load_qa_chain
from langchain.llms import AzureOpenAI
from chromadb.utils import embedding_functions
from langchain.text_splitter import CharacterTextSplitter
from langchain.embeddings.openai import OpenAIEmbeddings
from langchain.vectorstores import Chroma
from langchain import VectorDBQA
from langchain.llms import AzureOpenAI
import openai
os.environ["OPENAI_API_TYPE"] = openai.api_type = "azure"
os.environ["OPENAI_API_VERSION"] = openai.api_version = "2022-12-01"
os.environ["OPENAI_API_BASE"] = openai.api_base = "https://openai-endpoint.openai.azure.com/"
os.environ["OPENAI_API_KEY"] = openai.api_key = "f056ead909e54ea0a2fb570e2febad2b"
embeddings = []
def pdf_to_text(file_obj, pdf_text, vectorstore, progress = gr.Progress(track_tqdm=True)):
reader = PdfReader(file_obj)
number_of_pages = len(reader.pages)
pdf_text = ""
for page_number in range(number_of_pages):
page = reader.pages[page_number]
pdf_text += page.extract_text()
text_splitter = RecursiveCharacterTextSplitter(
chunk_size = 1000,
chunk_overlap = 200,
length_function = len,)
texts = text_splitter.split_text(pdf_text)
for text in tqdm.tqdm(texts):
try:
response = openai.Embedding.create(
input=text,
engine="text-embedding-ada-002")
emb = response['data'][0]['embedding']
embeddings.append(emb)
except Exception as e:
print(e)
time.sleep(5)
response = openai.Embedding.create(
input=text,
engine="text-embedding-ada-002")
emb = response['data'][0]['embedding']
embeddings.append(emb)
azure_embeddings = OpenAIEmbeddings(document_model_name="text-embedding-ada-002",query_model_name="text-embedding-ada-002")
vectorstore = Chroma("collection", embedding_function=azure_embeddings)
vectorstore._collection.add(
ids= [f"doc_{i}" for i in range(len(texts))],
documents=texts,
embeddings=embeddings,
metadatas=[{"source": "source"} for text in texts]
)
return pdf_text, vectorstore
def add_text(state, query, vectorstore):
# state = state + [(text, text + "?")]
qa = VectorDBQA.from_chain_type(llm= AzureOpenAI(deployment_name="davinci003", model_name="text-davinci-003"), chain_type="stuff", vectorstore=vectorstore)
qa = qa.run(query)
# chain.run(input_documents=docs, question=query)
state = state + [(query, qa)]
return state, state, vectorstore
with gr.Blocks(title="AOAI") as demo:
pdf_text = gr.State([])
vectorstore = gr.State([])
text_box = gr.TextArea()
upload_button = gr.UploadButton("Click to Upload a File", file_types=["pdf"])
upload_button.upload(pdf_to_text, inputs=[upload_button, pdf_text, vectorstore], outputs=[pdf_text, vectorstore])
with gr.Row():
chatbot = gr.Chatbot()
state = gr.State([])
text = gr.Textbox(show_label=False, placeholder="Enter text and press enter").style(container=False)
text.submit(add_text, [state, text, vectorstore], [chatbot, state, vectorstore])