Spaces:
Running
Running
File size: 3,480 Bytes
74748ba 3be2bfb 2d2e179 3657970 2d2e179 3be2bfb 2d2e179 74748ba 2d2e179 9fa297f 74748ba 4fe0e78 3be2bfb 2d2e179 74748ba 2d2e179 bfc97c8 2d2e179 74748ba 3be2bfb 2d2e179 3be2bfb 2d2e179 3be2bfb 2d2e179 3be2bfb 2d2e179 74748ba 3be2bfb 678e471 3be2bfb 2d2e179 3be2bfb 2d2e179 3be2bfb 2d2e179 8ff6f4e |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 |
import gradio as gr
import openai, os
import tqdm
import time
from langchain.vectorstores import Chroma
from PyPDF2 import PdfReader
from langchain.text_splitter import CharacterTextSplitter, RecursiveCharacterTextSplitter
from langchain.embeddings.openai import OpenAIEmbeddings
from langchain import VectorDBQA
from langchain.llms import AzureOpenAI
os.environ["OPENAI_API_TYPE"] = openai.api_type = "azure"
os.environ["OPENAI_API_VERSION"] = openai.api_version = "2022-12-01"
os.environ["OPENAI_API_BASE"] = openai.api_base = "https://openai-endpoint.openai.azure.com/"
openai.api_key =os.environ["OPENAI_API_KEY"]
def upload_pdf(file, pdf_text, embeddings, vectorstore, azure_embeddings, qa, progress = gr.Progress(track_tqdm=True)):
reader = PdfReader(file)
number_of_pages = len(reader.pages)
pdf_text = ""
for page_number in range(number_of_pages):
page = reader.pages[page_number]
pdf_text += page.extract_text()
text_splitter = RecursiveCharacterTextSplitter(
chunk_size = 1000,
chunk_overlap = 200,
length_function = len,)
texts = text_splitter.split_text(pdf_text)
for text in tqdm.tqdm(texts):
try:
response = openai.Embedding.create(
input=text,
engine="text-embedding-ada-002")
emb = response['data'][0]['embedding']
embeddings.append(emb)
except Exception as e:
print(e)
time.sleep(8)
response = openai.Embedding.create(
input=text,
engine="text-embedding-ada-002")
emb = response['data'][0]['embedding']
embeddings.append(emb)
azure_embeddings = OpenAIEmbeddings(document_model_name="text-embedding-ada-002",query_model_name="text-embedding-ada-002")
vectorstore = Chroma("collection", embedding_function=azure_embeddings)
vectorstore._collection.add(
ids= [f"doc_{i}" for i in range(len(texts))],
documents=texts,
embeddings=embeddings,
metadatas=[{"source": "source"} for text in texts])
qa = VectorDBQA.from_chain_type(llm= AzureOpenAI(deployment_name="davinci003", model_name="text-davinci-003"), chain_type="stuff", vectorstore=vectorstore)
return pdf_text, pdf_text, embeddings, vectorstore, azure_embeddings, qa, gr.update(visible=True), gr.update(visible=True), gr.update(visible=False)
def add_text(chatstate, query, qa):
# chain.run(input_documents=docs, question=query)
chatstate = chatstate + [(query, qa.run(query))]
return chatstate, chatstate, qa
with gr.Blocks(css="footer {visibility: hidden}") as demo:
qa = pdf_text = embeddings = vectorstore = azure_embeddings = gr.State([])
with gr.Row(visible=False) as chat_row:
chatbot = gr.Chatbot()
with gr.Row(visible=False) as submit_row:
text = gr.Textbox(show_label=False, placeholder="Enter text and press enter").style(container=False)
chatstate = gr.State([])
text.submit(add_text, [chatstate, text, qa], [chatbot, chatstate, qa])
# set state
with gr.Column() as upload_column:
file = gr.File()
upload_btn = gr.Button("Upload")
output_text = gr.TextArea()
upload_btn.click(upload_pdf, inputs=[file, pdf_text, embeddings, vectorstore, azure_embeddings, qa], outputs=[output_text, pdf_text, embeddings, vectorstore, azure_embeddings, qa, chat_row, submit_row, upload_column])
demo.launch(enable_queue=True) |