import gradio as gr import openai, os import tqdm import time from langchain.vectorstores import Chroma from PyPDF2 import PdfReader from langchain.text_splitter import CharacterTextSplitter, RecursiveCharacterTextSplitter from langchain.embeddings.openai import OpenAIEmbeddings from langchain import VectorDBQA from langchain.llms import AzureOpenAI from langchain.chains import RetrievalQA from langchain.chat_models import AzureChatOpenAI # from langchain.chat_models import AzureChatOpenAI os.environ["OPENAI_API_TYPE"] = openai.api_type = "azure" os.environ["OPENAI_API_VERSION"] = openai.api_version = "2023-03-15-preview" os.environ["OPENAI_API_BASE"] = openai.api_base = "https://openai-endpoint.openai.azure.com/" openai.api_key = os.environ["OPENAI_API_KEY"] = "b83d692637df4e339298f24790a2dcb6" def upload_pdf(file, pdf_text, embeddings, vectorstore, azure_embeddings, qa, progress = gr.Progress(track_tqdm=True)): reader = PdfReader(file) number_of_pages = len(reader.pages) pdf_text = "" for page_number in range(number_of_pages): page = reader.pages[page_number] pdf_text += page.extract_text() text_splitter = RecursiveCharacterTextSplitter( chunk_size = 1000, chunk_overlap = 200, length_function = len,) texts = text_splitter.split_text(pdf_text) for text in tqdm.tqdm(texts): try: response = openai.Embedding.create( input=text, engine="text-embedding-ada-002") emb = response['data'][0]['embedding'] embeddings.append(emb) except Exception as e: print(e) time.sleep(8) response = openai.Embedding.create( input=text, engine="text-embedding-ada-002") emb = response['data'][0]['embedding'] embeddings.append(emb) azure_embeddings = OpenAIEmbeddings(document_model_name="text-embedding-ada-002",query_model_name="text-embedding-ada-002") vectorstore = Chroma("collection", embedding_function=azure_embeddings) vectorstore._collection.add( ids= [f"doc_{i}" for i in range(len(texts))], documents=texts, embeddings=embeddings, metadatas=[{"source": "source"} for text in texts]) qa = RetrievalQA.from_chain_type(llm= AzureChatOpenAI(deployment_name="Bartos", model_name='gpt-35-turbo' ), chain_type="stuff", retriever=vectorstore.as_retriever()) # qa = RetrievalQA.from_chain_type(llm= AzureOpenAI(deployment_name="davinci003", model_name="text-davinci-003"), chain_type="stuff", vectorstore=vectorstore) return pdf_text, pdf_text, embeddings, vectorstore, azure_embeddings, qa, gr.update(visible=True), gr.update(visible=True), gr.update(visible=False) def add_text(chatstate, query, qa): # chain.run(input_documents=docs, question=query) chatstate = chatstate + [(query, qa.run(query))] return chatstate, chatstate, qa with gr.Blocks(css="footer {visibility: hidden}", title='PDF - Q&A') as demo: qa = pdf_text = embeddings = vectorstore = azure_embeddings = gr.State([]) with gr.Row(visible=False) as chat_row: chatbot = gr.Chatbot() with gr.Row(visible=False) as submit_row: text = gr.Textbox(show_label=False, placeholder="Enter text and press enter").style(container=False) chatstate = gr.State([]) text.submit(add_text, [chatstate, text, qa], [chatbot, chatstate, qa]) # set state with gr.Column() as upload_column: file = gr.File() upload_btn = gr.Button("Upload") output_text = gr.TextArea() upload_btn.click(upload_pdf, inputs=[file, pdf_text, embeddings, vectorstore, azure_embeddings, qa], outputs=[output_text, pdf_text, embeddings, vectorstore, azure_embeddings, qa, chat_row, submit_row, upload_column]) with gr.Row(): gr.Markdown("`now with GPT-3.5 Turbo`") demo.launch(enable_queue=True)