import gradio as gr from PyPDF2 import PdfReader import tqdm import os import openai import time import gradio as gr from langchain.embeddings.openai import OpenAIEmbeddings from langchain.text_splitter import CharacterTextSplitter, RecursiveCharacterTextSplitter from langchain.vectorstores import Chroma from langchain.docstore.document import Document from langchain.prompts import PromptTemplate from langchain.document_loaders import TextLoader from langchain.chains.question_answering import load_qa_chain from langchain.llms import AzureOpenAI from chromadb.utils import embedding_functions from langchain.text_splitter import CharacterTextSplitter from langchain.embeddings.openai import OpenAIEmbeddings from langchain.vectorstores import Chroma from langchain import VectorDBQA from langchain.llms import AzureOpenAI import openai os.environ["OPENAI_API_TYPE"] = openai.api_type = "azure" os.environ["OPENAI_API_VERSION"] = openai.api_version = "2022-12-01" os.environ["OPENAI_API_BASE"] = openai.api_base = "https://openai-endpoint.openai.azure.com/" os.environ["OPENAI_API_KEY"] = openai.api_key = "f056ead909e54ea0a2fb570e2febad2b" embeddings = [] def pdf_to_text(file_obj, pdf_text, vectorstore, progress = gr.Progress(track_tqdm=True)): reader = PdfReader(file_obj) number_of_pages = len(reader.pages) pdf_text = "" for page_number in range(number_of_pages): page = reader.pages[page_number] pdf_text += page.extract_text() text_splitter = RecursiveCharacterTextSplitter( chunk_size = 1000, chunk_overlap = 200, length_function = len,) texts = text_splitter.split_text(pdf_text) for text in tqdm.tqdm(texts): try: response = openai.Embedding.create( input=text, engine="text-embedding-ada-002") emb = response['data'][0]['embedding'] embeddings.append(emb) except Exception as e: print(e) time.sleep(5) response = openai.Embedding.create( input=text, engine="text-embedding-ada-002") emb = response['data'][0]['embedding'] embeddings.append(emb) azure_embeddings = OpenAIEmbeddings(document_model_name="text-embedding-ada-002",query_model_name="text-embedding-ada-002") vectorstore = Chroma("collection", embedding_function=azure_embeddings) vectorstore._collection.add( ids= [f"doc_{i}" for i in range(len(texts))], documents=texts, embeddings=embeddings, metadatas=[{"source": "source"} for text in texts] ) return pdf_text, vectorstore def add_text(state, query, vectorstore): # state = state + [(text, text + "?")] qa = VectorDBQA.from_chain_type(llm= AzureOpenAI(deployment_name="davinci003", model_name="text-davinci-003"), chain_type="stuff", vectorstore=vectorstore) qa = qa.run(query) # chain.run(input_documents=docs, question=query) state = state + [(query, qa)] return state, state, vectorstore with gr.Blocks(title="AOAI") as demo: pdf_text = gr.State([]) vectorstore = gr.State([]) text_box = gr.TextArea() upload_button = gr.UploadButton("Click to Upload a File", file_types=["pdf"]) upload_button.upload(pdf_to_text, inputs=[upload_button, pdf_text, vectorstore], outputs=[pdf_text, vectorstore]) with gr.Row(): chatbot = gr.Chatbot() state = gr.State([]) text = gr.Textbox(show_label=False, placeholder="Enter text and press enter").style(container=False) text.submit(add_text, [state, text, vectorstore], [chatbot, state, vectorstore]) demo.launch(enable_queue=True)