Spaces:
Running
Running
import gradio as gr | |
from PyPDF2 import PdfReader | |
import tqdm | |
import os | |
import openai | |
import time | |
import gradio as gr | |
from langchain.embeddings.openai import OpenAIEmbeddings | |
from langchain.text_splitter import CharacterTextSplitter, RecursiveCharacterTextSplitter | |
from langchain.vectorstores import Chroma | |
from langchain.docstore.document import Document | |
from langchain.prompts import PromptTemplate | |
from langchain.document_loaders import TextLoader | |
from langchain.chains.question_answering import load_qa_chain | |
from langchain.llms import AzureOpenAI | |
from chromadb.utils import embedding_functions | |
from langchain.text_splitter import CharacterTextSplitter | |
from langchain.embeddings.openai import OpenAIEmbeddings | |
from langchain.vectorstores import Chroma | |
from langchain import VectorDBQA | |
from langchain.llms import AzureOpenAI | |
import openai | |
os.environ["OPENAI_API_TYPE"] = openai.api_type = "azure" | |
os.environ["OPENAI_API_VERSION"] = openai.api_version = "2022-12-01" | |
os.environ["OPENAI_API_BASE"] = openai.api_base = "https://openai-endpoint.openai.azure.com/" | |
os.environ["OPENAI_API_KEY"] = openai.api_key = "f056ead909e54ea0a2fb570e2febad2b" | |
embeddings = [] | |
def pdf_to_text(file_obj, pdf_text, vectorstore, progress = gr.Progress(track_tqdm=True)): | |
reader = PdfReader(file_obj) | |
number_of_pages = len(reader.pages) | |
pdf_text = "" | |
for page_number in range(number_of_pages): | |
page = reader.pages[page_number] | |
pdf_text += page.extract_text() | |
text_splitter = RecursiveCharacterTextSplitter( | |
chunk_size = 1000, | |
chunk_overlap = 200, | |
length_function = len,) | |
texts = text_splitter.split_text(pdf_text) | |
for text in tqdm.tqdm(texts): | |
try: | |
response = openai.Embedding.create( | |
input=text, | |
engine="text-embedding-ada-002") | |
emb = response['data'][0]['embedding'] | |
embeddings.append(emb) | |
except Exception as e: | |
print(e) | |
time.sleep(5) | |
response = openai.Embedding.create( | |
input=text, | |
engine="text-embedding-ada-002") | |
emb = response['data'][0]['embedding'] | |
embeddings.append(emb) | |
azure_embeddings = OpenAIEmbeddings(document_model_name="text-embedding-ada-002",query_model_name="text-embedding-ada-002") | |
vectorstore = Chroma("collection", embedding_function=azure_embeddings) | |
vectorstore._collection.add( | |
ids= [f"doc_{i}" for i in range(len(texts))], | |
documents=texts, | |
embeddings=embeddings, | |
metadatas=[{"source": "source"} for text in texts] | |
) | |
return pdf_text, vectorstore | |
def add_text(state, query, vectorstore): | |
# state = state + [(text, text + "?")] | |
qa = VectorDBQA.from_chain_type(llm= AzureOpenAI(deployment_name="davinci003", model_name="text-davinci-003"), chain_type="stuff", vectorstore=vectorstore) | |
qa = qa.run(query) | |
# chain.run(input_documents=docs, question=query) | |
state = state + [(query, qa)] | |
return state, state, vectorstore | |
with gr.Blocks(title="AOAI") as demo: | |
pdf_text = gr.State([]) | |
vectorstore = gr.State([]) | |
text_box = gr.TextArea() | |
upload_button = gr.UploadButton("Click to Upload a File", file_types=["pdf"]) | |
upload_button.upload(pdf_to_text, inputs=[upload_button, pdf_text, vectorstore], outputs=[pdf_text, vectorstore]) | |
with gr.Row(): | |
chatbot = gr.Chatbot() | |
state = gr.State([]) | |
text = gr.Textbox(show_label=False, placeholder="Enter text and press enter").style(container=False) | |
text.submit(add_text, [state, text, vectorstore], [chatbot, state, vectorstore]) | |