Spaces:
Running
Running
File size: 3,679 Bytes
74748ba 2d2e179 74748ba 2d2e179 74748ba 2d2e179 74748ba 2d2e179 74748ba 2d2e179 14b60e8 2d2e179 74748ba 2d2e179 74748ba 2d2e179 74748ba 2d2e179 74748ba 2d2e179 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 |
import gradio as gr
from PyPDF2 import PdfReader
import tqdm
import os
import openai
import time
import gradio as gr
from langchain.embeddings.openai import OpenAIEmbeddings
from langchain.text_splitter import CharacterTextSplitter, RecursiveCharacterTextSplitter
from langchain.vectorstores import Chroma
from langchain.docstore.document import Document
from langchain.prompts import PromptTemplate
from langchain.document_loaders import TextLoader
from langchain.chains.question_answering import load_qa_chain
from langchain.llms import AzureOpenAI
from chromadb.utils import embedding_functions
from langchain.text_splitter import CharacterTextSplitter
from langchain.embeddings.openai import OpenAIEmbeddings
from langchain.vectorstores import Chroma
from langchain import VectorDBQA
from langchain.llms import AzureOpenAI
import openai
os.environ["OPENAI_API_TYPE"] = openai.api_type = "azure"
os.environ["OPENAI_API_VERSION"] = openai.api_version = "2022-12-01"
os.environ["OPENAI_API_BASE"] = openai.api_base = "https://openai-endpoint.openai.azure.com/"
os.environ["OPENAI_API_KEY"] = openai.api_key = "f056ead909e54ea0a2fb570e2febad2b"
embeddings = []
def pdf_to_text(file_obj, pdf_text, vectorstore, progress = gr.Progress(track_tqdm=True)):
reader = PdfReader(file_obj)
number_of_pages = len(reader.pages)
pdf_text = ""
for page_number in range(number_of_pages):
page = reader.pages[page_number]
pdf_text += page.extract_text()
text_splitter = RecursiveCharacterTextSplitter(
chunk_size = 1000,
chunk_overlap = 200,
length_function = len,)
texts = text_splitter.split_text(pdf_text)
for text in tqdm.tqdm(texts):
try:
response = openai.Embedding.create(
input=text,
engine="text-embedding-ada-002")
emb = response['data'][0]['embedding']
embeddings.append(emb)
except Exception as e:
print(e)
time.sleep(5)
response = openai.Embedding.create(
input=text,
engine="text-embedding-ada-002")
emb = response['data'][0]['embedding']
embeddings.append(emb)
azure_embeddings = OpenAIEmbeddings(document_model_name="text-embedding-ada-002",query_model_name="text-embedding-ada-002")
vectorstore = Chroma("collection", embedding_function=azure_embeddings)
vectorstore._collection.add(
ids= [f"doc_{i}" for i in range(len(texts))],
documents=texts,
embeddings=embeddings,
metadatas=[{"source": "source"} for text in texts]
)
return pdf_text, vectorstore
def add_text(state, query, vectorstore):
# state = state + [(text, text + "?")]
qa = VectorDBQA.from_chain_type(llm= AzureOpenAI(deployment_name="davinci003", model_name="text-davinci-003"), chain_type="stuff", vectorstore=vectorstore)
qa = qa.run(query)
# chain.run(input_documents=docs, question=query)
state = state + [(query, qa)]
return state, state, vectorstore
with gr.Blocks(title="AOAI") as demo:
pdf_text = gr.State([])
vectorstore = gr.State([])
text_box = gr.TextArea()
upload_button = gr.UploadButton("Click to Upload a File", file_types=["pdf"])
upload_button.upload(pdf_to_text, inputs=[upload_button, pdf_text, vectorstore], outputs=[pdf_text, vectorstore])
with gr.Row():
chatbot = gr.Chatbot()
state = gr.State([])
text = gr.Textbox(show_label=False, placeholder="Enter text and press enter").style(container=False)
text.submit(add_text, [state, text, vectorstore], [chatbot, state, vectorstore])
|