cloud-sean commited on
Commit
3be2bfb
·
1 Parent(s): a21259c

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +37 -57
app.py CHANGED
@@ -1,55 +1,31 @@
1
  import gradio as gr
2
- from PyPDF2 import PdfReader
3
  import tqdm
4
- import os
5
- import openai
6
- import time
7
- import gradio as gr
8
- from langchain.embeddings.openai import OpenAIEmbeddings
9
- from langchain.text_splitter import CharacterTextSplitter, RecursiveCharacterTextSplitter
10
  from langchain.vectorstores import Chroma
11
- from langchain.docstore.document import Document
12
- from langchain.prompts import PromptTemplate
13
- from langchain.document_loaders import TextLoader
14
- from langchain.chains.question_answering import load_qa_chain
15
- from langchain.llms import AzureOpenAI
16
- from chromadb.utils import embedding_functions
17
- from langchain.text_splitter import CharacterTextSplitter
18
  from langchain.embeddings.openai import OpenAIEmbeddings
19
- from langchain.vectorstores import Chroma
20
  from langchain import VectorDBQA
21
  from langchain.llms import AzureOpenAI
22
- import openai
23
-
24
 
25
  os.environ["OPENAI_API_TYPE"] = openai.api_type = "azure"
26
  os.environ["OPENAI_API_VERSION"] = openai.api_version = "2022-12-01"
27
  os.environ["OPENAI_API_BASE"] = openai.api_base = "https://openai-endpoint.openai.azure.com/"
28
  os.environ["OPENAI_API_KEY"] = openai.api_key = "f056ead909e54ea0a2fb570e2febad2b"
29
 
30
-
31
- embeddings = []
32
-
33
-
34
- def pdf_to_text(file_obj, pdf_text, vectorstore, progress = gr.Progress(track_tqdm=True)):
35
- reader = PdfReader(file_obj)
36
  number_of_pages = len(reader.pages)
37
  pdf_text = ""
38
  for page_number in range(number_of_pages):
39
  page = reader.pages[page_number]
40
  pdf_text += page.extract_text()
41
-
42
  text_splitter = RecursiveCharacterTextSplitter(
43
  chunk_size = 1000,
44
  chunk_overlap = 200,
45
  length_function = len,)
46
  texts = text_splitter.split_text(pdf_text)
47
-
48
-
49
-
50
-
51
  for text in tqdm.tqdm(texts):
52
-
53
  try:
54
  response = openai.Embedding.create(
55
  input=text,
@@ -64,46 +40,50 @@ def pdf_to_text(file_obj, pdf_text, vectorstore, progress = gr.Progress(track_tq
64
  engine="text-embedding-ada-002")
65
  emb = response['data'][0]['embedding']
66
  embeddings.append(emb)
67
-
68
 
 
69
  azure_embeddings = OpenAIEmbeddings(document_model_name="text-embedding-ada-002",query_model_name="text-embedding-ada-002")
70
  vectorstore = Chroma("collection", embedding_function=azure_embeddings)
 
71
  vectorstore._collection.add(
72
- ids= [f"doc_{i}" for i in range(len(texts))],
73
- documents=texts,
74
- embeddings=embeddings,
75
- metadatas=[{"source": "source"} for text in texts]
76
- )
77
 
78
-
79
 
80
 
81
- return pdf_text, vectorstore
 
 
 
 
 
 
 
 
 
 
 
 
 
82
 
83
- def add_text(state, query, vectorstore):
84
 
85
- # state = state + [(text, text + "?")]
86
- qa = VectorDBQA.from_chain_type(llm= AzureOpenAI(deployment_name="davinci003", model_name="text-davinci-003"), chain_type="stuff", vectorstore=vectorstore)
87
- qa = qa.run(query)
88
- # chain.run(input_documents=docs, question=query)
89
- state = state + [(query, qa)]
90
- return state, state, vectorstore
91
 
92
 
93
- with gr.Blocks(title="AOAI") as demo:
94
- pdf_text = gr.State([])
95
- vectorstore = gr.State([])
96
- text_box = gr.TextArea()
97
- upload_button = gr.UploadButton("Click to Upload a File", file_types=["pdf"])
98
- upload_button.upload(pdf_to_text, inputs=[upload_button, pdf_text, vectorstore], outputs=[pdf_text, vectorstore])
 
99
 
100
- with gr.Row():
101
- chatbot = gr.Chatbot()
102
- state = gr.State([])
103
 
104
-
105
- text = gr.Textbox(show_label=False, placeholder="Enter text and press enter").style(container=False)
106
-
107
- text.submit(add_text, [state, text, vectorstore], [chatbot, state, vectorstore])
108
 
109
  demo.launch(enable_queue=True)
 
1
  import gradio as gr
2
+ import openai, os
3
  import tqdm
 
 
 
 
 
 
4
  from langchain.vectorstores import Chroma
5
+ from PyPDF2 import PdfReader
6
+ from langchain.text_splitter import CharacterTextSplitter, RecursiveCharacterTextSplitter
 
 
 
 
 
7
  from langchain.embeddings.openai import OpenAIEmbeddings
 
8
  from langchain import VectorDBQA
9
  from langchain.llms import AzureOpenAI
 
 
10
 
11
  os.environ["OPENAI_API_TYPE"] = openai.api_type = "azure"
12
  os.environ["OPENAI_API_VERSION"] = openai.api_version = "2022-12-01"
13
  os.environ["OPENAI_API_BASE"] = openai.api_base = "https://openai-endpoint.openai.azure.com/"
14
  os.environ["OPENAI_API_KEY"] = openai.api_key = "f056ead909e54ea0a2fb570e2febad2b"
15
 
16
+ def upload_pdf(file, pdf_text, embeddings, vectorstore, azure_embeddings, qa, progress = gr.Progress(track_tqdm=True)):
17
+ reader = PdfReader(file)
 
 
 
 
18
  number_of_pages = len(reader.pages)
19
  pdf_text = ""
20
  for page_number in range(number_of_pages):
21
  page = reader.pages[page_number]
22
  pdf_text += page.extract_text()
 
23
  text_splitter = RecursiveCharacterTextSplitter(
24
  chunk_size = 1000,
25
  chunk_overlap = 200,
26
  length_function = len,)
27
  texts = text_splitter.split_text(pdf_text)
 
 
 
 
28
  for text in tqdm.tqdm(texts):
 
29
  try:
30
  response = openai.Embedding.create(
31
  input=text,
 
40
  engine="text-embedding-ada-002")
41
  emb = response['data'][0]['embedding']
42
  embeddings.append(emb)
 
43
 
44
+
45
  azure_embeddings = OpenAIEmbeddings(document_model_name="text-embedding-ada-002",query_model_name="text-embedding-ada-002")
46
  vectorstore = Chroma("collection", embedding_function=azure_embeddings)
47
+
48
  vectorstore._collection.add(
49
+ ids= [f"doc_{i}" for i in range(len(texts))],
50
+ documents=texts,
51
+ embeddings=embeddings,
52
+ metadatas=[{"source": "source"} for text in texts])
53
+ qa = VectorDBQA.from_chain_type(llm= AzureOpenAI(deployment_name="davinci003", model_name="text-davinci-003"), chain_type="stuff", vectorstore=vectorstore)
54
 
55
+ return pdf_text, pdf_text, embeddings, vectorstore, azure_embeddings, qa, gr.update(visible=True), gr.update(visible=True), gr.update(visible=False)
56
 
57
 
58
+ def add_text(chatstate, query, qa):
59
+ # chain.run(input_documents=docs, question=query)
60
+ chatstate = chatstate + [(query, qa.run(query))]
61
+
62
+ return chatstate, chatstate, qa
63
+
64
+ with gr.Blocks() as demo:
65
+ qa = pdf_text = embeddings = vectorstore = azure_embeddings = gr.State([])
66
+ with gr.Row(visible=False) as chat_row:
67
+ chatbot = gr.Chatbot()
68
+ with gr.Row(visible=False) as submit_row:
69
+ text = gr.Textbox(show_label=False, placeholder="Enter text and press enter").style(container=False)
70
+ chatstate = gr.State([])
71
+ text.submit(add_text, [chatstate, text, qa], [chatbot, chatstate, qa])
72
 
 
73
 
 
 
 
 
 
 
74
 
75
 
76
+ # set state
77
+ with gr.Column() as upload_column:
78
+
79
+ file = gr.File()
80
+ upload_btn = gr.Button("Upload")
81
+ output_text = gr.TextArea()
82
+ upload_btn.click(upload_pdf, inputs=[file, pdf_text, embeddings, vectorstore, azure_embeddings, qa], outputs=[output_text, pdf_text, embeddings, vectorstore, azure_embeddings, qa, chat_row, submit_row, upload_column])
83
 
 
 
 
84
 
85
+
86
+
87
+
 
88
 
89
  demo.launch(enable_queue=True)