hfwittmann commited on
Commit
6a9b66f
·
1 Parent(s): 23c2d7a

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +215 -4
app.py CHANGED
@@ -1,7 +1,218 @@
 
 
 
1
  import gradio as gr
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2
 
3
- def greet(name):
4
- return "Hello " + name + "!!"
5
 
6
- iface = gr.Interface(fn=greet, inputs="text", outputs="text")
7
- iface.launch()
 
1
+ import os
2
+ from typing import Any
3
+
4
  import gradio as gr
5
+ import openai
6
+ import pandas as pd
7
+ from IPython.display import Markdown, display
8
+ from langchain.document_loaders import PyPDFLoader
9
+ from langchain.embeddings import OpenAIEmbeddings
10
+ from langchain.indexes import VectorstoreIndexCreator
11
+ from langchain.text_splitter import CharacterTextSplitter
12
+ from langchain.text_splitter import RecursiveCharacterTextSplitter
13
+ from langchain.llms import OpenAI
14
+ from langchain.vectorstores import DocArrayInMemorySearch
15
+ from uuid import uuid4
16
+
17
+ css_style = """
18
+ .gradio-container {
19
+ font-family: "IBM Plex Mono";
20
+ }
21
+ """
22
+
23
+
24
+ class myClass:
25
+ def __init__(self) -> None:
26
+ self.openapi = ""
27
+ self.valid_key = False
28
+ self.docs_ready = False
29
+ self.status = "⚠️Waiting for documents and key⚠️"
30
+ self.uuid = uuid4()
31
+ pass
32
+
33
+ def check_status(self):
34
+ if self.docs_ready and self.valid_key:
35
+ out = "✨Ready✨"
36
+ elif self.docs_ready:
37
+ out = "⚠️Waiting for key⚠️"
38
+ elif self.valid_key:
39
+ out = "⚠️Waiting for documents⚠️"
40
+ else:
41
+ out = "⚠️Waiting for documents and key⚠️"
42
+
43
+ self.status = out
44
+
45
+ def validate_key(self, myin):
46
+ assert isinstance(myin, str)
47
+ self.valid_key = True
48
+ self.openai_api_key = myin.strip()
49
+ self.embedding = OpenAIEmbeddings(openai_api_key=self.openai_api_key)
50
+ self.llm = OpenAI(openai_api_key=self.openai_api_key)
51
+
52
+ self.check_status()
53
+ return [self.status]
54
+
55
+ def request_pathname(self, files, data):
56
+ if files is None:
57
+ self.docs_ready = False
58
+ self.check_status()
59
+ return (
60
+ pd.DataFrame(data, columns=["filepath", "citation string", "key"]),
61
+ self.status,
62
+ )
63
+ for file in files:
64
+ # make sure we're not duplicating things in the dataset
65
+ if file.name in [x[0] for x in data]:
66
+ continue
67
+ data.append([file.name, None, None])
68
+
69
+ mydataset = pd.DataFrame(data, columns=["filepath", "citation string", "key"])
70
+ validation_button = self.validate_dataset(mydataset)
71
+
72
+ return mydataset, validation_button
73
+
74
+ def validate_dataset(self, dataset):
75
+ self.docs_ready = dataset.iloc[-1, 0] != ""
76
+ self.dataset = dataset
77
+
78
+ self.check_status()
79
+
80
+ if self.status == "✨Ready✨":
81
+ self.get_index()
82
+
83
+ return self.status
84
+
85
+ def get_index(self):
86
+ if self.docs_ready and self.valid_key:
87
+ # os.environ["OPENAI_API_KEY"] = self.openai_api_key
88
+
89
+ # myfile = "Angela Merkel - Wikipedia.pdf"
90
+ # loader = PyPDFLoader(file_path=myfile)
91
+ loaders = [PyPDFLoader(f) for f in self.dataset["filepath"]]
92
+
93
+ self.index = VectorstoreIndexCreator(
94
+ vectorstore_cls=DocArrayInMemorySearch,
95
+ embedding=self.embedding,
96
+ text_splitter = RecursiveCharacterTextSplitter(
97
+ # Set a really small chunk size, just to show.
98
+ chunk_size = 1000,
99
+ chunk_overlap = 20,
100
+ length_function = len,
101
+ separators="."
102
+ )
103
+
104
+ ).from_loaders(loaders=loaders)
105
+
106
+ # del os.environ["OPENAI_API_KEY"]
107
+
108
+ pass
109
+
110
+ def do_ask(self, question):
111
+ # os.environ["OPENAI_API_KEY"] = self.openai_api_key
112
+ # openai.api_key = self.openai_api_key
113
+
114
+ if self.status == "✨Ready✨":
115
+ # os.environ["OPENAI_API_KEY"] = self.openai_api_key
116
+
117
+ response = self.index.query(question=question, llm=self.llm)
118
+ # del os.environ["OPENAI_API_KEY"]
119
+ yield response
120
+ pass
121
+
122
+
123
+ def validate_key(myInstance: myClass, openai_api_key):
124
+ if myInstance is None:
125
+ myInstance = myClass()
126
+
127
+ out = myInstance.validate_key(openai_api_key)
128
+ return myInstance, *out
129
+
130
+
131
+ def request_pathname(myInstance: myClass, files, data):
132
+ if myInstance is None:
133
+ myInstance = myClass()
134
+ out = myInstance.request_pathname(files, data)
135
+ return myInstance, *out
136
+
137
+
138
+ def do_ask(myInstance: myClass, question):
139
+ out = myInstance.do_ask(question)
140
+ return myInstance, *out
141
+
142
+
143
+ with gr.Blocks(css=css_style) as demo:
144
+ myInstance = gr.State()
145
+ openai_api_key = gr.State("")
146
+ docs = gr.State()
147
+ data = gr.State([])
148
+ index = gr.State()
149
+
150
+ gr.Markdown(
151
+ """
152
+ # Document Question and Answer
153
+ *By D8a.ai*
154
+ Idea based on https://huggingface.co/spaces/whitead/paper-qa
155
+ Significant advances in langchain have made it possible to simplify the code.
156
+ This tool allows you to ask questions of your uploaded text, PDF documents.
157
+ It uses OpenAI's GPT models, so you need to enter your API key below. This
158
+ tool is under active development and currently uses a lot of tokens - up to 10,000
159
+ for a single query. This is $0.10-0.20 per query, so please be careful!
160
+ * [langchain](https://github.com/hwchase17/langchain) is the main library this tool utilizes.
161
+ 1. Enter API Key ([What is that?](https://platform.openai.com/account/api-keys))
162
+ 2. Upload your documents
163
+ 3. Ask questions
164
+ """
165
+ )
166
+
167
+ openai_api_key = gr.Textbox(
168
+ label="OpenAI API Key", placeholder="sk-...", type="password"
169
+ )
170
+ with gr.Tab("File upload"):
171
+ uploaded_files = gr.File(
172
+ label="Upload your pdf Dokument", file_count="multiple"
173
+ )
174
+
175
+ with gr.Accordion("See Docs:", open=False):
176
+ dataset = gr.Dataframe(
177
+ headers=["filepath", "citation string", "key"],
178
+ datatype=["str", "str", "str"],
179
+ col_count=(3, "fixed"),
180
+ interactive=False,
181
+ label="Documents and Citations",
182
+ overflow_row_behaviour="paginate",
183
+ max_rows=5,
184
+ )
185
+
186
+ buildb = gr.Textbox(
187
+ "⚠️Waiting for documents and key...",
188
+ label="Status",
189
+ interactive=False,
190
+ show_label=True,
191
+ max_lines=1,
192
+ )
193
+
194
+ query = gr.Textbox(placeholder="Enter your question here...", label="Question")
195
+ ask = gr.Button("Ask Question")
196
+ answer = gr.Markdown(label="Answer")
197
+
198
+ openai_api_key.change(
199
+ validate_key, inputs=[myInstance, openai_api_key], outputs=[myInstance, buildb]
200
+ )
201
+
202
+ uploaded_files.change(
203
+ request_pathname,
204
+ inputs=[myInstance, uploaded_files, data],
205
+ outputs=[myInstance, dataset, buildb],
206
+ )
207
+
208
+ ask.click(
209
+ do_ask,
210
+ inputs=[myInstance, query],
211
+ outputs=[myInstance, answer],
212
+ )
213
+
214
+
215
 
 
 
216
 
217
+ demo.queue(concurrency_count=20)
218
+ demo.launch(show_error=True)