hfwittmann commited on
Commit
708e7b3
·
1 Parent(s): 806ab1d

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +103 -100
app.py CHANGED
@@ -1,7 +1,15 @@
 
 
 
1
  import gradio as gr
 
2
  import pandas as pd
3
- from pathlib import Path
4
- import os
 
 
 
 
5
 
6
  css_style = """
7
  .gradio-container {
@@ -10,91 +18,101 @@ css_style = """
10
  """
11
 
12
 
13
- def request_pathname(files, data, openai_api_key, index):
14
- if files is None:
15
- return [[]]
16
- for file in files:
17
- # make sure we're not duplicating things in the dataset
18
- if file.name in [x[0] for x in data]:
19
- continue
20
- data.append([file.name, None, None])
21
-
22
- mydataset = pd.DataFrame(data, columns=["filepath", "citation string", "key"])
 
 
 
 
 
 
 
 
 
 
 
 
 
 
23
 
24
- validation, index = validate_dataset(mydataset, openai_api_key, index)
25
-
26
- return (
27
- [[len(data), 0]],
28
- data,
29
- data,
30
- validation,
31
- index
32
- )
33
 
 
 
 
 
 
 
 
 
 
 
 
 
 
34
 
35
- def validate_dataset(dataset, openapi, index):
36
- docs_ready = dataset.iloc[-1, 0] != ""
37
-
38
- if docs_ready and type(openapi) is str and len(openapi) > 0:
39
- os.environ["OPENAI_API_KEY"] = openapi.strip()
40
- index = get_index(dataset, openapi, index)
41
- return "✨Ready✨", index
42
- elif docs_ready:
43
- return "⚠️Waiting for key⚠️", index
44
- elif type(openapi) is str and len(openapi) > 0:
45
- return "⚠️Waiting for documents⚠️", index
46
- else:
47
- return "⚠️Waiting for documents and key⚠️", index
48
-
49
-
50
- def get_index(dataset, openapi, index):
51
-
52
- docs_ready = dataset.iloc[-1, 0] != ""
53
 
54
- if docs_ready and type(openapi) is str and len(openapi) > 0:
55
- from langchain.document_loaders import PyPDFLoader
56
- from langchain.vectorstores import DocArrayInMemorySearch
57
- from IPython.display import display, Markdown
58
- from langchain.indexes import VectorstoreIndexCreator
59
 
60
- # myfile = "Angela Merkel - Wikipedia.pdf"
61
- # loader = PyPDFLoader(file_path=myfile)
 
62
 
63
- loader = PyPDFLoader(file_path=dataset["filepath"][0])
64
 
65
- index = VectorstoreIndexCreator(
66
- vectorstore_cls=DocArrayInMemorySearch
67
- ).from_loaders([loader])
68
 
69
- return index
70
 
 
 
 
 
71
 
72
- def make_stats(docs):
73
- return [[len(docs.doc_previews), sum([x[0] for x in docs.doc_previews])]]
74
 
 
 
 
75
 
76
- def do_ask(question, button, openapi, dataset, index):
77
- passages = ""
78
- docs_ready = dataset.iloc[-1, 0] != ""
79
- out = ''
80
- if button == "✨Ready✨" and type(openapi) is str and len(openapi) > 0 and docs_ready:
81
 
 
82
 
83
- # "Please provide a summary of signifcant personal life events of Angela Merkel. Of that summary extract all events with dates and put these into a markdown table."
84
- # limit = f' Limit your answer to a maxmium of {length} words.'
85
-
86
- query = question # + limit
87
-
88
- response = index.query(query)
89
- out = response
90
-
91
- yield out, index
 
 
92
 
93
 
94
  with gr.Blocks(css=css_style) as demo:
95
  docs = gr.State()
96
  data = gr.State([])
97
  openai_api_key = gr.State("")
 
 
 
98
 
99
  gr.Markdown(
100
  """
@@ -102,7 +120,7 @@ with gr.Blocks(css=css_style) as demo:
102
 
103
  *By D8a.ai*
104
 
105
- Based on https://huggingface.co/spaces/whitead/paper-qa
106
 
107
  Significant advances in langchain have made it possible to simplify the code.
108
 
@@ -115,17 +133,16 @@ with gr.Blocks(css=css_style) as demo:
115
  * [langchain](https://github.com/hwchase17/langchain) is the main library this tool utilizes.
116
  1. Enter API Key ([What is that?](https://platform.openai.com/account/api-keys))
117
  2. Upload your documents
118
- 3. Ask a questions
119
  """
120
  )
121
 
122
  openai_api_key = gr.Textbox(
123
  label="OpenAI API Key", placeholder="sk-...", type="password"
124
  )
125
- with gr.Tab("File Upload"):
126
  uploaded_files = gr.File(
127
- label="Your Documents Upload (PDF or txt)",
128
- file_count="multiple",
129
  )
130
 
131
  with gr.Accordion("See Docs:", open=False):
@@ -139,7 +156,6 @@ with gr.Blocks(css=css_style) as demo:
139
  max_rows=5,
140
  )
141
 
142
-
143
  buildb = gr.Textbox(
144
  "⚠️Waiting for documents and key...",
145
  label="Status",
@@ -147,40 +163,27 @@ with gr.Blocks(css=css_style) as demo:
147
  show_label=True,
148
  max_lines=1,
149
  )
150
-
151
- index = gr.State()
152
-
153
- stats = gr.Dataframe(
154
- headers=["Docs", "Chunks"],
155
- datatype=["number", "number"],
156
- col_count=(2, "fixed"),
157
- interactive=False,
158
- label="Doc Stats",
159
- )
160
- openai_api_key.change(
161
- validate_dataset, inputs=[dataset, openai_api_key], outputs=[buildb, index]
162
- )
163
- dataset.change(validate_dataset, inputs=[dataset, openai_api_key, index], outputs=[buildb, index])
164
-
165
-
166
- uploaded_files.change(
167
- request_pathname,
168
- inputs=[uploaded_files, data, openai_api_key, index],
169
- outputs=[stats, data, dataset, buildb, index],
170
- )
171
 
172
  query = gr.Textbox(placeholder="Enter your question here...", label="Question")
173
-
174
- # with gr.Row():
175
- # length = gr.Slider(25, 200, value=100, step=5, label="Words in answer")
176
  ask = gr.Button("Ask Question")
177
  answer = gr.Markdown(label="Answer")
178
 
 
 
 
 
 
 
 
 
 
 
179
  ask.click(
180
- do_ask,
181
- inputs=[query, buildb, openai_api_key, dataset, index],
182
- outputs=[answer, index],
183
  )
184
 
 
185
  demo.queue(concurrency_count=20)
186
  demo.launch(show_error=True)
 
1
+ import os
2
+ from typing import Any
3
+
4
  import gradio as gr
5
+ import openai
6
  import pandas as pd
7
+ from IPython.display import Markdown, display
8
+ from langchain.document_loaders import PyPDFLoader
9
+ from langchain.indexes import VectorstoreIndexCreator
10
+ from langchain.vectorstores import DocArrayInMemorySearch
11
+ from langchain.embeddings import OpenAIEmbeddings
12
+
13
 
14
  css_style = """
15
  .gradio-container {
 
18
  """
19
 
20
 
21
+ class myClass:
22
+ def __init__(self) -> None:
23
+ self.openapi = ""
24
+ self.valid_key = False
25
+ self.docs_ready = False
26
+ self.status = "⚠️Waiting for documents and key⚠️"
27
+ pass
28
+
29
+ def check_status(self):
30
+ if self.docs_ready and self.valid_key:
31
+ out = "✨Ready✨"
32
+ elif self.docs_ready:
33
+ out = "⚠️Waiting for key⚠️"
34
+ elif self.valid_key:
35
+ out = "⚠️Waiting for documents⚠️"
36
+ else:
37
+ out = "⚠️Waiting for documents and key⚠️"
38
+
39
+ self.status = out
40
+
41
+ def validate_key(self, myin):
42
+ assert isinstance(myin, str)
43
+ self.valid_key = True
44
+ self.openai_api_key = myin.strip()
45
 
46
+ self.check_status()
47
+ return self.status
 
 
 
 
 
 
 
48
 
49
+ def request_pathname(self, files, data):
50
+ if files is None:
51
+ self.docs_ready = False
52
+ self.check_status()
53
+ return (
54
+ pd.DataFrame(data, columns=["filepath", "citation string", "key"]),
55
+ self.status,
56
+ )
57
+ for file in files:
58
+ # make sure we're not duplicating things in the dataset
59
+ if file.name in [x[0] for x in data]:
60
+ continue
61
+ data.append([file.name, None, None])
62
 
63
+ mydataset = pd.DataFrame(data, columns=["filepath", "citation string", "key"])
64
+ validation_button = self.validate_dataset(mydataset)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
65
 
66
+ return mydataset, validation_button
 
 
 
 
67
 
68
+ def validate_dataset(self, dataset):
69
+ self.docs_ready = dataset.iloc[-1, 0] != ""
70
+ self.dataset = dataset
71
 
72
+ self.check_status()
73
 
74
+ if self.status == "✨Ready✨":
75
+ self.get_index()
 
76
 
77
+ return self.status
78
 
79
+ def get_index(self):
80
+ if self.docs_ready and self.valid_key:
81
+ # openai = OpenAIEmbeddings(openai_api_key=self.openai_api_key)
82
+ os.environ["OPENAI_API_KEY"] = self.openai_api_key
83
 
 
 
84
 
85
+ # myfile = "Angela Merkel - Wikipedia.pdf"
86
+ # loader = PyPDFLoader(file_path=myfile)
87
+ loader = PyPDFLoader(file_path=self.dataset["filepath"][0])
88
 
89
+ self.index = VectorstoreIndexCreator(
90
+ vectorstore_cls=DocArrayInMemorySearch
91
+ ).from_loaders([loader])
92
+ del os.environ["OPENAI_API_KEY"]
 
93
 
94
+ pass
95
 
96
+ def do_ask(self, question):
97
+ # os.environ["OPENAI_API_KEY"] = self.openai_api_key
98
+ # openai.api_key = self.openai_api_key
99
+ if self.status == "✨Ready✨":
100
+ # openai = OpenAIEmbeddings(openai_api_key=self.openai_api_key)
101
+ os.environ["OPENAI_API_KEY"] = self.openai_api_key
102
+
103
+ response = self.index.query(question=question)
104
+ del os.environ["OPENAI_API_KEY"]
105
+ yield response
106
+ pass
107
 
108
 
109
  with gr.Blocks(css=css_style) as demo:
110
  docs = gr.State()
111
  data = gr.State([])
112
  openai_api_key = gr.State("")
113
+ index = gr.State()
114
+ myInstance = gr.State()
115
+ myInstance = myClass()
116
 
117
  gr.Markdown(
118
  """
 
120
 
121
  *By D8a.ai*
122
 
123
+ Idea based on https://huggingface.co/spaces/whitead/paper-qa
124
 
125
  Significant advances in langchain have made it possible to simplify the code.
126
 
 
133
  * [langchain](https://github.com/hwchase17/langchain) is the main library this tool utilizes.
134
  1. Enter API Key ([What is that?](https://platform.openai.com/account/api-keys))
135
  2. Upload your documents
136
+ 3. Ask questions
137
  """
138
  )
139
 
140
  openai_api_key = gr.Textbox(
141
  label="OpenAI API Key", placeholder="sk-...", type="password"
142
  )
143
+ with gr.Tab("File upload"):
144
  uploaded_files = gr.File(
145
+ label="Upload your pdf Dokument", file_count="multiple"
 
146
  )
147
 
148
  with gr.Accordion("See Docs:", open=False):
 
156
  max_rows=5,
157
  )
158
 
 
159
  buildb = gr.Textbox(
160
  "⚠️Waiting for documents and key...",
161
  label="Status",
 
163
  show_label=True,
164
  max_lines=1,
165
  )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
166
 
167
  query = gr.Textbox(placeholder="Enter your question here...", label="Question")
 
 
 
168
  ask = gr.Button("Ask Question")
169
  answer = gr.Markdown(label="Answer")
170
 
171
+ openai_api_key.change(
172
+ myInstance.validate_key, inputs=openai_api_key, outputs=buildb
173
+ )
174
+
175
+ uploaded_files.change(
176
+ myInstance.request_pathname,
177
+ inputs=[uploaded_files, data],
178
+ outputs=[dataset, buildb],
179
+ )
180
+
181
  ask.click(
182
+ myInstance.do_ask,
183
+ inputs=[query],
184
+ outputs=answer,
185
  )
186
 
187
+
188
  demo.queue(concurrency_count=20)
189
  demo.launch(show_error=True)