hfwittmann commited on
Commit
806ab1d
·
0 Parent(s):

Duplicate from hfwittmann/simple-paper-qa

Browse files
Files changed (5) hide show
  1. .gitattributes +34 -0
  2. .vscode/launch.json +16 -0
  3. README.md +14 -0
  4. app.py +186 -0
  5. requirements.txt +10 -0
.gitattributes ADDED
@@ -0,0 +1,34 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ *.7z filter=lfs diff=lfs merge=lfs -text
2
+ *.arrow filter=lfs diff=lfs merge=lfs -text
3
+ *.bin filter=lfs diff=lfs merge=lfs -text
4
+ *.bz2 filter=lfs diff=lfs merge=lfs -text
5
+ *.ckpt filter=lfs diff=lfs merge=lfs -text
6
+ *.ftz filter=lfs diff=lfs merge=lfs -text
7
+ *.gz filter=lfs diff=lfs merge=lfs -text
8
+ *.h5 filter=lfs diff=lfs merge=lfs -text
9
+ *.joblib filter=lfs diff=lfs merge=lfs -text
10
+ *.lfs.* filter=lfs diff=lfs merge=lfs -text
11
+ *.mlmodel filter=lfs diff=lfs merge=lfs -text
12
+ *.model filter=lfs diff=lfs merge=lfs -text
13
+ *.msgpack filter=lfs diff=lfs merge=lfs -text
14
+ *.npy filter=lfs diff=lfs merge=lfs -text
15
+ *.npz filter=lfs diff=lfs merge=lfs -text
16
+ *.onnx filter=lfs diff=lfs merge=lfs -text
17
+ *.ot filter=lfs diff=lfs merge=lfs -text
18
+ *.parquet filter=lfs diff=lfs merge=lfs -text
19
+ *.pb filter=lfs diff=lfs merge=lfs -text
20
+ *.pickle filter=lfs diff=lfs merge=lfs -text
21
+ *.pkl filter=lfs diff=lfs merge=lfs -text
22
+ *.pt filter=lfs diff=lfs merge=lfs -text
23
+ *.pth filter=lfs diff=lfs merge=lfs -text
24
+ *.rar filter=lfs diff=lfs merge=lfs -text
25
+ *.safetensors filter=lfs diff=lfs merge=lfs -text
26
+ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
27
+ *.tar.* filter=lfs diff=lfs merge=lfs -text
28
+ *.tflite filter=lfs diff=lfs merge=lfs -text
29
+ *.tgz filter=lfs diff=lfs merge=lfs -text
30
+ *.wasm filter=lfs diff=lfs merge=lfs -text
31
+ *.xz filter=lfs diff=lfs merge=lfs -text
32
+ *.zip filter=lfs diff=lfs merge=lfs -text
33
+ *.zst filter=lfs diff=lfs merge=lfs -text
34
+ *tfevents* filter=lfs diff=lfs merge=lfs -text
.vscode/launch.json ADDED
@@ -0,0 +1,16 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ // Use IntelliSense to learn about possible attributes.
3
+ // Hover to view descriptions of existing attributes.
4
+ // For more information, visit: https://go.microsoft.com/fwlink/?linkid=830387
5
+ "version": "0.2.0",
6
+ "configurations": [
7
+ {
8
+ "name": "Python: Current File",
9
+ "type": "python",
10
+ "request": "launch",
11
+ "program": "${file}",
12
+ "console": "integratedTerminal",
13
+ "justMyCode": true
14
+ }
15
+ ]
16
+ }
README.md ADDED
@@ -0,0 +1,14 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ title: Simple Paper Qa
3
+ emoji: 🏃
4
+ colorFrom: blue
5
+ colorTo: blue
6
+ sdk: gradio
7
+ sdk_version: 3.34.0
8
+ app_file: app.py
9
+ pinned: false
10
+ license: apache-2.0
11
+ duplicated_from: hfwittmann/simple-paper-qa
12
+ ---
13
+
14
+ Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
app.py ADDED
@@ -0,0 +1,186 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ import pandas as pd
3
+ from pathlib import Path
4
+ import os
5
+
6
+ css_style = """
7
+ .gradio-container {
8
+ font-family: "IBM Plex Mono";
9
+ }
10
+ """
11
+
12
+
13
+ def request_pathname(files, data, openai_api_key, index):
14
+ if files is None:
15
+ return [[]]
16
+ for file in files:
17
+ # make sure we're not duplicating things in the dataset
18
+ if file.name in [x[0] for x in data]:
19
+ continue
20
+ data.append([file.name, None, None])
21
+
22
+ mydataset = pd.DataFrame(data, columns=["filepath", "citation string", "key"])
23
+
24
+ validation, index = validate_dataset(mydataset, openai_api_key, index)
25
+
26
+ return (
27
+ [[len(data), 0]],
28
+ data,
29
+ data,
30
+ validation,
31
+ index
32
+ )
33
+
34
+
35
+ def validate_dataset(dataset, openapi, index):
36
+ docs_ready = dataset.iloc[-1, 0] != ""
37
+
38
+ if docs_ready and type(openapi) is str and len(openapi) > 0:
39
+ os.environ["OPENAI_API_KEY"] = openapi.strip()
40
+ index = get_index(dataset, openapi, index)
41
+ return "✨Ready✨", index
42
+ elif docs_ready:
43
+ return "⚠️Waiting for key⚠️", index
44
+ elif type(openapi) is str and len(openapi) > 0:
45
+ return "⚠️Waiting for documents⚠️", index
46
+ else:
47
+ return "⚠️Waiting for documents and key⚠️", index
48
+
49
+
50
+ def get_index(dataset, openapi, index):
51
+
52
+ docs_ready = dataset.iloc[-1, 0] != ""
53
+
54
+ if docs_ready and type(openapi) is str and len(openapi) > 0:
55
+ from langchain.document_loaders import PyPDFLoader
56
+ from langchain.vectorstores import DocArrayInMemorySearch
57
+ from IPython.display import display, Markdown
58
+ from langchain.indexes import VectorstoreIndexCreator
59
+
60
+ # myfile = "Angela Merkel - Wikipedia.pdf"
61
+ # loader = PyPDFLoader(file_path=myfile)
62
+
63
+ loader = PyPDFLoader(file_path=dataset["filepath"][0])
64
+
65
+ index = VectorstoreIndexCreator(
66
+ vectorstore_cls=DocArrayInMemorySearch
67
+ ).from_loaders([loader])
68
+
69
+ return index
70
+
71
+
72
+ def make_stats(docs):
73
+ return [[len(docs.doc_previews), sum([x[0] for x in docs.doc_previews])]]
74
+
75
+
76
+ def do_ask(question, button, openapi, dataset, index):
77
+ passages = ""
78
+ docs_ready = dataset.iloc[-1, 0] != ""
79
+ out = ''
80
+ if button == "✨Ready✨" and type(openapi) is str and len(openapi) > 0 and docs_ready:
81
+
82
+
83
+ # "Please provide a summary of signifcant personal life events of Angela Merkel. Of that summary extract all events with dates and put these into a markdown table."
84
+ # limit = f' Limit your answer to a maxmium of {length} words.'
85
+
86
+ query = question # + limit
87
+
88
+ response = index.query(query)
89
+ out = response
90
+
91
+ yield out, index
92
+
93
+
94
+ with gr.Blocks(css=css_style) as demo:
95
+ docs = gr.State()
96
+ data = gr.State([])
97
+ openai_api_key = gr.State("")
98
+
99
+ gr.Markdown(
100
+ """
101
+ # Document Question and Answer
102
+
103
+ *By D8a.ai*
104
+
105
+ Based on https://huggingface.co/spaces/whitead/paper-qa
106
+
107
+ Significant advances in langchain have made it possible to simplify the code.
108
+
109
+ This tool allows you to ask questions of your uploaded text, PDF documents.
110
+
111
+ It uses OpenAI's GPT models, so you need to enter your API key below. This
112
+ tool is under active development and currently uses a lot of tokens - up to 10,000
113
+ for a single query. This is $0.10-0.20 per query, so please be careful!
114
+
115
+ * [langchain](https://github.com/hwchase17/langchain) is the main library this tool utilizes.
116
+ 1. Enter API Key ([What is that?](https://platform.openai.com/account/api-keys))
117
+ 2. Upload your documents
118
+ 3. Ask a questions
119
+ """
120
+ )
121
+
122
+ openai_api_key = gr.Textbox(
123
+ label="OpenAI API Key", placeholder="sk-...", type="password"
124
+ )
125
+ with gr.Tab("File Upload"):
126
+ uploaded_files = gr.File(
127
+ label="Your Documents Upload (PDF or txt)",
128
+ file_count="multiple",
129
+ )
130
+
131
+ with gr.Accordion("See Docs:", open=False):
132
+ dataset = gr.Dataframe(
133
+ headers=["filepath", "citation string", "key"],
134
+ datatype=["str", "str", "str"],
135
+ col_count=(3, "fixed"),
136
+ interactive=False,
137
+ label="Documents and Citations",
138
+ overflow_row_behaviour="paginate",
139
+ max_rows=5,
140
+ )
141
+
142
+
143
+ buildb = gr.Textbox(
144
+ "⚠️Waiting for documents and key...",
145
+ label="Status",
146
+ interactive=False,
147
+ show_label=True,
148
+ max_lines=1,
149
+ )
150
+
151
+ index = gr.State()
152
+
153
+ stats = gr.Dataframe(
154
+ headers=["Docs", "Chunks"],
155
+ datatype=["number", "number"],
156
+ col_count=(2, "fixed"),
157
+ interactive=False,
158
+ label="Doc Stats",
159
+ )
160
+ openai_api_key.change(
161
+ validate_dataset, inputs=[dataset, openai_api_key], outputs=[buildb, index]
162
+ )
163
+ dataset.change(validate_dataset, inputs=[dataset, openai_api_key, index], outputs=[buildb, index])
164
+
165
+
166
+ uploaded_files.change(
167
+ request_pathname,
168
+ inputs=[uploaded_files, data, openai_api_key, index],
169
+ outputs=[stats, data, dataset, buildb, index],
170
+ )
171
+
172
+ query = gr.Textbox(placeholder="Enter your question here...", label="Question")
173
+
174
+ # with gr.Row():
175
+ # length = gr.Slider(25, 200, value=100, step=5, label="Words in answer")
176
+ ask = gr.Button("Ask Question")
177
+ answer = gr.Markdown(label="Answer")
178
+
179
+ ask.click(
180
+ do_ask,
181
+ inputs=[query, buildb, openai_api_key, dataset, index],
182
+ outputs=[answer, index],
183
+ )
184
+
185
+ demo.queue(concurrency_count=20)
186
+ demo.launch(show_error=True)
requirements.txt ADDED
@@ -0,0 +1,10 @@
 
 
 
 
 
 
 
 
 
 
 
1
+ python-dotenv == 1.0.0
2
+ openai==0.27.8
3
+ langchain==0.0.194
4
+ tiktoken==0.4.0
5
+ pandas==2.0.2
6
+ pypdf==3.9.1
7
+ docarray==0.32.1
8
+ gradio == 3.34.0
9
+ jupyter == 1.0.0
10
+ ipykernel == 6.23.1