Spaces:
Runtime error
Runtime error
Commit
·
2b9b8a4
1
Parent(s):
fabd612
Update app.py
Browse files
app.py
CHANGED
@@ -26,6 +26,10 @@ from haystack.pipelines import ExtractiveQAPipeline, Pipeline
|
|
26 |
from markdown import markdown
|
27 |
from sentence_transformers import SentenceTransformer
|
28 |
|
|
|
|
|
|
|
|
|
29 |
index_name = "qa_demo"
|
30 |
|
31 |
|
@@ -36,6 +40,7 @@ pinecone.init(
|
|
36 |
)
|
37 |
index_name = "qa-demo"
|
38 |
|
|
|
39 |
preprocessor = PreProcessor(
|
40 |
clean_empty_lines=True,
|
41 |
clean_whitespace=True,
|
@@ -54,7 +59,7 @@ if index_name not in pinecone.list_indexes():
|
|
54 |
# create the index if it does not exist
|
55 |
pinecone.create_index(
|
56 |
index_name,
|
57 |
-
dimension=
|
58 |
metric="cosine"
|
59 |
)
|
60 |
|
@@ -69,7 +74,10 @@ def create_doc_store():
|
|
69 |
api_key= st.secrets["pinecone_apikey"],
|
70 |
index=index_name,
|
71 |
similarity="cosine",
|
72 |
-
embedding_dim=768
|
|
|
|
|
|
|
73 |
)
|
74 |
return document_store
|
75 |
|
@@ -83,20 +91,68 @@ def create_doc_store():
|
|
83 |
# reader = FARMReader(model_name_or_path="deepset/roberta-base-squad2", use_gpu=False)
|
84 |
# pipe = ExtractiveQAPipeline(reader, retriever)
|
85 |
# return pipe
|
|
|
86 |
|
87 |
-
def query
|
88 |
-
res =
|
89 |
-
query
|
|
|
90 |
)
|
91 |
-
|
92 |
-
#
|
93 |
-
|
94 |
-
|
95 |
-
#
|
96 |
-
|
97 |
-
|
98 |
-
|
99 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
100 |
|
101 |
document_store = create_doc_store()
|
102 |
# pipe = create_pipe(document_store)
|
@@ -107,11 +163,11 @@ embedding_model=retriever_model,
|
|
107 |
model_format="sentence_transformers",
|
108 |
)
|
109 |
# load the retriever model from huggingface model hub
|
110 |
-
sentence_encoder = SentenceTransformer(retriever_model)
|
111 |
-
|
112 |
-
reader = FARMReader(model_name_or_path="deepset/roberta-base-squad2", use_gpu=False)
|
113 |
-
pipe = ExtractiveQAPipeline(reader, retriever)
|
114 |
|
|
|
|
|
|
|
115 |
|
116 |
indexing_pipeline_with_classification = Pipeline()
|
117 |
indexing_pipeline_with_classification.add_node(
|
@@ -205,7 +261,18 @@ if len(ALL_FILES) > 0:
|
|
205 |
# extract batch
|
206 |
batch = [doc.content for doc in docs[i:i_end]]
|
207 |
# generate embeddings for batch
|
208 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
209 |
# get metadata
|
210 |
meta = [doc.meta for doc in docs[i:i_end]]
|
211 |
# create unique IDs
|
@@ -215,22 +282,22 @@ if len(ALL_FILES) > 0:
|
|
215 |
# upsert/insert these records to pinecone
|
216 |
_ = index.upsert(vectors=to_upsert)
|
217 |
|
218 |
-
top_k_reader = st.sidebar.slider(
|
219 |
-
|
220 |
-
|
221 |
-
|
222 |
-
|
223 |
-
|
224 |
-
|
225 |
-
)
|
226 |
-
top_k_retriever = st.sidebar.slider(
|
227 |
-
|
228 |
-
|
229 |
-
|
230 |
-
|
231 |
-
|
232 |
-
|
233 |
-
)
|
234 |
# data_files = st.file_uploader(
|
235 |
# "upload", type=["csv"], accept_multiple_files=True, label_visibility="hidden"
|
236 |
# )
|
@@ -267,7 +334,7 @@ if run_pressed:
|
|
267 |
):
|
268 |
try:
|
269 |
st.session_state.results = query(
|
270 |
-
pipe, question, top_k_reader=
|
271 |
)
|
272 |
except JSONDecodeError as je:
|
273 |
st.error("👓 An error occurred reading the results. Is the document store working?")
|
@@ -283,21 +350,29 @@ if st.session_state.results:
|
|
283 |
|
284 |
st.write("## Results:")
|
285 |
|
286 |
-
for
|
287 |
-
answer, context = result.answer, result.context
|
288 |
-
start_idx = context.find(answer)
|
289 |
-
end_idx = start_idx + len(answer)
|
290 |
# Hack due to this bug: https://github.com/streamlit/streamlit/issues/3190
|
291 |
try:
|
292 |
-
source = f"[{result.meta['Title']}]({result.meta['link']})"
|
|
|
|
|
|
|
|
|
293 |
st.write(
|
294 |
-
markdown(f
|
295 |
unsafe_allow_html=True,
|
296 |
-
|
297 |
except:
|
298 |
-
filename = result.meta.get('filename', "")
|
|
|
|
|
|
|
|
|
299 |
st.write(
|
300 |
-
markdown(f
|
301 |
unsafe_allow_html=True,
|
302 |
)
|
303 |
|
|
|
26 |
from markdown import markdown
|
27 |
from sentence_transformers import SentenceTransformer
|
28 |
|
29 |
+
import openai
|
30 |
+
|
31 |
+
# get API key from top-right dropdown on OpenAI website
|
32 |
+
openai.api_key = st.secrets["OPENAI_API_KEY"]
|
33 |
index_name = "qa_demo"
|
34 |
|
35 |
|
|
|
40 |
)
|
41 |
index_name = "qa-demo"
|
42 |
|
43 |
+
embed_model = "text-embedding-ada-002"
|
44 |
preprocessor = PreProcessor(
|
45 |
clean_empty_lines=True,
|
46 |
clean_whitespace=True,
|
|
|
59 |
# create the index if it does not exist
|
60 |
pinecone.create_index(
|
61 |
index_name,
|
62 |
+
dimension=1536,
|
63 |
metric="cosine"
|
64 |
)
|
65 |
|
|
|
74 |
api_key= st.secrets["pinecone_apikey"],
|
75 |
index=index_name,
|
76 |
similarity="cosine",
|
77 |
+
embedding_dim=768,
|
78 |
+
metadata_config={
|
79 |
+
'indexed': ['filename']
|
80 |
+
}
|
81 |
)
|
82 |
return document_store
|
83 |
|
|
|
91 |
# reader = FARMReader(model_name_or_path="deepset/roberta-base-squad2", use_gpu=False)
|
92 |
# pipe = ExtractiveQAPipeline(reader, retriever)
|
93 |
# return pipe
|
94 |
+
limit = 3750
|
95 |
|
96 |
+
def retrieve(query):
|
97 |
+
res = openai.Embedding.create(
|
98 |
+
input=[query],
|
99 |
+
engine=embed_model
|
100 |
)
|
101 |
+
|
102 |
+
# retrieve from Pinecone
|
103 |
+
xq = res['data'][0]['embedding']
|
104 |
+
|
105 |
+
# get relevant contexts
|
106 |
+
res = index.query(xq, top_k=3, include_metadata=True)
|
107 |
+
contexts = [
|
108 |
+
x['metadata']['text'] for x in res['matches']
|
109 |
+
]
|
110 |
+
|
111 |
+
# build our prompt with the retrieved contexts included
|
112 |
+
prompt_start = (
|
113 |
+
"Answer the question based on the context below.\n\n"+
|
114 |
+
"Context:\n"
|
115 |
+
)
|
116 |
+
prompt_end = (
|
117 |
+
f"\n\nQuestion: {query}\nAnswer:"
|
118 |
+
)
|
119 |
+
# append contexts until hitting limit
|
120 |
+
for i in range(1, len(contexts)):
|
121 |
+
if len("\n\n---\n\n".join(contexts[:i])) >= limit:
|
122 |
+
prompt = (
|
123 |
+
prompt_start +
|
124 |
+
"\n\n---\n\n".join(contexts[:i-1]) +
|
125 |
+
prompt_end
|
126 |
+
)
|
127 |
+
break
|
128 |
+
elif i == len(contexts)-1:
|
129 |
+
prompt = (
|
130 |
+
prompt_start +
|
131 |
+
"\n\n---\n\n".join(contexts) +
|
132 |
+
prompt_end
|
133 |
+
)
|
134 |
+
return prompt, contexts
|
135 |
+
|
136 |
+
|
137 |
+
# first let's make it simpler to get answers
|
138 |
+
def complete(prompt):
|
139 |
+
# query text-davinci-003
|
140 |
+
res = openai.Completion.create(
|
141 |
+
engine='text-davinci-003',
|
142 |
+
prompt=prompt,
|
143 |
+
temperature=0,
|
144 |
+
max_tokens=400,
|
145 |
+
top_p=1,
|
146 |
+
frequency_penalty=0,
|
147 |
+
presence_penalty=0,
|
148 |
+
stop=None
|
149 |
+
)
|
150 |
+
return res['choices'][0]['text'].strip()
|
151 |
+
|
152 |
+
def query(pipe, question, top_k_reader, top_k_retriever):
|
153 |
+
# first we retrieve relevant items from Pinecone
|
154 |
+
query_with_contexts, contexts = retrieve(question)
|
155 |
+
return complete(query_with_contexts), contexts
|
156 |
|
157 |
document_store = create_doc_store()
|
158 |
# pipe = create_pipe(document_store)
|
|
|
163 |
model_format="sentence_transformers",
|
164 |
)
|
165 |
# load the retriever model from huggingface model hub
|
166 |
+
# sentence_encoder = SentenceTransformer(retriever_model)
|
|
|
|
|
|
|
167 |
|
168 |
+
# reader = FARMReader(model_name_or_path="deepset/roberta-base-squad2", use_gpu=False)
|
169 |
+
# pipe = ExtractiveQAPipeline(reader, retriever)
|
170 |
+
# now query text-davinci-003 WITHOUT context
|
171 |
|
172 |
indexing_pipeline_with_classification = Pipeline()
|
173 |
indexing_pipeline_with_classification.add_node(
|
|
|
261 |
# extract batch
|
262 |
batch = [doc.content for doc in docs[i:i_end]]
|
263 |
# generate embeddings for batch
|
264 |
+
try:
|
265 |
+
res = openai.Embedding.create(input=texts, engine=embed_model)
|
266 |
+
except:
|
267 |
+
done = False
|
268 |
+
while not done:
|
269 |
+
sleep(5)
|
270 |
+
try:
|
271 |
+
res = openai.Embedding.create(input=texts, engine=embed_model)
|
272 |
+
done = True
|
273 |
+
except:
|
274 |
+
pass
|
275 |
+
embeds = [record['embedding'] for record in res['data']]
|
276 |
# get metadata
|
277 |
meta = [doc.meta for doc in docs[i:i_end]]
|
278 |
# create unique IDs
|
|
|
282 |
# upsert/insert these records to pinecone
|
283 |
_ = index.upsert(vectors=to_upsert)
|
284 |
|
285 |
+
# top_k_reader = st.sidebar.slider(
|
286 |
+
# "Max. number of answers",
|
287 |
+
# min_value=1,
|
288 |
+
# max_value=10,
|
289 |
+
# value=DEFAULT_NUMBER_OF_ANSWERS,
|
290 |
+
# step=1,
|
291 |
+
# on_change=reset_results,
|
292 |
+
# )
|
293 |
+
# top_k_retriever = st.sidebar.slider(
|
294 |
+
# "Max. number of documents from retriever",
|
295 |
+
# min_value=1,
|
296 |
+
# max_value=10,
|
297 |
+
# value=DEFAULT_DOCS_FROM_RETRIEVER,
|
298 |
+
# step=1,
|
299 |
+
# on_change=reset_results,
|
300 |
+
# )
|
301 |
# data_files = st.file_uploader(
|
302 |
# "upload", type=["csv"], accept_multiple_files=True, label_visibility="hidden"
|
303 |
# )
|
|
|
334 |
):
|
335 |
try:
|
336 |
st.session_state.results = query(
|
337 |
+
pipe, question, top_k_reader=None, top_k_retriever=None
|
338 |
)
|
339 |
except JSONDecodeError as je:
|
340 |
st.error("👓 An error occurred reading the results. Is the document store working?")
|
|
|
350 |
|
351 |
st.write("## Results:")
|
352 |
|
353 |
+
for result,contexts in st.session_state.results:
|
354 |
+
# answer, context = result.answer, result.context
|
355 |
+
# start_idx = context.find(answer)
|
356 |
+
# end_idx = start_idx + len(answer)
|
357 |
# Hack due to this bug: https://github.com/streamlit/streamlit/issues/3190
|
358 |
try:
|
359 |
+
# source = f"[{result.meta['Title']}]({result.meta['link']})"
|
360 |
+
# st.write(
|
361 |
+
# markdown(f'**Source:** {source} \n {context[:start_idx] } {str(annotation(answer, "ANSWER", "#8ef"))} {context[end_idx:]} \n '),
|
362 |
+
# unsafe_allow_html=True,
|
363 |
+
# )
|
364 |
st.write(
|
365 |
+
markdown(f"Answer: {result} \n Extracted from context {contexts}"),
|
366 |
unsafe_allow_html=True,
|
367 |
+
)
|
368 |
except:
|
369 |
+
# filename = result.meta.get('filename', "")
|
370 |
+
# st.write(
|
371 |
+
# markdown(f'From file: {filename} \n {context[:start_idx] } {str(annotation(answer, "ANSWER", "#8ef"))} {context[end_idx:]} \n '),
|
372 |
+
# unsafe_allow_html=True,
|
373 |
+
# )
|
374 |
st.write(
|
375 |
+
markdown(f"Answer: {result}"),
|
376 |
unsafe_allow_html=True,
|
377 |
)
|
378 |
|