Spaces:
Sleeping
Sleeping
Delete utils/semantic_search.py
Browse files- utils/semantic_search.py +0 -582
utils/semantic_search.py
DELETED
@@ -1,582 +0,0 @@
|
|
1 |
-
from haystack.nodes import TransformersQueryClassifier, Docs2Answers
|
2 |
-
from haystack.nodes import EmbeddingRetriever, FARMReader
|
3 |
-
from haystack.nodes.base import BaseComponent
|
4 |
-
from haystack.document_stores import InMemoryDocumentStore
|
5 |
-
from markdown import markdown
|
6 |
-
from annotated_text import annotation
|
7 |
-
from haystack.schema import Document
|
8 |
-
from typing import List, Text, Union
|
9 |
-
from typing_extensions import Literal
|
10 |
-
from utils.preprocessing import processingpipeline
|
11 |
-
from utils.streamlitcheck import check_streamlit
|
12 |
-
from haystack.pipelines import Pipeline
|
13 |
-
import pandas as pd
|
14 |
-
import logging
|
15 |
-
try:
|
16 |
-
from termcolor import colored
|
17 |
-
except:
|
18 |
-
pass
|
19 |
-
try:
|
20 |
-
import streamlit as st
|
21 |
-
except ImportError:
|
22 |
-
logging.info("Streamlit not installed")
|
23 |
-
|
24 |
-
|
25 |
-
@st.cache(allow_output_mutation=True)
|
26 |
-
def loadQueryClassifier():
|
27 |
-
"""
|
28 |
-
retuns the haystack query classifier model
|
29 |
-
model = shahrukhx01/bert-mini-finetune-question-detection
|
30 |
-
|
31 |
-
"""
|
32 |
-
query_classifier = TransformersQueryClassifier(model_name_or_path=
|
33 |
-
"shahrukhx01/bert-mini-finetune-question-detection")
|
34 |
-
return query_classifier
|
35 |
-
|
36 |
-
class QueryCheck(BaseComponent):
|
37 |
-
"""
|
38 |
-
Uses Query Classifier from Haystack, process the query based on query type.
|
39 |
-
Ability to determine the statements is not so good, therefore the chances
|
40 |
-
statement also get modified. Ex: "List water related issues" will be
|
41 |
-
identified by the model as keywords, and therefore it be processed as "what
|
42 |
-
are the 'list all water related issues' related issues and discussions?".
|
43 |
-
This is one shortcoming but is igonred for now, as semantic search will not
|
44 |
-
get affected a lot, by this. If you want to pass keywords list and want to
|
45 |
-
do batch processing use. run_batch. Example: if you want to find relevant
|
46 |
-
passages for water, food security, poverty then querylist = ["water", "food
|
47 |
-
security","poverty"] and then execute QueryCheck.run_batch(queries = querylist)
|
48 |
-
|
49 |
-
1. https://docs.haystack.deepset.ai/docs/query_classifier
|
50 |
-
|
51 |
-
"""
|
52 |
-
|
53 |
-
outgoing_edges = 1
|
54 |
-
|
55 |
-
def run(self, query:str):
|
56 |
-
"""
|
57 |
-
mandatory method to use the custom node. Determines the query type, if
|
58 |
-
if the query is of type keyword/statement will modify it to make it more
|
59 |
-
useful for sentence transoformers.
|
60 |
-
|
61 |
-
Params
|
62 |
-
--------
|
63 |
-
query: query/statement/keywords in form of string
|
64 |
-
|
65 |
-
Return
|
66 |
-
------
|
67 |
-
output: dictionary, with key as identifier and value could be anything
|
68 |
-
we need to return. In this case the output contain key = 'query'.
|
69 |
-
|
70 |
-
output_1: As there is only one outgoing edge, we pass 'output_1' string
|
71 |
-
|
72 |
-
"""
|
73 |
-
query_classifier = loadQueryClassifier()
|
74 |
-
result = query_classifier.run(query=query)
|
75 |
-
|
76 |
-
if result[1] == "output_1":
|
77 |
-
output = {"query":query,
|
78 |
-
"query_type": 'question/statement'}
|
79 |
-
else:
|
80 |
-
output = {"query": "what are the {} related issues and \
|
81 |
-
discussions?".format(query),
|
82 |
-
"query_type": 'statements/keyword'}
|
83 |
-
logging.info(output)
|
84 |
-
return output, "output_1"
|
85 |
-
|
86 |
-
def run_batch(self, queries:List[str]):
|
87 |
-
"""
|
88 |
-
running multiple queries in one go, howeevr need the queries to be passed
|
89 |
-
as list of string. Example: if you want to find relevant passages for
|
90 |
-
water, food security, poverty then querylist = ["water", "food security",
|
91 |
-
"poverty"] and then execute QueryCheck.run_batch(queries = querylist)
|
92 |
-
|
93 |
-
Params
|
94 |
-
--------
|
95 |
-
queries: queries/statements/keywords in form of string encapsulated
|
96 |
-
within List
|
97 |
-
|
98 |
-
Return
|
99 |
-
------
|
100 |
-
output: dictionary, with key as identifier and value could be anything
|
101 |
-
we need to return. In this case the output contain key = 'queries'.
|
102 |
-
|
103 |
-
output_1: As there is only one outgoing edge, we pass 'output_1' string
|
104 |
-
"""
|
105 |
-
query_classifier = loadQueryClassifier()
|
106 |
-
query_list = []
|
107 |
-
for query in queries:
|
108 |
-
result = query_classifier.run(query=query)
|
109 |
-
if result[1] == "output_1":
|
110 |
-
query_list.append(query)
|
111 |
-
else:
|
112 |
-
query_list.append("what are the {} related issues and \
|
113 |
-
discussions?".format(query))
|
114 |
-
output = {'queries':query_list}
|
115 |
-
logging.info(output)
|
116 |
-
return output, "output_1"
|
117 |
-
|
118 |
-
|
119 |
-
@st.cache(allow_output_mutation=True)
|
120 |
-
def runSemanticPreprocessingPipeline(file_path:str, file_name:str,
|
121 |
-
split_by: Literal["sentence", "word"] = 'sentence',
|
122 |
-
split_length:int = 2, split_overlap:int = 0,
|
123 |
-
split_respect_sentence_boundary:bool = False,
|
124 |
-
remove_punc:bool = False)->List[Document]:
|
125 |
-
"""
|
126 |
-
creates the pipeline and runs the preprocessing pipeline.
|
127 |
-
|
128 |
-
Params
|
129 |
-
------------
|
130 |
-
|
131 |
-
file_name: filename, in case of streamlit application use
|
132 |
-
st.session_state['filename']
|
133 |
-
file_path: filepath, in case of streamlit application use
|
134 |
-
st.session_state['filepath']
|
135 |
-
split_by: document splitting strategy either as word or sentence
|
136 |
-
split_length: when synthetically creating the paragrpahs from document,
|
137 |
-
it defines the length of paragraph.
|
138 |
-
split_overlap: Number of words or sentences that overlap when creating the
|
139 |
-
paragraphs. This is done as one sentence or 'some words' make sense
|
140 |
-
when read in together with others. Therefore the overlap is used.
|
141 |
-
split_respect_sentence_boundary: Used when using 'word' strategy for
|
142 |
-
splititng of text.
|
143 |
-
remove_punc: to remove all Punctuation including ',' and '.' or not
|
144 |
-
|
145 |
-
Return
|
146 |
-
--------------
|
147 |
-
List[Document]: When preprocessing pipeline is run, the output dictionary
|
148 |
-
has four objects. For the Haysatck implementation of semantic search we,
|
149 |
-
need to use the List of Haystack Document, which can be fetched by
|
150 |
-
key = 'documents' on output.
|
151 |
-
|
152 |
-
"""
|
153 |
-
|
154 |
-
semantic_processing_pipeline = processingpipeline()
|
155 |
-
|
156 |
-
output_semantic_pre = semantic_processing_pipeline.run(file_paths = file_path,
|
157 |
-
params= {"FileConverter": {"file_path": file_path, \
|
158 |
-
"file_name": file_name},
|
159 |
-
"UdfPreProcessor": {"remove_punc": remove_punc, \
|
160 |
-
"split_by": split_by, \
|
161 |
-
"split_length":split_length,\
|
162 |
-
"split_overlap": split_overlap,
|
163 |
-
"split_respect_sentence_boundary":split_respect_sentence_boundary}})
|
164 |
-
|
165 |
-
return output_semantic_pre
|
166 |
-
|
167 |
-
|
168 |
-
@st.cache(hash_funcs={"builtins.SwigPyObject": lambda _: None},
|
169 |
-
allow_output_mutation=True)
|
170 |
-
def loadRetriever(embedding_model:Text=None, embedding_model_format:Text = None,
|
171 |
-
embedding_layer:int = None, retriever_top_k:int = 10,
|
172 |
-
max_seq_len:int=512, document_store:InMemoryDocumentStore=None):
|
173 |
-
"""
|
174 |
-
Returns the Retriever model based on params provided.
|
175 |
-
1. https://docs.haystack.deepset.ai/docs/retriever#embedding-retrieval-recommended
|
176 |
-
2. https://www.sbert.net/examples/applications/semantic-search/README.html
|
177 |
-
3. https://github.com/deepset-ai/haystack/blob/main/haystack/nodes/retriever/dense.py
|
178 |
-
|
179 |
-
|
180 |
-
Params
|
181 |
-
---------
|
182 |
-
embedding_model: Name of the model to be used for embedding. Check the links
|
183 |
-
provided in documentation
|
184 |
-
embedding_model_format: check the github link of Haystack provided in
|
185 |
-
documentation embedding_layer: check the github link of Haystack
|
186 |
-
provided in documentation retriever_top_k: Number of Top results to
|
187 |
-
be returned by
|
188 |
-
retriever max_seq_len: everymodel has max seq len it can handle, check in
|
189 |
-
model card. Needed to hanlde the edge cases.
|
190 |
-
document_store: InMemoryDocumentStore, write haystack Document list to
|
191 |
-
DocumentStore and pass the same to function call. Can be done using
|
192 |
-
createDocumentStore from utils.
|
193 |
-
|
194 |
-
Return
|
195 |
-
-------
|
196 |
-
retriever: embedding model
|
197 |
-
"""
|
198 |
-
logging.info("loading retriever")
|
199 |
-
if document_store is None:
|
200 |
-
logging.warning("Retriever initialization requires the DocumentStore")
|
201 |
-
return
|
202 |
-
|
203 |
-
retriever = EmbeddingRetriever(
|
204 |
-
embedding_model=embedding_model,top_k = retriever_top_k,
|
205 |
-
document_store = document_store,
|
206 |
-
emb_extraction_layer=embedding_layer, scale_score =True,
|
207 |
-
model_format=embedding_model_format, use_gpu = True,
|
208 |
-
max_seq_len = max_seq_len )
|
209 |
-
if check_streamlit:
|
210 |
-
st.session_state['retriever'] = retriever
|
211 |
-
return retriever
|
212 |
-
|
213 |
-
@st.cache(hash_funcs={"builtins.SwigPyObject": lambda _: None},
|
214 |
-
allow_output_mutation=True)
|
215 |
-
def createDocumentStore(documents:List[Document], similarity:str = 'dot_product',
|
216 |
-
embedding_dim:int = 768):
|
217 |
-
"""
|
218 |
-
Creates the InMemory Document Store from haystack list of Documents.
|
219 |
-
It is mandatory component for Retriever to work in Haystack frame work.
|
220 |
-
|
221 |
-
Params
|
222 |
-
-------
|
223 |
-
documents: List of haystack document. If using the preprocessing pipeline,
|
224 |
-
can be fetched key = 'documents; on output of preprocessing pipeline.
|
225 |
-
similarity: scoring function, can be either 'cosine' or 'dot_product'
|
226 |
-
embedding_dim: Document store has default value of embedding size = 768, and
|
227 |
-
update_embeddings method of Docstore cannot infer the embedding size of
|
228 |
-
retiever automatically, therefore set this value as per the model card.
|
229 |
-
|
230 |
-
Return
|
231 |
-
-------
|
232 |
-
document_store: InMemory Document Store object type.
|
233 |
-
|
234 |
-
"""
|
235 |
-
document_store = InMemoryDocumentStore(similarity = similarity,
|
236 |
-
embedding_dim = embedding_dim )
|
237 |
-
document_store.write_documents(documents)
|
238 |
-
|
239 |
-
return document_store
|
240 |
-
|
241 |
-
|
242 |
-
@st.cache(hash_funcs={"builtins.SwigPyObject": lambda _: None},
|
243 |
-
allow_output_mutation=True)
|
244 |
-
def semanticSearchPipeline(documents:List[Document], embedding_model:Text = None,
|
245 |
-
embedding_model_format:Text = None,embedding_layer:int = None,
|
246 |
-
embedding_dim:int = 768,retriever_top_k:int = 10,
|
247 |
-
reader_model:str = None, reader_top_k:int = 10,
|
248 |
-
max_seq_len:int =512,useQueryCheck = True,
|
249 |
-
top_k_per_candidate:int = 1):
|
250 |
-
"""
|
251 |
-
creates the semantic search pipeline and document Store object from the
|
252 |
-
list of haystack documents. The top_k for the Reader and Retirever are kept
|
253 |
-
same, so that all the results returned by Retriever are used, however the
|
254 |
-
context is extracted by Reader for each retrieved result. The querycheck is
|
255 |
-
added as node to process the query. This pipeline is suited for keyword search,
|
256 |
-
and to some extent extractive QA purpose. The purpose of Reader is strictly to
|
257 |
-
highlight the context for retrieved result and not for QA, however as stated
|
258 |
-
it can work for QA too in limited sense.
|
259 |
-
There are 4 variants of pipeline it can return
|
260 |
-
1.QueryCheck > Retriever > Reader
|
261 |
-
2.Retriever > Reader
|
262 |
-
3.QueryCheck > Retriever > Docs2Answers : If reader is None,
|
263 |
-
then Doc2answer is used to keep the output of pipeline structurally same.
|
264 |
-
4.Retriever > Docs2Answers
|
265 |
-
|
266 |
-
Links
|
267 |
-
|
268 |
-
1. https://docs.haystack.deepset.ai/docs/retriever#embedding-retrieval-recommended
|
269 |
-
2. https://www.sbert.net/examples/applications/semantic-search/README.html
|
270 |
-
3. https://github.com/deepset-ai/haystack/blob/main/haystack/nodes/retriever/dense.py
|
271 |
-
4. https://docs.haystack.deepset.ai/docs/reader
|
272 |
-
|
273 |
-
|
274 |
-
Params
|
275 |
-
----------
|
276 |
-
documents: list of Haystack Documents, returned by preprocessig pipeline.
|
277 |
-
embedding_model: Name of the model to be used for embedding. Check the links
|
278 |
-
provided in documentation
|
279 |
-
embedding_model_format: check the github link of Haystack provided in
|
280 |
-
documentation
|
281 |
-
embedding_layer: check the github link of Haystack provided in documentation
|
282 |
-
embedding_dim: Document store has default value of embedding size = 768, and
|
283 |
-
update_embeddings method of Docstore cannot infer the embedding size of
|
284 |
-
retiever automatically, therefore set this value as per the model card.
|
285 |
-
retriever_top_k: Number of Top results to be returned by retriever
|
286 |
-
reader_model: Name of the model to be used for Reader node in hasyatck
|
287 |
-
Pipeline. Check the links provided in documentation
|
288 |
-
reader_top_k: Reader will use retrieved results to further find better matches.
|
289 |
-
As purpose here is to use reader to extract context, the value is
|
290 |
-
same as retriever_top_k.
|
291 |
-
max_seq_len:everymodel has max seq len it can handle, check in model card.
|
292 |
-
Needed to hanlde the edge cases
|
293 |
-
useQueryCheck: Whether to use the querycheck which modifies the query or not.
|
294 |
-
top_k_per_candidate:How many answers to extract for each candidate doc
|
295 |
-
that is coming from the retriever
|
296 |
-
|
297 |
-
Return
|
298 |
-
---------
|
299 |
-
semanticsearch_pipeline: Haystack Pipeline object, with all the necessary
|
300 |
-
nodes [QueryCheck, Retriever, Reader/Docs2Answer]. If reader is None,
|
301 |
-
then Doc2answer is used to keep the output of pipeline structurally
|
302 |
-
same.
|
303 |
-
|
304 |
-
document_store: As retriever can work only with Haystack Document Store, the
|
305 |
-
list of document returned by preprocessing pipeline are fed into to
|
306 |
-
get InMemmoryDocumentStore object type, with retriever updating the
|
307 |
-
embeddings of each paragraph in document store.
|
308 |
-
|
309 |
-
"""
|
310 |
-
document_store = createDocumentStore(documents=documents,
|
311 |
-
embedding_dim=embedding_dim)
|
312 |
-
retriever = loadRetriever(embedding_model = embedding_model,
|
313 |
-
embedding_model_format=embedding_model_format,
|
314 |
-
embedding_layer=embedding_layer,
|
315 |
-
retriever_top_k= retriever_top_k,
|
316 |
-
document_store = document_store,
|
317 |
-
max_seq_len=max_seq_len)
|
318 |
-
document_store.update_embeddings(retriever)
|
319 |
-
semantic_search_pipeline = Pipeline()
|
320 |
-
if useQueryCheck and reader_model:
|
321 |
-
querycheck = QueryCheck()
|
322 |
-
reader = FARMReader(model_name_or_path=reader_model,
|
323 |
-
top_k = reader_top_k, use_gpu=True,
|
324 |
-
top_k_per_candidate = top_k_per_candidate)
|
325 |
-
semantic_search_pipeline.add_node(component = querycheck,
|
326 |
-
name = "QueryCheck",inputs = ["Query"])
|
327 |
-
semantic_search_pipeline.add_node(component = retriever,
|
328 |
-
name = "EmbeddingRetriever",inputs = ["QueryCheck.output_1"])
|
329 |
-
semantic_search_pipeline.add_node(component = reader, name = "FARMReader",
|
330 |
-
inputs= ["EmbeddingRetriever"])
|
331 |
-
|
332 |
-
elif reader_model :
|
333 |
-
reader = FARMReader(model_name_or_path=reader_model,
|
334 |
-
top_k = reader_top_k, use_gpu=True,
|
335 |
-
top_k_per_candidate = top_k_per_candidate)
|
336 |
-
semantic_search_pipeline.add_node(component = retriever,
|
337 |
-
name = "EmbeddingRetriever",inputs = ["Query"])
|
338 |
-
semantic_search_pipeline.add_node(component = reader,
|
339 |
-
name = "FARMReader",inputs= ["EmbeddingRetriever"])
|
340 |
-
elif useQueryCheck and not reader_model:
|
341 |
-
querycheck = QueryCheck()
|
342 |
-
docs2answers = Docs2Answers()
|
343 |
-
semantic_search_pipeline.add_node(component = querycheck,
|
344 |
-
name = "QueryCheck",inputs = ["Query"])
|
345 |
-
semantic_search_pipeline.add_node(component = retriever,
|
346 |
-
name = "EmbeddingRetriever",inputs = ["QueryCheck.output_1"])
|
347 |
-
semantic_search_pipeline.add_node(component = docs2answers,
|
348 |
-
name = "Docs2Answers",inputs= ["EmbeddingRetriever"])
|
349 |
-
elif not useQueryCheck and not reader_model:
|
350 |
-
docs2answers = Docs2Answers()
|
351 |
-
semantic_search_pipeline.add_node(component = retriever,
|
352 |
-
name = "EmbeddingRetriever",inputs = ["Query"])
|
353 |
-
semantic_search_pipeline.add_node(component = docs2answers,
|
354 |
-
name = "Docs2Answers",inputs= ["EmbeddingRetriever"])
|
355 |
-
|
356 |
-
logging.info(semantic_search_pipeline.components)
|
357 |
-
return semantic_search_pipeline, document_store
|
358 |
-
|
359 |
-
def runSemanticPipeline(pipeline:Pipeline, queries:Union[list,str])->dict:
|
360 |
-
"""
|
361 |
-
will use the haystack run or run_batch based on if single query is passed
|
362 |
-
as string or multiple queries as List[str]
|
363 |
-
|
364 |
-
Params
|
365 |
-
-------
|
366 |
-
pipeline: haystack pipeline, this is same as returned by semanticSearchPipeline
|
367 |
-
from utils.semanticsearch
|
368 |
-
|
369 |
-
queries: Either a single query or list of queries.
|
370 |
-
|
371 |
-
Return
|
372 |
-
-------
|
373 |
-
results: Dict containing answers and documents as key and their respective
|
374 |
-
values
|
375 |
-
|
376 |
-
"""
|
377 |
-
|
378 |
-
if type(queries) == list:
|
379 |
-
results = pipeline.run_batch(queries=queries)
|
380 |
-
elif type(queries) == str:
|
381 |
-
results = pipeline.run(query=queries)
|
382 |
-
else:
|
383 |
-
logging.info("Please check the input type for the queries")
|
384 |
-
return
|
385 |
-
|
386 |
-
return results
|
387 |
-
|
388 |
-
def process_query_output(results:dict)->pd.DataFrame:
|
389 |
-
"""
|
390 |
-
Returns the dataframe with necessary information like including
|
391 |
-
['query','answer','answer_offset','context_offset','context','content',
|
392 |
-
'reader_score','retriever_score','id',]. This is designed for output given
|
393 |
-
by semantic search pipeline with single query and final node as reader.
|
394 |
-
The output of pipeline having Docs2Answers as final node or multiple queries
|
395 |
-
need to be handled separately. In these other cases, use process_semantic_output
|
396 |
-
from utils.semantic_search which uses this function internally to make one
|
397 |
-
combined dataframe.
|
398 |
-
|
399 |
-
Params
|
400 |
-
---------
|
401 |
-
results: this dictionary should have key,values with
|
402 |
-
keys = [query,answers,documents], however answers is optional.
|
403 |
-
in case of [Doc2Answers as final node], process_semantic_output
|
404 |
-
doesnt return answers thereby setting all values contained in
|
405 |
-
answers to 'None'
|
406 |
-
|
407 |
-
Return
|
408 |
-
--------
|
409 |
-
df: dataframe with all the columns mentioned in function description.
|
410 |
-
|
411 |
-
"""
|
412 |
-
query_text = results['query']
|
413 |
-
if 'answers' in results.keys():
|
414 |
-
answer_dict = {}
|
415 |
-
|
416 |
-
for answer in results['answers']:
|
417 |
-
answer_dict[answer.document_id] = answer.to_dict()
|
418 |
-
else:
|
419 |
-
answer_dict = {}
|
420 |
-
docs = results['documents']
|
421 |
-
df = pd.DataFrame(columns=['query','answer','answer_offset','context_offset',
|
422 |
-
'context','content','reader_score','retriever_score',
|
423 |
-
'id'])
|
424 |
-
for doc in docs:
|
425 |
-
row_list = {}
|
426 |
-
row_list['query'] = query_text
|
427 |
-
row_list['retriever_score'] = doc.score
|
428 |
-
row_list['id'] = doc.id
|
429 |
-
row_list['content'] = doc.content
|
430 |
-
if doc.id in answer_dict.keys():
|
431 |
-
row_list['answer'] = answer_dict[doc.id]['answer']
|
432 |
-
row_list['context'] = answer_dict[doc.id]['context']
|
433 |
-
row_list['reader_score'] = answer_dict[doc.id]['score']
|
434 |
-
answer_offset = answer_dict[doc.id]['offsets_in_document'][0]
|
435 |
-
row_list['answer_offset'] = [answer_offset['start'],answer_offset['end']]
|
436 |
-
start_idx = doc.content.find(row_list['context'])
|
437 |
-
end_idx = start_idx + len(row_list['context'])
|
438 |
-
row_list['context_offset'] = [start_idx, end_idx]
|
439 |
-
else:
|
440 |
-
row_list['answer'] = None
|
441 |
-
row_list['context'] = None
|
442 |
-
row_list['reader_score'] = None
|
443 |
-
row_list['answer_offset'] = None
|
444 |
-
row_list['context_offset'] = None
|
445 |
-
df_dictionary = pd.DataFrame([row_list])
|
446 |
-
df = pd.concat([df, df_dictionary], ignore_index=True)
|
447 |
-
|
448 |
-
return df
|
449 |
-
|
450 |
-
def process_semantic_output(results):
|
451 |
-
"""
|
452 |
-
Returns the dataframe with necessary information like including
|
453 |
-
['query','answer','answer_offset','context_offset','context','content',
|
454 |
-
'reader_score','retriever_score','id',]. Distingushes if its single query or
|
455 |
-
multi queries by reading the pipeline output dictionary keys.
|
456 |
-
Uses the process_query_output to get the dataframe for each query and create
|
457 |
-
one concataneted dataframe. In case of Docs2Answers as final node, deletes
|
458 |
-
the answers part. See documentations of process_query_output.
|
459 |
-
|
460 |
-
Params
|
461 |
-
---------
|
462 |
-
results: raw output of runSemanticPipeline.
|
463 |
-
|
464 |
-
Return
|
465 |
-
--------
|
466 |
-
df: dataframe with all the columns mentioned in function description.
|
467 |
-
|
468 |
-
"""
|
469 |
-
output = {}
|
470 |
-
if 'query' in results.keys():
|
471 |
-
output['query'] = results['query']
|
472 |
-
output['documents'] = results['documents']
|
473 |
-
if results['node_id'] == 'Docs2Answers':
|
474 |
-
pass
|
475 |
-
else:
|
476 |
-
output['answers'] = results['answers']
|
477 |
-
df = process_query_output(output)
|
478 |
-
return df
|
479 |
-
if 'queries' in results.keys():
|
480 |
-
df = pd.DataFrame(columns=['query','answer','answer_offset',
|
481 |
-
'context_offset','context','content',
|
482 |
-
'reader_score','retriever_score','id'])
|
483 |
-
for query,answers,documents in zip(results['queries'],
|
484 |
-
results['answers'],results['documents']):
|
485 |
-
output = {}
|
486 |
-
output['query'] = query
|
487 |
-
output['documents'] = documents
|
488 |
-
if results['node_id'] == 'Docs2Answers':
|
489 |
-
pass
|
490 |
-
else:
|
491 |
-
output['answers'] = answers
|
492 |
-
|
493 |
-
temp = process_query_output(output)
|
494 |
-
df = pd.concat([df, temp], ignore_index=True)
|
495 |
-
|
496 |
-
|
497 |
-
return df
|
498 |
-
|
499 |
-
def semanticsearchAnnotator(matches:List[List[int]], document:Text):
|
500 |
-
"""
|
501 |
-
Annotates the text in the document defined by list of [start index, end index]
|
502 |
-
Example: "How are you today", if document type is text, matches = [[0,3]]
|
503 |
-
will give answer = "How", however in case we used the spacy matcher then the
|
504 |
-
matches = [[0,3]] will give answer = "How are you". However if spacy is used
|
505 |
-
to find "How" then the matches = [[0,1]] for the string defined above.
|
506 |
-
|
507 |
-
"""
|
508 |
-
start = 0
|
509 |
-
annotated_text = ""
|
510 |
-
for match in matches:
|
511 |
-
start_idx = match[0]
|
512 |
-
end_idx = match[1]
|
513 |
-
if check_streamlit():
|
514 |
-
annotated_text = (annotated_text + document[start:start_idx]
|
515 |
-
+ str(annotation(body=document[start_idx:end_idx],
|
516 |
-
label="Context", background="#964448", color='#ffffff')))
|
517 |
-
else:
|
518 |
-
annotated_text = (annotated_text + document[start:start_idx]
|
519 |
-
+ colored(document[start_idx:end_idx],
|
520 |
-
"green", attrs = ['bold']))
|
521 |
-
start = end_idx
|
522 |
-
|
523 |
-
annotated_text = annotated_text + document[end_idx:]
|
524 |
-
|
525 |
-
if check_streamlit():
|
526 |
-
|
527 |
-
st.write(
|
528 |
-
markdown(annotated_text),
|
529 |
-
unsafe_allow_html=True,
|
530 |
-
)
|
531 |
-
else:
|
532 |
-
print(annotated_text)
|
533 |
-
|
534 |
-
|
535 |
-
def semantic_keywordsearch(query:Text,documents:List[Document],
|
536 |
-
embedding_model:Text,
|
537 |
-
embedding_model_format:Text,
|
538 |
-
embedding_layer:int, reader_model:str,
|
539 |
-
retriever_top_k:int = 10, reader_top_k:int = 10,
|
540 |
-
return_results:bool = False, embedding_dim:int = 768,
|
541 |
-
max_seq_len:int = 512,top_k_per_candidate:int =1,
|
542 |
-
sort_by:Literal["retriever", "reader"] = 'retriever'):
|
543 |
-
"""
|
544 |
-
Performs the Semantic search on the List of haystack documents which is
|
545 |
-
returned by preprocessing Pipeline.
|
546 |
-
|
547 |
-
Params
|
548 |
-
-------
|
549 |
-
query: Keywords that need to be searche in documents.
|
550 |
-
documents: List fo Haystack documents returned by preprocessing pipeline.
|
551 |
-
|
552 |
-
"""
|
553 |
-
semanticsearch_pipeline, doc_store = semanticSearchPipeline(documents = documents,
|
554 |
-
embedding_model= embedding_model,
|
555 |
-
embedding_layer= embedding_layer,
|
556 |
-
embedding_model_format= embedding_model_format,
|
557 |
-
reader_model= reader_model, retriever_top_k= retriever_top_k,
|
558 |
-
reader_top_k= reader_top_k, embedding_dim=embedding_dim,
|
559 |
-
max_seq_len=max_seq_len,
|
560 |
-
top_k_per_candidate=top_k_per_candidate)
|
561 |
-
|
562 |
-
raw_output = runSemanticPipeline(semanticsearch_pipeline,query)
|
563 |
-
results_df = process_semantic_output(raw_output)
|
564 |
-
if sort_by == 'retriever':
|
565 |
-
results_df = results_df.sort_values(by=['retriever_score'], ascending=False)
|
566 |
-
else:
|
567 |
-
results_df = results_df.sort_values(by=['reader_score'], ascending=False)
|
568 |
-
|
569 |
-
if return_results:
|
570 |
-
return results_df
|
571 |
-
else:
|
572 |
-
if check_streamlit:
|
573 |
-
st.markdown("##### Top few semantic search results #####")
|
574 |
-
else:
|
575 |
-
print("Top few semantic search results")
|
576 |
-
for i in range(len(results_df)):
|
577 |
-
if check_streamlit:
|
578 |
-
st.write("Result {}".format(i+1))
|
579 |
-
else:
|
580 |
-
print("Result {}".format(i+1))
|
581 |
-
semanticsearchAnnotator([results_df.loc[i]['context_offset']],
|
582 |
-
results_df.loc[i]['content'] )
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|