leavoigt commited on
Commit
9acfc38
·
1 Parent(s): 6234e7f

Delete utils/lexical_search.py

Browse files
Files changed (1) hide show
  1. utils/lexical_search.py +0 -251
utils/lexical_search.py DELETED
@@ -1,251 +0,0 @@
1
- from haystack.nodes import TfidfRetriever
2
- from haystack.document_stores import InMemoryDocumentStore
3
- import spacy
4
- import re
5
- from spacy.matcher import Matcher
6
- from markdown import markdown
7
- from annotated_text import annotation
8
- from haystack.schema import Document
9
- from typing import List, Text, Tuple
10
- from typing_extensions import Literal
11
- from utils.preprocessing import processingpipeline
12
- from utils.streamlitcheck import check_streamlit
13
- import logging
14
- try:
15
- from termcolor import colored
16
- except:
17
- pass
18
-
19
- try:
20
- import streamlit as st
21
- except ImportError:
22
- logging.info("Streamlit not installed")
23
-
24
-
25
- def runLexicalPreprocessingPipeline(file_name:str,file_path:str,
26
- split_by: Literal["sentence", "word"] = 'word',
27
- split_length:int = 80, split_overlap:int = 0,
28
- remove_punc:bool = False,)->List[Document]:
29
- """
30
- creates the pipeline and runs the preprocessing pipeline,
31
- the params for pipeline are fetched from paramconfig. As lexical doesnt gets
32
- affected by overlap, threfore split_overlap = 0 in default paramconfig and
33
- split_by = word.
34
-
35
- Params
36
- ------------
37
-
38
- file_name: filename, in case of streamlit application use
39
- st.session_state['filename']
40
- file_path: filepath, in case of streamlit application use
41
- st.session_state['filepath']
42
- split_by: document splitting strategy either as word or sentence
43
- split_length: when synthetically creating the paragrpahs from document,
44
- it defines the length of paragraph.
45
- split_overlap: Number of words or sentences that overlap when creating
46
- the paragraphs. This is done as one sentence or 'some words' make sense
47
- when read in together with others. Therefore the overlap is used.
48
- splititng of text.
49
- removePunc: to remove all Punctuation including ',' and '.' or not
50
-
51
- Return
52
- --------------
53
- List[Document]: When preprocessing pipeline is run, the output dictionary
54
- has four objects. For the lexicaal search using TFIDFRetriever we
55
- need to use the List of Haystack Document, which can be fetched by
56
- key = 'documents' on output.
57
-
58
- """
59
-
60
- lexical_processing_pipeline = processingpipeline()
61
-
62
-
63
- output_lexical_pre = lexical_processing_pipeline.run(file_paths = file_path,
64
- params= {"FileConverter": {"file_path": file_path, \
65
- "file_name": file_name},
66
- "UdfPreProcessor": {"remove_punc": remove_punc, \
67
- "split_by": split_by, \
68
- "split_length":split_length,\
69
- "split_overlap": split_overlap}})
70
-
71
- return output_lexical_pre
72
-
73
-
74
- def tokenize_lexical_query(query:str)-> List[str]:
75
- """
76
- Removes the stop words from query and returns the list of important keywords
77
- in query. For the lexical search the relevent paragraphs in document are
78
- retreived using TfIDFretreiver from Haystack. However to highlight these
79
- keywords we need the tokenized form of query.
80
-
81
- Params
82
- --------
83
- query: string which represents either list of keywords user is looking for
84
- or a query in form of Question.
85
-
86
- Return
87
- -----------
88
- token_list: list of important keywords in the query.
89
-
90
- """
91
- nlp = spacy.load("en_core_web_sm")
92
- token_list = [token.text.lower() for token in nlp(query)
93
- if not (token.is_stop or token.is_punct)]
94
- return token_list
95
-
96
- def runSpacyMatcher(token_list:List[str], document:Text
97
- )->Tuple[List[List[int]],spacy.tokens.doc.Doc]:
98
- """
99
- Using the spacy in backend finds the keywords in the document using the
100
- Matcher class from spacy. We can alternatively use the regex, but spacy
101
- finds all keywords in serialized manner which helps in annotation of answers.
102
-
103
- Params
104
- -------
105
- token_list: this is token list which tokenize_lexical_query function returns
106
- document: text in which we need to find the tokens
107
-
108
- Return
109
- --------
110
- matches: List of [start_index, end_index] in the spacydoc(at word level not
111
- character) for the keywords in token list.
112
-
113
- spacydoc: the keyword index in the spacydoc are at word level and not character,
114
- therefore to allow the annotator to work seamlessly we return the spacydoc.
115
-
116
- """
117
- nlp = spacy.load("en_core_web_sm")
118
- spacydoc = nlp(document)
119
- matcher = Matcher(nlp.vocab)
120
- token_pattern = [[{"LOWER":token}] for token in token_list]
121
- matcher.add(",".join(token_list), token_pattern)
122
- spacymatches = matcher(spacydoc)
123
-
124
- # getting start and end index in spacydoc so that annotator can work seamlessly
125
- matches = []
126
- for match_id, start, end in spacymatches:
127
- matches = matches + [[start, end]]
128
-
129
- return matches, spacydoc
130
-
131
- def runRegexMatcher(token_list:List[str], document:Text):
132
- """
133
- Using the regex in backend finds the keywords in the document.
134
-
135
- Params
136
- -------
137
- token_list: this is token list which tokenize_lexical_query function returns
138
-
139
- document: text in which we need to find the tokens
140
-
141
- Return
142
- --------
143
- matches: List of [start_index, end_index] in the document for the keywords
144
- in token list at character level.
145
-
146
- document: the keyword index returned by regex are at character level,
147
- therefore to allow the annotator to work seamlessly we return the text back.
148
-
149
- """
150
- matches = []
151
- for token in token_list:
152
- matches = (matches +
153
- [[val.start(), val.start() +
154
- len(token)] for val in re.finditer(token, document)])
155
-
156
- return matches, document
157
-
158
- def spacyAnnotator(matches: List[List[int]], document:spacy.tokens.doc.Doc):
159
- """
160
- This is spacy Annotator and needs spacy.doc
161
- Annotates the text in the document defined by list of [start index, end index]
162
- Example: "How are you today", if document type is text, matches = [[0,3]]
163
- will give answer = "How", however in case we used the spacy matcher then the
164
- matches = [[0,3]] will give answer = "How are you". However if spacy is used
165
- to find "How" then the matches = [[0,1]] for the string defined above.
166
-
167
- Params
168
- -----------
169
- matches: As mentioned its list of list. Example [[0,1],[10,13]]
170
- document: document which needs to be indexed.
171
-
172
-
173
- Return
174
- --------
175
- will send the output to either app front end using streamlit or
176
- write directly to output screen.
177
-
178
- """
179
- start = 0
180
- annotated_text = ""
181
- for match in matches:
182
- start_idx = match[0]
183
- end_idx = match[1]
184
-
185
- if check_streamlit():
186
- annotated_text = (annotated_text + document[start:start_idx].text
187
- + str(annotation(body=document[start_idx:end_idx].text,
188
- label="ANSWER", background="#964448", color='#ffffff')))
189
- else:
190
- annotated_text = (annotated_text + document[start:start_idx].text
191
- + colored(document[start_idx:end_idx].text,
192
- "green", attrs = ['bold']))
193
-
194
-
195
- start = end_idx
196
-
197
- annotated_text = annotated_text + document[end_idx:].text
198
-
199
-
200
- if check_streamlit():
201
-
202
- st.write(
203
- markdown(annotated_text),
204
- unsafe_allow_html=True,
205
- )
206
- else:
207
- print(annotated_text)
208
-
209
- def lexical_search(query:Text, documents:List[Document],top_k:int):
210
- """
211
- Performs the Lexical search on the List of haystack documents which is
212
- returned by preprocessing Pipeline.
213
-
214
- Params
215
- -------
216
- query: Keywords that need to be searche in documents.
217
- documents: List of Haystack documents returned by preprocessing pipeline.
218
- top_k: Number of Top results to be fetched.
219
-
220
- """
221
-
222
- document_store = InMemoryDocumentStore()
223
- document_store.write_documents(documents)
224
-
225
- # Haystack Retriever works with document stores only.
226
- retriever = TfidfRetriever(document_store)
227
- results = retriever.retrieve(query=query, top_k = top_k)
228
- query_tokens = tokenize_lexical_query(query)
229
- flag = True
230
- for count, result in enumerate(results):
231
- matches, doc = runSpacyMatcher(query_tokens,result.content)
232
-
233
- if len(matches) != 0:
234
- if flag:
235
- flag = False
236
- if check_streamlit():
237
- st.markdown("##### Top few lexical search (TFIDF) hits #####")
238
- else:
239
- print("Top few lexical search (TFIDF) hits")
240
-
241
- if check_streamlit():
242
- st.write("Result {}".format(count+1))
243
- else:
244
- print("Results {}".format(count +1))
245
- spacyAnnotator(matches, doc)
246
-
247
- if flag:
248
- if check_streamlit():
249
- st.info("🤔 No relevant result found. Please try another keyword.")
250
- else:
251
- print("No relevant result found. Please try another keyword.")