HarryLee commited on
Commit
f4b5cf8
·
1 Parent(s): 915ac92

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +31 -1
app.py CHANGED
@@ -71,6 +71,16 @@ from tqdm.autonotebook import tqdm
71
  import numpy as np
72
  import re
73
 
 
 
 
 
 
 
 
 
 
 
74
 
75
  # We lower case our text and remove stop-words from indexing
76
  def bm25_tokenizer(text):
@@ -88,6 +98,9 @@ for passage in tqdm(passages):
88
 
89
  bm25 = BM25Okapi(tokenized_corpus)
90
 
 
 
 
91
  # This function will search all wikipedia articles for passages that
92
  # answer the query
93
  def search(query):
@@ -163,7 +176,24 @@ def search(query):
163
  #st.write(rs_final.strip())
164
  res.append(rs_final.strip())
165
 
166
- st.write(res[0:maxtags_sidebar])
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
167
 
168
  st.write("## Results:")
169
  if st.button('Generated Expansion'):
 
71
  import numpy as np
72
  import re
73
 
74
+ import yake
75
+
76
+ language = "en"
77
+ max_ngram_size = 3
78
+ deduplication_threshold = 0.9
79
+ deduplication_algo = 'seqm'
80
+ windowSize = 3
81
+ numOfKeywords = 3
82
+
83
+ custom_kw_extractor = yake.KeywordExtractor(lan=language, n=max_ngram_size, dedupLim=deduplication_threshold, dedupFunc=deduplication_algo, windowsSize=windowSize, top=numOfKeywords, features=None)
84
 
85
  # We lower case our text and remove stop-words from indexing
86
  def bm25_tokenizer(text):
 
98
 
99
  bm25 = BM25Okapi(tokenized_corpus)
100
 
101
+ def word_len(s):
102
+ return len([i for i in s.split(' ') if i])
103
+
104
  # This function will search all wikipedia articles for passages that
105
  # answer the query
106
  def search(query):
 
176
  #st.write(rs_final.strip())
177
  res.append(rs_final.strip())
178
 
179
+ #st.write(res[0:maxtags_sidebar])
180
+
181
+ res_clean = []
182
+ for out in result:
183
+ if len(out) > 20:
184
+ keywords = custom_kw_extractor.extract_keywords(out)
185
+ for key in keywords:
186
+ res_clean.append(key[0])
187
+ else:
188
+ res_clean.append(out)
189
+
190
+ show_out = []
191
+ for i in res_clean:
192
+ num = word_len(i)
193
+ if num > 1:
194
+ show_out.append(i)
195
+
196
+ st.write(show_out[0:maxtags_sidebar])
197
 
198
  st.write("## Results:")
199
  if st.button('Generated Expansion'):