cakiki commited on
Commit
30e2235
·
1 Parent(s): 08d9321

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +9 -0
app.py CHANGED
@@ -2,10 +2,18 @@ import gradio as gr
2
  from datasets import load_from_disk
3
  from pyserini.search.lucene import LuceneSearcher
4
  from pyserini.analysis import JWhiteSpaceAnalyzer
 
 
5
 
6
  searcher = LuceneSearcher("index")
7
  searcher.set_analyzer(JWhiteSpaceAnalyzer())
8
 
 
 
 
 
 
 
9
  ds = load_from_disk("data")
10
  NUM_PAGES = 10 # STATIC. THIS CAN'T CHANGE BECAUSE GRADIO CAN'T DYNAMICALLY CREATE COMPONENTS.
11
  RESULTS_PER_PAGE = 5
@@ -23,6 +31,7 @@ def format_results(results):
23
  return "\n".join([result_html(result, meta) for result,meta in zip(results[TEXT_FIELD], results[METADATA_FIELD])])
24
 
25
  def page_0(query):
 
26
  hits = searcher.search(query, k=NUM_PAGES*RESULTS_PER_PAGE)
27
  ix = [int(hit.docid) for hit in hits]
28
  results = ds.select(ix).shard(num_shards=NUM_PAGES, index=0, contiguous=True) # no need to shard. split ix in batches instead. (would make sense if results was cacheable)
 
2
  from datasets import load_from_disk
3
  from pyserini.search.lucene import LuceneSearcher
4
  from pyserini.analysis import JWhiteSpaceAnalyzer
5
+ from itertools import chain
6
+ from nltk.util import everygrams
7
 
8
  searcher = LuceneSearcher("index")
9
  searcher.set_analyzer(JWhiteSpaceAnalyzer())
10
 
11
+ def tokenize_word(word, min_len=2, max_len=4):
12
+ return [''.join(ngram) for ngram in list(everygrams(word, min_len=min_len, max_len=max_len))]
13
+
14
+ def tokenize_sentence(sentence, min_len=2, max_len=4):
15
+ return " ".join(chain(*[tokenize_word(word, min_len=min_len, max_len=max_len) for word in sentence.split()]))
16
+
17
  ds = load_from_disk("data")
18
  NUM_PAGES = 10 # STATIC. THIS CAN'T CHANGE BECAUSE GRADIO CAN'T DYNAMICALLY CREATE COMPONENTS.
19
  RESULTS_PER_PAGE = 5
 
31
  return "\n".join([result_html(result, meta) for result,meta in zip(results[TEXT_FIELD], results[METADATA_FIELD])])
32
 
33
  def page_0(query):
34
+ query = tokenize_sentence(query)
35
  hits = searcher.search(query, k=NUM_PAGES*RESULTS_PER_PAGE)
36
  ix = [int(hit.docid) for hit in hits]
37
  results = ds.select(ix).shard(num_shards=NUM_PAGES, index=0, contiguous=True) # no need to shard. split ix in batches instead. (would make sense if results was cacheable)