Spaces:
Running
Running
Update app.py
Browse files
app.py
CHANGED
@@ -2,10 +2,18 @@ import gradio as gr
|
|
2 |
from datasets import load_from_disk
|
3 |
from pyserini.search.lucene import LuceneSearcher
|
4 |
from pyserini.analysis import JWhiteSpaceAnalyzer
|
|
|
|
|
5 |
|
6 |
searcher = LuceneSearcher("index")
|
7 |
searcher.set_analyzer(JWhiteSpaceAnalyzer())
|
8 |
|
|
|
|
|
|
|
|
|
|
|
|
|
9 |
ds = load_from_disk("data")
|
10 |
NUM_PAGES = 10 # STATIC. THIS CAN'T CHANGE BECAUSE GRADIO CAN'T DYNAMICALLY CREATE COMPONENTS.
|
11 |
RESULTS_PER_PAGE = 5
|
@@ -23,6 +31,7 @@ def format_results(results):
|
|
23 |
return "\n".join([result_html(result, meta) for result,meta in zip(results[TEXT_FIELD], results[METADATA_FIELD])])
|
24 |
|
25 |
def page_0(query):
|
|
|
26 |
hits = searcher.search(query, k=NUM_PAGES*RESULTS_PER_PAGE)
|
27 |
ix = [int(hit.docid) for hit in hits]
|
28 |
results = ds.select(ix).shard(num_shards=NUM_PAGES, index=0, contiguous=True) # no need to shard. split ix in batches instead. (would make sense if results was cacheable)
|
|
|
2 |
from datasets import load_from_disk
|
3 |
from pyserini.search.lucene import LuceneSearcher
|
4 |
from pyserini.analysis import JWhiteSpaceAnalyzer
|
5 |
+
from itertools import chain
|
6 |
+
from nltk.util import everygrams
|
7 |
|
8 |
searcher = LuceneSearcher("index")
|
9 |
searcher.set_analyzer(JWhiteSpaceAnalyzer())
|
10 |
|
11 |
+
def tokenize_word(word, min_len=2, max_len=4):
|
12 |
+
return [''.join(ngram) for ngram in list(everygrams(word, min_len=min_len, max_len=max_len))]
|
13 |
+
|
14 |
+
def tokenize_sentence(sentence, min_len=2, max_len=4):
|
15 |
+
return " ".join(chain(*[tokenize_word(word, min_len=min_len, max_len=max_len) for word in sentence.split()]))
|
16 |
+
|
17 |
ds = load_from_disk("data")
|
18 |
NUM_PAGES = 10 # STATIC. THIS CAN'T CHANGE BECAUSE GRADIO CAN'T DYNAMICALLY CREATE COMPONENTS.
|
19 |
RESULTS_PER_PAGE = 5
|
|
|
31 |
return "\n".join([result_html(result, meta) for result,meta in zip(results[TEXT_FIELD], results[METADATA_FIELD])])
|
32 |
|
33 |
def page_0(query):
|
34 |
+
query = tokenize_sentence(query)
|
35 |
hits = searcher.search(query, k=NUM_PAGES*RESULTS_PER_PAGE)
|
36 |
ix = [int(hit.docid) for hit in hits]
|
37 |
results = ds.select(ix).shard(num_shards=NUM_PAGES, index=0, contiguous=True) # no need to shard. split ix in batches instead. (would make sense if results was cacheable)
|