Spaces:

spacerini
/

code-search

Running

cakiki commited on Feb 24, 2023

Commit

30e2235

1 Parent(s): 08d9321

Update app.py

Files changed (1) hide show

app.py CHANGED Viewed

@@ -2,10 +2,18 @@ import gradio as gr
 from datasets import load_from_disk
 from pyserini.search.lucene import LuceneSearcher
 from pyserini.analysis import JWhiteSpaceAnalyzer
 searcher = LuceneSearcher("index")
 searcher.set_analyzer(JWhiteSpaceAnalyzer())
 ds = load_from_disk("data")
 NUM_PAGES = 10 # STATIC. THIS CAN'T CHANGE BECAUSE GRADIO CAN'T DYNAMICALLY CREATE COMPONENTS.
 RESULTS_PER_PAGE = 5
@@ -23,6 +31,7 @@ def format_results(results):
     return "\n".join([result_html(result, meta) for result,meta in zip(results[TEXT_FIELD], results[METADATA_FIELD])])
 def page_0(query):
     hits = searcher.search(query, k=NUM_PAGES*RESULTS_PER_PAGE)
     ix = [int(hit.docid) for hit in hits]
     results = ds.select(ix).shard(num_shards=NUM_PAGES, index=0, contiguous=True) # no need to shard. split ix in batches instead. (would make sense if results was cacheable)

 from datasets import load_from_disk
 from pyserini.search.lucene import LuceneSearcher
 from pyserini.analysis import JWhiteSpaceAnalyzer
+from itertools import chain
+from nltk.util import everygrams
 searcher = LuceneSearcher("index")
 searcher.set_analyzer(JWhiteSpaceAnalyzer())
+def tokenize_word(word, min_len=2, max_len=4):
+    return [''.join(ngram) for ngram in list(everygrams(word, min_len=min_len, max_len=max_len))]
+def tokenize_sentence(sentence, min_len=2, max_len=4):
+    return " ".join(chain(*[tokenize_word(word, min_len=min_len, max_len=max_len) for word in sentence.split()]))
 ds = load_from_disk("data")
 NUM_PAGES = 10 # STATIC. THIS CAN'T CHANGE BECAUSE GRADIO CAN'T DYNAMICALLY CREATE COMPONENTS.
 RESULTS_PER_PAGE = 5
     return "\n".join([result_html(result, meta) for result,meta in zip(results[TEXT_FIELD], results[METADATA_FIELD])])
 def page_0(query):
+    query = tokenize_sentence(query)
     hits = searcher.search(query, k=NUM_PAGES*RESULTS_PER_PAGE)
     ix = [int(hit.docid) for hit in hits]
     results = ds.select(ix).shard(num_shards=NUM_PAGES, index=0, contiguous=True) # no need to shard. split ix in batches instead. (would make sense if results was cacheable)