import gradio as gr from datasets import load_from_disk from pyserini.search.lucene import LuceneSearcher from pyserini.analysis import JWhiteSpaceAnalyzer from itertools import chain from nltk.util import everygrams searcher = LuceneSearcher("index") searcher.set_analyzer(JWhiteSpaceAnalyzer()) def tokenize_word(word, min_len=2, max_len=4): return [''.join(ngram) for ngram in list(everygrams(word, min_len=min_len, max_len=max_len))] def tokenize_sentence(sentence, min_len=2, max_len=4): return " ".join(chain(*[tokenize_word(word, min_len=min_len, max_len=max_len) for word in sentence.split()])) ds = load_from_disk("data") NUM_PAGES = 10 # STATIC. THIS CAN'T CHANGE BECAUSE GRADIO CAN'T DYNAMICALLY CREATE COMPONENTS. RESULTS_PER_PAGE = 5 TEXT_FIELD = "content" METADATA_FIELD = "docid" def result_html(result, meta): return ( f"
{meta}

" f"
{result[:250]}...

{result[250:]}




" ) def format_results(results, query): text_content = results[TEXT_FIELD] query_words = query.split() for word in query_words: text_content = [text.replace(word, f"{word}") for text in text_content] return "\n".join([result_html(result, meta) for result,meta in zip(text_content, results[METADATA_FIELD])]) def page_0(query): untokenized_query = query query = tokenize_sentence(query) hits = searcher.search(query, k=NUM_PAGES*RESULTS_PER_PAGE) ix = [int(hit.docid) for hit in hits] results = ds.select(ix).shard(num_shards=NUM_PAGES, index=0, contiguous=True) # no need to shard. split ix in batches instead. (would make sense if results was cacheable) results = format_results(results, untokenized_query) return results, [ix], gr.update(visible=True), untokenized_query def page_i(i, ix, query): ix = ix[0] results = ds.select(ix).shard(num_shards=NUM_PAGES, index=i, contiguous=True) results = format_results(results, query) return results, [ix], query with gr.Blocks(css="#b {min-width:15px;background:transparent;}") as demo: #border:white;box-shadow:none; with gr.Row(): gr.Markdown(value="""##

Code search

""") with gr.Row(): with gr.Column(scale=1): result_list = gr.Dataframe(type="array", visible=False, col_count=1) with gr.Column(scale=13): query = gr.Textbox(lines=1, max_lines=1, placeholder="Search…", label="") with gr.Column(scale=1): with gr.Row(scale=1): pass with gr.Row(scale=1): submit_btn = gr.Button("🔍", elem_id="b").style(full_width=False) with gr.Row(scale=1): pass with gr.Row(): with gr.Column(scale=1): pass with gr.Column(scale=13): c = gr.HTML(label="Results") with gr.Row(visible=False) as pagination: # left = gr.Button(value="◀", elem_id="b", visible=False).style(full_width=True) page_1 = gr.Button(value="1", elem_id="b").style(full_width=True) page_2 = gr.Button(value="2", elem_id="b").style(full_width=True) page_3 = gr.Button(value="3", elem_id="b").style(full_width=True) page_4 = gr.Button(value="4", elem_id="b").style(full_width=True) page_5 = gr.Button(value="5", elem_id="b").style(full_width=True) page_6 = gr.Button(value="6", elem_id="b").style(full_width=True) page_7 = gr.Button(value="7", elem_id="b").style(full_width=True) page_8 = gr.Button(value="8", elem_id="b").style(full_width=True) page_9 = gr.Button(value="9", elem_id="b").style(full_width=True) page_10 = gr.Button(value="10", elem_id="b").style(full_width=True) # right = gr.Button(value="▶", elem_id="b", visible=False).style(full_width=True) with gr.Column(scale=1): pass query.submit(fn=page_0, inputs=[query], outputs=[c, result_list, pagination, query]) submit_btn.click(page_0, inputs=[query], outputs=[c, result_list, pagination, query]) with gr.Box(visible=False): nums = [gr.Number(i, visible=False, precision=0) for i in range(NUM_PAGES)] page_1.click(fn=page_i, inputs=[nums[0], result_list, query], outputs=[c, result_list, query]) page_2.click(fn=page_i, inputs=[nums[1], result_list, query], outputs=[c, result_list, query]) page_3.click(fn=page_i, inputs=[nums[2], result_list, query], outputs=[c, result_list, query]) page_4.click(fn=page_i, inputs=[nums[3], result_list, query], outputs=[c, result_list, query]) page_5.click(fn=page_i, inputs=[nums[4], result_list, query], outputs=[c, result_list, query]) page_6.click(fn=page_i, inputs=[nums[5], result_list, query], outputs=[c, result_list, query]) page_7.click(fn=page_i, inputs=[nums[6], result_list, query], outputs=[c, result_list, query]) page_8.click(fn=page_i, inputs=[nums[7], result_list, query], outputs=[c, result_list, query]) page_9.click(fn=page_i, inputs=[nums[8], result_list, query], outputs=[c, result_list, query]) page_10.click(fn=page_i, inputs=[nums[9], result_list, query], outputs=[c, result_list, query]) demo.launch(enable_queue=True, debug=True)