File size: 5,478 Bytes
10be4e3
 
 
08d9321
30e2235
 
10be4e3
 
08d9321
 
30e2235
 
 
 
 
 
10be4e3
 
 
 
 
 
 
 
 
 
 
 
 
1a2fbfc
 
 
 
11be492
1a2fbfc
10be4e3
 
d6cb72b
30e2235
10be4e3
 
 
1a2fbfc
d6cb72b
10be4e3
d6cb72b
10be4e3
 
1a2fbfc
d6cb72b
10be4e3
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
d6cb72b
 
10be4e3
 
d6cb72b
 
 
 
 
 
 
 
 
 
10be4e3
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
import gradio as gr
from datasets import load_from_disk
from pyserini.search.lucene import LuceneSearcher
from pyserini.analysis import JWhiteSpaceAnalyzer
from itertools import chain
from nltk.util import everygrams

searcher = LuceneSearcher("index")
searcher.set_analyzer(JWhiteSpaceAnalyzer())

def tokenize_word(word, min_len=2, max_len=4):
    return [''.join(ngram) for ngram in list(everygrams(word, min_len=min_len, max_len=max_len))]

def tokenize_sentence(sentence, min_len=2, max_len=4):
    return " ".join(chain(*[tokenize_word(word, min_len=min_len, max_len=max_len) for word in sentence.split()]))

ds = load_from_disk("data")
NUM_PAGES = 10 # STATIC. THIS CAN'T CHANGE BECAUSE GRADIO CAN'T DYNAMICALLY CREATE COMPONENTS. 
RESULTS_PER_PAGE = 5 

TEXT_FIELD = "content"
METADATA_FIELD = "docid"

def result_html(result, meta):
    return (
    f"<div style=\"color:#2a5cb3;font-weight: 500\"><u>{meta}</u></div><br>"
    f"<div><details><summary>{result[:250]}...</summary><p>{result[250:]}</p></details></div><br><hr><br>"
    )

def format_results(results, query):
    text_content = results[TEXT_FIELD]
    query_words = query.split()
    for word in query_words:
        text_content = [text.replace(word, f"<b style=\"color:#2a5cb3;font-weight: 700\">{word}</b>") for text in text_content]
    return "\n".join([result_html(result, meta) for result,meta in zip(text_content, results[METADATA_FIELD])])
    
def page_0(query):
    untokenized_query = query
    query = tokenize_sentence(query)
    hits = searcher.search(query, k=NUM_PAGES*RESULTS_PER_PAGE)
    ix = [int(hit.docid) for hit in hits]
    results = ds.select(ix).shard(num_shards=NUM_PAGES, index=0, contiguous=True) # no need to shard. split ix in batches instead. (would make sense if results was cacheable)
    results = format_results(results, untokenized_query)
    return results, [ix], gr.update(visible=True), untokenized_query

def page_i(i, ix, query):
    ix = ix[0]
    results = ds.select(ix).shard(num_shards=NUM_PAGES, index=i, contiguous=True)
    results = format_results(results, query)
    return results, [ix], query
    
with gr.Blocks(css="#b {min-width:15px;background:transparent;border:white;box-shadow:none;}") as demo: #
    with gr.Row():
        gr.Markdown(value="""## <p style="text-align: center;"> Code search </p>""")  
    with gr.Row():
        with gr.Column(scale=1):
            result_list = gr.Dataframe(type="array", visible=False, col_count=1)      
        with gr.Column(scale=13):
            query = gr.Textbox(lines=1, max_lines=1, placeholder="Search…", label="")
        with gr.Column(scale=1):
            with gr.Row(scale=1):
                pass
            with gr.Row(scale=1):    
                submit_btn = gr.Button("🔍", elem_id="b").style(full_width=False)
            with gr.Row(scale=1):
                pass
                
    with gr.Row():
        with gr.Column(scale=1):
            pass
        with gr.Column(scale=13):
            c = gr.HTML(label="Results")
            with gr.Row(visible=False) as pagination:
                # left = gr.Button(value="◀", elem_id="b", visible=False).style(full_width=True)
                page_1 = gr.Button(value="1", elem_id="b").style(full_width=True)
                page_2 = gr.Button(value="2", elem_id="b").style(full_width=True)
                page_3 = gr.Button(value="3", elem_id="b").style(full_width=True)
                page_4 = gr.Button(value="4", elem_id="b").style(full_width=True)
                page_5 = gr.Button(value="5", elem_id="b").style(full_width=True)
                page_6 = gr.Button(value="6", elem_id="b").style(full_width=True)
                page_7 = gr.Button(value="7", elem_id="b").style(full_width=True)
                page_8 = gr.Button(value="8", elem_id="b").style(full_width=True)
                page_9 = gr.Button(value="9", elem_id="b").style(full_width=True)
                page_10 = gr.Button(value="10", elem_id="b").style(full_width=True)
                # right = gr.Button(value="▶", elem_id="b", visible=False).style(full_width=True)
        with gr.Column(scale=1):
            pass
    query.submit(fn=page_0, inputs=[query], outputs=[c, result_list, pagination, query])
    submit_btn.click(page_0, inputs=[query], outputs=[c, result_list, pagination, query])
    with gr.Box(visible=False):
        nums = [gr.Number(i, visible=False, precision=0) for i in range(NUM_PAGES)]
    page_1.click(fn=page_i, inputs=[nums[0], result_list, query], outputs=[c, result_list, query])
    page_2.click(fn=page_i, inputs=[nums[1], result_list, query], outputs=[c, result_list, query])
    page_3.click(fn=page_i, inputs=[nums[2], result_list, query], outputs=[c, result_list, query])
    page_4.click(fn=page_i, inputs=[nums[3], result_list, query], outputs=[c, result_list, query])
    page_5.click(fn=page_i, inputs=[nums[4], result_list, query], outputs=[c, result_list, query])
    page_6.click(fn=page_i, inputs=[nums[5], result_list, query], outputs=[c, result_list, query])
    page_7.click(fn=page_i, inputs=[nums[6], result_list, query], outputs=[c, result_list, query])
    page_8.click(fn=page_i, inputs=[nums[7], result_list, query], outputs=[c, result_list, query])
    page_9.click(fn=page_i, inputs=[nums[8], result_list, query], outputs=[c, result_list, query])
    page_10.click(fn=page_i, inputs=[nums[9], result_list, query], outputs=[c, result_list, query])
demo.launch(enable_queue=True, debug=True)