liujch1998 commited on
Commit
6477832
Β·
1 Parent(s): 0067690

Bug fix: find_result cache breaks down upon concurrent users

Browse files
Files changed (2) hide show
  1. README.md +1 -1
  2. app.py +39 -21
README.md CHANGED
@@ -4,7 +4,7 @@ emoji: πŸ“–
4
  colorFrom: blue
5
  colorTo: green
6
  sdk: gradio
7
- sdk_version: 4.36.0
8
  app_file: app.py
9
  pinned: false
10
  license: cc-by-nc-sa-4.0
 
4
  colorFrom: blue
5
  colorTo: green
6
  sdk: gradio
7
+ sdk_version: 4.44.1
8
  app_file: app.py
9
  pinned: false
10
  license: cc-by-nc-sa-4.0
app.py CHANGED
@@ -150,16 +150,16 @@ def search_docs(index_desc, query, maxnum, max_disp_len, max_clause_freq, max_di
150
  docs.append([])
151
  return tuple([latency, tokenization_info, message] + metadatas + docs)
152
 
153
- find_result = None
154
-
155
- def search_docs_new(index_desc, query, max_disp_len, max_clause_freq, max_diff_tokens):
156
- global find_result
157
  if ' AND ' in query or ' OR ' in query: # CNF query
158
  find_result = process('find_cnf', index_desc, query=query, max_clause_freq=max_clause_freq, max_diff_tokens=max_diff_tokens)
159
  find_result['type'] = 'cnf'
160
  else: # simple query
161
  find_result = process('find', index_desc, query=query)
162
  find_result['type'] = 'simple'
 
 
 
163
  latency = '' if 'latency' not in find_result else f'{find_result["latency"]:.3f}'
164
  tokenization_info = format_tokenization_info(find_result)
165
  if 'error' in find_result:
@@ -167,7 +167,7 @@ def search_docs_new(index_desc, query, max_disp_len, max_clause_freq, max_diff_t
167
  idx = gr.Number(minimum=0, maximum=0, step=1, value=0, interactive=False)
168
  metadata = ''
169
  doc = []
170
- return latency, tokenization_info, message, idx, metadata, doc
171
 
172
  if ' AND ' in query or ' OR ' in query: # CNF query
173
  ptrs_by_shard = find_result['ptrs_by_shard']
@@ -183,21 +183,20 @@ def search_docs_new(index_desc, query, max_disp_len, max_clause_freq, max_diff_t
183
  idx = gr.Number(minimum=0, maximum=0, step=1, value=0, interactive=False)
184
  metadata = ''
185
  doc = []
186
- return latency, tokenization_info, message, idx, metadata, doc
187
  idx = random.randint(0, cnt_retrievable-1)
188
- metadata, doc = get_another_doc(index_desc, idx, max_disp_len)
189
  idx = gr.Number(minimum=0, maximum=cnt_retrievable-1, step=1, value=idx, interactive=True)
190
- return latency, tokenization_info, message, idx, metadata, doc
191
 
192
- def clear_search_docs_new():
193
- global find_result
194
- find_result = None
195
  idx = gr.Number(minimum=0, maximum=0, step=1, value=0, interactive=False)
196
- return idx
197
 
198
- def get_another_doc(index_desc, idx, max_disp_len):
199
- global find_result
200
- if not (type(idx) == int and 0 <= idx and idx < find_result['cnt']):
201
  metadata = ''
202
  doc = []
203
  return metadata, doc
@@ -230,10 +229,10 @@ def get_another_doc(index_desc, idx, max_disp_len):
230
  with gr.Blocks() as demo:
231
  with gr.Column():
232
  gr.HTML(
233
- '''<h1 text-align="center">Infini-gram: An Engine for n-gram / ∞-gram Language Modeling with Trillion-Token Corpora</h1>
234
 
235
- <p style='font-size: 16px;'>This is an engine that processes n-gram / ∞-gram queries on massive text corpora. Please first select the corpus and the type of query, then enter your query and submit.</p>
236
- <p style='font-size: 16px;'>The engine is developed by <a href="https://liujch1998.github.io">Jiacheng (Gary) Liu</a> and documented in our paper: <a href="https://huggingface.co/papers/2401.17377">Infini-gram: Scaling Unbounded n-gram Language Models to a Trillion Tokens</a>. Feel free to check out our <a href="https://infini-gram.io">Project Homepage</a>.</p>
237
  <p style='font-size: 16px;'><b>API Endpoint:</b> If you'd like to issue batch queries to infini-gram, you may invoke our API endpoint. Please refer to the <a href="https://infini-gram.io/api_doc">API documentation</a>.</p>
238
  <p style='font-size: 16px;'><b>Note:</b> The query is <b>case-sensitive</b>. Your query will be tokenized with the Llama-2 tokenizer (unless otherwise specified).</p>
239
  '''
@@ -482,10 +481,29 @@ with gr.Blocks() as demo:
482
  search_docs_new_idx = gr.Slider(label='', minimum=0, maximum=0, step=1, value=0, interactive=False)
483
  search_docs_new_metadata = gr.Textbox(label='Metadata', lines=3, max_lines=3, interactive=False)
484
  search_docs_new_output = gr.HighlightedText(label='Document', show_legend=False, color_map={"-": "red", "0": "green", "1": "cyan", "2": "blue", "3": "magenta"})
 
485
  search_docs_new_clear.add([search_docs_new_query, search_docs_new_latency, search_docs_new_tokenized, search_docs_new_message, search_docs_new_idx, search_docs_new_metadata, search_docs_new_output])
486
- search_docs_new_clear.click(clear_search_docs_new, inputs=[], outputs=[search_docs_new_idx], api_name=False)
487
- search_docs_new_submit.click(search_docs_new, inputs=[index_desc, search_docs_new_query, search_docs_new_max_disp_len, search_docs_new_max_clause_freq, search_docs_new_max_diff_tokens], outputs=[search_docs_new_latency, search_docs_new_tokenized, search_docs_new_message, search_docs_new_idx, search_docs_new_metadata, search_docs_new_output], api_name=False)
488
- search_docs_new_idx.input(get_another_doc, inputs=[index_desc, search_docs_new_idx, search_docs_new_max_disp_len], outputs=[search_docs_new_metadata, search_docs_new_output], api_name=False)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
489
 
490
  with gr.Row():
491
  gr.Markdown('''
 
150
  docs.append([])
151
  return tuple([latency, tokenization_info, message] + metadatas + docs)
152
 
153
+ def search_docs_new(index_desc, query, max_disp_len, max_clause_freq, max_diff_tokens, state):
 
 
 
154
  if ' AND ' in query or ' OR ' in query: # CNF query
155
  find_result = process('find_cnf', index_desc, query=query, max_clause_freq=max_clause_freq, max_diff_tokens=max_diff_tokens)
156
  find_result['type'] = 'cnf'
157
  else: # simple query
158
  find_result = process('find', index_desc, query=query)
159
  find_result['type'] = 'simple'
160
+
161
+ state = find_result
162
+
163
  latency = '' if 'latency' not in find_result else f'{find_result["latency"]:.3f}'
164
  tokenization_info = format_tokenization_info(find_result)
165
  if 'error' in find_result:
 
167
  idx = gr.Number(minimum=0, maximum=0, step=1, value=0, interactive=False)
168
  metadata = ''
169
  doc = []
170
+ return latency, tokenization_info, message, idx, metadata, doc, state
171
 
172
  if ' AND ' in query or ' OR ' in query: # CNF query
173
  ptrs_by_shard = find_result['ptrs_by_shard']
 
183
  idx = gr.Number(minimum=0, maximum=0, step=1, value=0, interactive=False)
184
  metadata = ''
185
  doc = []
186
+ return latency, tokenization_info, message, idx, metadata, doc, state
187
  idx = random.randint(0, cnt_retrievable-1)
188
+ metadata, doc = get_another_doc(index_desc, idx, max_disp_len, state)
189
  idx = gr.Number(minimum=0, maximum=cnt_retrievable-1, step=1, value=idx, interactive=True)
190
+ return latency, tokenization_info, message, idx, metadata, doc, state
191
 
192
+ def clear_search_docs_new(state):
193
+ state = None
 
194
  idx = gr.Number(minimum=0, maximum=0, step=1, value=0, interactive=False)
195
+ return idx, state
196
 
197
+ def get_another_doc(index_desc, idx, max_disp_len, state):
198
+ find_result = state
199
+ if find_result is None or not (type(idx) == int and 0 <= idx and idx < find_result['cnt']):
200
  metadata = ''
201
  doc = []
202
  return metadata, doc
 
229
  with gr.Blocks() as demo:
230
  with gr.Column():
231
  gr.HTML(
232
+ '''<h1 text-align="center">Infini-gram: An Efficient Search Engine over the Massive Pretraining Datasets of Language Models</h1>
233
 
234
+ <p style='font-size: 16px;'>This engine does exact-match search over several open pretraining datasets of language models. Please first select the corpus and the type of query, then enter your query and submit.</p>
235
+ <p style='font-size: 16px;'>The engine is developed by <a href="https://liujch1998.github.io">Jiacheng Liu</a> and documented in our paper: <a href="https://huggingface.co/papers/2401.17377">Infini-gram: Scaling Unbounded n-gram Language Models to a Trillion Tokens</a>. Feel free to check out our <a href="https://infini-gram.io">Project Homepage</a>.</p>
236
  <p style='font-size: 16px;'><b>API Endpoint:</b> If you'd like to issue batch queries to infini-gram, you may invoke our API endpoint. Please refer to the <a href="https://infini-gram.io/api_doc">API documentation</a>.</p>
237
  <p style='font-size: 16px;'><b>Note:</b> The query is <b>case-sensitive</b>. Your query will be tokenized with the Llama-2 tokenizer (unless otherwise specified).</p>
238
  '''
 
481
  search_docs_new_idx = gr.Slider(label='', minimum=0, maximum=0, step=1, value=0, interactive=False)
482
  search_docs_new_metadata = gr.Textbox(label='Metadata', lines=3, max_lines=3, interactive=False)
483
  search_docs_new_output = gr.HighlightedText(label='Document', show_legend=False, color_map={"-": "red", "0": "green", "1": "cyan", "2": "blue", "3": "magenta"})
484
+ search_docs_state = gr.State(value=None)
485
  search_docs_new_clear.add([search_docs_new_query, search_docs_new_latency, search_docs_new_tokenized, search_docs_new_message, search_docs_new_idx, search_docs_new_metadata, search_docs_new_output])
486
+ search_docs_new_clear.click(
487
+ clear_search_docs_new,
488
+ inputs=[search_docs_state],
489
+ outputs=[search_docs_new_idx, search_docs_state]
490
+ )
491
+ search_docs_new_submit.click(
492
+ search_docs_new,
493
+ inputs=[index_desc, search_docs_new_query, search_docs_new_max_disp_len,
494
+ search_docs_new_max_clause_freq, search_docs_new_max_diff_tokens,
495
+ search_docs_state],
496
+ outputs=[search_docs_new_latency, search_docs_new_tokenized,
497
+ search_docs_new_message, search_docs_new_idx,
498
+ search_docs_new_metadata, search_docs_new_output,
499
+ search_docs_state]
500
+ )
501
+ search_docs_new_idx.input(
502
+ get_another_doc,
503
+ inputs=[index_desc, search_docs_new_idx, search_docs_new_max_disp_len,
504
+ search_docs_state],
505
+ outputs=[search_docs_new_metadata, search_docs_new_output]
506
+ )
507
 
508
  with gr.Row():
509
  gr.Markdown('''