Yehor commited on
Commit
e6d2f62
·
verified ·
1 Parent(s): 0532bab

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +71 -9
app.py CHANGED
@@ -198,6 +198,32 @@ def enhance_text(kenlm_model, text):
198
  return fixed_label
199
 
200
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
201
  def text_to_kenlm(
202
  _text_file,
203
  _order,
@@ -218,7 +244,7 @@ def text_to_kenlm(
218
  raise gr.Error("Please add an order.")
219
 
220
  gr.Info("Started to make the model, wait...")
221
-
222
  results = []
223
 
224
  # Read the file
@@ -229,14 +255,16 @@ def text_to_kenlm(
229
  line = line.lower()
230
  results.append(line)
231
 
 
 
232
  # Write to intermediate file
233
- intermediate_file = f"/tmp/intermediate.txt"
234
  with open(intermediate_file, "w") as f:
235
  f.write(" ".join(results))
236
 
237
  # Commands to run in the container
238
  cmd = (
239
- f"{kenlm_bin}/lmplz --temp_prefix /tmp --memory 90% --text {intermediate_file} --arpa /tmp/my_model.arpa -o {_order} --prune {_arpa_prune} --discount_fallback",
240
  )
241
  print(subprocess.run(cmd, shell=True))
242
 
@@ -281,6 +309,8 @@ def text_to_kenlm(
281
  )
282
  )
283
 
 
 
284
  if _do_quantize:
285
  file_name_quantized = (
286
  f"/tmp/my_model-{_binary_type}-{_topk_words}-words.bin"
@@ -298,8 +328,27 @@ def text_to_kenlm(
298
  print(subprocess.run(cmd, shell=True))
299
 
300
  gr.Success("Model created.")
301
-
302
- return gr.DownloadButton(value=Path(file_name), label=f"Download: {file_name}")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
303
 
304
 
305
  with gr.Blocks(
@@ -401,9 +450,22 @@ with gr.Blocks(
401
  value=False,
402
  )
403
 
404
- kenlm_model = gr.DownloadButton(
405
- label="Created KenLM model",
406
- )
 
 
 
 
 
 
 
 
 
 
 
 
 
407
 
408
  gr.Button("Create").click(
409
  text_to_kenlm,
@@ -420,7 +482,7 @@ with gr.Blocks(
420
  topk_words,
421
  do_limit_topk,
422
  ],
423
- outputs=kenlm_model,
424
  )
425
 
426
  with gr.Row():
 
198
  return fixed_label
199
 
200
 
201
+ def generate_files(results):
202
+ # Write words to a file
203
+ words = [r.split() for r in results]
204
+ words = list(set([w for r in words for w in r]))
205
+
206
+ with open("/tmp/model_vocab.txt", "w") as f:
207
+ f.write("\n".join(words))
208
+
209
+ # Generate tokens file
210
+ tokens = set()
211
+ for word in words:
212
+ tokens.update(list(word))
213
+ # add "|" token
214
+ tokens.add("|")
215
+
216
+ with open("/tmp/model_tokens.txt", "w") as f:
217
+ tokens_ordered = sorted(tokens)
218
+ f.write("\n".join(tokens_ordered))
219
+
220
+ # Generate lexicon file
221
+ with open("/tmp/model_lexicon.txt", "w") as f:
222
+ for word in words:
223
+ splitted_word = " ".join(list(word + "|"))
224
+ f.write(f"{word}\t{splitted_word}\n")
225
+
226
+
227
  def text_to_kenlm(
228
  _text_file,
229
  _order,
 
244
  raise gr.Error("Please add an order.")
245
 
246
  gr.Info("Started to make the model, wait...")
247
+
248
  results = []
249
 
250
  # Read the file
 
255
  line = line.lower()
256
  results.append(line)
257
 
258
+ generate_files(results)
259
+
260
  # Write to intermediate file
261
+ intermediate_file = "/tmp/intermediate.txt"
262
  with open(intermediate_file, "w") as f:
263
  f.write(" ".join(results))
264
 
265
  # Commands to run in the container
266
  cmd = (
267
+ f"{kenlm_bin}/lmplz --temp_prefix {app_dir} --memory 90% --text {intermediate_file} --arpa /tmp/my_model.arpa -o {_order} --prune {_arpa_prune} --discount_fallback",
268
  )
269
  print(subprocess.run(cmd, shell=True))
270
 
 
309
  )
310
  )
311
 
312
+ generate_files(vocab_str.split("\n"))
313
+
314
  if _do_quantize:
315
  file_name_quantized = (
316
  f"/tmp/my_model-{_binary_type}-{_topk_words}-words.bin"
 
328
  print(subprocess.run(cmd, shell=True))
329
 
330
  gr.Success("Model created.")
331
+
332
+ model_file = gr.DownloadButton(
333
+ value=Path(file_name), label=f"Download: {file_name}"
334
+ )
335
+
336
+ vocab_file = gr.DownloadButton(
337
+ value=Path("/tmp/model_vocab.txt"),
338
+ label="Created model_vocab.txt",
339
+ )
340
+
341
+ lexicon_file = gr.DownloadButton(
342
+ value=Path("/tmp/model_lexicon.txt"),
343
+ label="Created model_lexicon.txt",
344
+ )
345
+
346
+ tokens_file = gr.DownloadButton(
347
+ value=Path("/tmp/model_tokens.txt"),
348
+ label="Created model_tokens.txt",
349
+ )
350
+
351
+ return [model_file, vocab_file, lexicon_file, tokens_file]
352
 
353
 
354
  with gr.Blocks(
 
450
  value=False,
451
  )
452
 
453
+ with gr.Column():
454
+ kenlm_model = gr.DownloadButton(
455
+ label="Created KenLM model",
456
+ )
457
+
458
+ vocab_file = gr.DownloadButton(
459
+ label="Created model_vocab.txt",
460
+ )
461
+
462
+ lexicon_file = gr.DownloadButton(
463
+ label="Created model_lexicon.txt",
464
+ )
465
+
466
+ tokens_file = gr.DownloadButton(
467
+ label="Created model_tokens.txt",
468
+ )
469
 
470
  gr.Button("Create").click(
471
  text_to_kenlm,
 
482
  topk_words,
483
  do_limit_topk,
484
  ],
485
+ outputs=[kenlm_model, vocab_file, lexicon_file, tokens_file],
486
  )
487
 
488
  with gr.Row():