Update app.py
Browse files
app.py
CHANGED
@@ -198,6 +198,32 @@ def enhance_text(kenlm_model, text):
|
|
198 |
return fixed_label
|
199 |
|
200 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
201 |
def text_to_kenlm(
|
202 |
_text_file,
|
203 |
_order,
|
@@ -218,7 +244,7 @@ def text_to_kenlm(
|
|
218 |
raise gr.Error("Please add an order.")
|
219 |
|
220 |
gr.Info("Started to make the model, wait...")
|
221 |
-
|
222 |
results = []
|
223 |
|
224 |
# Read the file
|
@@ -229,14 +255,16 @@ def text_to_kenlm(
|
|
229 |
line = line.lower()
|
230 |
results.append(line)
|
231 |
|
|
|
|
|
232 |
# Write to intermediate file
|
233 |
-
intermediate_file =
|
234 |
with open(intermediate_file, "w") as f:
|
235 |
f.write(" ".join(results))
|
236 |
|
237 |
# Commands to run in the container
|
238 |
cmd = (
|
239 |
-
f"{kenlm_bin}/lmplz --temp_prefix
|
240 |
)
|
241 |
print(subprocess.run(cmd, shell=True))
|
242 |
|
@@ -281,6 +309,8 @@ def text_to_kenlm(
|
|
281 |
)
|
282 |
)
|
283 |
|
|
|
|
|
284 |
if _do_quantize:
|
285 |
file_name_quantized = (
|
286 |
f"/tmp/my_model-{_binary_type}-{_topk_words}-words.bin"
|
@@ -298,8 +328,27 @@ def text_to_kenlm(
|
|
298 |
print(subprocess.run(cmd, shell=True))
|
299 |
|
300 |
gr.Success("Model created.")
|
301 |
-
|
302 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
303 |
|
304 |
|
305 |
with gr.Blocks(
|
@@ -401,9 +450,22 @@ with gr.Blocks(
|
|
401 |
value=False,
|
402 |
)
|
403 |
|
404 |
-
|
405 |
-
|
406 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
407 |
|
408 |
gr.Button("Create").click(
|
409 |
text_to_kenlm,
|
@@ -420,7 +482,7 @@ with gr.Blocks(
|
|
420 |
topk_words,
|
421 |
do_limit_topk,
|
422 |
],
|
423 |
-
outputs=kenlm_model,
|
424 |
)
|
425 |
|
426 |
with gr.Row():
|
|
|
198 |
return fixed_label
|
199 |
|
200 |
|
201 |
+
def generate_files(results):
|
202 |
+
# Write words to a file
|
203 |
+
words = [r.split() for r in results]
|
204 |
+
words = list(set([w for r in words for w in r]))
|
205 |
+
|
206 |
+
with open("/tmp/model_vocab.txt", "w") as f:
|
207 |
+
f.write("\n".join(words))
|
208 |
+
|
209 |
+
# Generate tokens file
|
210 |
+
tokens = set()
|
211 |
+
for word in words:
|
212 |
+
tokens.update(list(word))
|
213 |
+
# add "|" token
|
214 |
+
tokens.add("|")
|
215 |
+
|
216 |
+
with open("/tmp/model_tokens.txt", "w") as f:
|
217 |
+
tokens_ordered = sorted(tokens)
|
218 |
+
f.write("\n".join(tokens_ordered))
|
219 |
+
|
220 |
+
# Generate lexicon file
|
221 |
+
with open("/tmp/model_lexicon.txt", "w") as f:
|
222 |
+
for word in words:
|
223 |
+
splitted_word = " ".join(list(word + "|"))
|
224 |
+
f.write(f"{word}\t{splitted_word}\n")
|
225 |
+
|
226 |
+
|
227 |
def text_to_kenlm(
|
228 |
_text_file,
|
229 |
_order,
|
|
|
244 |
raise gr.Error("Please add an order.")
|
245 |
|
246 |
gr.Info("Started to make the model, wait...")
|
247 |
+
|
248 |
results = []
|
249 |
|
250 |
# Read the file
|
|
|
255 |
line = line.lower()
|
256 |
results.append(line)
|
257 |
|
258 |
+
generate_files(results)
|
259 |
+
|
260 |
# Write to intermediate file
|
261 |
+
intermediate_file = "/tmp/intermediate.txt"
|
262 |
with open(intermediate_file, "w") as f:
|
263 |
f.write(" ".join(results))
|
264 |
|
265 |
# Commands to run in the container
|
266 |
cmd = (
|
267 |
+
f"{kenlm_bin}/lmplz --temp_prefix {app_dir} --memory 90% --text {intermediate_file} --arpa /tmp/my_model.arpa -o {_order} --prune {_arpa_prune} --discount_fallback",
|
268 |
)
|
269 |
print(subprocess.run(cmd, shell=True))
|
270 |
|
|
|
309 |
)
|
310 |
)
|
311 |
|
312 |
+
generate_files(vocab_str.split("\n"))
|
313 |
+
|
314 |
if _do_quantize:
|
315 |
file_name_quantized = (
|
316 |
f"/tmp/my_model-{_binary_type}-{_topk_words}-words.bin"
|
|
|
328 |
print(subprocess.run(cmd, shell=True))
|
329 |
|
330 |
gr.Success("Model created.")
|
331 |
+
|
332 |
+
model_file = gr.DownloadButton(
|
333 |
+
value=Path(file_name), label=f"Download: {file_name}"
|
334 |
+
)
|
335 |
+
|
336 |
+
vocab_file = gr.DownloadButton(
|
337 |
+
value=Path("/tmp/model_vocab.txt"),
|
338 |
+
label="Created model_vocab.txt",
|
339 |
+
)
|
340 |
+
|
341 |
+
lexicon_file = gr.DownloadButton(
|
342 |
+
value=Path("/tmp/model_lexicon.txt"),
|
343 |
+
label="Created model_lexicon.txt",
|
344 |
+
)
|
345 |
+
|
346 |
+
tokens_file = gr.DownloadButton(
|
347 |
+
value=Path("/tmp/model_tokens.txt"),
|
348 |
+
label="Created model_tokens.txt",
|
349 |
+
)
|
350 |
+
|
351 |
+
return [model_file, vocab_file, lexicon_file, tokens_file]
|
352 |
|
353 |
|
354 |
with gr.Blocks(
|
|
|
450 |
value=False,
|
451 |
)
|
452 |
|
453 |
+
with gr.Column():
|
454 |
+
kenlm_model = gr.DownloadButton(
|
455 |
+
label="Created KenLM model",
|
456 |
+
)
|
457 |
+
|
458 |
+
vocab_file = gr.DownloadButton(
|
459 |
+
label="Created model_vocab.txt",
|
460 |
+
)
|
461 |
+
|
462 |
+
lexicon_file = gr.DownloadButton(
|
463 |
+
label="Created model_lexicon.txt",
|
464 |
+
)
|
465 |
+
|
466 |
+
tokens_file = gr.DownloadButton(
|
467 |
+
label="Created model_tokens.txt",
|
468 |
+
)
|
469 |
|
470 |
gr.Button("Create").click(
|
471 |
text_to_kenlm,
|
|
|
482 |
topk_words,
|
483 |
do_limit_topk,
|
484 |
],
|
485 |
+
outputs=[kenlm_model, vocab_file, lexicon_file, tokens_file],
|
486 |
)
|
487 |
|
488 |
with gr.Row():
|