Update app.py
Browse files
app.py
CHANGED
@@ -261,37 +261,38 @@ def text_to_kenlm(
|
|
261 |
with open(intermediate_file, "w") as f:
|
262 |
f.write(" ".join(results))
|
263 |
|
264 |
-
# Commands to run in the container
|
265 |
-
cmd = (
|
266 |
-
f"{kenlm_bin}/lmplz -T /tmp -S 80% --text {intermediate_file} --arpa /tmp/my_model.arpa -o {_order} --prune {_arpa_prune} --discount_fallback",
|
267 |
-
)
|
268 |
-
r = subprocess.run(cmd, shell=True)
|
269 |
-
print(r)
|
270 |
-
if r.returncode != 0:
|
271 |
-
raise gr.Error("Failed to create the model.")
|
272 |
-
|
273 |
file_name = "/tmp/my_model.arpa"
|
274 |
-
|
275 |
-
|
276 |
-
|
277 |
-
|
278 |
-
|
279 |
-
|
280 |
-
|
281 |
-
|
282 |
-
|
283 |
-
|
284 |
-
|
285 |
-
|
286 |
-
|
287 |
-
|
288 |
-
|
289 |
-
|
290 |
-
|
291 |
-
|
292 |
-
|
293 |
-
|
294 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
295 |
|
296 |
if _do_limit_topk:
|
297 |
file_name = f"/tmp/my_model-{_topk_words}-words.arpa"
|
@@ -302,7 +303,7 @@ def text_to_kenlm(
|
|
302 |
[
|
303 |
os.path.join(kenlm_bin, "filter"),
|
304 |
"single",
|
305 |
-
"model:{}".format(
|
306 |
file_name,
|
307 |
],
|
308 |
input=vocab_str.encode("utf-8"),
|
@@ -330,7 +331,7 @@ def text_to_kenlm(
|
|
330 |
if _do_quantize:
|
331 |
file_name = f"/tmp/my_model-{_binary_type}.bin"
|
332 |
|
333 |
-
cmd = f"{kenlm_bin}/build_binary -a {_binary_a_bits} -b {_binary_b_bits} -q {_binary_q_bits} -v {_binary_type} {
|
334 |
r = subprocess.run(cmd, shell=True)
|
335 |
print(r)
|
336 |
if r.returncode != 0:
|
|
|
261 |
with open(intermediate_file, "w") as f:
|
262 |
f.write(" ".join(results))
|
263 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
264 |
file_name = "/tmp/my_model.arpa"
|
265 |
+
|
266 |
+
# Commands to run in the container
|
267 |
+
if not _do_limit_topk:
|
268 |
+
cmd = (
|
269 |
+
f"{kenlm_bin}/lmplz -T /tmp -S 80% --text {intermediate_file} --arpa /tmp/my_model.arpa -o {_order} --prune {_arpa_prune} --discount_fallback",
|
270 |
+
)
|
271 |
+
r = subprocess.run(cmd, shell=True)
|
272 |
+
print(r)
|
273 |
+
if r.returncode != 0:
|
274 |
+
raise gr.Error("Failed to create the model.")
|
275 |
+
|
276 |
+
file_name_fixed = "/tmp/my_model_correct.arpa"
|
277 |
+
|
278 |
+
# Fix the ARPA file
|
279 |
+
with (
|
280 |
+
open(file_name, "r") as read_file,
|
281 |
+
open(file_name_fixed, "w") as write_file,
|
282 |
+
):
|
283 |
+
has_added_eos = False
|
284 |
+
for line in read_file:
|
285 |
+
if not has_added_eos and "ngram 1=" in line:
|
286 |
+
count = line.strip().split("=")[-1]
|
287 |
+
write_file.write(line.replace(f"{count}", f"{int(count) + 1}"))
|
288 |
+
elif not has_added_eos and "<s>" in line:
|
289 |
+
write_file.write(line)
|
290 |
+
write_file.write(line.replace("<s>", "</s>"))
|
291 |
+
has_added_eos = True
|
292 |
+
else:
|
293 |
+
write_file.write(line)
|
294 |
+
# Replace the file name
|
295 |
+
file_name = file_name_fixed
|
296 |
|
297 |
if _do_limit_topk:
|
298 |
file_name = f"/tmp/my_model-{_topk_words}-words.arpa"
|
|
|
303 |
[
|
304 |
os.path.join(kenlm_bin, "filter"),
|
305 |
"single",
|
306 |
+
"model:{}".format(file_name),
|
307 |
file_name,
|
308 |
],
|
309 |
input=vocab_str.encode("utf-8"),
|
|
|
331 |
if _do_quantize:
|
332 |
file_name = f"/tmp/my_model-{_binary_type}.bin"
|
333 |
|
334 |
+
cmd = f"{kenlm_bin}/build_binary -a {_binary_a_bits} -b {_binary_b_bits} -q {_binary_q_bits} -v {_binary_type} {file_name} {file_name}"
|
335 |
r = subprocess.run(cmd, shell=True)
|
336 |
print(r)
|
337 |
if r.returncode != 0:
|